前言:之前写过一篇关于爬取数据的文章,但是我们想在网页上进行操作,总不可能只在命令行进行爬取吧,那么就要用到Django来操作,那么我们开始吧!!(只是用于教学,若有商用,概不承担责任!)
一:爬虫端
既然前面已经讲过如何爬取了,那么我们就直接看代码:
url = 'https://music.163.com/discover/toplist?id=%s' % data['ids']
#这个ids是前端传来的值,这个我们后面会讲到
head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
}
down_url = 'https://music.163.com/song/media/outer/url?id='
respone = requests.get(url, headers=head)
# print(respone)
html = etree.HTML(respone.text)
id_list = html.xpath('//a[contains(@href,"song?")]')
# print(id_list)
i = 0
for id in id_list:
href = id.xpath('./@href')[0]
# print(href)
music_id = href.split('=')[1]
# print(music_id)
if "$" not in music_id:
music_name = id.xpath('./text()')[0]
# print(music_name)
music_url = down_url + music_id
music = requests.get(url=music_url, headers=head)
# print(music_url)
if not os.path.exists(r'D: witternodeface witternodefacestatic'):
os.mkdir(r'D: witternodeface witternodefacestatic')
else:
try:
with open(r'绝对路径static/%s.mp3' % music_name, "wb") as f:
i += 1
print("正在下载第 %i 首歌曲 歌名为: 《%s》 ..." % (i, music_name))
f.write(music.content)
except OSError as e:
print('OSError', e)
结果:
二:数据库以及node.js
1.数据库
数据库采用MySql,用于存储我们歌曲的名称以及ID:
2:node.js
采用node.js来进行我们数据库的内容的显示,不可能我们在页面输入数据吧,毕竟那么多我们也不可能记住:
- 封装的MySql模块
const mysql = require("mysql");
// 开发环境
var connection = mysql.createConnection({
host: 'localhost',
user: 'root',
password: '',
port: '3306',
database: 'growup'
});
module.exports = connection
- node模块(music模块)
// 导入express模块
const express = require('express')
const router = express.Router()
// 导入mysql
const connet = require('./setmysql')
router.get('/getmusic', (req, res) => {
let sql = 'SELECT * FROM music_table'
connet.query(sql, function (err, result) {
if (err) {
console.log('[SELECT ERROR] - ', err.message);
let data = {
code: 400,
result: err.message
}
res.send(data)
return;
} else {
let data = {
code: 200,
result: result
}
res.send(data)
}
});
})
module.exports = router
- 主模块(main)
const express = require('express')
const bodyParser = require('body-parser')
const app = express()
const cors = require('cors');
// 解决跨域问题
app.use(cors());
// parse application/x-www-form-urlencoded
app.use(bodyParser.urlencoded({
extended: false
}));
// parse application/json
app.use(bodyParser.json());
//设置跨域访问
app.all('*', function (req, res, next) {
res.header("Access-Control-Allow-Origin", "*");
res.header("Access-Control-Allow-Headers", "X-Requested-With");
res.header("Access-Control-Allow-Methods", "PUT,POST,GET,DELETE,OPTIONS");
res.header("X-Powered-By", ' 3.2.1')
res.header("Content-Type", "application/x-www-form-urlencoded");
next();
})
// 导入模块
const music = require('./music')
app.use(music)
// 监测服务的开启
app.listen(3333, '0.0.0.0', function (res) {
console.log('Server start on http://0.0.0.0:3333');
})
三:前端
首先我们看一下页面的显示效果:
逻辑梳理
- 点击
任务下发
开始进行数据库查询,返回查询结果;选中想要爬取的榜单
,那么它对应的value会被拿到,也就是数据库里面的id;- 选中后点击
执行任务
,会把数据提交到后台进行爬取,爬取完之后将结果返回;- 进行过滤并进行显示数据
我们使用vue.js来进行前端页面编写,代码如下:
<template>
<div class="music">
<!-- 头部 -->
<div class="music_head">
<span style="float: left;margin-left: 20px;color:black">网易云爬取(目前歌曲:<b>{{musiclist[0]}}</b>首)</span>
<span style="margin-left: 10px;color: red">注:因为可能存在重复数据,所以总数据可能不会变化</span>
<el-button type="primary" size="small" style="float: right;margin-top: 10px;margin-right: 20px;" @click="gotask">任务下发</el-button>
</div>
<!-- 信息展示 -->
<div class="music_bottom">
<div class="bottom_left">
<ul>
<li v-for="(item,index) in musictitle[2]" :key="index">{{index+1}}、{{item.slice(33)}}</li>
</ul>
</div>
<div class="bottom_right">
<ul>
<li v-for="(item1,index1) in musiclist[2]" :key="index1">
<audio style="margin-top: 30px;" :src="item1" controls="controls"></audio>
</li>
</ul>
</div>
</div>
<el-dialog :append-to-body="true" title="进行爬取" :visible.sync="dialogVisible" width="30%">
<table cellpadding="50px" cellspacing="30px" style="width:100%;text-align: center;border:1px solid black">
<tr>
<td>选择榜名:</td>
<td>
<el-select v-model="value" clearable placeholder="请选择">
<el-option v-for="item in musicname" :key="item.muscid" :label="item.type" :value="item.muscid">
</el-option>
</el-select>
</td>
</tr>
</table>
<span slot="footer" class="dialog-footer">
<el-button type="primary" @click="tasksend">执行任务</el-button>
</span>
</el-dialog>
</div>
</template>
<script>
export default {
name: 'music',
data () {
return {
dialogVisible: false,
// 榜名
musicname: [],
// 值
value: '',
// 用户的任务
task: [],
// 页码
currentPage: 1,
size: 10,
// 状态
status: '进行中...',
// 歌单
musiclist: [],
title: '',
musictitle: [],
count: ''
}
},
created () {
var that = this
that.get()
},
components: {},
mounted () {},
methods: {
// 任务下发
gotask () {
var that = this
that.dialogVisible = true
that.$get('http://192.168.1.107:3333/getmusic').then(res => {
console.log(res)
if (res.code === 200) {
that.musicname = res.result
} else {
that.$message({
type: 'error',
message: '服务异常,请稍后重试'
})
}
}).catch(e => {
that.$message({
type: 'error',
message: '服务异常,请稍后重试'
})
})
},
// 获取榜名
get () {
var that = this
that.$get('http://192.168.1.107:8000/seemusic/').then(res => {
that.musictitle = res
for (var i = 0; i < res[2].length; i++) {
res[2][i] = 'http://192.168.1.107:8000/static/' + res[2][i]
}
console.log(res)
that.musiclist = res
console.log(that.musiclist.length)
}).catch(e => {
that.$message({
type: 'error',
message: '服务异常,请稍后重试'
})
})
},
// 执行任务
tasksend () {
var that = this
var loading = this.$loading({
lock: true,
text: '数据量可能比较大,正在加载...',
spinner: 'el-icon-loading',
background: 'rgba(0, 0, 0, 0.7)'
})
that.dialogVisible = false
let data1 = {
ids: Number(that.value)
}
that.$post('http://192.168.1.107:8000/music/', data1).then(res => {
console.log(res)
if (res.indexOf('ok') > -1) {
that.count = res.slice(2)
that.get()
loading.close()
that.$message({
type: 'success',
message: `爬取${that.count}首`
})
} else {
loading.close()
that.$message({
type: 'error',
message: '服务异常,请稍后重试'
})
}
})
},
// 删除
handleSee (index, row) {
console.log(index, row)
},
handleCurrentChange (val) {
var that = this
that.currentPage = val
},
// 时间转换以及补零操作
timestampToTime (timestamp) {
var date = new Date(timestamp)
var Y = date.getFullYear() + '-'
var M = (date.getMonth() + 1).toString().padStart(2, '0') + '-'
var D =
date
.getDate()
.toString()
.padStart(2, '0') + ' '
var h =
date
.getHours()
.toString()
.padStart(2, '0') + ':'
var m =
date
.getMinutes()
.toString()
.padStart(2, '0') + ':'
var s = date
.getSeconds()
.toString()
.padStart(2, '0')
return Y + M + D + h + m + s
}
}
}
</script>
<style scoped>
.music {
width: 900px;
height: 650px;
/* background-color: aquamarine; */
margin: 0 auto;
box-shadow: -5px 0 5px #d7eafc, 5px 0 5px #d7eafc, 0 5px 5px #a7ccf0;
}
/* 头部菜单 */
.music_head {
width: 100%;
height: 50px;
/* background-color: pink; */
line-height: 50px;
}
/* 信息展示 */
.music_bottom {
width: 100%;
height: 600px;
border-top: 1px solid black;
/* background-color: antiquewhite; */
display: flex;
overflow: auto;
}
/* 左边 */
.bottom_left {
width: 550px;
height: 100%;
/* background-color: aqua; */
}
.bottom_left ul li {
height: 100px;
line-height: 100px;
text-align: left;
text-overflow: ellipsis;
white-space: nowrap;
overflow: hidden;
color: black;
}
/* 右边 */
.bottom_right {
width: 350px;
height: 100%;
/* background-color: pink; */
}
.bottom_right ul li {
height: 100px;
line-height: 100px;
text-align: center;
}
/* 页码 */
.ym {
width: 100%;
height: 34px;
/* background-color: skyblue; */
text-align: center;
}
/*修改操作的样式*/
/deep/ .el-dialog .el-dialog__header {
background-color: #26639c;
}
/deep/ .el-dialog .el-dialog__header .el-dialog__title {
color: #ffffff;
font-size: 20px;
padding: 15px 20px 10px;
}
/deep/ .el-dialog .el-dialog__header .el-dialog__headerbtn {
top: 18px;
border-radius: 50%;
width: 22px;
height: 21px;
background-color: #ffffff;
}
</style>
四:Django模块
我们数据的爬取以及结果返回运用python来解决,通过Django来进行接口的编写:
- 爬取模块
def music (request):
data = request.body.decode(encoding="utf8")
data = json.loads(data)
print(data['ids'])
time.sleep(1)
# id = input('请输入您要爬取歌单的id号:')
url = 'https://music.163.com/discover/toplist?id=%s' % data['ids']
head = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
}
down_url = 'https://music.163.com/song/media/outer/url?id='
respone = requests.get(url, headers=head)
# print(respone)
html = etree.HTML(respone.text)
id_list = html.xpath('//a[contains(@href,"song?")]')
# print(id_list)
i = 0
for id in id_list:
href = id.xpath('./@href')[0]
# print(href)
music_id = href.split('=')[1]
# print(music_id)
if "$" not in music_id:
music_name = id.xpath('./text()')[0]
# print(music_name)
music_url = down_url + music_id
music = requests.get(url=music_url, headers=head)
# print(music_url)
if not os.path.exists(r'路径static'):
os.mkdir(r'路径static')
else:
try:
with open(r'路径static/%s.mp3' % music_name, "wb") as f:
i += 1
print("正在下载第 %i 首歌曲 歌名为: 《%s》 ..." % (i, music_name))
f.write(music.content)
except OSError as e:
print('OSError', e)
return HttpResponse('ok{0}'.format(i))
- 结果返回处理模块
也就是我们存储在文件夹之后进行文件夹内容的映射,静态文件的访问:
def seemusic(request):
# 结果集
result = []
# 总数
count = 0
# 成功数
i = 0
for j in os.listdir(r'路径static'):
print(j)
count += 1
#判断文件大小,如果小于94386则证明文件为空文件,进行过滤掉
if os.path.getsize(r'路径static/'+j) > 94386:
result.append(j)
i += 1
data = [count, i, result]
print(data)
return JsonResponse(data,safe=False)
五:结果预览
人生从来没有真正的绝境。无论遭受多少艰辛,无论经历多少苦难,只要一个人的心中还怀着一粒信念的种子,那么总有一天,他就能走出困境,让生命重新开花结果。
本文摘自 :https://blog.51cto.com/u