当前位置:首页 > IT技术 > Web编程 > 正文

Django+node.js+Vue.js进行数据爬虫
2021-09-15 15:06:56

前言:之前写过一篇关于爬取数据的文章,但是我们想在网页上进行操作,总不可能只在命令行进行爬取吧,那么就要用到Django来操作,那么我们开始吧!!(只是用于教学,若有商用,概不承担责任!)

Django+node.js+Vue.js进行数据爬虫_node.js

 

 


一:爬虫端

既然前面已经讲过如何爬取了,那么我们就直接看代码:

url = 'https://music.163.com/discover/toplist?id=%s' % data['ids']
			#这个ids是前端传来的值,这个我们后面会讲到
    head = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36'
    }
    down_url = 'https://music.163.com/song/media/outer/url?id='
    respone = requests.get(url, headers=head)
    # print(respone)
    html = etree.HTML(respone.text)
    id_list = html.xpath('//a[contains(@href,"song?")]')
    # print(id_list)
    i = 0
    for id in id_list:
        href = id.xpath('./@href')[0]
        # print(href)
        music_id = href.split('=')[1]
        # print(music_id)
        if "$" not in music_id:
            music_name = id.xpath('./text()')[0]
            # print(music_name)
            music_url = down_url + music_id
            music = requests.get(url=music_url, headers=head)
            # print(music_url)
            if not os.path.exists(r'D:	witternodeface	witternodefacestatic'):
                os.mkdir(r'D:	witternodeface	witternodefacestatic')
            else:
                try:
                    with open(r'绝对路径static/%s.mp3' % music_name, "wb") as f:
                        i += 1
                        print("正在下载第 %i 首歌曲 歌名为: 《%s》 ..." % (i, music_name))
                        f.write(music.content)

                except OSError as  e:
                    print('OSError', e)

结果:
Django+node.js+Vue.js进行数据爬虫_vue.js_02


二:数据库以及node.js

1.数据库

数据库采用MySql,用于存储我们歌曲的名称以及ID:
Django+node.js+Vue.js进行数据爬虫_django_03


2:node.js

采用node.js来进行我们数据库的内容的显示,不可能我们在页面输入数据吧,毕竟那么多我们也不可能记住:

  • 封装的MySql模块
const mysql = require("mysql");
// 开发环境
 var connection = mysql.createConnection({
     host: 'localhost',
     user: 'root',
     password: '',
     port: '3306',
     database: 'growup'
 });
module.exports = connection
  • node模块(music模块)
// 导入express模块
const express = require('express')
const router = express.Router()
// 导入mysql
const connet = require('./setmysql')
router.get('/getmusic', (req, res) => {
  let sql = 'SELECT * FROM music_table'
  connet.query(sql, function (err, result) {
    if (err) {
      console.log('[SELECT ERROR] - ', err.message);
      let data = {
        code: 400,
        result: err.message
      }
      res.send(data)
      return;
    } else {
      let data = {
        code: 200,
        result: result
      }
      res.send(data)
    }
  });
})
module.exports = router
  • 主模块(main)
const express = require('express')
const bodyParser = require('body-parser')
const app = express()
const cors = require('cors');

// 解决跨域问题
app.use(cors());
// parse application/x-www-form-urlencoded
app.use(bodyParser.urlencoded({
    extended: false
}));
// parse application/json
app.use(bodyParser.json());
//设置跨域访问
app.all('*', function (req, res, next) {
    res.header("Access-Control-Allow-Origin", "*");
    res.header("Access-Control-Allow-Headers", "X-Requested-With");
    res.header("Access-Control-Allow-Methods", "PUT,POST,GET,DELETE,OPTIONS");
    res.header("X-Powered-By", ' 3.2.1')
    res.header("Content-Type", "application/x-www-form-urlencoded");
    next();
})
// 导入模块
const music = require('./music')
app.use(music)
// 监测服务的开启
app.listen(3333, '0.0.0.0', function (res) {
    console.log('Server start on http://0.0.0.0:3333');
})

三:前端

首先我们看一下页面的显示效果:
Django+node.js+Vue.js进行数据爬虫_vue.js_04

逻辑梳理

  1. 点击任务下发开始进行数据库查询,返回查询结果;
  2. 选中想要爬取的榜单,那么它对应的value会被拿到,也就是数据库里面的id;
  3. 选中后点击执行任务,会把数据提交到后台进行爬取,爬取完之后将结果返回;
  4. 进行过滤并进行显示数据

我们使用vue.js来进行前端页面编写,代码如下:

<template>
  <div class="music">
    <!-- 头部 -->
    <div class="music_head">
      <span style="float: left;margin-left: 20px;color:black">网易云爬取(目前歌曲:<b>{{musiclist[0]}}</b>首)</span>
      <span style="margin-left: 10px;color: red">注:因为可能存在重复数据,所以总数据可能不会变化</span>
      <el-button type="primary" size="small" style="float: right;margin-top: 10px;margin-right: 20px;" @click="gotask">任务下发</el-button>
    </div>
    <!-- 信息展示 -->
    <div class="music_bottom">
      <div class="bottom_left">
        <ul>
          <li v-for="(item,index) in musictitle[2]" :key="index">{{index+1}}、{{item.slice(33)}}</li>
        </ul>
      </div>
      <div class="bottom_right">
        <ul>
          <li v-for="(item1,index1) in musiclist[2]" :key="index1">
            <audio style="margin-top: 30px;" :src="item1" controls="controls"></audio>
          </li>
        </ul>
      </div>
    </div>
    <el-dialog :append-to-body="true" title="进行爬取" :visible.sync="dialogVisible" width="30%">
      <table cellpadding="50px" cellspacing="30px" style="width:100%;text-align: center;border:1px solid black">
        <tr>
          <td>选择榜名:</td>
          <td>
            <el-select v-model="value" clearable placeholder="请选择">
              <el-option v-for="item in musicname" :key="item.muscid" :label="item.type" :value="item.muscid">
              </el-option>
            </el-select>
          </td>
        </tr>
      </table>
      <span slot="footer" class="dialog-footer">
        <el-button type="primary" @click="tasksend">执行任务</el-button>
      </span>
    </el-dialog>
  </div>
</template>

<script>
export default {
  name: 'music',
  data () {
    return {
      dialogVisible: false,
      // 榜名
      musicname: [],
      // 值
      value: '',
      // 用户的任务
      task: [],
      // 页码
      currentPage: 1,
      size: 10,
      // 状态
      status: '进行中...',
      // 歌单
      musiclist: [],
      title: '',
      musictitle: [],
      count: ''
    }
  },
  created () {
    var that = this
    that.get()
  },
  components: {},
  mounted () {},
  methods: {
    // 任务下发
    gotask () {
      var that = this
      that.dialogVisible = true
      that.$get('http://192.168.1.107:3333/getmusic').then(res => {
        console.log(res)
        if (res.code === 200) {
          that.musicname = res.result
        } else {
          that.$message({
            type: 'error',
            message: '服务异常,请稍后重试'
          })
        }
      }).catch(e => {
        that.$message({
          type: 'error',
          message: '服务异常,请稍后重试'
        })
      })
    },
    // 获取榜名
    get () {
      var that = this
      that.$get('http://192.168.1.107:8000/seemusic/').then(res => {
        that.musictitle = res
        for (var i = 0; i < res[2].length; i++) {
          res[2][i] = 'http://192.168.1.107:8000/static/' + res[2][i]
        }
        console.log(res)
        that.musiclist = res
        console.log(that.musiclist.length)
      }).catch(e => {
        that.$message({
          type: 'error',
          message: '服务异常,请稍后重试'
        })
      })
    },
    // 执行任务
    tasksend () {
      var that = this
      var loading = this.$loading({
        lock: true,
        text: '数据量可能比较大,正在加载...',
        spinner: 'el-icon-loading',
        background: 'rgba(0, 0, 0, 0.7)'
      })
      that.dialogVisible = false
      let data1 = {
        ids: Number(that.value)
      }
      that.$post('http://192.168.1.107:8000/music/', data1).then(res => {
        console.log(res)
        if (res.indexOf('ok') > -1) {
          that.count = res.slice(2)
          that.get()
          loading.close()
          that.$message({
            type: 'success',
            message: `爬取${that.count}首`
          })
        } else {
          loading.close()
          that.$message({
            type: 'error',
            message: '服务异常,请稍后重试'
          })
        }
      })
    },
    // 删除
    handleSee (index, row) {
      console.log(index, row)
    },
    handleCurrentChange (val) {
      var that = this
      that.currentPage = val
    },
    // 时间转换以及补零操作
    timestampToTime (timestamp) {
      var date = new Date(timestamp)
      var Y = date.getFullYear() + '-'
      var M = (date.getMonth() + 1).toString().padStart(2, '0') + '-'
      var D =
          date
            .getDate()
            .toString()
            .padStart(2, '0') + ' '
      var h =
          date
            .getHours()
            .toString()
            .padStart(2, '0') + ':'
      var m =
          date
            .getMinutes()
            .toString()
            .padStart(2, '0') + ':'
      var s = date
        .getSeconds()
        .toString()
        .padStart(2, '0')
      return Y + M + D + h + m + s
    }
  }
}

</script>

<style scoped>
  .music {
    width: 900px;
    height: 650px;
    /* background-color: aquamarine; */
    margin: 0 auto;
    box-shadow: -5px 0 5px #d7eafc, 5px 0 5px #d7eafc, 0 5px 5px #a7ccf0;
  }

  /* 头部菜单 */
  .music_head {
    width: 100%;
    height: 50px;
    /* background-color: pink; */
    line-height: 50px;
  }

  /* 信息展示 */
  .music_bottom {
    width: 100%;
    height: 600px;
    border-top: 1px solid black;
    /* background-color: antiquewhite; */
    display: flex;
    overflow: auto;
  }

  /* 左边 */
  .bottom_left {
    width: 550px;
    height: 100%;
    /* background-color: aqua; */
  }

  .bottom_left ul li {
    height: 100px;
    line-height: 100px;
    text-align: left;
    text-overflow: ellipsis;
    white-space: nowrap;
    overflow: hidden;
    color: black;
  }

  /* 右边 */
  .bottom_right {
    width: 350px;
    height: 100%;
    /* background-color: pink; */
  }

  .bottom_right ul li {
    height: 100px;
    line-height: 100px;
    text-align: center;
  }

  /* 页码 */
  .ym {
    width: 100%;
    height: 34px;
    /* background-color: skyblue; */
    text-align: center;
  }

  /*修改操作的样式*/
  /deep/ .el-dialog .el-dialog__header {
    background-color: #26639c;
  }

  /deep/ .el-dialog .el-dialog__header .el-dialog__title {
    color: #ffffff;
    font-size: 20px;
    padding: 15px 20px 10px;
  }

  /deep/ .el-dialog .el-dialog__header .el-dialog__headerbtn {
    top: 18px;
    border-radius: 50%;
    width: 22px;
    height: 21px;
    background-color: #ffffff;
  }

</style>


四:Django模块

我们数据的爬取以及结果返回运用python来解决,通过Django来进行接口的编写:

  1. 爬取模块
def music (request):
    data = request.body.decode(encoding="utf8")
    data = json.loads(data)
    print(data['ids'])
    time.sleep(1)
    # id = input('请输入您要爬取歌单的id号:')
    url = 'https://music.163.com/discover/toplist?id=%s' % data['ids']
    head = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.101 Safari/537.36'
    }
    down_url = 'https://music.163.com/song/media/outer/url?id='
    respone = requests.get(url, headers=head)
    # print(respone)
    html = etree.HTML(respone.text)
    id_list = html.xpath('//a[contains(@href,"song?")]')
    # print(id_list)
    i = 0
    for id in id_list:
        href = id.xpath('./@href')[0]
        # print(href)
        music_id = href.split('=')[1]
        # print(music_id)
        if "$" not in music_id:
            music_name = id.xpath('./text()')[0]
            # print(music_name)
            music_url = down_url + music_id
            music = requests.get(url=music_url, headers=head)
            # print(music_url)
            if not os.path.exists(r'路径static'):
                os.mkdir(r'路径static')
            else:
                try:
                    with open(r'路径static/%s.mp3' % music_name, "wb") as f:
                        i += 1
                        print("正在下载第 %i 首歌曲 歌名为: 《%s》 ..." % (i, music_name))
                        f.write(music.content)
                except OSError as  e:
                    print('OSError', e)

    return HttpResponse('ok{0}'.format(i))
  1. 结果返回处理模块

也就是我们存储在文件夹之后进行文件夹内容的映射,静态文件的访问:

def seemusic(request):
    # 结果集
    result = []
    # 总数
    count = 0
    # 成功数
    i = 0
    for j in os.listdir(r'路径static'):
        print(j)
        count += 1
        #判断文件大小,如果小于94386则证明文件为空文件,进行过滤掉
        if os.path.getsize(r'路径static/'+j) > 94386:
            result.append(j)
            i += 1
    data = [count, i, result]
    print(data)

    return JsonResponse(data,safe=False)

五:结果预览

Django+node.js+Vue.js进行数据爬虫_django_05


人生从来没有真正的绝境。无论遭受多少艰辛,无论经历多少苦难,只要一个人的心中还怀着一粒信念的种子,那么总有一天,他就能走出困境,让生命重新开花结果。

 

本文摘自 :https://blog.51cto.com/u

开通会员,享受整站包年服务立即开通 >