基于node.js + crawler 包实现爬虫技术
背景知识:
crawler中文文档
https://node-crawler.readthedocs.io/zh_CN/latest/
开源demo:
https://gitee.com/zhui_i/reptile_store.git
案例:
main.js 核心文件,用于将 DB 及 retile 文件 整合一起 启动
var DB = require('../DB/mode')const { getData } = require('./retile')getData().then((resp) => {insertData(resp.data)})/*** DB model*/function insertData(data) {// console.log('getData', data.Data)let datas = data.Data// 通过 DB.getConnection 获得链接DB.getConnection(function (err, connection) {if (err) {console.error(err)return}let post = []for (let i in datas) {datas[i].forEach((d) => {let dArr = []for (j in d) {dArr.push(d[j])}post.push(dArr)})}console.log('post', post)var query = connection.query('INSERT INTO newsType VALUES ?',[post],function (err, result) {// Neat!if (err) console.debug('inserted-err', err)else console.debug('inserted-result', result)connection.release() // 释放该链接,把该链接放回池里供其他人使用})DB.end(function (err) {// all connections in the DB cluster have endedconsole.log('结束链接')})})}
retile.js 用来 执行爬虫 并将 数据 存储至内存;
var axios = require('axios')var fs = require('fs')var cheerio = require('cheerio')var request = require('request')axios.defaults.headers = {accept: '*/*','accept-encoding': 'gzip, deflate, br','accept-language': 'zh-CN,zh;q=0.9',channelid: '28','x-channel': '28','x-channel-type': '1',}const getData = () =>axios.get('https://www.tophub.fun:8888/GetAllType', {params: {page: 1,size: 10,sort: 'create_time;',},})module.exports = {getData: getData,}
DB.js 数据库 读写文件
const mysql = require('mysql') // mysql node driverconst mysqlConfig = require('./config')// 初始化数据库配置, mysql端口号默认为3306const pool = mysql.createPool({connectionLimit: 10, // 连接数量host: mysqlConfig.host,user: mysqlConfig.user,password: mysqlConfig.password,database: mysqlConfig.database,})module.exports = pool
module.exports = {host: 'localhost',user: 'root',password: 'WANG1996',database: 'reptileDB',port: 3306,}
文章转载自晓组智,如果涉嫌侵权,请发送邮件至:contact@modb.pro进行举报,并提供相关证据,一经查实,墨天轮将立刻删除相关内容。





