380 words
|
6.33 分钟
NODE.JS采集练手
2017-05-03
使用类库
- request 发送请求获取网页内容
- co 执行Generator函数
- cheerio 解析html,和jQuery一样通过选择器获取
- node-xlsx 生成excel文档
代码如下
'use strict'
const request = require('request')
const co = require('co')
const cheerio = require('cheerio')
const xlsx = require('node-xlsx')
const fs = require('fs')
let base = 'https://aso100.com/app/rank/appid/1188599882/country/cn'
let base_url = 'https://aso100.com/app/rank/appid/1188599882/country/'
let urls = []
let dataArray = []
//通过url,获取网页内容
let getUrl = (url) => {
return new Promise((resolve, reject) => {
request(url, function (err, response, body) {
if (err) {
return reject(err)
}
return err ? reject(err) : resolve(body)
})
})
}
//通过url获取需要的字段内容,并存入dataArray
let getDetail = function* (url) {
let data = yield getUrl(url)
let $ = cheerio.load(data)
let title = $('.appinfo-title').text()
let name = $('.appinfo-country .name').text()
console.log(title + name)
dataArray.push([title, name])
}
//休眠函数,避免速度太快
let sleep = (t) => {
return new Promise((resolve, reject) => {
setTimeout(function () {
resolve()
}, t)
})
}
//获取base目录中所有需要采集的url地址,并执行采集,最后导出excel
let getUrlArray = function* () {
let html = yield getUrl(base)
let $ = cheerio.load(html)
let ele = $('.select-container').find('a')
ele.each(function () {
if ($(this).attr('data-country')) {
urls.push(base_url + $(this).attr('data-country'))
}
})
//获取内容
for (let i = 0; i < urls.length; i++) {
yield sleep(20000)
yield getDetail(urls[i])
}
//导出excel
let buffer = xlsx.build([{ name: 'mySheetName', data: dataArray }])
fs.writeFileSync('test.csv', buffer, 'binary')
}
//通过co执行Generator
let run = (fn) => {
co(fn).catch((err) => {
console.log(err)
})
}
//开始采集
run(getUrlArray())