剛學(xué)node 寫(xiě)了個(gè)爬小說(shuō)的爬蟲(chóng) 爬一定數(shù)量的小說(shuō)以后就會(huì) 溢出 請(qǐng)幫忙看看是哪里的問(wèn)題
爬取的順序:
book主頁(yè)url列表組成bookList ->
爬取首頁(yè)信息和目錄列表,插入對(duì)應(yīng)的book對(duì)象中 ->
爬取book目錄頁(yè)的章節(jié)列表信息和章節(jié)的url插入對(duì)應(yīng)的book對(duì)象中 ->
遍歷bookList & 遍歷book的章節(jié)列表url爬取章節(jié)內(nèi)容插入對(duì)應(yīng)的章節(jié)中,當(dāng)一本書(shū)的內(nèi)容全部插入完成后存成txt文件;這步用async.allLimit()做了限制,爬取完一本小說(shuō)存為txt文件后,開(kāi)始爬取下一本
好像我所有爬到的html內(nèi)容都一直存在,沒(méi)有被釋放
const fs = require('fs');
const cheerio = require('cheerio');
const request = require('request');
const async = require('async')
const rp = require('request-promise')
const bookUrlSelector = '.two_main .main_con li .chap .fs14'
const bookImgSelector = '.main .book_cover img'
const bookName_MenuUrlSelector = '.main .status h1 a'
const bookAuthorSelector = '.main .status .booksub a[href*=userInfo]'
const bookTypeSelector = '.main .status .booksub a[href*=store]'
const bookDescSelector = '.main .status .info_con p'
const bookKeywordsSelector = '.main .status .keyword a'
const bookChapSelector = '.chapterBean a'
const bookChapContentSelector = '#readerFs p'
const bookContentPartSelector = '.reader_con h3'
let menuSpeed = 10
let chapsSpeed = 10
let bookSpeed = 1
let contentSpeed = 10
let bookList = []
let doneBookList = []
let pageStartCount = 1
let pageEndCount = 1
let bookIndex = 0
let menuIndex = 0
let chapsIndex = 0
let doneChap = 0
/**
* 發(fā)送request請(qǐng)求,獲取請(qǐng)求url的html內(nèi)容
*/
function getHtml(url, cb, item = null, callback = null, referer = 'http://book.zongheng.com') {
let config = {
headers: {
'Referer': referer,
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
},
method: 'GET',
url: url,
};
rp(config)
.then(html => {
cb(html, item)
callback&&callback(null, 'success!!')
})
.catch(err => {
console.log('err-----------', url, err)
getHtml(url, cb, item, callback, referer)
})
}
/**
* 爬取縱橫中文網(wǎng)站點(diǎn)的免費(fèi)完結(jié)小說(shuō)列表頁(yè),共20頁(yè)
* @return {Array}: 返回全部列表頁(yè)的arr
*/
function bookMainUrlList() {
let arr = []
for (let i = pageStartCount; i <= pageEndCount; i++) {
arr.push(`http://book.zongheng.com/quanben/c0/c0/b9/u1/p${i}/v0/s1/t0/ALL.html`)
}
return arr
}
/**
* 爬取所有列表頁(yè)的url
*/
function initBookList() {
bookMainUrlList().map(url => {
getHtml(url, addBookMainUrl)
})
}
/**
* 解析獲取book主頁(yè)的地址
* @param html: page頁(yè)的html
* @return {Array}: book主頁(yè)url, 只返回縱橫的
*/
function getBookUrlList(html) {
let $ = cheerio.load(html);
let bookUrlList = $(bookUrlSelector)
return (Array.from((bookUrlList)).filter(book => {
if (book.attribs.href.indexOf('baidu') === -1) {
return book
}
})).map(book => {
return book.attribs.href
})
}
/**
* 將從每個(gè)列表頁(yè)爬取的 book的主頁(yè)url 組成一個(gè)數(shù)組;創(chuàng)建bookList中book的mainUrl屬性
* @param html: 列表頁(yè)的html
*/
function addBookMainUrl(html) {
let bookMainUrl = []
bookMainUrl = bookMainUrl.concat(getBookUrlList(html))
bookMainUrl.forEach((url, index) => {
bookList.push({
mainUrl: url
})
})
bookIndex++
}
function limitBookBaseInfo() {
console.log(bookList.length)
async.allLimit(bookList, menuSpeed, (book, callback) => {
getHtml(book.mainUrl, getBookBaseInfo, book, callback)
}, (error, result) => {
if(error) {
console.log('出錯(cuò)啦?。。。。。。。?!', error)
}
if (result) {
console.log('添加所有書(shū)的基本信息------------done~~~~~~~~^_^')
}
})
}
function getBookBaseInfo(html, book) {
let $ = cheerio.load(html);
let bookImg = $(bookImgSelector)
let bookName = $(bookName_MenuUrlSelector)
let bookAuthor = $(bookAuthorSelector)
let bookType = $(bookTypeSelector)
let bookDesc = $(bookDescSelector)
let bookKeywords = $(bookKeywordsSelector)
let info = {
sourceID: bookName[0].attribs.href.split('/').pop().split('.')[0],
image: bookImg[0].attribs.src,
name: bookName[0].children[0].data,
author: {
authorID: bookAuthor[0].attribs.href.split('/').pop().split('.')[0],
authorName: bookAuthor[0].children[0].data
},
type: bookType[0].children[0].data,
desc: bookDesc[0].children[0] ? bookType[0].children[0].data : '',
keywords: Array.from(bookKeywords).map(keyword => {
return keyword.children[0].data
}),
menuUrl: bookName[0].attribs.href
}
addBookBaseInfo(info, book)
}
function addBookBaseInfo(info, book) {
book.sourceID = info.sourceID
book.image = info.image
book.name = info.name
book.author = info.author
book.type = info.type
book.desc = info.desc
book.keywords = info.keywords
book.menuUrl = info.menuUrl
menuIndex++
console.log(menuIndex + '本書(shū)的基本信息添加完成')
}
function limitBookChaps() {
console.log('添加book的chaps', bookList.length)
async.allLimit(bookList, chapsSpeed, (book, callback) => {
getHtml(book.menuUrl, getBookChaps, book, callback)
}, (error, result) => {
if(error) {
console.log('出錯(cuò)啦?。。。。。。。。?, error)
}
if (result) {
console.log('添加所有書(shū)的chaps------------done~~~~~~~~^_^')
}
})
}
function getBookChaps(html, book) {
let $ = cheerio.load(html)
let chapList = $(bookChapSelector)
book.chaps = (Array.from(chapList)).map((chap, index) => {
return {
chapID: index,
chapTitle: chap.children[0].data,
chapUrl: chap.attribs.href
}
})
chapList = null
console.log(`${book.sourceID} ${book.name} 的 ${book.chaps.length} 章添加完成`)
chapsIndex++
}
function removeEmptyBook() {
bookList = bookList.filter(book => {
if (book.chaps.length) {
return book
} else {
console.log(book.name, book.menuUrl, book.chaps)
console.log(`${book.sourceID} ${book.name} ${book.menuUrl} chaps為${book.chaps}沒(méi)有內(nèi)容----刪除`)
}
})
}
function limitBookContent() {
console.log('添加chaps的content', bookList.length)
async.allLimit(bookList, bookSpeed, (book, callback) => {
limitChapsContent(book, callback)
}, (error, result) => {
if(error) {
console.log('出錯(cuò)啦?。。。。。。。。?, error)
}
if (result) {
console.log('添加所有書(shū)的chaps------------done~~~~~~~~^_^')
}
})
}
function limitChapsContent(book, callback) {
async.allLimit(book.chaps, contentSpeed, (chap, cb) => {
getHtml(chap.chapUrl, getBookChapContent, chap, cb, book.menuUrl)
}, (error, result) => {
if (error) {
console.log('出錯(cuò)啦?。。。。。。。?!', error)
}
if (result) {
console.log(`book: ${book.name}--已完成,準(zhǔn)備創(chuàng)建txt文件`)
doneChap = 0
callback(null, 'success')
writeToTxt(book)
}
})
}
function getBookChapContent(html, chap) {
doneChap++
let $ = cheerio.load(html)
let contentList = $(bookChapContentSelector)
let contentPart = $(bookContentPartSelector)
chap.part = contentPart[0].children[0].data
chap.content = (Array.from(contentList)).map((parasContent) => {
return ' ' + parasContent.children[0].data
})
console.log(doneChap)
}
initBookList()
// 當(dāng)爬取的頁(yè)數(shù)index和頁(yè)數(shù)的總count一致的時(shí)候,book的mainUrl爬取完畢等待下一次
let step_1 = function() {
console.log('wait.................................')
clearTimeout(timer1)
if (bookMainUrlList().length === bookIndex) {
clearTimeout(timer1)
console.log(`${bookList.length}本書(shū)的mainUrl爬取完畢`)
} else {
timer1 = setTimeout(step_1, 500)
}
}
let timer1 = setTimeout(step_1, 500)
let step_2 = function () {
// clearTimeout(timer2_2)
console.log('waiting..baseinfo.................................' + bookMainUrlList().length, bookIndex, menuIndex, bookList.length)
if (menuIndex === bookList.length) {
console.log(`${bookList.length}本書(shū)添加基本信息------------done~~~~~~~~^_^`)
} else {
timer2_2 = setTimeout(step_2, 5000)
}
}
let initBookListDone = function () {
clearTimeout(timer2_1)
if (bookMainUrlList().length === bookIndex) {
limitBookBaseInfo()
let timer2_2 = setTimeout(step_2, 5000)
} else {
timer2_1 = setTimeout(initBookListDone, 2000)
}
}
let timer2_1 = setTimeout(initBookListDone, 2000)
let step_3 = function () {
// clearTimeout(timer3_2)
console.log('waiting..bookchaps.................................' + bookList.length, chapsIndex)
if (chapsIndex === bookList.length) {
console.log(bookList.length)
console.log(`${bookList.length} 本書(shū)添加章節(jié)列表(id,title,url)------------done~~~~~~~~^_^`)
console.log('刪除沒(méi)有內(nèi)容的book')
removeEmptyBook()
console.log(`刪除內(nèi)容為空的book后,共有書(shū) ${bookList.length} 本`)
} else {
timer3_2 = setTimeout(step_3, 2000)
}
}
let addBookBaseInfoDone = function () {
clearTimeout(timer3_1)
// console.log(menuIndex, bookList)
if (menuIndex === bookList.length) {
console.log('==================================開(kāi)始添加章節(jié)列表')
limitBookChaps()
let timer3_2 = setTimeout(step_3, 2000)
} else {
timer3_1 = setTimeout(addBookBaseInfoDone, 30000)
}
}
let timer3_1 = setTimeout(addBookBaseInfoDone, 30000)
let addBookChapsDone = function() {
console.log(`wait........... ${chapsIndex} ${bookList.length}`)
if (chapsIndex === bookList.length) {
limitBookContent()
} else {
timer4_1 = setTimeout(addBookChapsDone, 10000)
}
}
let timer4_1 = setTimeout(addBookChapsDone, 60000)
function writeToTxt(book) {
let str = ''
// if (book.chaps) {
for(let chap of book.chaps) {
str += chap.chapTitle + '\n'
if(chap.content) {
str += chap.content.join('\n') + '\n' + '\n'
}
}
// }
console.log("準(zhǔn)備寫(xiě)入文件");
fs.writeFile(`./zhbooks/${book.name}.txt`, str, function(err) {
if (err) {
return console.error(err)
}
console.log("數(shù)據(jù)寫(xiě)入成功!")
})
}
北大青鳥(niǎo)APTECH成立于1999年。依托北京大學(xué)優(yōu)質(zhì)雄厚的教育資源和背景,秉承“教育改變生活”的發(fā)展理念,致力于培養(yǎng)中國(guó)IT技能型緊缺人才,是大數(shù)據(jù)專業(yè)的國(guó)家
北大青鳥(niǎo)中博軟件學(xué)院創(chuàng)立于2003年,作為華東區(qū)著名互聯(lián)網(wǎng)學(xué)院和江蘇省首批服務(wù)外包人才培訓(xùn)基地,中博成功培育了近30000名軟件工程師走向高薪崗位,合作企業(yè)超4
中公教育集團(tuán)創(chuàng)建于1999年,經(jīng)過(guò)二十年潛心發(fā)展,已由一家北大畢業(yè)生自主創(chuàng)業(yè)的信息技術(shù)與教育服務(wù)機(jī)構(gòu),發(fā)展為教育服務(wù)業(yè)的綜合性企業(yè)集團(tuán),成為集合面授教學(xué)培訓(xùn)、網(wǎng)
達(dá)內(nèi)教育集團(tuán)成立于2002年,是一家由留學(xué)海歸創(chuàng)辦的高端職業(yè)教育培訓(xùn)機(jī)構(gòu),是中國(guó)一站式人才培養(yǎng)平臺(tái)、一站式人才輸送平臺(tái)。2014年4月3日在美國(guó)成功上市,融資1
浪潮集團(tuán)項(xiàng)目經(jīng)理。精通Java與.NET 技術(shù), 熟練的跨平臺(tái)面向?qū)ο箝_(kāi)發(fā)經(jīng)驗(yàn),技術(shù)功底深厚。 授課風(fēng)格 授課風(fēng)格清新自然、條理清晰、主次分明、重點(diǎn)難點(diǎn)突出、引人入勝。
曾工作于聯(lián)想擔(dān)任系統(tǒng)開(kāi)發(fā)工程師,曾在博彥科技股份有限公司擔(dān)任項(xiàng)目經(jīng)理從事移動(dòng)互聯(lián)網(wǎng)管理及研發(fā)工作,曾創(chuàng)辦藍(lán)懿科技有限責(zé)任公司從事總經(jīng)理職務(wù)負(fù)責(zé)iOS教學(xué)及管理工作。
精通HTML5和CSS3;Javascript及主流js庫(kù),具有快速界面開(kāi)發(fā)的能力,對(duì)瀏覽器兼容性、前端性能優(yōu)化等有深入理解。精通網(wǎng)頁(yè)制作和網(wǎng)頁(yè)游戲開(kāi)發(fā)。
具有10 年的Java 企業(yè)應(yīng)用開(kāi)發(fā)經(jīng)驗(yàn)。曾經(jīng)歷任德國(guó)Software AG 技術(shù)顧問(wèn),美國(guó)Dachieve 系統(tǒng)架構(gòu)師,美國(guó)AngelEngineers Inc. 系統(tǒng)架構(gòu)師。