鍍金池/ 問(wèn)答/HTML/ node 爬蟲(chóng) 堆棧溢出問(wèn)題

node 爬蟲(chóng) 堆棧溢出問(wèn)題

剛學(xué)node 寫(xiě)了個(gè)爬小說(shuō)的爬蟲(chóng) 爬一定數(shù)量的小說(shuō)以后就會(huì) 溢出 請(qǐng)幫忙看看是哪里的問(wèn)題

爬取的順序:
book主頁(yè)url列表組成bookList ->
爬取首頁(yè)信息和目錄列表,插入對(duì)應(yīng)的book對(duì)象中 ->
爬取book目錄頁(yè)的章節(jié)列表信息和章節(jié)的url插入對(duì)應(yīng)的book對(duì)象中 ->
遍歷bookList & 遍歷book的章節(jié)列表url爬取章節(jié)內(nèi)容插入對(duì)應(yīng)的章節(jié)中,當(dāng)一本書(shū)的內(nèi)容全部插入完成后存成txt文件;這步用async.allLimit()做了限制,爬取完一本小說(shuō)存為txt文件后,開(kāi)始爬取下一本

好像我所有爬到的html內(nèi)容都一直存在,沒(méi)有被釋放

const fs = require('fs');
const cheerio = require('cheerio');
const request = require('request');
const async = require('async')
const rp = require('request-promise')

const bookUrlSelector = '.two_main .main_con li .chap .fs14'
const bookImgSelector = '.main .book_cover img'
const bookName_MenuUrlSelector = '.main .status h1 a'
const bookAuthorSelector = '.main .status .booksub a[href*=userInfo]'
const bookTypeSelector = '.main .status .booksub a[href*=store]'
const bookDescSelector = '.main .status .info_con p'
const bookKeywordsSelector = '.main .status .keyword a'
const bookChapSelector = '.chapterBean a'
const bookChapContentSelector = '#readerFs p'
const bookContentPartSelector = '.reader_con h3'

let menuSpeed = 10
let chapsSpeed = 10
let bookSpeed = 1
let contentSpeed = 10

let bookList = []
let doneBookList = []
let pageStartCount = 1
let pageEndCount = 1

let bookIndex = 0
let menuIndex = 0
let chapsIndex = 0
let doneChap = 0


/**
 * 發(fā)送request請(qǐng)求,獲取請(qǐng)求url的html內(nèi)容
 */
function getHtml(url, cb, item = null, callback = null, referer = 'http://book.zongheng.com') {
  let config = {
    headers: {
      'Referer': referer,
      'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    },
    method: 'GET',
    url: url,
  };
  rp(config)
    .then(html => {
      cb(html, item)
      callback&&callback(null, 'success!!')
    })
    .catch(err => {
      console.log('err-----------', url, err)
      getHtml(url, cb, item, callback, referer)
    })
}

/**
 * 爬取縱橫中文網(wǎng)站點(diǎn)的免費(fèi)完結(jié)小說(shuō)列表頁(yè),共20頁(yè)
 * @return {Array}: 返回全部列表頁(yè)的arr
 */
function bookMainUrlList() {
  let arr = []
  for (let i = pageStartCount; i <= pageEndCount; i++) {
    arr.push(`http://book.zongheng.com/quanben/c0/c0/b9/u1/p${i}/v0/s1/t0/ALL.html`)
  }
  return arr
}

/**
 * 爬取所有列表頁(yè)的url
 */
function initBookList() {
  bookMainUrlList().map(url => {
    getHtml(url, addBookMainUrl)
  })
}

/**
 * 解析獲取book主頁(yè)的地址
 * @param html: page頁(yè)的html
 * @return {Array}: book主頁(yè)url, 只返回縱橫的
 */
function getBookUrlList(html) {
  let $ = cheerio.load(html);
  let bookUrlList = $(bookUrlSelector)
  return (Array.from((bookUrlList)).filter(book => {
    if (book.attribs.href.indexOf('baidu') === -1) {
      return book
    }
  })).map(book => {
    return book.attribs.href
  })
}

/**
 * 將從每個(gè)列表頁(yè)爬取的 book的主頁(yè)url 組成一個(gè)數(shù)組;創(chuàng)建bookList中book的mainUrl屬性
 * @param html: 列表頁(yè)的html
 */
function addBookMainUrl(html) {
  let bookMainUrl = []
  bookMainUrl = bookMainUrl.concat(getBookUrlList(html))

  bookMainUrl.forEach((url, index) => {
    bookList.push({
      mainUrl: url
    })
  })
  bookIndex++
}

function limitBookBaseInfo() {
  console.log(bookList.length)
  async.allLimit(bookList, menuSpeed, (book, callback) => {
    getHtml(book.mainUrl, getBookBaseInfo, book, callback)
  }, (error, result) => {
    if(error) {
      console.log('出錯(cuò)啦?。。。。。。。?!', error)
    }
    if (result) {
      console.log('添加所有書(shū)的基本信息------------done~~~~~~~~^_^')
    }
  })
}

function getBookBaseInfo(html, book) {
  let $ = cheerio.load(html);
  let bookImg = $(bookImgSelector)
  let bookName = $(bookName_MenuUrlSelector)
  let bookAuthor = $(bookAuthorSelector)
  let bookType = $(bookTypeSelector)
  let bookDesc = $(bookDescSelector)
  let bookKeywords = $(bookKeywordsSelector)

  let info = {
    sourceID: bookName[0].attribs.href.split('/').pop().split('.')[0],
    image: bookImg[0].attribs.src,
    name: bookName[0].children[0].data,
    author: {
      authorID: bookAuthor[0].attribs.href.split('/').pop().split('.')[0],
      authorName: bookAuthor[0].children[0].data
    },
    type: bookType[0].children[0].data,
    desc: bookDesc[0].children[0] ? bookType[0].children[0].data : '',
    keywords: Array.from(bookKeywords).map(keyword => {
      return keyword.children[0].data
    }),
    menuUrl: bookName[0].attribs.href
  }
  addBookBaseInfo(info, book)
}

function addBookBaseInfo(info, book) {
  book.sourceID = info.sourceID
  book.image = info.image
  book.name = info.name
  book.author = info.author
  book.type = info.type
  book.desc = info.desc
  book.keywords = info.keywords
  book.menuUrl = info.menuUrl
  menuIndex++
  console.log(menuIndex + '本書(shū)的基本信息添加完成')
}

function limitBookChaps() {
  console.log('添加book的chaps', bookList.length)
  async.allLimit(bookList, chapsSpeed, (book, callback) => {
    getHtml(book.menuUrl, getBookChaps, book, callback)
  }, (error, result) => {
    if(error) {
      console.log('出錯(cuò)啦?。。。。。。。。?, error)
    }
    if (result) {
      console.log('添加所有書(shū)的chaps------------done~~~~~~~~^_^')
    }
  })
}

function getBookChaps(html, book) {
  let $ = cheerio.load(html)
  let chapList = $(bookChapSelector)
  book.chaps = (Array.from(chapList)).map((chap, index) => {
    return {
      chapID: index,
      chapTitle: chap.children[0].data,
      chapUrl: chap.attribs.href
    }
  })
  chapList = null
  console.log(`${book.sourceID} ${book.name} 的 ${book.chaps.length} 章添加完成`)
  chapsIndex++
}

function removeEmptyBook() {
  bookList = bookList.filter(book => {
    if (book.chaps.length) {
      return book
    } else {
      console.log(book.name, book.menuUrl, book.chaps)
      console.log(`${book.sourceID} ${book.name} ${book.menuUrl} chaps為${book.chaps}沒(méi)有內(nèi)容----刪除`)
    }
  })
}

function limitBookContent() {
  console.log('添加chaps的content', bookList.length)
  async.allLimit(bookList, bookSpeed, (book, callback) => { 
     limitChapsContent(book, callback)
  }, (error, result) => {
    if(error) {
      console.log('出錯(cuò)啦?。。。。。。。。?, error)
    }
    if (result) {
      console.log('添加所有書(shū)的chaps------------done~~~~~~~~^_^')
    }
  })
}

function limitChapsContent(book, callback) {
  async.allLimit(book.chaps, contentSpeed, (chap, cb) => {
    getHtml(chap.chapUrl, getBookChapContent, chap, cb, book.menuUrl)

  }, (error, result) => {
    if (error) {
      console.log('出錯(cuò)啦?。。。。。。。?!', error)
    }
    if (result) {
      console.log(`book: ${book.name}--已完成,準(zhǔn)備創(chuàng)建txt文件`)
      doneChap = 0
      callback(null, 'success')
      writeToTxt(book)
    }
  })
}

function getBookChapContent(html, chap) {
  doneChap++
  let $ = cheerio.load(html)

  let contentList = $(bookChapContentSelector)
  let contentPart = $(bookContentPartSelector)

  chap.part = contentPart[0].children[0].data
  chap.content = (Array.from(contentList)).map((parasContent) => {
    return '    ' + parasContent.children[0].data
  })
  console.log(doneChap)
}


initBookList()
// 當(dāng)爬取的頁(yè)數(shù)index和頁(yè)數(shù)的總count一致的時(shí)候,book的mainUrl爬取完畢等待下一次

let step_1 = function() {
  console.log('wait.................................')
  clearTimeout(timer1)
  if (bookMainUrlList().length === bookIndex) {
    clearTimeout(timer1)
    console.log(`${bookList.length}本書(shū)的mainUrl爬取完畢`)
  } else {
    timer1 = setTimeout(step_1, 500)
  }
}
let timer1 = setTimeout(step_1, 500)


let step_2 = function () {
  // clearTimeout(timer2_2)
  console.log('waiting..baseinfo.................................' + bookMainUrlList().length, bookIndex, menuIndex, bookList.length)
  if (menuIndex === bookList.length) {
    console.log(`${bookList.length}本書(shū)添加基本信息------------done~~~~~~~~^_^`)
  } else {
    timer2_2 = setTimeout(step_2, 5000)
  }
}

let initBookListDone = function () {
  clearTimeout(timer2_1)
  if (bookMainUrlList().length === bookIndex) {
    limitBookBaseInfo()
    let timer2_2 = setTimeout(step_2, 5000)
  } else {
    timer2_1 = setTimeout(initBookListDone, 2000)
  }
}
let timer2_1 = setTimeout(initBookListDone, 2000)


let step_3 = function () {
  // clearTimeout(timer3_2)
  console.log('waiting..bookchaps.................................' + bookList.length, chapsIndex)
  if (chapsIndex === bookList.length) {
    console.log(bookList.length)
    console.log(`${bookList.length} 本書(shū)添加章節(jié)列表(id,title,url)------------done~~~~~~~~^_^`)
    console.log('刪除沒(méi)有內(nèi)容的book')
    removeEmptyBook()
    console.log(`刪除內(nèi)容為空的book后,共有書(shū) ${bookList.length} 本`)
  } else {
    timer3_2 = setTimeout(step_3, 2000)
  }
}
let addBookBaseInfoDone = function () {
  clearTimeout(timer3_1)
  // console.log(menuIndex, bookList)
  if (menuIndex === bookList.length) {
    console.log('==================================開(kāi)始添加章節(jié)列表')
    limitBookChaps()
    let timer3_2 = setTimeout(step_3, 2000)
  } else {
    timer3_1 = setTimeout(addBookBaseInfoDone, 30000)
  }
}
let timer3_1 = setTimeout(addBookBaseInfoDone, 30000)


let addBookChapsDone = function() {
  console.log(`wait........... ${chapsIndex} ${bookList.length}`)
  if (chapsIndex === bookList.length) {
    limitBookContent()
  } else {
    timer4_1 = setTimeout(addBookChapsDone, 10000)
  }
}
let timer4_1 = setTimeout(addBookChapsDone, 60000)



function writeToTxt(book) {
  let str = ''
  // if (book.chaps) {
    for(let chap of book.chaps) {
      str += chap.chapTitle + '\n'
      if(chap.content) {
        str += chap.content.join('\n') + '\n' + '\n'
      }
    }
  // }
  console.log("準(zhǔn)備寫(xiě)入文件");
  fs.writeFile(`./zhbooks/${book.name}.txt`, str, function(err) {
    if (err) {
      return console.error(err)
    }
    console.log("數(shù)據(jù)寫(xiě)入成功!")
  })
}
回答
編輯回答
枕邊人

我試了一下 node進(jìn)程的內(nèi)存漲到900多我就殺掉了 一直再漲 建議讀一點(diǎn)寫(xiě)一點(diǎn) 不要都放在內(nèi)存中

2018年3月19日 00:42