鍍金池/ 問答/HTML/ node大佬請進!電影天堂的node爬蟲,為什么沒進入titleHref.for

node大佬請進!電影天堂的node爬蟲,為什么沒進入titleHref.forEach就崩了??

舊代碼的問題是res.on('end',...)是異步的,與可能titleHref都還是空的。現(xiàn)在我更新了代碼,解決了這個問題,但是怎么打印出結果的ans數(shù)組呢,打印是同步的,老是打印出空數(shù)據(jù)。

新代碼:

const cheerio = require('cheerio');
const http = require('http');
const iconv = require('iconv-lite');

let baseUrl = "http://www.ygdy8.net/html/gndy/dyzz/list_23_";
let Host = "http://www.ygdy8.net/";

const totalPage = 2; //指定爬多少頁數(shù)據(jù)
let ans = [];
//獲取頁面電影數(shù)據(jù)
function getTitleHref(url,page) {
  let startUrl = url+page+".html";
  http.get(startUrl,function(res) {
    const { statusCode } = res;
    let chunks = [];
    res.on('data',function(chunk){
      chunks.push(chunk);
    });
    res.on('end',function(){
      let title = [];
      
      let html = iconv.decode(Buffer.concat(chunks),'gb2312');
      let $ = cheerio.load(html, {decodeEntities: false});
      // console.log($);
      $('.co_content8 .ulink').each(function(i,d) {
        let $d = $(d);
        let titleHref = [];
        titleHref.push({
          href: $d.attr('href')
        });
        getLink(titleHref)
      });
      // console.log(ans);
    });  
  });
}


// /*
//獲取種子鏈接
function getLink(titleHref) {
  console.log('進入getLink');
  console.log(titleHref);
  if(titleHref) {
    titleHref.forEach(function(v,k) {
      console.log('~~~~~~~~~~~~~~~~~~~~');
      let infoUrl = Host + v.href;
      // console.log(infoUrl);
    
        http.get(infoUrl,function(res) {
          const { statusCode } = res;
          const contentType = res.headers['content-type'];
        
          let error;
          if (statusCode !== 200) {
            error = new Error('請求失敗。\n' +
                             `狀態(tài)碼: ${statusCode}`);
          } 
          if (error) {
            console.error(error.message);
            // 消耗響應數(shù)據(jù)以釋放內存
            res.resume();
            return;
          }
          console.log('進入getlink http');
          let chunks = [];
          res.on('data',function(chunk) {  
            chunks.push(chunk);
          });
          res.on('end', function(){
            try {
              let html = iconv.decode(Buffer.concat(chunks),'gb2312');
              let $ = cheerio.load(html, {decodeEntities: false});
              let bt = '';
              bt = $('#Zoom td').children('a').attr('href');
              // console.log(bt);
              // console.log(typeof bt)
              ans.push(bt);
            }catch (e) {
              console.error('bt',e.message);
            }
          })
        }).on('error', (e) => {
          console.error(`錯誤: ${e.message}`);
        });
    });
  }
};
// */
for(let i = 1; i <= totalPage; i++) {
  getTitleHref(baseUrl,i);
};



--------------------------------------------------------分割線-------

const cheerio = require('cheerio');
const http = require('http');
const iconv = require('iconv-lite');

let baseUrl = "http://www.ygdy8.net/html/gndy/dyzz/list_23_";
let Host = "http://www.ygdy8.net/";
let titleHref = [];
const totalPage = 1; //指定爬多少頁數(shù)據(jù)
let res = [];
//獲取頁面電影數(shù)據(jù)
function getTitleHref(url,page) {
  let startUrl = url+page+".html";
  http.get(startUrl,function(res) {
    let chunks = [];

    res.on('data',function(chunk){
      chunks.push(chunk);
    });
    res.on('end',function(){
      let title = [];
      let html = iconv.decode(Buffer.concat(chunks),'gb2312');
      let $ = cheerio.load(html, {decodeEntities: false});
      // console.log($);
      $('.co_content8 .ulink').each(function(i,d) {
        let $d = $(d);
        titleHref.push({
          href: $d.attr('href')
        });
      });
      console.log(titleHref);
    });
    if(page <= totalPage) {
      getTitleHref(url,++page);
    }else {
      console.log(page);
      getLink(titleHref);
    }
   
  });
}

//獲取種子鏈接
function getLink(titleHref) {
  console.log('進入getLink');

  titleHref.forEach(function(v,k) {
    console.log('~~~~~~~~~~~~~~~~~~~~');
    let infoUrl = Host + v.href;
    console.log(infoUrl);
    // try {
      http.get(infoUrl,function(res) {
        console.log('進入getlink http');
        
        let chunks = [];
        res.on('data',function(chunk) {
          chunks.push(chunk);
        });
        res.on('end', function(){
          let html = iconv.decode(Buffer.concat(chunks),'gb2312');
          let $ = cheerio.load(html, {decodeEntities: false});
          
          
          let reg = /.*譯  名/;
          let info = '';
          let bt = '';
          let textInfo = $('.co_content8 #Zoom p').eq(0).text();
          info = textInfo.match(reg)[0];
          bt = $('#Zoom td').children('a').attr('href');
          res.push({
            Info:info,
            Bt:bt
          });
          console.log(res);
        })
        //怎么捕獲錯誤?。?!
        //res.on('error',function(){
        //  console.log('error');
        //})
      })
  // }catch(e) {
  //   console.log(e);
  // }
  });
};

getTitleHref(baseUrl,1)

如上面代碼,異步太多,不知道是哪里的問題,求node大神指教,getLink()函數(shù)里的titleHref.forEach都沒進去到,node線程就崩了?console.log('進入getlink http')也是沒有打印的

回答
編輯回答
魚梓

getLink(titleHref);執(zhí)行完res.on('end',...)還沒執(zhí)行。空數(shù)組怎么進去。

2018年5月11日 09:13