鍍金池/ 問答/PHP  Python/ PHP 動態(tài)網(wǎng)站爬取問題

PHP 動態(tài)網(wǎng)站爬取問題

URL http://app1.sfda.gov.cn/datas...

clipboard.png

<?php
require_once 'curl.func.php';
$url='http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=25&tableName=TABLE25&tableView=%B9%FA%B2%FA%D2%A9%C6%B7&Id=29813&bcId=124356560303886909015737447882 ';
$config=array();
$config['cookie']='JSESSIONID=CB48D05599167A38CFCDCB53416B6AE1.7; FSSBBIl1UgzbN7N82S=vZCYhV7eHgrdkkhDoKwBA2ck5t.Y0NbG8rONrlN7HoM_GuZzRR6fNkJkR7MJF3u_; FSSBBIl1UgzbN7N82T=2gjgV3eNRxQ1nzOjyrE_N4bSy84kQZ6HotJaeBD3VycZ4kDwb.PVnyEC0aiuxiuFTKyJXv_pFn150mftlM9Yqo4_MKfuJuWrCkEjcOwXZaaZnqPAXlurB5n5wtzNlBShlr1BMYc_g7I9dSbJFg2pdyyW4S3d4DwpxPwQfwYlY1SA758_pgEakKCZafgq_13s2_QXWHN0JKsU_1geEVR2ymIqyNFt7yOTTjorHW2_crSBlqfhnF9kGgGIak1K_83t_jA3SBf6aCp6pp_6UotA50yP6Wb5mGb_4enYZnEYmY23wgeX984XbcM3Jkf0keLOpjjGjuzqIUUXZMNoSBUL286ZJvrmuIcYknISGHtYBSxRFJz62v9auesdmkflTIaF_ta5PUjx0Nml_ejCKW0ynSEEp';
$config['header'][]='User-Agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36';
$config['header'][]='Cache-Control: max-age=0';
$config['header'][]='Upgrade-Insecure-Requests: 1';
$config['header'][]='Content-Type: text/html;encoding=gbk';
$config['header'][]='Accept-Encoding: gzip, deflate';
$config['header'][]='Accept-Language: zh-CN,zh;q=0.9';  
$config['header'][]='Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'; 
// $config['referer']=$referer; 

$result=curlopen($url,$config);

file_put_contents('cfda.txt',$result);

curlopen代碼

function curlOpen($url, $config = array())
{
    $arr = array('post' => false,'referer' => $url,'cookie' => '', 'useragent' => 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; customie8)', 'timeout' => 20, 'return' => true, 'proxy' => '', 'userpwd' => '', 'nobody' => false,'header'=>array(),'gzip'=>true,'ssl'=>false,'isupfile'=>false);
    $arr = array_merge($arr, $config);
    $ch = curl_init();
    
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, $arr['return']);
    curl_setopt($ch, CURLOPT_NOBODY, $arr['nobody']);  
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
    curl_setopt($ch, CURLOPT_USERAGENT, $arr['useragent']);
    curl_setopt($ch, CURLOPT_REFERER, $arr['referer']);
    curl_setopt($ch, CURLOPT_TIMEOUT, $arr['timeout']);
    curl_setopt($ch, CURLOPT_MAXREDIRS, 0);
    
curl_setopt($curl,CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_HEADER, true);//獲取header
    if($arr['gzip']) curl_setopt($ch, CURLOPT_ENCODING, 'gzip,deflate');
    if($arr['ssl'])
    {
        curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
        curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
    }
    if(!empty($arr['cookie']))
    {
        curl_setopt($ch, CURLOPT_COOKIEJAR, $arr['cookie']);
        curl_setopt($ch, CURLOPT_COOKIEFILE, $arr['cookie']); 
    } 
    
    if(!empty($arr['proxy']))
    {
        //curl_setopt($ch, CURLOPT_PROXYTYPE, CURLPROXY_HTTP);  
        curl_setopt ($ch, CURLOPT_PROXY, $arr['proxy']);
        if(!empty($arr['userpwd']))
        {            
            curl_setopt($ch,CURLOPT_PROXYUSERPWD,$arr['userpwd']);
        }        
    }    
    var_dump($arr['cookie']);
    //ip比較特殊,用鍵值表示
    if(!empty($arr['header']['ip']))
    {
        array_push($arr['header'],'X-FORWARDED-FOR:'.$arr['header']['ip'],'CLIENT-IP:'.$arr['header']['ip']);
        unset($arr['header']['ip']);
    }   
    $arr['header'] = array_filter($arr['header']);
    
    if(!empty($arr['header']))
    {
        curl_setopt($ch, CURLOPT_HTTPHEADER, $arr['header']); 
    }

    if ($arr['post'] != false)
    {
        curl_setopt($ch, CURLOPT_POST, true);
        if(is_array($arr['post']) && $arr['isupfile'] === false)
        {
            $post = http_build_query($arr['post']);            
        } 
        else
        {
            $post = $arr['post'];
        }
        curl_setopt($ch, CURLOPT_POSTFIELDS, $post);
    }    
    $result = curl_exec($ch);
    //var_dump(curl_getinfo($ch));
    curl_close($ch);

    return $result;
}

curlopen后一直不是想要的內(nèi)容。即GET請求的內(nèi)容和瀏覽器顯示的不一致

回答
編輯回答
嘟尛嘴

headless browser 可嘗試一下
對于動態(tài)網(wǎng)站,可以先用無頭瀏覽器去訪問,得到 js 運行之后的結(jié)果,在分析 html 結(jié)構(gòu)就行了。
比如我知道的有 phantomjs ,還有很多類似的吧

2018年6月25日 07:18
編輯回答
抱緊我

數(shù)據(jù)可能是ajax單獨獲取的,需要抓包看一下
而且建議建議你把curlopen的代碼貼出來看看

2017年8月24日 05:38