2017-12-02 356 views
0

我需要刮这个HTML页面...提取字符串使用PHP广告的XPath刮

https://www.sanita.puglia.it/monitorpo/aslfg/monitorps-web/monitorps/monitorPSperASL.do?codNazionale=160115

enter image description here

....使用PHP和XPath来获取值在名为“PO G.TATARELLA-CERIGNOLA”的表格下的绿色框中。

(注:你可以在页面中看到不同的价值,如果你试图浏览它......没关系.. ,,它改变dinamically ....)

我用这PHP代码示例打印的价值...

<?php 
    ini_set('display_errors', 'On'); 
    error_reporting(E_ALL); 

    $url = 'https://www.sanita.puglia.it/monitorpo/aslfg/monitorps-web/monitorps/monitorPSperASL.do?codNazionale=160115'; 

    $xpath_for_parsing = '/html/body/div[4]/table/tbody/tr[2]/td[4]/div'; 

    //#Set CURL parameters: pay attention to the PROXY config !!!! 
    $ch = curl_init(); 
    curl_setopt($ch, CURLOPT_AUTOREFERER, TRUE); 
    curl_setopt($ch, CURLOPT_HEADER, 0); 
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 
    curl_setopt($ch, CURLOPT_URL, $url); 
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE); 
    curl_setopt($ch, CURLOPT_PROXY, ''); 
    $data = curl_exec($ch); 
    curl_close($ch); 

    $dom = new DOMDocument(); 
    @$dom->loadHTML($data); 

    $xpath = new DOMXPath($dom); 

    $colorWaitingNumber = $xpath->query($xpath_for_parsing); 
    $theValue = 'N.D.'; 
    foreach($colorWaitingNumber as $node) 
    { 
     $theValue = $node->nodeValue; 
    } 

    print $theValue; 
?> 

这样,我获得“ND”作为输出不是“”我想。

页面源代码如下...

enter image description here

在我的代码,我想不要用“绝对的XPath”所以,我试图使用像语法(我知道这是行不通的,但我是一个新手用xpath ...)

$xpath_for_parsing = '//*[div="cRiga3 boxtriageS"]'; 

但结果总是一样的。

任何建议/示例?

回答

1

我认为以下几点应该会有所帮助 - 您需要调整XPath查询,以便定位特定的表格和特定的单元格内容,但主代码似乎可以正常工作。我怀疑原始代码的问题是URL为https,通常在进行卷曲请求时需要额外的配置设置。有curlrequest函数中的设置可以删除,我只是从另一个脚本中复制了这些设置。

改变路径以$cacertcacert.pem您的系统上的副本或到live version on curl.haxx.se

$url = 'https://www.sanita.puglia.it/monitorpo/aslfg/monitorps-web/monitorps/monitorPSperASL.do?codNazionale=160115'; 

function _curlrequest($url=null, $options=null){ 
    $cacert='c:/wwwroot/cacert.pem'; 
    $vbh = fopen('php://temp', 'w+'); 


    $res=array(
     'response' => null, 
     'verbose' => null, 
     'info'  => array('http_code' => 100), 
     'headers' => null, 
     'errors' => null 
    ); 
    if(is_null($url)) return (object)$res; 

    session_write_close(); 

    /* Initialise curl request object */ 
    $curl=curl_init(); 
    if(parse_url($url,PHP_URL_SCHEME)=='https'){ 
     curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, true); 
     curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, 2); 
     curl_setopt($curl, CURLOPT_CAINFO, $cacert); 
    } 

    /* Define standard options */ 
    curl_setopt($curl, CURLOPT_URL,trim($url)); 
    curl_setopt($curl, CURLOPT_AUTOREFERER, true); 
    curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); 
    curl_setopt($curl, CURLOPT_FAILONERROR, true); 
    curl_setopt($curl, CURLOPT_HEADER, false); 
    curl_setopt($curl, CURLINFO_HEADER_OUT, false); 
    curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); 
    curl_setopt($curl, CURLOPT_BINARYTRANSFER, true); 
    curl_setopt($curl, CURLOPT_CONNECTTIMEOUT, 20); 
    curl_setopt($curl, CURLOPT_TIMEOUT, 60); 
    curl_setopt($curl, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'); 
    curl_setopt($curl, CURLOPT_MAXREDIRS, 10); 
    curl_setopt($curl, CURLOPT_ENCODING, ''); 

    curl_setopt($curl,CURLOPT_VERBOSE,true); 
    curl_setopt($curl,CURLOPT_NOPROGRESS,true); 
    curl_setopt($curl,CURLOPT_STDERR,$vbh); 

    /* Assign runtime parameters as options */ 
    if(isset($options) && is_array($options)){ 
     foreach($options as $param => $value) curl_setopt($curl, $param, $value); 
    } 

    /* Execute the request and store responses */ 
    $res=(object)array(
     'response' => curl_exec($curl), 
     'info'  => (object)curl_getinfo($curl), 
     'errors' => curl_error($curl) 
    ); 
    rewind($vbh); 
    $res->verbose=stream_get_contents($vbh); 
    fclose($vbh); 

    curl_close($curl); 
    return $res; 
} 

function getdom($data=false, $debug=false){ 
    try{ 
     if(!$data)throw new Exception('No data passed whilst trying to invoke DOMDocument'); 
     libxml_use_internal_errors(true); 

     $dom = new DOMDocument(); 
     $dom->validateOnParse=false; 
     $dom->standalone=true; 
     $dom->strictErrorChecking=false; 
     $dom->recover=true; 
     $dom->formatOutput=false; 
     $dom->loadHTML($data); 

     $errors=libxml_get_errors(); 
     libxml_clear_errors(); 

     return !empty($errors) && $debug ? $errors : $dom; 

    }catch(Exception $e){ 
     echo $e->getMessage(); 
    } 
} 



$obj=_curlrequest($url); 
if($obj->info->http_code==200){ 

    $dom=getdom($obj->response); 
    $xp=new DOMXPath($dom); 


    $query='//div[ contains(@class,"cRiga3 boxtriageS") ]'; 
    $col=$xp->query($query); 

    if(!empty($col) && $col->length > 0){ 
     foreach($col as $node)echo $node->nodeValue . '<br />'; 
    } 
} 

此输出

2 
20 
37 
>1h 
1 
2 
24 
10 
5 
7 
32 
29 
0 
3 
25 
5 
0 
0 
6 
2