2011-04-28 96 views
0

首先,我是一个新手PHP所以,如果有做什么,我试图做的,请随时指出来:)比较爬页面文本

我来的更好或更有效的方法跨过用于抓取站点的旧PHP脚本,并检查找到的页面上的响应代码。我修改了它来做一个重复的内容检查。它使用similar_text函数将1页的内容(由用户指定)与其找到的每个页面的内容进行比较。

这有点慢,但它的工作。我遇到的唯一问题是,它在大约前10个链接后停止,我找不出原因。我提前道歉,我知道这是相当多的代码。任何帮助是极大的赞赏。

<form action="<?php echo $_SERVER['PHP_SELF']; ?>" method="post">  
<div class="row"><label for="page1" class="small label"><strong>Page? </strong>: </label><input type="text" name="page1" id="page1" value="" size="40" /></div>   
<div class="row"><label for="url" class="small label"><strong>Please Enter URL </strong>: </label><input type="text" name="url" id="url" value="" size="40" /></div> 
<div class="row"><label for="maxlinks" class="small label"><strong>Number of links to get </strong>: </label><input type="text" name="maxlinks" id="maxlinks" value="25" size="3" maxlength="3" /></div> 
<div class="row"><label for="linkdepth" class="small label"><strong>Links Maximum depth</strong> : </label> <select name="linkdepth" id="linkdepth" ><option value="1">1</option> 
<option value="2" selected="selected">2</option> 
<option value="3">3</option> 
<option value="4">4</option> 
<option value="5">5</option> 
<option value="6">6</option> 
</select></div> 
<input type="submit" name="submit" style="font-weight: bold" value="Check links" id="submit" /> 
</form> 
<?php 
if (isset($_POST['submit'])){ 
    $page1 = ($_POST['page1']); 
    $baseurl = ($_POST['url']); 
    $pages = array(); 
    $i=($_POST['linkdepth']); 
    $maxlinks = (integer)$_POST['maxlinks']; 

$domain= extract_domain_name($baseurl); 
echo '<p class="small">Extracted domain name: <strong>'.$domain.'</strong>. '; 
echo 'Maximum depth: <strong>'.$i.'</strong></p>'; 
function get_urls($page){ 
    global $domain, $i; 

    $ch = curl_init(); 
    curl_setopt($ch, CURLOPT_URL, $page); 
    curl_setopt($ch, CURLOPT_RETURNTRANSFER,1); 
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 
    curl_setopt($ch, CURLOPT_HEADER, true); 
    /* Spoof the User-Agent header value; just to be safe */ 
    curl_setopt($ch, CURLOPT_USERAGENT, 
     'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)'); 
    /* I set timeout values for the connection and download 
    because I don't want my script to get stuck 
    downloading huge files or trying to connect to 
    a nonresponsive server. These are optional. */ 
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 100); 
    curl_setopt($ch, CURLOPT_TIMEOUT, 100); 
    /* This ensures 404 Not Found (and similar) will be 
    treated as errors */ 
    curl_setopt($ch, CURLOPT_FAILONERROR, 0); 

    /* Download the page */ 
    $html = curl_exec($ch); 
    /* in case of an error*/ 
    if(curl_exec($ch) === false) 
     { 
     echo '<p class="small">Error. Please check URL: <strong style="color:#ae3100">' . curl_error($ch).'</p></strong>'; 
     } 

    curl_close($ch); 

    if(!$html) return false; 
    /* Extract the BASE tag (if present) for 
     relative-to-absolute URL conversions later */ 
     if(preg_match('/<base[\s]+href=\s*[\"\']?([^\'\" >]+)[\'\" >]/i',$html, $matches)){ 

     $base_url=$matches[1]; 
     echo $base_url; 
      } else { 
        $base_url=$page; //base url = strani4ka s kotoroy na4inaetsa novaja porverka 
        } 
      $links=array(); 
      $html = str_replace("\n", ' ', $html); 


      preg_match_all('/<a[\s]+[^>]*href\s*=\s*[\"\']?([^\'\" >]+)[\'\" >]/i', $html, $m); 
     /* this regexp is a combination of numerous 
      versions I saw online*/ 
       foreach($m[1] as $url) { 
       $url=trim($url); 
       /* get rid of PHPSESSID, #linkname, & and javascript: */ 
       $url=preg_replace(
        array('/([\?&]PHPSESSID=\w+)$/i','/(#[^\/]*)$/i', '/&/','/^(javascript:.*)/i'), 
        array('','','&',''), 
        $url); 

       /* turn relative URLs into absolute URLs. 
        relative2absolute() is defined further down 
        below on this page. */ 

        $url = relative2absolute($base_url, $url); 

        // check if in the same (sub-)$domain 
       if(preg_match("/^http[s]?:\/\/[^\/]*".str_replace('.', '\.', $domain)."/i", $url)) 
       { 
       $depth= substr_count($url, "/")-2 ; 

       /* Counts slashes in URL 
       Responsible for link depth 
       */ 

     if ($depth <= $i){ 

      if(!in_array($url, $links, check)) $links[]=$url; 
       } } 
     } 

    return $links; 

} 

// Functions to crawl the next page 
function next_page(){ 
    global $pages; 
$k=0; 
    foreach(array_keys($pages) as $k=> $page){ 

     if($pages[$page] == NULL){ 
      $k++; 

      echo "[$k] - "; 
      return $page; 
     } 
    } 
    return NULL; 
} 

function add_urls($page){ // ads new unique urls in to array and checks each url for Server Header Status 
    global $pages, $maxlinks; 

    $start = microtime(); 
    $urls = get_urls($page); 
    $resptime = microtime() - $start; // with microtime it is possible to find out on which page the crowler stops responding. 

    //Start checking for Server Header 
    $ch = curl_init($page); 
    curl_setopt($ch, CURLOPT_NOBODY, 1); 
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 

    // Execute 
    curl_exec($ch); 
    $info = curl_getinfo($ch); 

    print "$page"; 

// If the status code os 200, then print OK, else = NO 
//  if($info['http_code']==200) { 
$page1 = ($_POST['page1']); 
$page1data = file_get_contents($page1); 
$page2 = file_get_contents($page); 

$i = similar_text($page1data, $page2, $p); 
$p = round($p, 2); 

     echo ' - Match Percentage:' . $p . '%'; 
//  } else { 
//    echo '<strong style="color:#ba3d00"> NO </strong>';} 

      /* echo substr(($resptime),0,5). " seconds"; */ // Activate ths to see how much time it takes to crawl 
      echo '<br/>'; 

     curl_close($ch); // Close handle 

    $pages[$page] = array ('resptime' => floor($resptime * 9000), 'url' => $page); 

    foreach($urls as $url){ 
     if(!array_key_exists($url, $pages) && !in_array($url, $pages) && count($pages)<$maxlinks){ 
      $pages[$url] = NULL; 
     } 

    } 

} 

echo '[1] - '; // this is for the first input url, as it will be extracted from input 
add_urls($baseurl); 

while(($page= next_page()) != NULL) //while there are urls available 


{ 
add_urls($page); 

} 

    echo '<p class="small">Amount of crawled links: <strong>'.count ($pages).'</strong></p>'; 
    if (count($pages)<$maxlinks) echo '<p class="small">Sorry, no more links to crawl!!</p>';// count all extracted Urls 
} 

?><?php 
function extract_domain_name($url){ 
    /* old domain extractor 
    if(preg_match('@^(?:http:\/\/)?([^\/]+)@i', $url, $matches)) { 
     return trim(strtolower($matches[1])); 
    } else { 
     return ''; 
    }*/ 
    preg_match("/^(http:\/\/)?([^\/]+)/i", $url, $matches); 
    $host = $matches[2]; 
    // get last two segments of host name 
    preg_match("/[^\.\/]+\.[^\.\/]+$/", $host, $matches); 
    return $matches[0]; 

} 

function relative2absolute($absolute, $relative) { 
$p = parse_url($relative); 
if($p["scheme"])return $relative; 
extract(parse_url($absolute)); 
$path = dirname($path); 
if($relative{0} == '/') 
{ 
$newPath = array_filter(explode("/", $relative)); 
} 
else 
{ 
$aparts = array_filter(explode("/", $path)); 
$rparts = array_filter(explode("/", $relative)); 
$cparts = array_merge($aparts, $rparts); 
$k = 0; 
$newPath = array(); 
foreach($cparts as $i => $part) 
{ 
if($part == '..') 
{ 
$k = $k - 1; 
$newPath[$k] = null; 
} 
else 
{ 
$newPath[$k] = $cparts[$i]; 
$k = $k + 1; 
} 
} 
$newPath = array_filter($newPath); 
} 
$path = implode("/", $newPath); 
$url = ""; 
if($scheme) 
{ 
$url = "$scheme://"; 
} 
if($user) 
{ 
$url .= "$user"; 
if($pass) 
{ 
$url .= ":$pass"; 
} 
$url .= "@"; 
} 
if($host) 
{ 
$url .= "$host/"; 
} 
$url .= $path; 
return $url; 
} 

################################################## 

回答

1

如果它恰好在约30秒后停止。将以下内容添加到脚本的顶部:set_time_limit(0);

通常,PHP脚本在30秒后终止,但您可以像这样覆盖它。

+0

我试着在每个开放的PHP语句的下面添加一个,并且它在10之后仍然停止 – Batfan 2011-04-28 15:44:55