2011-10-01 100 views
3

我是一个使用简单的HTML DOM分析器的一堆子页面的屏幕抓取页面。出于某种原因,它解析了前40个子页面,但是当涉及到41号时,它没有错误地死亡。PHP简单的HTML DOM分析器模

我制作了this test page,并尝试记录我在脚本中所做的所有操作以及Simple DOM DOM解析器中的一些venet,但是我一直无法找到该错误。

有没有人有一个想法,当解析URL号码41?或者是否有人知道一些情况简单的HTML DOM解析器会失败?

我的测试页:http://snuzzer.dk/pub/shdp/parse.php

这是我的剧本,我用简单的HTML DOM解析器的非修改后的版本。 有趣的东西发生在get_lections()和我有市场,我称之为简单的HTML DOM解析器。

define("LECTION_STATUS_REGULAR", 0); 
define("LECTION_STATUS_CHANGED", 1); 
define("LECTION_STATUS_CANCELLED", 2); 

define("LECTION_DOCUMENTS_NONE", 0); 
define("LECTION_DOCUMENTS_TRUE", 1); 

define("AMOUNT_OF_WEEKS_IN_A_YEAR", 52); 

include_once("simple_html_dom.php"); 

function clean_text($text) 
{ 
    $text = trim($text); 
    $text = strip_tags($text); 
    $text = html_entity_decode($text, ENT_QUOTES, "UTF-8"); 
    $text = utf8_decode($text); 

    return $text; 
} 

function get_links_for_lections($weeks) 
{ 
    echo "Finding links<br /><textarea style=\"width:70%;height:150px;\">"; 

    foreach($weeks as $week) 
    { 
     // ** 
     // 
     // THIS IS WHERE I CALL SIMPLE HTML DOM PARSER 
     // 
     // ** 

     echo " * Retrieving HTML...\n"; 
     $html = file_get_html("http://www.lectio.dk/lectio/285/SkemaNy.aspx?type=elev&elevid=2444366210&week=" . $week['week'] . $week['year']); 
     echo " * HTML retrieved...\n"; 

     $lections_regular = $html->find('a[class="s2skemabrik s2bgbox s2withlink"]'); 
     $lections_changed = $html->find('a[class="s2skemabrik s2bgbox s2changed s2withlink"]'); 
     $lections_cancelled = $html->find('a[class="s2skemabrik s2bgbox s2cancelled s2withlink"]'); 
     $lections = array_merge($lections_regular, $lections_changed, $lections_cancelled); 

     foreach($lections as $lection) 
     { 
      $links[] = "http://www.lectio.dk" . $lection->href; 
     } 
    } 

    echo "</textarea> 
    <hr />"; 

    return $links; 
} 

function get_lections($links) 
{ 
    // Create array to hold lections 
    $lections = array(); 

    // Loop through links 
    $num = 1; 
    foreach($links as $link) 
    { 
     echo $num . ". " . $link . "<br /> 
     <textarea style=\"width:70%;height:150px;\">"; 

     // Initialize lection 
     $lection = array(); 
     $lection['status'] = LECTION_STATUS_REGULAR; 
     $lection['documents'] = LECTION_DOCUMENTS_NONE; 

     echo " * Retrieving HTML...\n"; 
     $html = file_get_html($link); 
     echo " * HTML retrieved\n"; 

     // Loop through rows 
     foreach($html->find("tr") as $row) 
     { 
      echo " * New cell\n"; 

      // Get name of row 
      $row_name = $row->find("th"); 
      $row_name = $row_name['0']->innertext; 

      echo " - Row name: \"" . $row_name . "\"\n"; 

      if ($row_name == "Type:") 
      { 
       echo " - Checking type...\n"; 

       // Row tells what type it is 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['type'] = clean_text($content); 

       echo " - Type checked\n"; 
      } 
      else if ($row_name == "Titel:") 
      { 
       echo " - Checking title...\n"; 

       // Row tells the title 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['title'] = clean_text($content); 

       echo " - Title checked\n"; 
      } 
      else if ($row_name == "Hold:") 
      { 
       echo " - Checking subject...\n"; 

       // Row tells what the subject is 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['subject'] = clean_text($content); 

       echo " - Subject checked\n"; 
      } 
      else if ($row_name == "Lærere:") 
      { 
       echo " - Checking teachers...\n"; 

       // Row tells who the teacher is 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['teachers'] = clean_text($content); 

       echo " - Teachers checked\n"; 
      } 
      else if ($row_name == "Lokaler:") 
      { 
       echo " - Checking location...\n"; 

       // Row tells the location 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['location'] = clean_text($content); 

       echo " - Location checked\n"; 
      } 
      else if ($row_name == "Note:") 
      { 
       echo " - Checking note...\n"; 

       // Row contains a note 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['note'] = clean_text($content); 

       echo " - Note checked\n"; 
      } 
      elseif ($row_name == "Dokumenter:") 
      { 
       echo " - Checking documents...\n"; 

       // Row contains the documents 
       $cell = $row->find("td"); 
       $content = $cell['0']->plaintext; 
       $content = clean_text($content); 
       if ($content) 
       { 
        // We can't get the titles of the documents as we are not logged in 
        // Instead we tell the user that there are documents available 
        $lection['documents'] = LECTION_DOCUMENTS_TRUE; 
       } 

       echo " - Documents checked\n"; 
      } 
      else if ($row_name == "Lektier:") 
      { 
       echo " - Checking homework...\n"; 

       // Row contains the homework 
       $cell = $row->find("td"); 
       $content = $cell['0']->innertext; 
       $lection['homework'] = clean_text($content); 

       echo " - Homework checked\n"; 
      } 
      else if ($row_name == "Vises:") 
      { 
       echo " - Checking status (part 1)...\n"; 

       // Row tells where the lection is shown 
       $cell = $row->find("td"); 
       $content = $cell['0']->plaintext; 
       $content = clean_text($content); 
       if (strstr($content, ",")) 
       { 
        // If the above is true, the lection is NOT REGULAR 
        // Now we know that the lection is either changed or cancellde 
        // We assume it is changed 
        // Below we check if the lection is cancelled (Where $row_namme == "Status:") 
        $lection['status'] = LECTION_STATUS_CHANGED; 
       } 

       echo " - Status (part 1) checked\n"; 
      } 
     } 

     // Add lection to array of lections 
     $lections[] = $lection; 
     print_r($lection); 

     echo " - Lection added!</textarea><br /><br />"; 

     $num += 1; 
    } 

    return $lections; 
} 

function get_weeks($amount_of_weeks) 
{ 
    $weeks = array(); 

    // Current week 
    $week_now = date('W'); 
    $year_now = date('Y'); 

    // Demo 
    $week_now = 44; 

    // Last week to fetch 
    $last_week = $week_now + $amount_of_weeks; 

    // Add weeks to array 
    for ($i = $week_now; $i <= $last_week; $i++) 
    { 
     $week = array(); 

     if ($i > AMOUNT_OF_WEEKS_IN_A_YEAR) 
     { 
      // Week is next year 
      $week['week'] = $i - AMOUNT_OF_WEEKS_IN_A_YEAR; 
      $week['year'] = $year_now + 1; 
     } 
     else 
     { 
      // Week is in this year 
      $week['week'] = $i; 
      $week['year'] = $year_now; 
     } 

     // Add week to weeks 
     $weeks[] = $week; 
    } 

    return $weeks; 
} 

$weeks = get_weeks(5); 
$links = get_links_for_lections($weeks); 
$lections = get_lections($links); 
echo "<hr />"; 
print_r($lections); 
echo "<hr />"; 

回答

1

我跑这个,它工作正常,我起来了96.如果我不得不猜测我会说你达到最大excution时间。尝试在顶部添加:set_time_limit(0); 否则,请尝试更改错误报告并在此处发布任何错误。

+1

它适合你吗?真奇怪。设置'set_time_limit(0)'没有改变任何东西。我想我的webhotel不允许我这样做。它看起来并不像超时,因为它不会很长时间加载。我也觉得奇怪的是,如果是超时,它每次都停在同一个地方。 – simonbs

+0

我刚刚在另一台服务器上测试过。你是对的,似乎最大的执行时间已经达到。 – simonbs