2012-07-26 66 views
1

This是我试图解析使用简单的Html Dom的页面。我已经完成了90%的功能,但由于我是图书馆的新手,我不太确定要这样做。使用简单的Html Dom删除一些元素

我想刮文本每条新闻的,但由于文字是<p>元素中,使用类似->innertext带来里面的一切,包括链接。

这是我已经试过:

<h1>Scraper Noticias</h1> 

<?php 

include('simple_html_dom.php'); 

class News { 
    var $image; 
    var $fechanoticia; 
    var $title; 
    var $description; 
    var $sourceurl; 

    function get_image() { 
     return $this->image; 
    } 

    function set_image ($new_image) { 
     $this->image = $new_image; 
    } 

    function get_fechanoticia() { 
     return $this->fechanoticia; 
    } 

    function set_fechanoticia ($new_fechanoticia) { 
     $this->fechanoticia = $new_fechanoticia; 
    } 

    function get_title() { 
     return $this->title; 
    } 

    function set_title ($new_title) { 
     $this->title = $new_title; 
    } 

    function get_description() { 
     return $this->description; 
    } 

    function set_description ($new_description) { 
     $this->description = $new_description; 
    } 

    function get_sourceurl() { 
     return $this->sourceurl; 
    } 

    function set_sourceurl ($new_sourceurl) { 
     $this->sourceurl = $new_sourceurl; 
    } 
} 

// Create DOM from URL or file 
$html = file_get_html('http://www.uvm.cl/noticias_mas.shtml'); 

$parsedNews = array(); 

// Find all news items. 
foreach($html->find('#cont2 p') as $element) { 

    $newItem = new News; 

    // Parse the news item's thumbnail image. 
    foreach ($element->find('img') as $image) { 
     $newItem->set_image($image->src); 
     //echo $newItem->get_image() . "<br />"; 
    } 

    // Parse the news item's post date. 
    foreach ($element->find('span.fechanoticia') as $fecha) { 
     $newItem->set_fechanoticia($fecha->innertext); 
     //echo $newItem->get_fechanoticia() . "<br />"; 
    } 

    // Parse the news item's title. 
    foreach ($element->find('a') as $title) { 
     $newItem->set_title($title->innertext); 
     //echo $newItem->get_title() . "<br />"; 
    } 

    // Parse the news item's source URL link. 
    foreach ($element->find('a') as $sourceurl) { 
     $newItem->set_sourceurl("http://www.uvm.cl/" . $sourceurl->href); 
    } 

    // Parse the news items' description text. 
    echo $link; //This is the entire <p> tag. How can I get just the text. Not the link? 

} 

?> 
+0

我只是测试它,它返回7个环节。你只是想要的文字和剥离链接? – 2012-07-26 22:50:33

+0

@保罗:没错。 :)这正是我遇到的问题。我想要的文字,没有链接。 – 2012-07-26 22:51:34

+0

见下文...... – 2012-07-26 22:59:50

回答

1

这里有一个解决方案,我发现。虽然如果我可以改进代码,它将不胜感激。

<h1>Scraper Noticias</h1> 

<?php 

include('simple_html_dom.php'); 

class News { 
    var $image; 
    var $fechanoticia; 
    var $title; 
    var $description; 
    var $sourceurl; 

    function get_image() { 
     return $this->image; 
    } 

    function set_image ($new_image) { 
     $this->image = $new_image; 
    } 

    function get_fechanoticia() { 
     return $this->fechanoticia; 
    } 

    function set_fechanoticia ($new_fechanoticia) { 
     $this->fechanoticia = $new_fechanoticia; 
    } 

    function get_title() { 
     return $this->title; 
    } 

    function set_title ($new_title) { 
     $this->title = $new_title; 
    } 

    function get_description() { 
     return $this->description; 
    } 

    function set_description ($new_description) { 
     $this->description = $new_description; 
    } 

    function get_sourceurl() { 
     return $this->sourceurl; 
    } 

    function set_sourceurl ($new_sourceurl) { 
     $this->sourceurl = $new_sourceurl; 
    } 
} 

// Create DOM from URL or file 
$html = file_get_html('http://www.uvm.cl/noticias_mas.shtml'); 

$parsedNews = array(); 

// Find all news items. 
foreach($html->find('#cont2 p') as $element) { 

    $newItem = new News; 

    // Parse the news item's thumbnail image. 
    foreach ($element->find('img') as $image) { 
     $newItem->set_image($image->src); 
     //echo $newItem->get_image() . "<br />"; 
    } 

    // Parse the news item's post date. 
    foreach ($element->find('span.fechanoticia') as $fecha) { 
     $newItem->set_fechanoticia($fecha->innertext); 
     //echo $newItem->get_fechanoticia() . "<br />"; 
    } 

    // Parse the news item's title. 
    foreach ($element->find('a') as $title) { 
     $newItem->set_title($title->innertext); 
     //echo $newItem->get_title() . "<br />"; 
    } 

    // Parse the news item's source URL link. 
    foreach ($element->find('a') as $sourceurl) { 
     $newItem->set_sourceurl("http://www.uvm.cl/" . $sourceurl->href); 
    } 

    // Parse the news items' description text. 
    foreach ($element->find('a') as $link) { 
     $link->outertext = ''; 
    } 

    foreach ($element->find('span') as $link) { 
     $link->outertext = ''; 
    } 

    foreach ($element->find('img') as $link) { 
     $link->outertext = ''; 
    } 

    echo $element->innertext; 

} 

?> 
0

使用的innertext代替outertext

foreach ($element->find('a') as $sourceurl) { 
    echo $sourceurl->innertext . "<br />"; 
    }