2016-01-23 64 views
0

我试图抓取一个网站,以获取它的数据。到目前为止,我得到它至少连接到网站,但现在当我尝试设置与数据的文本框的文本,我只是得到了一堆:HtmlAgilityPack XPath返回HtmlAgilityPack.HtmlNodeCollection

HtmlAgilityPack.HtmlNodeCollection 

有相同数量的有数据的HtmlAgilityPack.HtmlNodeCollection。这里是我的代码(我知道这是有点马虎):

using System.Collections.Generic; 
using System.Linq; 
using System.Net; 
using System.Text.RegularExpressions; 
using System.Windows.Forms; 
using System; 
using HtmlAgilityPack; 

namespace WindowsFormsApplication1 
{ 
public partial class Form1 : Form 
{ 
    string choice; 

    public Form1() 
    { 
     InitializeComponent(); 
    } 

    public void comboBox1_SelectedIndexChanged(object sender, System.EventArgs e) 
    { 

    } 

    public void button1_Click(object sender, System.EventArgs e) 
    { 
     HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); 
     htmlDoc.OptionFixNestedTags = true; 

     string urlToLoad = "http://www.nbcwashington.com/weather/school-closings/"; 
     HttpWebRequest request = HttpWebRequest.Create(urlToLoad) as HttpWebRequest; 
     request.Method = "GET"; 

     Console.WriteLine(request.RequestUri.AbsoluteUri); 
     WebResponse response = request.GetResponse(); 

     htmlDoc.Load(response.GetResponseStream(), true); 
     if (htmlDoc.DocumentNode != null) 
     { 
      var articleNodes = htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p"); 



      if (articleNodes != null && articleNodes.Any()) 
      { 
       foreach (var articleNode in articleNodes) 
       { 

        textBox1.AppendText(htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p").ToString()); 

       } 
      } 
     } 

     Console.ReadLine(); 
    } 

    private void listBox1_SelectedIndexChanged(object sender, System.EventArgs e) 
    { 
     choice = listBox1.SelectedItem.ToString(); 
    } 



} 
} 

所以我在这里错过/做错了什么?数据应该返回类似于:

Warren County Public Schools Closed 
Washington Adventist University Closing at Noon 

感谢您关注此问题。

+0

转换为字符串的反应,看HTML,我们会告诉你什么是错的。此外,大多数这种检查为零是不需要的。 – mybirthname

回答

0

没关系,找到了问题。我想我试图抓住文档节点而不是内部文本...这里是代码,以防万一任何人想要它。

using System.Collections.Generic; 
using System.Linq; 
using System.Net; 
using System.Text.RegularExpressions; 
using System.Windows.Forms; 
using System; 
using HtmlAgilityPack; 

namespace WindowsFormsApplication1 
{ 
public partial class Form1 : Form 
{ 
    string choice; 

    public Form1() 
    { 
     InitializeComponent(); 
    } 

    public void comboBox1_SelectedIndexChanged(object sender, System.EventArgs e) 
    { 

    } 

    public void button1_Click(object sender, System.EventArgs e) 
    { 
     HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); 
     htmlDoc.OptionFixNestedTags = true; 

     string urlToLoad = "http://www.nbcwashington.com/weather/school-closings/"; 
     HttpWebRequest request = HttpWebRequest.Create(urlToLoad) as HttpWebRequest; 
     request.Method = "GET"; 

     Console.WriteLine(request.RequestUri.AbsoluteUri); 
     WebResponse response = request.GetResponse(); 

     htmlDoc.Load(response.GetResponseStream(), true); 
     if (htmlDoc.DocumentNode != null) 
     { 
      var articleNodes = htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p"); 



      if (articleNodes != null && articleNodes.Any()) 
      { 
       int k = 0; 
       foreach (var articleNode in articleNodes) 
       { 


        textBox1.AppendText(articleNode.InnerText + "\n"); 

       } 
      } 
     } 

     Console.ReadLine(); 
    } 

    private void listBox1_SelectedIndexChanged(object sender, System.EventArgs e) 
    { 
     choice = listBox1.SelectedItem.ToString(); 
    } 



} 

}

0

由于articleNodes已经包含了你感兴趣的节点,就没有必要再打电话SelectNodes()一个循环中。

此外,您不需要检查null,因为articleNodes是一个集合。它可能是空,但不应该是null

试试这个,访问InnerHtml(或InnerText)属性,而不是:

var articleNodes = htmlDoc.DocumentNode.SelectNodes("/html/body/div/div/div/div/div/div/p"); 

var result = articleNodes.Select(x => x.InnerHtml.Replace("<br><span>", " ") 
               .Replace(" </span>", "")).ToList();