如何在不支付组件的情况下将HTML转换为RTF（Rich Text）？

34

其实有一个简单而免费解决方案：使用你的浏览器，确定这是我用过的伎俩：

var webBrowser = new WebBrowser(); 
webBrowser.CreateControl(); // only if needed 
webBrowser.DocumentText = *yourhtmlstring*; 
while (_webBrowser.DocumentText != *yourhtmlstring*) 
    Application.DoEvents(); 
webBrowser.Document.ExecCommand("SelectAll", false, null); 
webBrowser.Document.ExecCommand("Copy", false, null); 
*yourRichTextControl*.Paste();

这可能是比其它方法速度慢，但至少它是免费的，作品！

来源

2011-01-31 18:34:04 Spartaco

1

也许你需要的是a control to edit the HTML？

来源

2008-09-30 08:10:05 GvS

3

这当然不完美，但这里是我用来将HTML转换为纯文本的代码。

public static string ConvertHtmlToText(string source) { 

      string result; 

      // Remove HTML Development formatting 
      // Replace line breaks with space 
      // because browsers inserts space 
      result = source.Replace("\r", " "); 
      // Replace line breaks with space 
      // because browsers inserts space 
      result = result.Replace("\n", " "); 
      // Remove step-formatting 
      result = result.Replace("\t", string.Empty); 
      // Remove repeating speces becuase browsers ignore them 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
                    @"()+", " "); 

      // Remove the header (prepare first by clearing attributes) 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"<()*head([^>])*>", "<head>", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"(<()*(/)()*head()*>)", "</head>", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        "(<head>).*(</head>)", string.Empty, 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 

      // remove all scripts (prepare first by clearing attributes) 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"<()*script([^>])*>", "<script>", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"(<()*(/)()*script()*>)", "</script>", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      //result = System.Text.RegularExpressions.Regex.Replace(result, 
      //   @"(<script>)([^(<script>\.</script>)])*(</script>)", 
      //   string.Empty, 
      //   System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"(<script>).*(</script>)", string.Empty, 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 

      // remove all styles (prepare first by clearing attributes) 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"<()*style([^>])*>", "<style>", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"(<()*(/)()*style()*>)", "</style>", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        "(<style>).*(</style>)", string.Empty, 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 

      // insert tabs in spaces of <td> tags 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"<()*td([^>])*>", "\t", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 

      // insert line breaks in places of <BR> and <LI> tags 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"<()*br()*>", "\r", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"<()*li()*>", "\r", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 

      // insert line paragraphs (double line breaks) in place 
      // if <P>, <DIV> and <TR> tags 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"<()*div([^>])*>", "\r\r", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"<()*tr([^>])*>", "\r\r", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"<()*p([^>])*>", "\r\r", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 

      // Remove remaining tags like <a>, links, images, 
      // comments etc - anything thats enclosed inside < > 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"<[^>]*>", string.Empty, 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 

      // replace special characters: 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"&nbsp;", " ", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 

      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"&bull;", " * ", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"&lsaquo;", "<", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"&rsaquo;", ">", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"&trade;", "(tm)", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"&frasl;", "/", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"<", "<", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @">", ">", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"&copy;", "(c)", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"&reg;", "(r)", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      // Remove all others. More can be added, see 
      // http://hotwired.lycos.com/webmonkey/reference/special_characters/ 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        @"&(.{2,6});", string.Empty, 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 


      // make line breaking consistent 
      result = result.Replace("\n", "\r"); 

      // Remove extra line breaks and tabs: 
      // replace over 2 breaks with 2 and over 4 tabs with 4. 
      // Prepare first to remove any whitespaces inbetween 
      // the escaped characters and remove redundant tabs inbetween linebreaks 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        "(\r)()+(\r)", "\r\r", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        "(\t)()+(\t)", "\t\t", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        "(\t)()+(\r)", "\t\r", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        "(\r)()+(\t)", "\r\t", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      // Remove redundant tabs 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        "(\r)(\t)+(\r)", "\r\r", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      // Remove multible tabs followind a linebreak with just one tab 
      result = System.Text.RegularExpressions.Regex.Replace(result, 
        "(\r)(\t)+", "\r\t", 
        System.Text.RegularExpressions.RegexOptions.IgnoreCase); 
      // Initial replacement target string for linebreaks 
      string breaks = "\r\r\r"; 
      // Initial replacement target string for tabs 
      string tabs = "\t\t\t\t\t"; 
      for (int index = 0; index < result.Length; index++) { 
       result = result.Replace(breaks, "\r\r"); 
       result = result.Replace(tabs, "\t\t\t\t"); 
       breaks = breaks + "\r"; 
       tabs = tabs + "\t"; 
      } 

      // Thats it. 
      return result; 

    }

来源

2008-09-30 21:11:35 Andrew

+8

Downvoted的原因雄辩地解释这里：http://stackoverflow.com/questions/1732348/regex-match-open-tags-except-xhtml-self-contained-tags/1732454#1732454 – 2011-06-30 07:26:14

+0

讽刺这与XSLT可能会出现错误的原因几乎相同。 HTML很混乱。很少有适合转换的XML文档。我怀疑一个合适的解决方案会包含一点正则表达式，以使得文档足够干净以进行适当的XSLT转换。 – Menefee 2012-06-23 03:47:07

8

检查出XHTML2RTF这个CodeProject上的文章（我是不是原作者，我从网上找到的代码将它改编）。

来源

2009-04-16 03:03:40

+0

非常适合XHTML，但是正如您从阅读该名称时猜测的那样，对于非XHTML /“香草HTML”不适用... – sager89 2014-08-29 15:08:56

+0

太棒了！制作一个控制台应用程序。需要在控制台主要方法之前添加[STAThread]。 – dforce 2016-11-03 15:02:37

4

在Spartaco的答案扩大我给以下伟大的工作提供了以下内容！

Using reportWebBrowser As New WebBrowser 
     reportWebBrowser.CreateControl() 
     reportWebBrowser.DocumentText = sbHTMLDoc.ToString 
     While reportWebBrowser.DocumentText <> sbHTMLDoc.ToString 
      Application.DoEvents() 
     End While 
     reportWebBrowser.Document.ExecCommand("SelectAll", False, Nothing) 
     reportWebBrowser.Document.ExecCommand("Copy", False, Nothing) 

     Using reportRichTextBox As New RichTextBox 
      reportRichTextBox.Paste() 
      reportRichTextBox.SaveFile(DocumentFileName) 
     End Using 
    End Using

来源

2011-02-17 21:01:21 cjbarth

如何在不支付组件的情况下将HTML转换为RTF（Rich Text）？

回答

相关问题