2012-07-23 63 views
0

任何人都可以帮助如何获得文本坐标?这可能吗?因为我只想要一个窗体应用程序,用户在文本框中键入单词,应用程序使用iTextSharp读取现有的PDF,如果找到则突出显示匹配的单词,并将突出显示的文本保存为PDF。到目前为止,我已经完成了几乎所有的工作,包括绘制一个黄色的矩形,但缺乏的是如何获得匹配的模式的文本坐标以突出显示它们,这要归功于:(顺便说一下:sb是搜索文本框,tb是搜索PDF文本,突出显示找到的单词后,通过绘制矩形后得到其坐标保存PDF文本突出显示

using System; 
using System.Collections.Generic; 
using System.ComponentModel; 
using System.Data; 
using System.Drawing; 
using System.Linq; 
using System.Text; 
using System.Windows.Forms; 
using System.IO; 
using iTextSharp.text.pdf; 
using iTextSharp.text.pdf.parser; 
using iTextSharp.text; 
using System.Text.RegularExpressions; 

namespace manipulatePDF 
{ 
    public partial class Form1 : Form 
    { 
     string oldFile; 
     Document document = new Document(); 
     StringBuilder text = new StringBuilder(); 
    public Form1() 
    { 
     InitializeComponent(); 
    } 
    private void open_Click(object sender, EventArgs e) 
    { 
     reset_Click(sender, e); 

     openFileDialog1.Filter = "PDF Files (.pdf)|*.pdf"; 
     openFileDialog1.FilterIndex = 1; 

     if (openFileDialog1.ShowDialog() == System.Windows.Forms.DialogResult.OK) 
     { 
      label1.Text = "File Location: " + openFileDialog1.FileName; 
      oldFile = openFileDialog1.FileName; 

      // open the reader 
      PdfReader reader = new PdfReader(oldFile); 

      iTextSharp.text.Rectangle size = reader.GetPageSizeWithRotation(1); 
      document.SetPageSize(size); 

      for (int cPage = 1; cPage <= reader.NumberOfPages; cPage++) 
      { 
       ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); 
       string currentText = PdfTextExtractor.GetTextFromPage(reader, cPage, strategy); 
       currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); 
       text.Append(currentText); 
       reader.Close(); 
      } 
      tb.Text = text.ToString(); 
     } 
    } 
    private void save_Click(object sender, EventArgs e) 
    { 
     saveFileDialog1.InitialDirectory = "C: "; 
     saveFileDialog1.Title = "Save the PDF File"; 
     saveFileDialog1.Filter = "PDF files (*.pdf)|*.pdf"; 

     if (saveFileDialog1.ShowDialog() == System.Windows.Forms.DialogResult.OK) 
     { 
      PdfReader reader = new PdfReader(oldFile); 
      string newFile = saveFileDialog1.FileName; 

      // open the writer 
      FileStream fs = new FileStream(newFile, FileMode.Create, FileAccess.Write); 
      PdfWriter writer = PdfWriter.GetInstance(document, fs); 

      document.Open(); 

      // the pdf content 
      PdfContentByte cb = writer.DirectContent; 

      // select the font properties 
      PdfGState graphicsState = new PdfGState(); 
      graphicsState.FillOpacity = 10; 
      cb.SetGState(graphicsState); 

      int index = 0; 
      while (index < text.ToString().LastIndexOf(sb.Text)) 
      { 
       if (contain.Checked == true) 
       { 
        tb.Find(sb.Text, index, tb.TextLength, RichTextBoxFinds.MatchCase); 
        tb.SelectionBackColor = Color.Gold; 
        index = tb.Text.IndexOf(sb.Text, index) + 1; 
       } 
       else if (exact.Checked == true) 
       { 
        tb.Find(sb.Text, index, tb.TextLength, RichTextBoxFinds.WholeWord); 
        tb.SelectionBackColor = Color.Gold; 
        index = tb.Text.IndexOf(sb.Text, index) + 1; 
       } 
      } 

      int count = 0; //counts the pattern occurance 
      for (int cPage = 1; cPage <= reader.NumberOfPages; cPage++) 
      { 
       ITextExtractionStrategy strategy = new SimpleTextExtractionStrategy(); 
       string currentText = PdfTextExtractor.GetTextFromPage(reader, cPage, strategy); 
       currentText = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(currentText))); 
       string textToSearch = sb.Text; 
       int lastStartIndex = currentText.IndexOf(textToSearch, 0, StringComparison.CurrentCulture); 

       while (lastStartIndex != -1)//if the pattern was found 
       { 
        count++; 
        lastStartIndex = currentText.IndexOf(textToSearch, lastStartIndex + 1, StringComparison.CurrentCulture); 

        BaseFont bf = BaseFont.CreateFont(BaseFont.HELVETICA, BaseFont.CP1252, BaseFont.NOT_EMBEDDED); 
        cb.SetFontAndSize(bf, 10); 

        cb.SetColorFill(new CMYKColor(0f, 0f, 1f, 0f)); 
        cb.Rectangle(document.PageSize.Width - 500f, 600f, 100f, 100f); 
        cb.Fill(); 
       } 

       if (count != 0) 
       { 
        if (contain.Checked == true) 
        { 
         label2.Text = "Number of pages: " + cPage + " - " + textToSearch + " found " + count + " times. \n"; 
        } 
        else if (exact.Checked == true) 
        { 
         //finds the words that are bounded by a space or a dot and store in cCount 
         //returns the count of matched pattern = count - cCount 
        } 
       } 

       text.Append(currentText); 
       // create the new page and add it to the pdf 
       PdfImportedPage page = writer.GetImportedPage(reader, cPage); 
       cb.AddTemplate(page, 0, 0); 

       document.NewPage(); 
       //PdfStamper stamper = new PdfStamper(reader, fs); 
       ////Create a rectangle for the highlight. NOTE: Technically this isn't used but it helps with the quadpoint calculation 
       //iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(60.6755f, 749.172f, 94.0195f, 735.3f); 
       ////Create an array of quad points based on that rectangle. NOTE: The order below doesn't appear to match the actual spec but is what Acrobat produces 
       //float[] quad = { rect.Left, rect.Bottom, rect.Right, rect.Bottom, rect.Left, rect.Top, rect.Right, rect.Top }; 

       ////Create our hightlight 
       //PdfAnnotation highlight = PdfAnnotation.CreateMarkup(stamper.Writer, rect, null, PdfAnnotation.MARKUP_HIGHLIGHT, quad); 

       ////Set the color 
       //highlight.Color = BaseColor.YELLOW; 

       ////Add the annotation 
       //stamper.AddAnnotation(highlight, 1); 
      } 

      // close the streams 
      document.Close(); 
      fs.Close(); 
      writer.Close(); 
      reader.Close(); 
     } 
    } 
    private void reset_Click(object sender, EventArgs e) 
    { 
     tb.Text = ""; 
    } 
} 
+0

在这里看到我的答案,并按照该帖子中的链接。我会建议总是逐字搜索,因为多个单词可能会跨越多条可能会变得丑陋的行。 http://stackoverflow.com/a/6527010/231316 – 2012-07-24 13:20:49

+0

谢谢你的时间! – 2012-07-25 09:00:56

回答

3

嘛,我又增加了使用2010 Vb.NET,它正是你需要做一个downloable例子,其中的PDF文本呈现出丰富的文本框),并在另一篇文章中的可用同样的线程Chris引用了。该代码适用于每种字体类型和字体大小,并且会返回所搜索的单词/句子的所有匹配项,并将每个匹配项作为具有x/y位置的矩形返回给UI,最后将它们全部高亮并保存为新的PDF,您只需要提供一些初始参数,如搜索项,文化比较类型,源PDF路径和目标PDF路径。唯一没有实现的是当搜索词/句子分成多行时的特殊情况,但由于您可以在TextChunk类中使用SameLine()方法,因此它应该是代码中的简单更改。

+0

非常感谢... – 2012-07-31 13:38:51