2016-09-15 110 views
0

我想搜索一个PDF文件的单词并将其替换。例如。搜索“错误”并替换为“正确”。 我已经设法使用iText v5.5.9(礼貌http://www.codeguru.com/columns/vb/manipulating-pdf-files-with-itextsharp-and-vb.net-2012.htm)做一个测试程序哪种工作正常(文本被替换似乎在顶部)。 我想知道如果v7 iText会更好/更简单,如果有人已经做到了/可以提供帮助。 下面是V5测试代码读取用的R/W密码从数据库的PDF然后用R/W密码并将其写入:iText7 .NET搜索/替换

' Based on http://www.codeguru.com/columns/vb/manipulating-pdf-files-with-itextsharp-and-vb.net-2012.htm 

Imports System.IO 'Working With Files 
Imports System.Text 'Working With Text 
Imports System.Data.SqlClient 

Imports iTextSharp.text 'Core PDF Text Functionalities 
Imports iTextSharp.text.pdf 'PDF Content 
Imports iTextSharp.text.pdf.parser 'Content Parser 

Imports pdf_clr.LocTextExtraction 'Import LocationTextExtractionStrategy Capabilities 

Public Class Class1 

Public Shared Sub ReplacePDFText(ByVal strSource As String, ByVal strDest As String, ByVal iDocType As SByte, ByVal strSearch As String, ByVal strReplace As String, ByVal bCase As Boolean) 
    ' strSource is an int 
    Dim i As Integer 
    Dim strSqlConnection As String = "context connection=true" 
    strSqlConnection = "Data Source=SERVER;Initial Catalog=DATABASE;Integrated Security=True" 
    Dim dbPDF As Byte() = Nothing 'For doc from database 
    Dim pcbContent As PdfContentByte = Nothing 'Read PDF Content 
    Dim psStamp As PdfStamper = Nothing 'PDF Stamper Object 
    Dim strPassword As String = strSource 

    Using connection As New SqlConnection(strSqlConnection) 
     connection.Open() 
     Dim command As New SqlCommand("SELECT pdf FROM docstore WHERE id=" & strSource, connection) 
     dbPDF = command.ExecuteScalar() 
    End Using 

    If IsNothing(dbPDF) <> True Then 'Check if dbPDF filled 

     'Dim pdfFileReader As New PdfReader(strSource, Encoding.ASCII.GetBytes(strPassword)) 'Read Our File 
     Dim pdfFileReader As New PdfReader(dbPDF, Encoding.ASCII.GetBytes(strPassword)) 'Read PDF 

     If strDest.ToString = "" Then 
      'strDest = System.IO.Path.GetTempPath() & System.IO.Path.GetRandomFileName() 
      strDest = "C:\tmp\" & System.IO.Path.GetRandomFileName() & ".pdf" 
     End If 

     Dim msPDF As New MemoryStream() 

     psStamp = New PdfStamper(pdfFileReader, msPDF) 'Memorystream as destination 
     psStamp.Writer.CloseStream = False 

     ' set r/w password to 
     psStamp.SetEncryption(Nothing, Encoding.ASCII.GetBytes(strPassword), PdfWriter.ALLOW_PRINTING, PdfWriter.DO_NOT_ENCRYPT_METADATA) 

     For intCurrPage As Integer = 1 To pdfFileReader.NumberOfPages 'Loop Through All Pages 

      Dim lteStrategy As LocTextExtractionStrategy = New LocTextExtractionStrategy 'Read PDF File Content Blocks 

      pcbContent = psStamp.GetUnderContent(intCurrPage) 'Look At Current Block 

      'Determine Spacing of Block To See If It Matches Our Search String 
      lteStrategy.UndercontentCharacterSpacing = pcbContent.CharacterSpacing 
      lteStrategy.UndercontentHorizontalScaling = pcbContent.HorizontalScaling 

      'Trigger The Block Reading Process 
      Dim currentText As String = PdfTextExtractor.GetTextFromPage(pdfFileReader, intCurrPage, lteStrategy) 
      Dim scCase As StringComparison = IIf(bCase = 0, StringComparison.CurrentCultureIgnoreCase, StringComparison.CurrentCulture) 

      'Call 
      DoSearchReplace(lteStrategy, pcbContent, psStamp, strSearch, strReplace, scCase, "SearchReplaceLayer") 

     Next 'page 

     psStamp.Close() 'Close Stamp Destination Object 

     msPDF.Position = 0 

     dbPDF = msPDF.ToArray 

     msPDF.Close() 
     msPDF.Dispose() 

     ' Write file as check during testing 
     File.WriteAllBytes(strDest, dbPDF) 

     If IsNumeric(strSource) And 1 = 1 Then 
      Using connection As New SqlConnection(strSqlConnection) 
       Dim cmd As New SqlCommand 
       cmd.CommandText = "sp_DOCSTORE_ADD_binary" ' updates or inserts into db 
       ' stored procedure parameters as needed 
       cmd.Parameters.Add("@FILE", Data.SqlDbType.VarBinary) : cmd.Parameters("@FILE").Value = dbPDF 
       cmd.Parameters.Add("@retvalue", Data.SqlDbType.Int).Direction = Data.ParameterDirection.ReturnValue 
       cmd.CommandType = Data.CommandType.StoredProcedure 
       cmd.Connection = connection 

       connection.Open() 

       i = cmd.ExecuteNonQuery() 

      End Using 
     End If 

    End If 

End Sub 

Public Shared Sub DoSearchReplace(ByRef lteStrategy As LocTextExtractionStrategy, ByRef pcbContent As PdfContentByte, ByRef psStamp As PdfStamper, ByVal strSearch As String, ByVal strReplace As String, ByVal scCase As StringComparison, ByVal strLayer As String) 
    'Determine Match(es) 
    Dim lstMatches As List(Of iTextSharp.text.Rectangle) = lteStrategy.GetTextLocations(strSearch, scCase) 
    Dim pdLayer As New PdfLayer(strLayer, psStamp.Writer) 'New layer and enable Overwriting Capabilities 

    'Set Fill Colour Of Replacing Layer 
    pcbContent.SetColorFill(BaseColor.WHITE) 

    For Each rctRect As Rectangle In lstMatches 'Loop Through Each Match 

     pcbContent.Rectangle(rctRect.Left, rctRect.Bottom, rctRect.Width, rctRect.Height) 'Create New Rectangle For Replacing Layer 
     pcbContent.Fill() 'Fill With Colour Specified 
     pcbContent.BeginLayer(pdLayer) 'Create Layer 
     pcbContent.SetColorFill(BaseColor.DARK_GRAY) 'Fill Layer 
     pcbContent.Fill() 'Fill Underlying Content 

     Dim pgState As PdfGState 'Create GState Object 
     pgState = New PdfGState() 

     pcbContent.SetGState(pgState) 'Set Current State 
     pcbContent.SetColorFill(BaseColor.BLACK) 'Fill Letters 
     pcbContent.BeginText() 'Start Text Replace Procedure 
     pcbContent.SetTextMatrix(rctRect.Left, rctRect.Bottom) 'Get Text Location 

     'Set New Font And Size 
     pcbContent.SetFontAndSize(BaseFont.CreateFont(BaseFont.HELVETICA_OBLIQUE, BaseFont.CP1252, BaseFont.NOT_EMBEDDED), 12) 
     pcbContent.ShowText(strReplace) 'Replacing Text 
     pcbContent.EndText() 'Stop Text Replace Procedure 
     pcbContent.EndLayer() 'Stop Layer replace Procedure 

    Next 'rectangle 
End Sub 

干杯。

+0

你在找VB的解决方案吗?我可以用C#表达我的想法... – mkl

回答

0

的基本思路(伪代码)将

  1. 实施IEventListener/ITextExtractionStrategy
  2. 使用这个类作为参数传递给PdfTextExtractor为您的文档的每一页
  3. 你的类的通知文档中的每个事件。你有兴趣TextRenderInfo类型的事件(即呈现文本的页面事件)
  4. 总结TextRenderInfo事​​件,并对其进行排序(按照逻辑读取顺序)来获取文本的概述文件
  5. 使用正则表达式中搜索匹配您的期望性质的所有文本,该文本映射回他们从
  6. 传来TextRenderInfo对象重建.pdf文档的基础上,TextRenderInfo对象,你已经收集,并且希望所代替