2008-10-29 46 views
5

我想在c#中使用PDFSharp来访问现有PDF文档中的PdfItem对象树。如何浏览PDFSharp中的PDF对象树?

我想创建所有对象的层次结构 - 类似于“PDF Explorer”示例 - 但我希望它是一个树而不是所有对象的平面列表。

根节点是document.Internals.Catalog。我想要遍历所有document.Internals.Catalog.Elements,直到我访问了每个元素。

我碰到的一个问题是树中有循环引用,我无法弄清楚如何检测它们。

任何代码样本?

回答

6

这篇文章通过对PDFSharp论坛marihanzo已经为我们带来:

http://forum.pdfsharp.net/viewtopic.php?f=2&t=527&p=1603

我们已经在处理与\ r \ n的这些领域的唯一问题。这里是代码的副本,以防论坛帖子丢失。

PDFParser.cs

public class PDFParser 
{ 
    /// BT = Beginning of a text object operator 
    /// ET = End of a text object operator 
    /// Td move to the start of next line 
    /// 5 Ts = superscript 
    /// -5 Ts = subscript 

    #region Fields 

    #region _numberOfCharsToKeep 
    /// <summary> 
    /// The number of characters to keep, when extracting text. 
    /// </summary> 
    private static int _numberOfCharsToKeep = 15; 
    #endregion 

    #endregion 



    #region ExtractTextFromPDFBytes 
    /// <summary> 
    /// This method processes an uncompressed Adobe (text) object 
    /// and extracts text. 
    /// </summary> 
    /// <param name="input">uncompressed</param> 
    /// <returns></returns> 
    public string ExtractTextFromPDFBytes(byte[] input) 
    { 
     if (input == null || input.Length == 0) return ""; 

     try 
     { 
      string resultString = ""; 

      // Flag showing if we are we currently inside a text object 
      bool inTextObject = false; 

      // Flag showing if the next character is literal 
      // e.g. '\\' to get a '\' character or '\(' to get '(' 
      bool nextLiteral = false; 

      //() Bracket nesting level. Text appears inside() 
      int bracketDepth = 0; 

      // Keep previous chars to get extract numbers etc.: 
      char[] previousCharacters = new char[_numberOfCharsToKeep]; 
      for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' '; 


      for (int i = 0; i < input.Length; i++) 
      { 
       char c = (char)input[i]; 

       if (inTextObject) 
       { 
        // Position the text 
        if (bracketDepth == 0) 
        { 
         if (CheckToken(new string[] { "TD", "Td" }, previousCharacters)) 
         { 
          resultString += "\n\r"; 
         } 
         else 
         { 
          if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters)) 
          { 
           resultString += "\n"; 
          } 
          else 
          { 
           if (CheckToken(new string[] { "Tj" }, previousCharacters)) 
           { 
            resultString += " "; 
           } 
          } 
         } 
        } 

        // End of a text object, also go to a new line. 
        if (bracketDepth == 0 && 
         CheckToken(new string[] { "ET" }, previousCharacters)) 
        { 

         inTextObject = false; 
         resultString += " "; 
        } 
        else 
        { 
         // Start outputting text 
         if ((c == '(') && (bracketDepth == 0) && (!nextLiteral)) 
         { 
          bracketDepth = 1; 
         } 
         else 
         { 
          // Stop outputting text 
          if ((c == ')') && (bracketDepth == 1) && (!nextLiteral)) 
          { 
           bracketDepth = 0; 
          } 
          else 
          { 
           // Just a normal text character: 
           if (bracketDepth == 1) 
           { 
            // Only print out next character no matter what. 
            // Do not interpret. 
            if (c == '\\' && !nextLiteral) 
            { 
             nextLiteral = true; 
            } 
            else 
            { 
             if (((c >= ' ') && (c <= '~')) || 
              ((c >= 128) && (c < 255))) 
             { 
              resultString += c.ToString(); 
             } 

             nextLiteral = false; 
            } 
           } 
          } 
         } 
        } 
       } 

       // Store the recent characters for 
       // when we have to go back for a checking 
       for (int j = 0; j < _numberOfCharsToKeep - 1; j++) 
       { 
        previousCharacters[j] = previousCharacters[j + 1]; 
       } 
       previousCharacters[_numberOfCharsToKeep - 1] = c; 

       // Start of a text object 
       if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters)) 
       { 
        inTextObject = true; 
       } 
      } 
      return resultString; 
     } 
     catch 
     { 
      return ""; 
     } 
    } 
    #endregion 

    #region CheckToken 
    /// <summary> 
    /// Check if a certain 2 character token just came along (e.g. BT) 
    /// </summary> 
    /// <param name="search">the searched token</param> 
    /// <param name="recent">the recent character array</param> 
    /// <returns></returns> 
    private bool CheckToken(string[] tokens, char[] recent) 
    { 
     foreach (string token in tokens) 
     { 
      if (token.Length > 1) 
      { 
       if ((recent[_numberOfCharsToKeep - 3] == token[0]) && 
        (recent[_numberOfCharsToKeep - 2] == token[1]) && 
        ((recent[_numberOfCharsToKeep - 1] == ' ') || 
        (recent[_numberOfCharsToKeep - 1] == 0x0d) || 
        (recent[_numberOfCharsToKeep - 1] == 0x0a)) && 
        ((recent[_numberOfCharsToKeep - 4] == ' ') || 
        (recent[_numberOfCharsToKeep - 4] == 0x0d) || 
        (recent[_numberOfCharsToKeep - 4] == 0x0a)) 
        ) 
       { 
        return true; 
       } 
      } 
      else 
      { 
       return false; 
      } 

     } 
     return false; 
    } 
    #endregion 
} 

和调用代码:

public override String ExtractText() 
    { 
     String outputText = ""; 
     try 
     { 
      PdfDocument inputDocument = PdfReader.Open(this._sDirectory + this._sFileName, PdfDocumentOpenMode.ReadOnly); 

      foreach (PdfPage page in inputDocument.Pages) 
      { 
       for (int index = 0; index < page.Contents.Elements.Count; index++) 
       { 

        PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream; 
        outputText += new PDFParser().ExtractTextFromPDFBytes(stream.Value); 
       } 
      } 

     } 
     catch (Exception e) 
     { 
      PDF_ParseException oEx = new PDF_ParseException(this, e); 
      oEx.Log(); 
      oEx.ToPdf(this._sDirectoryException); 
     } 
     return outputText; 
    } 
0

阅读并分析整个集合,然后构建自己的内存树。然后走那棵树。