2016-08-02 46 views
0

这涉及到使用iTextSharp的5.5.8或5.5.9,我的测试工具是:iTextSharp的GetTextFromPage不返回

{ 
    PdfReader pdfReader = null; 
    StringBuilder actual = new StringBuilder(); 

    try 
    { 
    pdfReader = new PdfReader(@"Quotation for Macbook 6-16.pdf"); 
    } 
    catch (iTextSharp.text.exceptions.BadPasswordException bpe) 
    { 
    actual.AppendLine(string.Format("Exception: Bad Password {0}", bpe)); 
    } 
    catch (Exception ex) 
    { 
    actual.AppendLine(string.Format("Exception: PDFReader {0}", ex)); 
    } 

    int pages = pdfReader.NumberOfPages; 
    for (int page = 1; page <= pages; page++) 
    { 
    try 
    { 
     String s = PdfTextExtractor.GetTextFromPage(pdfReader, page); 
     actual.AppendLine(string.Format("{0}", s)); 
    } 
    catch (Exception ex) 
    { 
     actual.AppendLine(string.Format("Exception PDF Page {0}: {1}", page, ex)); 
    } 
    } 

    foreach (var field in pdfReader.AcroFields.Fields) 
    { 
    actual.AppendLine(string.Format("{0}: {1}", field.Key, pdfReader.AcroFields.GetField(field.Key))); 
    } 
} 

我已经处理了成千上万的PDF文件调用GetTextFromPage的,但遇到特定的PDF根本不返回。我从GitHub下载代码,并通过它走了处理文件,它看起来像时,它调用InitFirst导致这里的连续循环的LineDashPattern的条件是LineDashPattern.cs

 private void InitFirst(float phase) { 
     if (dashArray.Size > 0) { 
      while (phase > 0) { 
       phase -= dashArray.GetAsNumber(currentIndex).FloatValue; 
       currentIndex = (currentIndex + 1) % DashArray.Size; 
       elemOrdinalNumber++; 
      } 

      if (phase < 0) { 
       --elemOrdinalNumber; 
       --currentIndex; 
       currentElem = new DashArrayElem(-phase, IsEven(elemOrdinalNumber)); 
      } else { 
       currentElem = new DashArrayElem(dashArray.GetAsNumber(currentIndex).FloatValue, 
        IsEven(elemOrdinalNumber)); 
      } 
     } 
    } 

传递相位码in是6.44245E + 8在dashArray 28.8中有两个入口,而对于这个阶段有9.6这样大的数字导致第一个,而因为28.8不足以根据float的分辨率降低相位而被阻塞。

我对内部知识不够了解,或者我会考虑进行更改。

我真的只对提取文本感兴趣,所以如果有一个设置我可以实现来过滤出对我来说也适用的行处理。

+0

好的。作为循环条件的浮点运算是不可取的...... – mkl

+0

我纠正并测试了这个问题,并将修改后的LineDashPattern.cs文件发送到[email protected]。修订版本基本上将模式长度除以阶段,并通过现有例程的其余部分处理剩余部分。 – Lee

+0

我会建议你在这里发布修改后的代码,以便其他需要修复的人也能在手边。 iText 7是目前主要使用的版本,因此可能需要一些时间才能在官方发行版中应用iText 5.5.x修复程序。 – mkl

回答

1

我更新了LineDashPattern.cs文件。我正在使用iTextSharp,据我所知5.5.9是最新版本,所以iText 7可能是Java。

无论如何,这里是我更新的代码。我添加了一个elts(线元素的总和)作为该类中的私有字段,更新了dashArray属性设置例程以基于当前的dashArray更新elts,并最终更新了InitFirst方法以将该阶段除以elts做一个批量在一个语句中的计算然后落入原始代码中以找到实际元素。

我一般都不知道相位值通常会传递到例程中,但是如果他们可以调整相位,我的价值将会循环近1700万次,所以这种变化应该快得多,因为它被称为多次为这个PDF它变得更大的性能改进,更不用说解决错误。完整的文件代码如下:

/* 
* $Id$ 
* 
* This file is part of the iText (R) project. 
* Copyright (c) 1998-2016 iText Group NV 
* Authors: Bruno Lowagie, Paulo Soares, et al. 
* 
* This program is free software; you can redistribute it and/or modify 
* it under the terms of the GNU Affero General Public License version 3 
* as published by the Free Software Foundation with the addition of the 
* following permission added to Section 15 as permitted in Section 7(a): 
* FOR ANY PART OF THE COVERED WORK IN WHICH THE COPYRIGHT IS OWNED BY 
* ITEXT GROUP. ITEXT GROUP DISCLAIMS THE WARRANTY OF NON INFRINGEMENT 
* OF THIRD PARTY RIGHTS 
* 
* This program is distributed in the hope that it will be useful, but 
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 
* or FITNESS FOR A PARTICULAR PURPOSE. 
* See the GNU Affero General Public License for more details. 
* You should have received a copy of the GNU Affero General Public License 
* along with this program; if not, see http://www.gnu.org/licenses or write to 
* the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, 
* Boston, MA, 02110-1301 USA, or download the license from the following URL: 
* http://itextpdf.com/terms-of-use/ 
* 
* The interactive user interfaces in modified source and object code versions 
* of this program must display Appropriate Legal Notices, as required under 
* Section 5 of the GNU Affero General Public License. 
* 
* In accordance with Section 7(b) of the GNU Affero General Public License, 
* a covered work must retain the producer line in every PDF that is created 
* or manipulated using iText. 
* 
* You can be released from the requirements of the license by purchasing 
* a commercial license. Buying such a license is mandatory as soon as you 
* develop commercial activities involving the iText software without 
* disclosing the source code of your own applications. 
* These activities include: offering paid services to customers as an ASP, 
* serving PDFs on the fly in a web application, shipping iText with a closed 
* source product. 
* 
* For more information, please contact iText Software Corp. at this 
* address: [email protected] 
*/ 

using System.util; 
using iTextSharp.awt.geom; 

namespace iTextSharp.text.pdf.parser { 

    /** 
    * Represents the line dash pattern. The line dash pattern shall control the pattern 
    * of dashes and gaps used to stroke paths. It shall be specified by a dash array and 
    * a dash phase. 
    * 
    * @since 5.5.6 
    */ 
    public class LineDashPattern { 

     private PdfArray dashArray; 
     private float dashPhase; 

     private int currentIndex; 
     private int elemOrdinalNumber = 1; 
     private DashArrayElem currentElem; 
     private float elts = 0.0F; 

     /** 
     * Creates new {@link LineDashPattern} object. 
     * @param dashArray The dash array. See {@link #getDashArray()} 
     * @param dashPhase The dash phase. See {@link #getDashPhase()} 
     */ 
     public LineDashPattern(PdfArray dashArray, float dashPhase) { 
      this.dashArray = new PdfArray(dashArray); 
      this.dashPhase = dashPhase; 
      InitFirst(dashPhase); 
     } 

     /** 
     * Getter and setter for the dash array. 
     * 
     * The dash array’s elements is number that specify the lengths of 
     * alternating dashes and gaps; the numbers are nonnegative. The 
     * elements are expressed in user space units. 
     * 
     * @return The dash array. 
     */ 
     public PdfArray DashArray { 
      get { return dashArray; } 
      set 
      { 
       dashArray = value; 
       float elts = 0.0F; 
       for (int i = 0; i < dashArray.Size; i++) 
       { 
       elts += dashArray.GetAsNumber(i).FloatValue; 
       } 
      } 
     } 

     /** 
     * Getter and setter for the dash phase. 
     * 
     * The dash phase shall specify the distance into the dash pattern at which 
     * to start the dash. The elements are expressed in user space units. 
     * 
     * @return The dash phase. 
     */ 
     public float DashPhase { 
      get { return dashPhase; } 
      set { dashPhase = value; } 
     } 

     /** 
     * Calculates and returns the next element which is either gap or dash. 
     * @return The next dash array's element. 
     */ 
     public DashArrayElem Next() { 
      DashArrayElem ret = currentElem; 

      if (dashArray.Size > 0) { 
       currentIndex = (currentIndex + 1) % DashArray.Size; 
       currentElem = new DashArrayElem(dashArray.GetAsNumber(currentIndex).FloatValue, 
        IsEven(++elemOrdinalNumber)); 
      } 

      return ret; 
     } 

     /** 
     * Checks whether the dashed pattern is solid or not. It's solid when the 
     * size of a dash array is even and sum of all the units off in the array 
     * is 0.<br/> 
     * For example: [3 0 4 0 5 0 6 0] (sum is 0), [3 0 4 0 5 1] (sum is 1). 
     */ 
     public bool IsSolid() { 
      if (dashArray.Size % 2 != 0) { 
       return false; 
      } 

      float unitsOffSum = 0; 

      for (int i = 1; i < dashArray.Size; i += 2) { 
       unitsOffSum += dashArray.GetAsNumber(i).FloatValue; 
      } 

      return Util.Compare(unitsOffSum, 0) == 0; 
     } 

     /** 
     * Resets the dash array so that the {@link #next()} method will start 
     * from the beginning of the dash array. 
     */ 
     public void Reset() { 
      currentIndex = 0; 
      elemOrdinalNumber = 1; 
      InitFirst(dashPhase); 
     } 

     private void InitFirst(float phase) { 
      if (dashArray.Size > 0) { 
       // handle a bulk of the line pattern 
       // 
       if (elts > 0.0) 
       { 
       int occurances = (int)(phase/elts); 
       elemOrdinalNumber = occurances * dashArray.Size; 
       phase -= occurances * elts; 

       // adjust for the final set of pattern elements 
       // 
       while (phase > 0) 
       { 
        phase -= dashArray.GetAsNumber(currentIndex).FloatValue; 
        currentIndex = (currentIndex + 1) % DashArray.Size; 
        elemOrdinalNumber++; 
       } 

       if (phase < 0) 
       { 
        --elemOrdinalNumber; 
        --currentIndex; 
        currentElem = new DashArrayElem(-phase, IsEven(elemOrdinalNumber)); 
       } 
       else 
       { 
        currentElem = new DashArrayElem(dashArray.GetAsNumber(currentIndex).FloatValue, 
         IsEven(elemOrdinalNumber)); 
       } 
       } 
      } 
     } 

     private bool IsEven(int num) { 
      return (num % 2) == 0; 
     } 

     public class DashArrayElem { 

      private float val; 
      private bool isGap; 

      public DashArrayElem(float val, bool isGap) { 
       this.val = val; 
       this.isGap = isGap; 
      } 

      public float Value 
      { 
       get { return val; } 
       set { val = value; } 
      } 

      public bool IsGap 
      { 
       get { return isGap; } 
       set { isGap = value; } 
      } 
     } 
    } 
}