2010-03-11 56 views
2

我有一个特定的要求,即从PDF文件中的特定区域提取文本和图像。区域可能是选定的或高亮显示的或来自给定的一组坐标。从选定区域或坐标中提取PDF文本和图像

当我经历了,所有的方法是完全从PDF中提取图像和文本不在指定的位置。 我尝试过使用iTextSharp,Syncfussion,Apose,但无法找到一个更好的方法。

如果有人能帮助我,这将是伟大的。你可以分享你的想法和建议如何在.net中实现。

问候, Arun.M从PDF

回答

1

此代码提取图像

using System; 
using System.Data; 
using System.Configuration; 
using System.Collections; 
using System.Drawing.Imaging; 
using System.IO; 
using System.Web; 
using System.Web.Security; 
using System.Web.UI; 
using System.Web.UI.WebControls; 
using System.Web.UI.WebControls.WebParts; 
using System.Web.UI.HtmlControls; 
using Bytescout.PDFExtractor; 

namespace ExtractAllImages 
{ 
    public partial class _Default : System.Web.UI.Page 
    { 
     protected void Page_Load(object sender, EventArgs e) 
     { 
      // This test file will be copied to the project directory on the pre-build event (see the project properties). 
      String inputFile = Server.MapPath("sample1.pdf"); 

      // Create Bytescout.PDFExtractor.ImageExtractor instance 
      ImageExtractor extractor = new ImageExtractor(); 
      extractor.RegistrationName = "demo"; 
      extractor.RegistrationKey = "demo"; 

      // Load sample PDF document 
      extractor.LoadDocumentFromFile("sample1.pdf"); 

      Response.Clear(); 

      int i = 0; 

      // Initialize image enumeration 
      if (extractor.GetFirstImage()) 
      { 
       do 
       { 
        if (i == 0) // Write the fist image to the Response stream 
        { 
         string imageFileName = "image" + i + ".png"; 

         Response.Write("<b>" + imageFileName + "</b>"); 

         Response.ContentType = "image/png"; 
         Response.AddHeader("Content-Disposition", "inline;filename=" + imageFileName); 

         // Write the image bytes into the Response output stream 
         Response.BinaryWrite(extractor.GetCurrentImageAsArrayOfBytes()); 
        } 

        i++; 

       } while (extractor.GetNextImage()); // Advance image enumeration 
      } 

      Response.End(); 
     } 
    } 
}