Monday, 24 February 2014

How to read texts from pdf document using Aspose.PDF

Reading texts from PDF document using Aspose.PDF:

#region "Description"
/*  
  * Texts from each slide of PDF document can be extracted using GetAllTextsFromPDF() method given in post  
 */
#endregion

#region "Methods"

Method Name: GetAllTextsFromPDF()
/// <summary>
/// Given Function takes input as PDF document file stream and returns created text list for per page in PDF
/// </summary>
/// <param name="SourceFileStream">PDF doc file stream</param>
/// <returns>List of text per slide </returns>


public static Dictionary<int, string> GetAllTextsFromPdf(Stream SourceFileStream)
        {
            //string filename = Path.GetFileNameWithoutExtension(SourceFilePath);           
            Dictionary<int, string> pageContent = new Dictionary<int, string>();
            //open document
            //Document pdfDocument = new Document("input.pdf");
            SourceFileStream.Position = 0;
            using (Document pdfDocument = new Document(SourceFileStream))
            {
                //create TextAbsorber object to find all instances of the input search phrase
                TextAbsorber absorber = new TextAbsorber();               
            
                //accept the absorber for a single page
                for (int mSlide = 1; mSlide <= pdfDocument.Pages.Count; mSlide++)
                {  
                    ////accept the absorber for a single page                    
                    absorber=new TextAbsorber();
                    pdfDocument.Pages[mSlide].Accept(absorber); 
                    //get the extracted text fragments                  
                    if (!pageContent.ContainsKey(mSlide-1))
                        pageContent.Add(mSlide - 1, string.Empty);

                    pageContent[mSlide - 1] =  absorber.Text;
                }
            }
            return pageContent;
        }
 }


No comments:

Post a Comment