My Code Funda: How to read texts from pdf document using Aspose.PDF

Reading texts from PDF document using Aspose.PDF:

#region "Description"

* Texts from each slide of PDF document can be extracted using GetAllTextsFromPDF() method given in post

#endregion

#region "Methods"

Method Name: GetAllTextsFromPDF()

/// <summary>

/// Given Function takes input as PDF document file stream and returns created text list for per page in PDF

/// </summary>

/// <param name="SourceFileStream">PDF doc file stream</param>

/// <returns>List of text per slide </returns>

public static Dictionary<int, string> GetAllTextsFromPdf(Stream SourceFileStream)

{

//string filename = Path.GetFileNameWithoutExtension(SourceFilePath);

Dictionary<int, string> pageContent = new Dictionary<int, string>();

//open document

//Document pdfDocument = new Document("input.pdf");

SourceFileStream.Position = 0;

using (Document pdfDocument = new Document(SourceFileStream))

{

//create TextAbsorber object to find all instances of the input search phrase

TextAbsorber absorber = new TextAbsorber();

//accept the absorber for a single page

for (int mSlide = 1; mSlide <= pdfDocument.Pages.Count; mSlide++)

{

////accept the absorber for a single page

absorber=new TextAbsorber();

pdfDocument.Pages[mSlide].Accept(absorber);

//get the extracted text fragments

if (!pageContent.ContainsKey(mSlide-1))

pageContent.Add(mSlide - 1, string.Empty);

pageContent[mSlide - 1] = absorber.Text;

}

return pageContent;

}

My Code Funda