#region "Description"
/*
* Texts from each slide of
PDF document can be extracted using GetAllTextsFromPDF() method given in post
*/
#endregion
#region "Methods"
Method Name: GetAllTextsFromPDF()
/// <summary>
/// Given Function takes input as PDF
document file stream and returns created text list for per page in PDF
/// </summary>
/// <param
name="SourceFileStream">PDF
doc file stream</param>
/// <returns>List of text per slide </returns>
public static Dictionary<int,
string> GetAllTextsFromPdf(Stream SourceFileStream)
{
//string
filename = Path.GetFileNameWithoutExtension(SourceFilePath);
Dictionary<int, string>
pageContent = new Dictionary<int, string>();
//open
document
//Document
pdfDocument = new Document("input.pdf");
SourceFileStream.Position = 0;
using
(Document pdfDocument = new Document(SourceFileStream))
{
//create
TextAbsorber object to find all instances of the input search phrase
TextAbsorber
absorber = new TextAbsorber();
//accept
the absorber for a single page
for
(int mSlide = 1; mSlide <=
pdfDocument.Pages.Count; mSlide++)
{
////accept
the absorber for a single page
absorber=new TextAbsorber();
pdfDocument.Pages[mSlide].Accept(absorber);
//get
the extracted text fragments
if
(!pageContent.ContainsKey(mSlide-1))
pageContent.Add(mSlide
- 1, string.Empty);
pageContent[mSlide - 1]
= absorber.Text;
}
}
return
pageContent;
}
}
No comments:
Post a Comment