Monday, 24 February 2014

How to read texts from pdf document using Aspose.PDF

Reading texts from PDF document using Aspose.PDF:

#region "Description"
/*  
  * Texts from each slide of PDF document can be extracted using GetAllTextsFromPDF() method given in post  
 */
#endregion

#region "Methods"

Method Name: GetAllTextsFromPDF()
/// <summary>
/// Given Function takes input as PDF document file stream and returns created text list for per page in PDF
/// </summary>
/// <param name="SourceFileStream">PDF doc file stream</param>
/// <returns>List of text per slide </returns>


public static Dictionary<int, string> GetAllTextsFromPdf(Stream SourceFileStream)
        {
            //string filename = Path.GetFileNameWithoutExtension(SourceFilePath);           
            Dictionary<int, string> pageContent = new Dictionary<int, string>();
            //open document
            //Document pdfDocument = new Document("input.pdf");
            SourceFileStream.Position = 0;
            using (Document pdfDocument = new Document(SourceFileStream))
            {
                //create TextAbsorber object to find all instances of the input search phrase
                TextAbsorber absorber = new TextAbsorber();               
            
                //accept the absorber for a single page
                for (int mSlide = 1; mSlide <= pdfDocument.Pages.Count; mSlide++)
                {  
                    ////accept the absorber for a single page                    
                    absorber=new TextAbsorber();
                    pdfDocument.Pages[mSlide].Accept(absorber); 
                    //get the extracted text fragments                  
                    if (!pageContent.ContainsKey(mSlide-1))
                        pageContent.Add(mSlide - 1, string.Empty);

                    pageContent[mSlide - 1] =  absorber.Text;
                }
            }
            return pageContent;
        }
 }


C#.Net code to normalize htmltable removing rowspan and colspan with the help of HtmlAgility Pack



C#.Net code to normalize (remove rowspan & colspan htmltable with the help of HtmlAgilityPack

Description: Convert Html Financial Table to its normalized format taking care of header, stub and financial cell by marking them.

Header: Header of financial table
Stub: First Non-financial column of financial table

Diagrammatic Representation:

a) Before Normalization:

Side
Company Abc
2012
2013
Assets
$5,000
6000
ABC Assets
USD300(1)
$4,000x
In Between SubHead
Debts
2000
$10000

b) After Normalization:

Side
Company Abc
2012
2013
Assets
$5,000
6000
ABC Assets
USD300(1)
$4,000x
In Between SubHead
Debts
2000
$10000

Main Calling method:
1)    FormatAllTables: Start point for normalization. Take input as html table string.
2)    NormalizeTable: Do the process of normalization.
3)    InsertCellForRowSpan(): To create cells for normalization.


Supporting methods:
1)    GetCleanText(): To get Clear content
2)    GetNormalizedCellCountIndex(): To get proper index of a cell excluding colspan.



Way of Calling:
1)        String strTable=”table html content”;
2)        Call FormatAllTable method as given below.
        Eg. strTable =FormatAllTable(strTable);


Method Definitions:

public string FormatAllTables(string HtmlDocumentString)
        {
            HtmlAgilityPack.HtmlDocument objHTMLdoc = new HtmlAgilityPack.HtmlDocument();
            try
            {
                HtmlDocumentString = HtmlDocumentString.Replace("  ", "").Replace("\t", "").Replace("\n", "").Replace("\r", "");
                objHTMLdoc.LoadHtml(HtmlDocumentString);

                HtmlNodeCollection tableList = objHTMLdoc.DocumentNode.SelectNodes("//table");
                if (tableList != null)
                {
                    for (int tableIndex = 0; tableIndex < tableList.Count(); tableIndex++)
                    {
                        try
                        {
                            HtmlNode tableCopy = tableList[tableIndex];
                            tableCopy.InnerHtml = NormalizeTable(tableCopy);

                        }
                        catch (Exception ex)
                        {
                            //handle exception
                        }
                    }
                }
            }
            catch (Exception ex)
            { }
            return objHTMLdoc.DocumentNode.OuterHtml; //OutPut
        }


static string NormalizeTable(HtmlNode table)
        {
            try
            {  
                HtmlNode trnode = null,tdnode = null;                         
                int trindex = 0, tdindex = 0, mRowSpan = 0, m = 0, mColSpan = 0;
                for (trindex = 0; trindex < table.ChildNodes.Count; trindex++)
                {
                    trnode = table.ChildNodes[trindex];
                   
                    for (tdindex = 0; tdindex < trnode.ChildNodes.Count; tdindex++)
                    {
                        tdnode = trnode.ChildNodes[tdindex];                      
                        mRowSpan = 0; mColSpan = 0;
                        #region For rowspan
                        if (tdnode.Attributes["rowspan"] != null)
                        {
                            if (tdnode.Attributes["rowspan"].Value != "1")
                            {
                                mRowSpan = Convert.ToInt32(tdnode.Attributes["rowspan"].Value);
                                InsertCellForRowSpan(ref table, trindex, tdindex, mRowSpan, tdnode);
                                tdnode.Attributes["rowspan"].Value = "1";                               
                                tdnode.Attributes.Add("OriginalRowspan""" + mRowSpan + "");
                            }
                        }
                        #endregion
                        #region For colspan
                        if (tdnode.Attributes["colspan"] != null && tdnode.Attributes["colspan"].Value != "1")
                        {
                            mColSpan = Convert.ToInt32(tdnode.Attributes["colspan"].Value);
                            for (m = 0; m < mColSpan - 1; m++)
                            {
                                HtmlNode newNode = HtmlNode.CreateNode("<td></td>");
                                trnode.InsertAfter(newNode, tdnode);
                            }
                            tdnode.Attributes["colspan"].Value = "1";                           
                            tdnode.Attributes.Add("OriginalColspan""" + mColSpan + "");
                        }
                        #endregion
                    }
                }

            }
            catch (Exception ex)
            { }
            return table.InnerHtml;
        }



static void InsertCellForRowSpan(ref HtmlNode table, int rowIndex, int cellIndex, int rowspan, HtmlNode Maintdnode)
        {
            int tdIndex = 0;
            int trIndex = 0;
            int mNormalizedCellCountIndex = 0;
            int mRecursiveRowSpan = 0;
            try
            {
                foreach (HtmlNode trnode in table.ChildNodes)
                {
                    if (trIndex > rowIndex && rowspan - 1 > 0)
                    {
                        tdIndex = 0;

                        if (trnode.ChildNodes.Count > 0)
                        {
                            mNormalizedCellCountIndex = GetNormalizedCellCountIndex(trnode);
                            foreach (HtmlNode tdnode in trnode.ChildNodes)
                            {
                                if (tdnode.Attributes["rowspan"] != null)
                                {
                                    if (tdnode.Attributes["rowspan"].Value != "1")
                                    {
                                        mRecursiveRowSpan = Convert.ToInt32(tdnode.Attributes["rowspan"].Value);
                                        InsertCellForRowSpan(ref table, trIndex, tdIndex, mRecursiveRowSpan, tdnode);
                                        tdnode.Attributes["rowspan"].Value = "1";
                                        tdnode.Attributes.Add("Rowspanremoved""true");
                                        tdnode.Attributes.Add("OriginalRowspan""" + mRecursiveRowSpan + "");
                                    }
                                }
                                if (mNormalizedCellCountIndex < cellIndex || tdIndex == cellIndex)
                                {
                                    HtmlNode newNode = HtmlNode.CreateNode("<td style=\"white-space:nowrap;padding-right:5px;padding-left:5px;\" row-span-cell=\"true\" ></td>");
                                    if (Maintdnode.Attributes["style"] != null)
                                    {
                                        newNode.Attributes["style"].Value = Maintdnode.Attributes["style"].Value + ";" + newNode.Attributes["style"].Value;
                                        if (!(newNode.Attributes["style"].Value.Trim().Contains("border-top-style:solid") && newNode.Attributes["style"].Value.Trim().Contains("border-bottom-style:solid")))
                                            newNode.Attributes["style"].Value = newNode.Attributes["style"].Value.Replace("border-top""");
                                    }

                                    if (Maintdnode.Attributes["colspan"] != null)
                                        newNode.Attributes.Add("colspan", Maintdnode.Attributes["colspan"].Value);
                                    if (mNormalizedCellCountIndex < cellIndex)
                                        trnode.InsertAfter(newNode, trnode.LastChild);
                                    else
                                        trnode.InsertBefore(newNode, tdnode);

                                    if (rowspan < 1) return;
                                    rowspan -= 1;
                                    break;
                                }
                                if (tdnode.Attributes["colspan"] != null)
                                {
                                    tdIndex += Convert.ToInt16(tdnode.Attributes["colspan"].Value);
                                }
                                else
                                {
                                    tdIndex += 1;
                                }

                            }
                        }
                        else
                        {
                            HtmlNode newNode = HtmlNode.CreateNode("<td style=\"white-space:nowrap;padding-right:5px;padding-left:5px;\" row-span-cell=\"true\" ></td>");
                            if (Maintdnode.Attributes["style"] != null)
                                newNode.Attributes["style"].Value = Maintdnode.Attributes["style"].Value + ";" + newNode.Attributes["style"].Value;

                            if (Maintdnode.Attributes["colspan"] != null)
                                newNode.Attributes.Add("colspan", Maintdnode.Attributes["colspan"].Value);

                            trnode.AppendChild(newNode);

                            if (rowspan < 1) return;
                            rowspan -= 1;
                        }
                    }
                    else if (rowspan - 1 == 0)
                    {
                        break;
                    }
                    trIndex += 1;
                }
            }
            catch (Exception ex)
            {
            }
        }

 static int GetNormalizedCellCountIndex(HtmlNode trnode)
        {
            int mCell = 0;
            int mCellCountSum = 0;
            try
            {
                if (trnode != null)
                {
                    for (mCell = 0; mCell < trnode.ChildNodes.Count; mCell++)
                    {
                        if (trnode.ChildNodes[mCell].Attributes["colspan"] != null)
                        {
                            mCellCountSum += Convert.ToInt32(trnode.ChildNodes[mCell].Attributes["colspan"].Value);
                        }
                        else
                        {
                            mCellCountSum += 1;
                        }
                    }
                    return mCellCountSum - 1;
                }
            }
            catch
            {
            }
            return trnode.ChildNodes.Count - 1;
        }


  static string GetCleanText(string Text)
        {
            return Text.Replace("&#160;""").Replace("&nbsp;""").Replace("&#xa0;""").Replace("@#double#@""").Replace("@#single#@""").Replace("@#doubletop#@""").Replace("@#singletop#@""").Replace("@#u#@""").Replace(" """).Replace(" """).Replace("\r""").Replace("\n""");
        }