C#.Net code to normalize (remove rowspan & colspan htmltable
with the help of HtmlAgilityPack
Description: Convert Html Financial Table to its normalized
format taking care of header, stub and financial cell by marking them.
Header: Header of financial table
Stub: First Non-financial column of financial table
Diagrammatic Representation:
a) Before Normalization:
Side
|
Company Abc
|
2012
|
2013
|
Assets
|
$5,000
|
6000
|
ABC
Assets
|
USD300(1)
|
$4,000x
|
|
In Between SubHead
|
Debts
|
2000
|
$10000
|
b) After Normalization:
Side
|
Company Abc
|
|
|
2012
|
2013
|
Assets
|
$5,000
|
6000
|
ABC
Assets
|
USD300(1)
|
$4,000x
|
|
In Between SubHead
|
|
Debts
|
2000
|
$10000
|
Main Calling method:
1) FormatAllTables: Start point for
normalization. Take input as html table string.
2) NormalizeTable: Do the
process of normalization.
3) InsertCellForRowSpan(): To create cells
for normalization.
Supporting methods:
1) GetCleanText(): To get Clear content
2) GetNormalizedCellCountIndex(): To
get proper index of a cell excluding colspan.
Way of Calling:
1) String
strTable=”table html content”;
2) Call FormatAllTable method as given below.
Eg. strTable =FormatAllTable(strTable);
Method Definitions:
public string
FormatAllTables(string HtmlDocumentString)
{
HtmlAgilityPack.HtmlDocument objHTMLdoc = new
HtmlAgilityPack.HtmlDocument();
try
{
HtmlDocumentString =
HtmlDocumentString.Replace(" ", "").Replace("\t", "").Replace("\n", "").Replace("\r", "");
objHTMLdoc.LoadHtml(HtmlDocumentString);
HtmlNodeCollection
tableList = objHTMLdoc.DocumentNode.SelectNodes("//table");
if
(tableList != null)
{
for
(int tableIndex = 0; tableIndex <
tableList.Count(); tableIndex++)
{
try
{
HtmlNode tableCopy = tableList[tableIndex];
tableCopy.InnerHtml
= NormalizeTable(tableCopy);
}
catch (Exception ex)
{
//handle exception
}
}
}
}
catch
(Exception ex)
{ }
return
objHTMLdoc.DocumentNode.OuterHtml; //OutPut
}
static string NormalizeTable(HtmlNode table)
{
try
{
HtmlNode trnode
= null,tdnode = null;
int trindex = 0,
tdindex = 0, mRowSpan = 0, m = 0, mColSpan = 0;
for (trindex =
0; trindex < table.ChildNodes.Count; trindex++)
{
trnode = table.ChildNodes[trindex];
for (tdindex =
0; tdindex < trnode.ChildNodes.Count; tdindex++)
{
tdnode =
trnode.ChildNodes[tdindex];
mRowSpan = 0; mColSpan = 0;
#region For rowspan
if (tdnode.Attributes["rowspan"] != null)
{
if (tdnode.Attributes["rowspan"].Value != "1")
{
mRowSpan = Convert.ToInt32(tdnode.Attributes["rowspan"].Value);
InsertCellForRowSpan(ref table,
trindex, tdindex, mRowSpan, tdnode);
tdnode.Attributes["rowspan"].Value = "1";
tdnode.Attributes.Add("OriginalRowspan", "" + mRowSpan
+ "");
}
}
#endregion
#region For colspan
if (tdnode.Attributes["colspan"] != null &&
tdnode.Attributes["colspan"].Value != "1")
{
mColSpan = Convert.ToInt32(tdnode.Attributes["colspan"].Value);
for (m = 0; m
< mColSpan - 1; m++)
{
HtmlNode newNode = HtmlNode.CreateNode("<td></td>");
trnode.InsertAfter(newNode, tdnode);
}
tdnode.Attributes["colspan"].Value = "1";
tdnode.Attributes.Add("OriginalColspan", "" + mColSpan
+ "");
}
#endregion
}
}
}
catch (Exception ex)
{ }
return table.InnerHtml;
}
static void InsertCellForRowSpan(ref HtmlNode table, int rowIndex, int cellIndex, int rowspan, HtmlNode Maintdnode)
{
int tdIndex
= 0;
int trIndex
= 0;
int mNormalizedCellCountIndex
= 0;
int mRecursiveRowSpan
= 0;
try
{
foreach (HtmlNode trnode in table.ChildNodes)
{
if (trIndex
> rowIndex && rowspan - 1 > 0)
{
tdIndex = 0;
if (trnode.ChildNodes.Count
> 0)
{
mNormalizedCellCountIndex = GetNormalizedCellCountIndex(trnode);
foreach (HtmlNode tdnode in trnode.ChildNodes)
{
if (tdnode.Attributes["rowspan"]
!= null)
{
if (tdnode.Attributes["rowspan"].Value
!= "1")
{
mRecursiveRowSpan = Convert.ToInt32(tdnode.Attributes["rowspan"].Value);
InsertCellForRowSpan(ref table, trIndex, tdIndex, mRecursiveRowSpan, tdnode);
tdnode.Attributes["rowspan"].Value = "1";
tdnode.Attributes.Add("Rowspanremoved", "true");
tdnode.Attributes.Add("OriginalRowspan", "" +
mRecursiveRowSpan + "");
}
}
if (mNormalizedCellCountIndex
< cellIndex || tdIndex == cellIndex)
{
HtmlNode newNode
= HtmlNode.CreateNode("<td style=\"white-space:nowrap;padding-right:5px;padding-left:5px;\"
row-span-cell=\"true\" ></td>");
if (Maintdnode.Attributes["style"]
!= null)
{
newNode.Attributes["style"].Value =
Maintdnode.Attributes["style"].Value + ";" +
newNode.Attributes["style"].Value;
if (!(newNode.Attributes["style"].Value.Trim().Contains("border-top-style:solid")
&& newNode.Attributes["style"].Value.Trim().Contains("border-bottom-style:solid")))
newNode.Attributes["style"].Value =
newNode.Attributes["style"].Value.Replace("border-top", "");
}
if (Maintdnode.Attributes["colspan"]
!= null)
newNode.Attributes.Add("colspan", Maintdnode.Attributes["colspan"].Value);
if (mNormalizedCellCountIndex
< cellIndex)
trnode.InsertAfter(newNode,
trnode.LastChild);
else
trnode.InsertBefore(newNode, tdnode);
if (rowspan
< 1) return;
rowspan
-= 1;
break;
}
if (tdnode.Attributes["colspan"]
!= null)
{
tdIndex += Convert.ToInt16(tdnode.Attributes["colspan"].Value);
}
else
{
tdIndex += 1;
}
}
}
else
{
HtmlNode newNode
= HtmlNode.CreateNode("<td
style=\"white-space:nowrap;padding-right:5px;padding-left:5px;\" row-span-cell=\"true\"
></td>");
if (Maintdnode.Attributes["style"]
!= null)
newNode.Attributes["style"].Value =
Maintdnode.Attributes["style"].Value + ";" +
newNode.Attributes["style"].Value;
if (Maintdnode.Attributes["colspan"]
!= null)
newNode.Attributes.Add("colspan", Maintdnode.Attributes["colspan"].Value);
trnode.AppendChild(newNode);
if (rowspan
< 1) return;
rowspan -= 1;
}
}
else if (rowspan
- 1 == 0)
{
break;
}
trIndex += 1;
}
}
catch (Exception ex)
{
}
}
static int GetNormalizedCellCountIndex(HtmlNode trnode)
{
int mCell
= 0;
int mCellCountSum
= 0;
try
{
if (trnode
!= null)
{
for (mCell
= 0; mCell < trnode.ChildNodes.Count; mCell++)
{
if (trnode.ChildNodes[mCell].Attributes["colspan"]
!= null)
{
mCellCountSum += Convert.ToInt32(trnode.ChildNodes[mCell].Attributes["colspan"].Value);
}
else
{
mCellCountSum += 1;
}
}
return mCellCountSum
- 1;
}
}
catch
{
}
return trnode.ChildNodes.Count
- 1;
}
static string GetCleanText(string Text)
{
return Text.Replace(" ", "").Replace(" ", "").Replace(" ", "").Replace("@#double#@", "").Replace("@#single#@", "").Replace("@#doubletop#@", "").Replace("@#singletop#@", "").Replace("@#u#@", "").Replace("
", "").Replace(" ", "").Replace("\r", "").Replace("\n", "");
}