using pyparsing to deal with nested tables , wanna keep table's
structure and propertys .
but program was chunked with the </td> tag of inner table.
have any ideas?
here's the program
from pyparsing import *
mytable = """
<table id="leftpage_table" width="156" border="0" cellspacing="0"
cellpadding="0">
<tr id="trtd" height="24">
<td width="153" background="images/bt_kind.gif" align="center"
class="left_menu">system</td>
</tr>
<tr id="trtd_down" height="20">
<td id="trtd_down"><table id="inner_lefgpage_table" width="100%"
height="100%" border="0" cellspacing="0" cellpadding="0">
<tr id="inner_trtd" height="20">
<td background="images/bt_class.gif" align="center">art</td>
</tr>
<tr>
<td background="images/bt_class.gif" align="center">art</td>
</tr>
</table></td>
</tr>
</table>
"""
startTag = Literal("<")
endTag = Literal(">")
idPattern = CaselessLiteral("id").suppress() + Literal("=").suppress()
+ ( quotedString.copy().setParseAction( removeQuotes ) |
Word(srange("[a-zA-Z0-9_~]")))
attrPattern = Combine(Word(alphanums + "_") + Literal("=") + (
quotedString | Word(srange("[a-zA-Z0-9_~:&@#;?/\.]"))))
tablePattern = Forward()
def getItemCloseTag(x):
itemCloseTag = Combine(startTag + Literal("/") + CaselessLiteral(x)
+ endTag).suppress()
return itemCloseTag
def getItemStartTag(x):
itemStartTag = startTag.suppress() +
Keyword(x,caseless=True).suppress() + Group(ZeroOrMore(idPattern)) +
Group(ZeroOrMore(attrPattern)) + endTag.suppress()
return itemStartTag
def getItemPattern(x):
tCloseTag = getItemCloseTag(x)
itemPattern = getItemStartTag(x) + Group(ZeroOrMore(tablePattern))
+ Group(SkipTo(tCloseTag)) + tCloseTag
return itemPattern
def getMultiLevelPattern(x,y):
tCloseTag = getItemCloseTag(x)
itemPattern = getItemStartTag(x) + Group(OneOrMore(y)) + tCloseTag
return itemPattern
tdPattern = getItemPattern(x='td')
trPattern = getMultiLevelPattern('tr',tdPattern)
tablePattern = getMultiLevelPattern('table',trPattern)
t = tablePattern
for toks,strt,end in t.scanString(mytable):
print toks.asList()
OutPut:
[['leftpage_table'], ['width="156"', 'border="0"', 'cellspacing="0"',
'cellpadding="0"'], [['trtd'], ['height="24"'], [[], ['width="153"',
'background="images/bt_kind.gif"', 'align="center"',
'class="left_menu"'], [], ['system']], ['trtd_down'], ['height="20"'],
[['trtd_down'], [], [], ['<table id="inner_lefgpage_table" width="100%"
height="100%" border="0" cellspacing="0" cellpadding="0">\n <tr
id="inner_trtd" height="20">\n <td
background="images/bt_class.gif" align="center">art']], [], [], [[],
['background="images/bt_class.gif"', 'align="center"'], [], ['art']]]]