Rik wrote:
Regex could be the way to go.
Argh! No! That way lies nightmares. Get the XML_HTMLSax3 class from PEAR
and use that.
Here's an example that should parse TR, TD and TH tags (ignoring others)
including ROWSPAN and COLSPAN attributes. It creates an array of arrays
representing rows of cells. It uses 0-based indices.
<?php
class TableParser
{
private $currow = -1;
private $curcol = -1;
private $shape = array();
private $data = array();
public function openHandler ($parser, $tag, $attrs)
{
$tag = strtolower($tag );
// Move to the correct cell co-ordinates.
if ($tag=='tr')
{
$this->currow++;
$this->curcol = -1;
}
elseif ($tag=='td'||$t ag=='th')
{
$this->curcol++;
}
// This should account for rowspan and colspan.
while ($this->shape[$this->currow][$this->curcol])
$this->curcol++;
$rowspan = 1;
$colspan = 1;
foreach ($attrs as $k=>$v)
{
$k = strtolower($k);
if ($k=='rowspan')
$rowspan=(int)$ v;
elseif ($k=='colspan')
$colspan=(int)$ v;
}
for ($i=0; $i<$rowspan; $i++)
for ($j=0; $j<$colspan; $j++)
{
$x = $this->currow + $i;
$y = $this->curcol + $j;
if ($this->shape[$x][$y])
error_log('Over lap!');
$this->shape[$x][$y] = TRUE;
}
}
public function closeHandler ($parser, $tag)
{
}
public function dataHandler ($parser, $data)
{
$this->data[$this->currow][$this->curcol] .= $data;
}
public function getData ()
{
unset($this->data[-1]);
foreach ($this->data as $k=>$v)
unset($this->data[$k][-1]);
return $this->data;
}
}
include 'XML/HTMLSax3.php';
$sax = new XML_HTMLSax3;
$hdlr = new TableParser;
$sax->set_object($hd lr);
$sax->set_element_ha ndler('openHand ler', 'closeHandler') ;
$sax->set_data_handl er('dataHandler ');
$sax->parse('
<table>
<tr>
<td rowspan="2">Tes t table lalala</td>
<td>123</td>
<td>456</td>
</tr>
<tr>
<td>789</td>
<td>ABC</td>
</tr>
<tr>
<td colspan="2" rowspan="2">123 </td>
<td>456</td>
</tr>
<tr>
<td>789</td>
</tr>
</table>
');
print_r($hdlr->getData());
?>
--
Toby A Inkster BSc (Hons) ARCS
[Geek of HTML/SQL/Perl/PHP/Python/Apache/Linux]
[OS: Linux 2.6.12-12mdksmp, up 29 days, 10:43.]
PHP Domain Class
http://tobyinkster.co.uk/blog/2007/0...-domain-class/