I have been using an xml parsing script to parse a number of rss feeds
and return relevant results to a database. The script has worked well
for a couple of years, despite having very crude error-trapping (if it
finds an error in one of the xml files, the script stops). Recently, the
script has stopped working because one of the xml files is badly formed.
So I decided to rewrite the script with better error trapping; the
script should continue with the well-formed xml files and send me an
email telling me what happened.
The prototype script is failing with a "Premature end of script headers"
error. I am trying to work out if:
- this is a problem with my script, or
- a problem with the web server configuration
I have been over the code with as close as I have to a fine toothcomb,
and I can't see anything which would cause a problem.
Here is my code:
<?php
# code to parse multiple RSS .xml files, identify stories with keywords
in them, and enter those stories into DB
##### the first section of code is unchanged from the previous (working)
version
# SELECT, INSERT user privs for this page
$privs = "insert";
# create list of RSS feeds to parse
$feedsource = array(
"http://news.bbc.co.uk/rss/newsonline_uk_edition/uk/rss091.xml",
"http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/england/rss.xml",
"http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/scotland/rss.xml",
"http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/wales/rss.xml",
"http://newsrss.bbc.co.uk/rss/newsonline_uk_edition/northern_ireland/rss.xml"
);
# these provide db connection and various query functions used below
include("../includes/config.inc");
include("../includes/sql.inc");
$insideitem = FALSE;
$tag = "";
$title = "";
$description = "";
$textdump = "";
$link = "";
$itemcount = FALSE;
$body1 = "";
function startElement($parser, $name, $attrs) {
global $insideitem, $tag, $title, $description, $link;
if ($insideitem) {
$tag = $name;
} elseif ($name == "ITEM") {
$insideitem = TRUE;
}
}
function endElement($parser, $name) {
global $insideitem, $tag, $title, $description, $link, $keywords, $feed;
$numkeywords = count($keywords);
$duplicate = FALSE;
if ($name == "ITEM") {
for($counter=0; $counter < $numkeywords; $counter++) {
# create regex which matches a whole word anywhere in a string
$regex = "/\b(" . $keywords[$counter] . ")\b/";
if(preg_match($regex, $title) || preg_match($regex, $description)) {
# if title or description string of parsed story matches the word
# get all news stories from db
$result = getTotalNews();
while ($row = mysql_fetch_array($result)) {
# loop through each existing news story
if($row[txtLink] == trim($link)) {
# if new link matches existing link, flag as duplicate
$duplicate = TRUE;
}
}
if($duplicate == FALSE) {
$itemcount = TRUE;
$datetime = date("Y-m-d H:i:s");
$title = trim(str_replace("'", "\'", $title));
$description = trim(str_replace("'", "\'", $description));
$link = trim($link);
$result = insertNews($title, $description, $link, $feed, $datetime);
$body1 .= "Item added: ";
$body1 .= $title;
$body1 .= " (link: ";
$body1 .= $link;
$body1 .= ") - ";
$body1 .= $description;
$body1 .= "\n\n";
mail("in*****@invalid.co.uk", "News Item Added: " . $title, $body1,
"FROM: ne******@railwaysarchive.co.uk");
} else {
$duplicate = FALSE;
}
}
}
$title = "";
$description = "";
$link = "";
$insideitem = FALSE;
}
}
function characterData($parser, $data) {
global $insideitem, $tag, $title, $description, $link;
if ($insideitem) {
switch ($tag) {
case "TITLE":
$title .= $data;
break;
case "DESCRIPTION":
$description .= $data;
break;
case "LINK":
$link .= $data;
break;
}
}
}
##### from here onwards the script has been rewritten
# initialise feed counter
$count = 0;
$passed = TRUE;
$body = "RSS parse results:\n";
foreach ($feedsource as $feed) {
# loop through each RSS file in turn
$xml_parser = xml_parser_create();
xml_set_element_handler($xml_parser, "startElement", "endElement");
xml_set_character_data_handler($xml_parser, "characterData");
if(fopen("$feed", "r")) {
# if file can be opened
$fp = fopen("$feed", "r");
$body .= "Success opening " . $feed . "\n";
while ($data = fread($fp, 4096)) {
# loop through feed contents
if(xml_parse($xml_parser, $data, feof($fp))) {
# success
$body .= "Success parsing " . $feed . "\n";
} else {
# fail
$body .= "Failed to parse " . $feed . ": XML error " .
xml_error_string(xml_get_error_code($xml_parser)) . " at line " .
xml_get_current_line_number($xml_parser) . "\n";
$passed = FALSE;
}
}
} else {
# failed to open file
$body .= "Failed to open " . $feed . "\n";
$passed = FALSE;
}
# close file
fclose($fp);
# free up xml parser
xml_parser_free($xml_parser);
}
if($passed) {
# if no errors
$passText = "no errors";
} else {
$passText = "ERRORS";
}
$subject = "Newsfeed report: " . $passText . " at " . date("d-m-Y G:i");
$to = "in*****@invalid.com";
mail($to, $subject, $body);
?>