I have a script which parses the BBC RSS feed once an hour, and drops
any stories with certain keywords into a database. I would like to ban
certain strings; at the moment the script will pick up the word "train"
but also "training"; this is giving me a lot of false positives. Can
anyone assist? I tried a couple of things but nothing that works.
Script is as follows:
<?php
$keywords = array("keyword1", "keyword2", "keyword3");
$bannedwords = array("bannedword1", "bannedword2", "bannedword3");
$feedsource =
"http://news.bbc.co.uk/rss/newsonline_uk_edition/uk/rss091.xml";
$db = mysql_connect("localhost","username","password") or
die(mysql_error());
mysql_select_db("database");
$insideitem = FALSE;
$tag = "";
$title = "";
$description = "";
$textdump = "";
$link = "";
$itemcount = FALSE;
$body1 = "";
function startElement($parser, $name, $attrs) {
global $insideitem, $tag, $title, $description, $link;
if ($insideitem) {
$tag = $name;
} elseif ($name == "ITEM") {
$insideitem = TRUE;
}
}
function endElement($parser, $name) {
global $insideitem, $tag, $title, $description, $link, $keywords;
$numkeywords = count($keywords);
$duplicate = FALSE;
if ($name == "ITEM") {
for($counter1=0; $counter1 < $numkeywords; $counter1++) {
if(stristr($title, $keywords[$counter1]) || strstr($description,
$keywords[$counter1])) {
$sql = "select * from tblRSSfeed";
$result = mysql_query($sql) or die(mysql_error());
while ($row = mysql_fetch_array($result)) {
if($row[txtLink] == trim($link)) {
$duplicate = TRUE;
}
}
if($duplicate == FALSE) {
$itemcount = TRUE;
$datetime = date("Y-m-d H:i:s");
$title = trim(str_replace("'", "\'", $title));
$description = trim(str_replace("'", "\'", $description));
$link = trim($link);
$sql = "INSERT INTO tblRSSfeed VALUES(NULL, '$title',
'$description', '$link', '$datetime')";
mysql_query($sql) or die(mysql_error());
} else {
$duplicate = FALSE;
}
}
}
$title = "";
$description = "";
$link = "";
$insideitem = FALSE;
}
}
function characterData($parser, $data) {
global $insideitem, $tag, $title, $description, $link;
if ($insideitem) {
switch ($tag) {
case "TITLE":
$title .= $data;
break;
case "DESCRIPTION":
$description .= $data;
break;
case "LINK":
$link .= $data;
break;
}
}
}
$xml_parser = xml_parser_create();
xml_set_element_handler($xml_parser, "startElement", "endElement");
xml_set_character_data_handler($xml_parser, "characterData");
$fp = fopen("$feedsource","r") or die("Error reading RSS data.");
while ($data = fread($fp, 4096)) {
xml_parse($xml_parser, $data, feof($fp)) or die(sprintf("XML error: %s
at line %d", xml_error_string(xml_get_error_code($xml_parser)),
xml_get_current_line_number($xml_parser)));
}
fclose($fp);
xml_parser_free($xml_parser);
?>