By using this site, you agree to our updated Privacy Policy and our Terms of Use. Manage your Cookies Settings.
459,492 Members | 1,210 Online
Bytes IT Community
+ Ask a Question
Need help? Post your question and get tips & solutions from a community of 459,492 IT Pros & Developers. It's quick & easy.

reading attributes with no quotes using XmlTextReader

P: n/a
All,

So I am creating a function that gets a short blurb of html from a
blog. I would like to retain all html formating and images. The code
below works well, with the exception of one issue.

My issue:
---------------------
When a blog's html has attributes with no quotes i get an exception.

Here's the example of the blog I am dealing with.
<p align=center>Some text from the blog.</p>

Questions:
----------------------
Is there a way to get the XmlTextReader to allow attributes without
quotes?

If not, do you like RegExs for this replace?

Then, Does anyone know any RegExs that could do this replace?
Code:
----------------------
public static string GetContentShortBlurb(string content, int len)
{
try
{
using (System.IO.MemoryStream ms = new
System.IO.MemoryStream())
{
if (!content.TrimStart(' ', '\r',
'\n').StartsWith("<"))
content = "<p>" + content + "</p>";

byte[] cb = System.Text.Encoding.UTF8.GetBytes("<doc>"
+ content + "</doc>");
ms.Write(cb, 0, cb.Length);
ms.Position = 0;

// create Reader for parsing
XmlTextReader xr = new XmlTextReader(ms);

// Create Writer for output
System.Text.StringBuilder sb = new
System.Text.StringBuilder();
XmlWriterSettings xws = new XmlWriterSettings();
xws.ConformanceLevel = ConformanceLevel.Fragment;
xws.Encoding = new System.Text.UTF8Encoding(false);
XmlWriter xw = XmlTextWriter.Create(sb, xws);

xr.Read();

int strCount = 0;
int nodesToEnd = 0;
while (strCount < len)
{
xr.Read();

if (xr.NodeType == XmlNodeType.EndElement)
{
if (xr.Name == "doc") break;

xw.WriteEndElement();
nodesToEnd--;
}

if (xr.NodeType == XmlNodeType.Element)
{
xw.WriteStartElement(xr.Name);

nodesToEnd++;

// write attributes
while (xr.MoveToNextAttribute())
{
xw.WriteAttributeString(xr.Name, xr.Value);
}
}

if (xr.NodeType == XmlNodeType.Text)
{
string inner = xr.Value;
if (inner.Length + strCount len)
{
inner = inner.Substring(0,
inner.LastIndexOf(' ', len - strCount)) + " ...";
}
xw.WriteString(inner);
strCount += inner.Length;
}
}

for (int i = 0; i < nodesToEnd; i++)
xw.WriteEndElement();

xr.Close();
xw.Close();
return Regex.Replace(sb.ToString(), "<\\?xml\\b[^>]*>",
"");
}
}
catch (Exception ex)
{
// Just do the standard old string trim
string stripHtmlEx = "</?([A-Z][A-Z0-9]*)\\b[^>]*>";
string output = Regex.Replace(content, stripHtmlEx, "");
if (output.Length len)
output = "<p>" + output.Substring(0,
output.LastIndexOf(' ', len)).Replace("\r\n", "</p>\r\n<p>") + "
....</p>";
return output;
}
}

Nov 28 '06 #1
Share this Question
Share on Google+
3 Replies


P: n/a
You're problem, which you might already know, is that you are trying to use
a XML Text Reader to read non-XML content. XML strictly requires all
attributes to be enclosed in double quotes. HTML is based on SGML which
doesn't have such a requirement. XHTML on the other hand is based on XML
and so you shouldn't have any problems.

All this to say that there probably isn't a way to make XmlTExtReader work
without quote - if it did, it wouldn't be an Xml reader...Unfortunetly,
there isn't an SgmlTextReader - which is really what you should be using.

You could try to use regular expressions to turn your content into valid
XML, but I think you'll keep running into new issues with this...first it'll
be missing double quotes, then missing closing tags....

Using a regular expression or even just string manipulation (index of and
substrings) is probably the right way to go...

Karl
--
http://www.openmymind.net/
http://www.fuelindustries.com/
"apiringmvp" <bd******@hotmail.comwrote in message
news:11*********************@j44g2000cwa.googlegro ups.com...
All,

So I am creating a function that gets a short blurb of html from a
blog. I would like to retain all html formating and images. The code
below works well, with the exception of one issue.

My issue:
---------------------
When a blog's html has attributes with no quotes i get an exception.

Here's the example of the blog I am dealing with.
<p align=center>Some text from the blog.</p>

Questions:
----------------------
Is there a way to get the XmlTextReader to allow attributes without
quotes?

If not, do you like RegExs for this replace?

Then, Does anyone know any RegExs that could do this replace?
Code:
----------------------
public static string GetContentShortBlurb(string content, int len)
{
try
{
using (System.IO.MemoryStream ms = new
System.IO.MemoryStream())
{
if (!content.TrimStart(' ', '\r',
'\n').StartsWith("<"))
content = "<p>" + content + "</p>";

byte[] cb = System.Text.Encoding.UTF8.GetBytes("<doc>"
+ content + "</doc>");
ms.Write(cb, 0, cb.Length);
ms.Position = 0;

// create Reader for parsing
XmlTextReader xr = new XmlTextReader(ms);

// Create Writer for output
System.Text.StringBuilder sb = new
System.Text.StringBuilder();
XmlWriterSettings xws = new XmlWriterSettings();
xws.ConformanceLevel = ConformanceLevel.Fragment;
xws.Encoding = new System.Text.UTF8Encoding(false);
XmlWriter xw = XmlTextWriter.Create(sb, xws);

xr.Read();

int strCount = 0;
int nodesToEnd = 0;
while (strCount < len)
{
xr.Read();

if (xr.NodeType == XmlNodeType.EndElement)
{
if (xr.Name == "doc") break;

xw.WriteEndElement();
nodesToEnd--;
}

if (xr.NodeType == XmlNodeType.Element)
{
xw.WriteStartElement(xr.Name);

nodesToEnd++;

// write attributes
while (xr.MoveToNextAttribute())
{
xw.WriteAttributeString(xr.Name, xr.Value);
}
}

if (xr.NodeType == XmlNodeType.Text)
{
string inner = xr.Value;
if (inner.Length + strCount len)
{
inner = inner.Substring(0,
inner.LastIndexOf(' ', len - strCount)) + " ...";
}
xw.WriteString(inner);
strCount += inner.Length;
}
}

for (int i = 0; i < nodesToEnd; i++)
xw.WriteEndElement();

xr.Close();
xw.Close();
return Regex.Replace(sb.ToString(), "<\\?xml\\b[^>]*>",
"");
}
}
catch (Exception ex)
{
// Just do the standard old string trim
string stripHtmlEx = "</?([A-Z][A-Z0-9]*)\\b[^>]*>";
string output = Regex.Replace(content, stripHtmlEx, "");
if (output.Length len)
output = "<p>" + output.Substring(0,
output.LastIndexOf(' ', len)).Replace("\r\n", "</p>\r\n<p>") + "
...</p>";
return output;
}
}
Nov 28 '06 #2

P: n/a
Your stuck to using string manipulation, and its not likely to be the
easiest task.

I have to ask - if its from a blog, why cant you syndicate the RSS and
consume it

--
--
Regards

John Timney (MVP)
VISIT MY WEBSITE:
http://www.johntimney.com
http://www.johntimney.com/blog
"apiringmvp" <bd******@hotmail.comwrote in message
news:11*********************@j44g2000cwa.googlegro ups.com...
All,

So I am creating a function that gets a short blurb of html from a
blog. I would like to retain all html formating and images. The code
below works well, with the exception of one issue.

My issue:
---------------------
When a blog's html has attributes with no quotes i get an exception.

Here's the example of the blog I am dealing with.
<p align=center>Some text from the blog.</p>

Questions:
----------------------
Is there a way to get the XmlTextReader to allow attributes without
quotes?

If not, do you like RegExs for this replace?

Then, Does anyone know any RegExs that could do this replace?
Code:
----------------------
public static string GetContentShortBlurb(string content, int len)
{
try
{
using (System.IO.MemoryStream ms = new
System.IO.MemoryStream())
{
if (!content.TrimStart(' ', '\r',
'\n').StartsWith("<"))
content = "<p>" + content + "</p>";

byte[] cb = System.Text.Encoding.UTF8.GetBytes("<doc>"
+ content + "</doc>");
ms.Write(cb, 0, cb.Length);
ms.Position = 0;

// create Reader for parsing
XmlTextReader xr = new XmlTextReader(ms);

// Create Writer for output
System.Text.StringBuilder sb = new
System.Text.StringBuilder();
XmlWriterSettings xws = new XmlWriterSettings();
xws.ConformanceLevel = ConformanceLevel.Fragment;
xws.Encoding = new System.Text.UTF8Encoding(false);
XmlWriter xw = XmlTextWriter.Create(sb, xws);

xr.Read();

int strCount = 0;
int nodesToEnd = 0;
while (strCount < len)
{
xr.Read();

if (xr.NodeType == XmlNodeType.EndElement)
{
if (xr.Name == "doc") break;

xw.WriteEndElement();
nodesToEnd--;
}

if (xr.NodeType == XmlNodeType.Element)
{
xw.WriteStartElement(xr.Name);

nodesToEnd++;

// write attributes
while (xr.MoveToNextAttribute())
{
xw.WriteAttributeString(xr.Name, xr.Value);
}
}

if (xr.NodeType == XmlNodeType.Text)
{
string inner = xr.Value;
if (inner.Length + strCount len)
{
inner = inner.Substring(0,
inner.LastIndexOf(' ', len - strCount)) + " ...";
}
xw.WriteString(inner);
strCount += inner.Length;
}
}

for (int i = 0; i < nodesToEnd; i++)
xw.WriteEndElement();

xr.Close();
xw.Close();
return Regex.Replace(sb.ToString(), "<\\?xml\\b[^>]*>",
"");
}
}
catch (Exception ex)
{
// Just do the standard old string trim
string stripHtmlEx = "</?([A-Z][A-Z0-9]*)\\b[^>]*>";
string output = Regex.Replace(content, stripHtmlEx, "");
if (output.Length len)
output = "<p>" + output.Substring(0,
output.LastIndexOf(' ', len)).Replace("\r\n", "</p>\r\n<p>") + "
...</p>";
return output;
}
}

Nov 28 '06 #3

P: n/a
You are going to run into very serious problems using an XMLTextReader
to operate on HTML. HTML is almost always NOT valid XML.

You'd rather use regular expressions to manipulate the text.

On 28 Nov 2006 07:24:56 -0800, "apiringmvp" <bd******@hotmail.com>
wrote:
>All,

So I am creating a function that gets a short blurb of html from a
blog. I would like to retain all html formating and images. The code
below works well, with the exception of one issue.

My issue:
---------------------
When a blog's html has attributes with no quotes i get an exception.

Here's the example of the blog I am dealing with.
<p align=center>Some text from the blog.</p>

Questions:
----------------------
Is there a way to get the XmlTextReader to allow attributes without
quotes?

If not, do you like RegExs for this replace?

Then, Does anyone know any RegExs that could do this replace?
Code:
----------------------
public static string GetContentShortBlurb(string content, int len)
{
try
{
using (System.IO.MemoryStream ms = new
System.IO.MemoryStream())
{
if (!content.TrimStart(' ', '\r',
'\n').StartsWith("<"))
content = "<p>" + content + "</p>";

byte[] cb = System.Text.Encoding.UTF8.GetBytes("<doc>"
+ content + "</doc>");
ms.Write(cb, 0, cb.Length);
ms.Position = 0;

// create Reader for parsing
XmlTextReader xr = new XmlTextReader(ms);

// Create Writer for output
System.Text.StringBuilder sb = new
System.Text.StringBuilder();
XmlWriterSettings xws = new XmlWriterSettings();
xws.ConformanceLevel = ConformanceLevel.Fragment;
xws.Encoding = new System.Text.UTF8Encoding(false);
XmlWriter xw = XmlTextWriter.Create(sb, xws);

xr.Read();

int strCount = 0;
int nodesToEnd = 0;
while (strCount < len)
{
xr.Read();

if (xr.NodeType == XmlNodeType.EndElement)
{
if (xr.Name == "doc") break;

xw.WriteEndElement();
nodesToEnd--;
}

if (xr.NodeType == XmlNodeType.Element)
{
xw.WriteStartElement(xr.Name);

nodesToEnd++;

// write attributes
while (xr.MoveToNextAttribute())
{
xw.WriteAttributeString(xr.Name, xr.Value);
}
}

if (xr.NodeType == XmlNodeType.Text)
{
string inner = xr.Value;
if (inner.Length + strCount len)
{
inner = inner.Substring(0,
inner.LastIndexOf(' ', len - strCount)) + " ...";
}
xw.WriteString(inner);
strCount += inner.Length;
}
}

for (int i = 0; i < nodesToEnd; i++)
xw.WriteEndElement();

xr.Close();
xw.Close();
return Regex.Replace(sb.ToString(), "<\\?xml\\b[^>]*>",
"");
}
}
catch (Exception ex)
{
// Just do the standard old string trim
string stripHtmlEx = "</?([A-Z][A-Z0-9]*)\\b[^>]*>";
string output = Regex.Replace(content, stripHtmlEx, "");
if (output.Length len)
output = "<p>" + output.Substring(0,
output.LastIndexOf(' ', len)).Replace("\r\n", "</p>\r\n<p>") + "
...</p>";
return output;
}
}
--

Bits.Bytes.
http://bytes.thinkersroom.com
Nov 28 '06 #4

This discussion thread is closed

Replies have been disabled for this discussion.