Nightcrawler wrote:
I am currently using the HttpWebRequest and HttpWebResponse to pull
webpages down from a few urls.
string url = "some url";
HttpWebRequest httpWebRequest =
(HttpWebRequest)WebRequest.Create(url);
using (HttpWebResponse httpWebResponse =
(HttpWebResponse)httpWebRequest.GetResponse())
{
string html = string.Empty;
StreamReader responseReader = new
StreamReader(httpWebResponse.GetResponseStream(), Encoding.UTF7);
html = responseReader.ReadToEnd();
}
My code works but my question is, am I doing it the right way
(especially the encoding part)? Some of the websites I pull content
from have charachters in them that do not exist in the english
alphabet and currently the only way for these to be read correctly by
my streamreader is if I am using UTF7 encoding. Is this really the
only way?
I am a bit surprised by the UTF-7, that is a rare encoding - at least
where I surf.
But else Martin Honnen is correct - you need to look at HTTP header
and HTML META tag.
See the code attached below for a starting point.
Arne
================================================== =======
public class HttpDownloadCharset
{
private static Regex encpat = new
Regex("charset=([A-Za-z0-9-]+)", RegexOptions.IgnoreCase |
RegexOptions.Compiled);
private static string ParseContentType(string contenttype)
{
Match m = encpat.Match(contenttype);
if(m.Success)
{
return m.Groups[1].Value;
}
else
{
return "ISO-8859-1";
}
}
private static Regex metaencpat = new
Regex("<META\\s+HTTP-EQUIV\\s*=\\s*[\"']Content-Type[\"']\\s+CONTENT\\s*=\\s*[\"']([^\"']*)[\"']>",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
private static string ParseMetaContentType(String html, String
defenc)
{
Match m = metaencpat.Match(html);
if(m.Success)
{
return ParseContentType(m.Groups[1].Value);
} else {
return defenc;
}
}
private const int DEFAULT_BUFSIZ = 1000000;
public static string Download(string urlstr)
{
HttpWebRequest req = (HttpWebRequest)WebRequest.Create(urlstr);
using(HttpWebResponse resp =
(HttpWebResponse)req.GetResponse())
{
if (resp.StatusCode == HttpStatusCode.OK)
{
string enc = ParseContentType(resp.ContentType);
int bufsiz = (int)resp.ContentLength;
if(bufsiz < 0) {
bufsiz = DEFAULT_BUFSIZ;
}
byte[] buf = new byte[bufsiz];
Stream stm = resp.GetResponseStream();
int ix = 0;
int n;
while((n = stm.Read(buf, ix, buf.Length - ix)) 0) {
ix += n;
}
stm.Close();
string temp = Encoding.ASCII.GetString(buf);
enc = ParseMetaContentType(temp, enc);
return Encoding.GetEncoding(enc).GetString(buf);
}
else
{
throw new ArgumentException("URL " + urlstr + "
returned " + resp.StatusDescription);
}
}
}
}