By using this site, you agree to our updated Privacy Policy and our Terms of Use. Manage your Cookies Settings.
435,594 Members | 3,017 Online
Bytes IT Community
+ Ask a Question
Need help? Post your question and get tips & solutions from a community of 435,594 IT Pros & Developers. It's quick & easy.

Trying to scrape a website in C#, but failing. Looking for assistance please

P: n/a
Hi people,

I have now spent far too much time on this small problem, I am
hopefully going to hand it over to you guys that are cleverer than I in
this respect.

Problem. I need to get all firms in a certain postal code area (say
"E1") from the FSA website. I can do this interactively, but I
thought I would save time and automate it. Big mistake, I have taken
hours investigating an so far failed to get the second search results
page displayed.

Step 1.
Goto site http://www.fsa.gov.uk/register/firmSearchForm.do and enter
postal code of "E1", press "Submit".

Step 2.
At bottom of screen, you can select 21 further pages via URI such as
http://www.fsa.gov.uk/register/firmM...o?pageNumber=3 for page
3.

I though this would be easy to code in VS2005, but I have come to a
complete standstill and would like some help please.

Full code in a button Click event below.

First connection to the site is only to get the cookie, nothing else.
Second connection is the entry of postcode "E1" and pressing "Submit"
button
Third connection is the selection of Page 2 to display companies 11 to
20.

You will need to add following to top of basic windforms app.
using System.Net;
using System.IO;
using System.Net.Cache;
---------------------------------
private void button2_Click(object sender, EventArgs e)
{
string address;
string postData;
CookieContainer cookies = new CookieContainer();
HttpWebRequest webRequest;
HttpWebResponse webResponse;
StreamReader responseReader;
StreamWriter requestWriter;
string responseData;
Cookie cookie ;

//
// First connection to site, purely to get the cookie out.
// (WORKS)
//
address = "http://www.fsa.gov.uk/register/firmMainSearch.do";
postData = "";

webRequest = WebRequest.Create(address) as HttpWebRequest;
webRequest.Method = "GET";
webRequest.CookieContainer = cookies;
webRequest.ContentType = "application/x-www-form-urlencoded";
webResponse = (HttpWebResponse)webRequest.GetResponse();
responseReader = new StreamReader(webResponse.GetResponseStream());
responseData = responseReader.ReadToEnd();
responseReader.Close();
webResponse.Close();

cookie = webResponse.Cookies[0];

//
// Second connection to site to emulate sending postcode E1 and
pressing Submit button
// (WORKS)
//
postData =
"pageNumber=0&firmName=&postcodeOut=E1&postcodeIn= &searchType=1&currAuthorisedInd=on&ddd=Submit";
address = "http://www.fsa.gov.uk/register/firmMainSearch.do;" +
cookie.Name + "="+cookie.Value;
webRequest = WebRequest.Create(address) as HttpWebRequest;
webRequest.Method = "POST";
webRequest.CookieContainer = cookies;
webRequest.CookieContainer.Add(cookie);
webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg,
image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel,
application/vnd.ms-powerpoint, application/msword, */*";
byte[] bytes = Encoding.ASCII.GetBytes(postData);
webRequest.ContentLength = bytes.Length;

webRequest.Referer =
"http://www.fsa.gov.uk/register/firmMainSearch.do;";// +cookie.Name +
"=" + cookie.Value;
webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value);
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
webRequest.Headers.Add("Accept-Language", "en-us");
webRequest.Headers.Add("UA-CPU", "x86");
webRequest.KeepAlive = true;
webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows
NT 5.1; .NET CLR 2.0.50727)";
webRequest.ContentType = "application/x-www-form-urlencoded";
webRequest.CachePolicy = new
HttpRequestCachePolicy(HttpRequestCacheLevel.NoCac heNoStore);
requestWriter = new StreamWriter(webRequest.GetRequestStream());
requestWriter.Write(postData);
requestWriter.Close();
webResponse = (HttpWebResponse)webRequest.GetResponse();
responseReader = new StreamReader(webResponse.GetResponseStream());
responseData = responseReader.ReadToEnd();
responseReader.Close();
webResponse.Close();
//
// Third connection, attempting to go to the second screen to read
off the subsequent companies
// (DOES NOT WORK)
//
string Secret_address =
"http://www.fsa.gov.uk/register/firmMainSearch.do?pageNumber=2";

webRequest = WebRequest.Create(Secret_address) as HttpWebRequest;
webRequest.Method = "GET";
webRequest.CookieContainer = cookies;
webRequest.ContentType = "application/x-www-form-urlencoded";
webRequest.CookieContainer.Add(cookie);
webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg,
image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel,
application/vnd.ms-powerpoint, application/msword, */*";

webRequest.Referer =
"http://www.fsa.gov.uk/register/firmMainSearch.do;" + cookie.Name + "="
+ cookie.Value;
webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value);
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
webRequest.Headers.Add("Accept-Language", "en-us");
webRequest.Headers.Add("UA-CPU", "x86");
webRequest.KeepAlive = true;
webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows
NT 5.1; .NET CLR 2.0.50727)";

webResponse = (HttpWebResponse)webRequest.GetResponse();
responseReader = new StreamReader(webResponse.GetResponseStream());
responseData = responseReader.ReadToEnd();
responseReader.Close();
webResponse.Close();

webBrowser1.DocumentText = responseData;

}

Nov 12 '06 #1
Share this Question
Share on Google+
3 Replies


P: n/a
<Ad**********@googlemail.comwrote in message
news:11*********************@f16g2000cwb.googlegro ups.com...

I think the main problem here is that the page in question has a data
control on it which supports paging. That's fine, of course, so long as
you're in the same session. But I have a feeling that every subsequent call
made by your code instantiates a *new* session, so the server doesn't
understand what to do.

I'm not sure there is a way round this...
Nov 12 '06 #2

P: n/a
Whats the actuall error you get?

Ciaran

"Ad**********@googlemail.com" wrote:
Hi people,

I have now spent far too much time on this small problem, I am
hopefully going to hand it over to you guys that are cleverer than I in
this respect.

Problem. I need to get all firms in a certain postal code area (say
"E1") from the FSA website. I can do this interactively, but I
thought I would save time and automate it. Big mistake, I have taken
hours investigating an so far failed to get the second search results
page displayed.

Step 1.
Goto site http://www.fsa.gov.uk/register/firmSearchForm.do and enter
postal code of "E1", press "Submit".

Step 2.
At bottom of screen, you can select 21 further pages via URI such as
http://www.fsa.gov.uk/register/firmM...o?pageNumber=3 for page
3.

I though this would be easy to code in VS2005, but I have come to a
complete standstill and would like some help please.

Full code in a button Click event below.

First connection to the site is only to get the cookie, nothing else.
Second connection is the entry of postcode "E1" and pressing "Submit"
button
Third connection is the selection of Page 2 to display companies 11 to
20.

You will need to add following to top of basic windforms app.
using System.Net;
using System.IO;
using System.Net.Cache;
---------------------------------
private void button2_Click(object sender, EventArgs e)
{
string address;
string postData;
CookieContainer cookies = new CookieContainer();
HttpWebRequest webRequest;
HttpWebResponse webResponse;
StreamReader responseReader;
StreamWriter requestWriter;
string responseData;
Cookie cookie ;

//
// First connection to site, purely to get the cookie out.
// (WORKS)
//
address = "http://www.fsa.gov.uk/register/firmMainSearch.do";
postData = "";

webRequest = WebRequest.Create(address) as HttpWebRequest;
webRequest.Method = "GET";
webRequest.CookieContainer = cookies;
webRequest.ContentType = "application/x-www-form-urlencoded";
webResponse = (HttpWebResponse)webRequest.GetResponse();
responseReader = new StreamReader(webResponse.GetResponseStream());
responseData = responseReader.ReadToEnd();
responseReader.Close();
webResponse.Close();

cookie = webResponse.Cookies[0];

//
// Second connection to site to emulate sending postcode E1 and
pressing Submit button
// (WORKS)
//
postData =
"pageNumber=0&firmName=&postcodeOut=E1&postcodeIn= &searchType=1&currAuthorisedInd=on&ddd=Submit";
address = "http://www.fsa.gov.uk/register/firmMainSearch.do;" +
cookie.Name + "="+cookie.Value;
webRequest = WebRequest.Create(address) as HttpWebRequest;
webRequest.Method = "POST";
webRequest.CookieContainer = cookies;
webRequest.CookieContainer.Add(cookie);
webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg,
image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel,
application/vnd.ms-powerpoint, application/msword, */*";
byte[] bytes = Encoding.ASCII.GetBytes(postData);
webRequest.ContentLength = bytes.Length;

webRequest.Referer =
"http://www.fsa.gov.uk/register/firmMainSearch.do;";// +cookie.Name +
"=" + cookie.Value;
webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value);
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
webRequest.Headers.Add("Accept-Language", "en-us");
webRequest.Headers.Add("UA-CPU", "x86");
webRequest.KeepAlive = true;
webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows
NT 5.1; .NET CLR 2.0.50727)";
webRequest.ContentType = "application/x-www-form-urlencoded";
webRequest.CachePolicy = new
HttpRequestCachePolicy(HttpRequestCacheLevel.NoCac heNoStore);
requestWriter = new StreamWriter(webRequest.GetRequestStream());
requestWriter.Write(postData);
requestWriter.Close();
webResponse = (HttpWebResponse)webRequest.GetResponse();
responseReader = new StreamReader(webResponse.GetResponseStream());
responseData = responseReader.ReadToEnd();
responseReader.Close();
webResponse.Close();
//
// Third connection, attempting to go to the second screen to read
off the subsequent companies
// (DOES NOT WORK)
//
string Secret_address =
"http://www.fsa.gov.uk/register/firmMainSearch.do?pageNumber=2";

webRequest = WebRequest.Create(Secret_address) as HttpWebRequest;
webRequest.Method = "GET";
webRequest.CookieContainer = cookies;
webRequest.ContentType = "application/x-www-form-urlencoded";
webRequest.CookieContainer.Add(cookie);
webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg,
image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel,
application/vnd.ms-powerpoint, application/msword, */*";

webRequest.Referer =
"http://www.fsa.gov.uk/register/firmMainSearch.do;" + cookie.Name + "="
+ cookie.Value;
webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value);
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
webRequest.Headers.Add("Accept-Language", "en-us");
webRequest.Headers.Add("UA-CPU", "x86");
webRequest.KeepAlive = true;
webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows
NT 5.1; .NET CLR 2.0.50727)";

webResponse = (HttpWebResponse)webRequest.GetResponse();
responseReader = new StreamReader(webResponse.GetResponseStream());
responseData = responseReader.ReadToEnd();
responseReader.Close();
webResponse.Close();

webBrowser1.DocumentText = responseData;

}

Nov 13 '06 #3

P: n/a
Thanks all.

Fixed by using WebBrowser control instead of separate WebRequest /
WebResponse objects.


Ciaran O''Donnell wrote:
Whats the actuall error you get?

Ciaran

"Ad**********@googlemail.com" wrote:
Hi people,

I have now spent far too much time on this small problem, I am
hopefully going to hand it over to you guys that are cleverer than I in
this respect.

Problem. I need to get all firms in a certain postal code area (say
"E1") from the FSA website. I can do this interactively, but I
thought I would save time and automate it. Big mistake, I have taken
hours investigating an so far failed to get the second search results
page displayed.

Step 1.
Goto site http://www.fsa.gov.uk/register/firmSearchForm.do and enter
postal code of "E1", press "Submit".

Step 2.
At bottom of screen, you can select 21 further pages via URI such as
http://www.fsa.gov.uk/register/firmM...o?pageNumber=3 for page
3.

I though this would be easy to code in VS2005, but I have come to a
complete standstill and would like some help please.

Full code in a button Click event below.

First connection to the site is only to get the cookie, nothing else.
Second connection is the entry of postcode "E1" and pressing "Submit"
button
Third connection is the selection of Page 2 to display companies 11 to
20.

You will need to add following to top of basic windforms app.
using System.Net;
using System.IO;
using System.Net.Cache;
---------------------------------
private void button2_Click(object sender, EventArgs e)
{
string address;
string postData;
CookieContainer cookies = new CookieContainer();
HttpWebRequest webRequest;
HttpWebResponse webResponse;
StreamReader responseReader;
StreamWriter requestWriter;
string responseData;
Cookie cookie ;

//
// First connection to site, purely to get the cookie out.
// (WORKS)
//
address = "http://www.fsa.gov.uk/register/firmMainSearch.do";
postData = "";

webRequest = WebRequest.Create(address) as HttpWebRequest;
webRequest.Method = "GET";
webRequest.CookieContainer = cookies;
webRequest.ContentType = "application/x-www-form-urlencoded";
webResponse = (HttpWebResponse)webRequest.GetResponse();
responseReader = new StreamReader(webResponse.GetResponseStream());
responseData = responseReader.ReadToEnd();
responseReader.Close();
webResponse.Close();

cookie = webResponse.Cookies[0];

//
// Second connection to site to emulate sending postcode E1 and
pressing Submit button
// (WORKS)
//
postData =
"pageNumber=0&firmName=&postcodeOut=E1&postcodeIn= &searchType=1&currAuthorisedInd=on&ddd=Submit";
address = "http://www.fsa.gov.uk/register/firmMainSearch.do;" +
cookie.Name + "="+cookie.Value;
webRequest = WebRequest.Create(address) as HttpWebRequest;
webRequest.Method = "POST";
webRequest.CookieContainer = cookies;
webRequest.CookieContainer.Add(cookie);
webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg,
image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel,
application/vnd.ms-powerpoint, application/msword, */*";
byte[] bytes = Encoding.ASCII.GetBytes(postData);
webRequest.ContentLength = bytes.Length;

webRequest.Referer =
"http://www.fsa.gov.uk/register/firmMainSearch.do;";// +cookie.Name +
"=" + cookie.Value;
webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value);
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
webRequest.Headers.Add("Accept-Language", "en-us");
webRequest.Headers.Add("UA-CPU", "x86");
webRequest.KeepAlive = true;
webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows
NT 5.1; .NET CLR 2.0.50727)";
webRequest.ContentType = "application/x-www-form-urlencoded";
webRequest.CachePolicy = new
HttpRequestCachePolicy(HttpRequestCacheLevel.NoCac heNoStore);
requestWriter = new StreamWriter(webRequest.GetRequestStream());
requestWriter.Write(postData);
requestWriter.Close();
webResponse = (HttpWebResponse)webRequest.GetResponse();
responseReader = new StreamReader(webResponse.GetResponseStream());
responseData = responseReader.ReadToEnd();
responseReader.Close();
webResponse.Close();
//
// Third connection, attempting to go to the second screen to read
off the subsequent companies
// (DOES NOT WORK)
//
string Secret_address =
"http://www.fsa.gov.uk/register/firmMainSearch.do?pageNumber=2";

webRequest = WebRequest.Create(Secret_address) as HttpWebRequest;
webRequest.Method = "GET";
webRequest.CookieContainer = cookies;
webRequest.ContentType = "application/x-www-form-urlencoded";
webRequest.CookieContainer.Add(cookie);
webRequest.Accept = "image/gif, image/x-xbitmap, image/jpeg,
image/pjpeg, application/x-shockwave-flash, application/vnd.ms-excel,
application/vnd.ms-powerpoint, application/msword, */*";

webRequest.Referer =
"http://www.fsa.gov.uk/register/firmMainSearch.do;" + cookie.Name + "="
+ cookie.Value;
webRequest.Headers.Add("Cookie", cookie.Name + "=" + cookie.Value);
webRequest.Headers.Add("Accept-Encoding", "gzip, deflate");
webRequest.Headers.Add("Accept-Language", "en-us");
webRequest.Headers.Add("UA-CPU", "x86");
webRequest.KeepAlive = true;
webRequest.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows
NT 5.1; .NET CLR 2.0.50727)";

webResponse = (HttpWebResponse)webRequest.GetResponse();
responseReader = new StreamReader(webResponse.GetResponseStream());
responseData = responseReader.ReadToEnd();
responseReader.Close();
webResponse.Close();

webBrowser1.DocumentText = responseData;

}
Nov 14 '06 #4

This discussion thread is closed

Replies have been disabled for this discussion.