473,387 Members | 1,859 Online
Bytes | Software Development & Data Engineering Community
Post Job

Home Posts Topics Members FAQ

Join Bytes to post your question to a community of 473,387 software developers and data experts.

crawler crawler....help needed

well i got this code from java.sun.com and tried modiifying it in all the possible ways,but to no good.. stil its not workin..pleas help me out and try postin good workinw web cralwer if u have.. need help asap...


import java.applet.Applet;
import java.text.*;
import java.awt.*;
import java.awt.List;
import java.awt.event.*;
import java.util.*;


import java.net.*;
import java.io.*;

public class WebCrawler extends Applet implements ActionListener, Runnable {
public static final String SEARCH = "Search";
public static final String STOP = "Stop";
public static final String DISALLOW = "Disallow:";
public static final int SEARCH_LIMIT = 50;

Panel panelMain;
List listMatches;
Label labelStatus;

Vector vectorToSearch;
Vector vectorSearched;
Vector vectorMatches;

Thread searchThread;

TextField textURL;
Choice choiceType;

public void init() {

panelMain = new Panel();
panelMain.setLayout(new BorderLayout(5, 5));

Panel panelEntry = new Panel();
panelEntry.setLayout(new BorderLayout(5, 5));

Panel panelURL = new Panel();
panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelURL = new Label("Starting URL: ", Label.RIGHT);
panelURL.add(labelURL);
textURL = new TextField("", 40);
panelURL.add(textURL);
panelEntry.add("North", panelURL);

Panel panelType = new Panel();
panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelType = new Label("Content type: ", Label.RIGHT);
panelType.add(labelType);
choiceType = new Choice();
choiceType.addItem("text/html");
choiceType.addItem("audio/basic");
choiceType.addItem("audio/au");
choiceType.addItem("audio/aiff");
choiceType.addItem("audio/wav");
choiceType.addItem("video/mpeg");
choiceType.addItem("video/x-avi");
panelType.add(choiceType);
panelEntry.add("South", panelType);

panelMain.add("North", panelEntry);

Panel panelListButtons = new Panel();
panelListButtons.setLayout(new BorderLayout(5, 5));

Panel panelList = new Panel();
panelList.setLayout(new BorderLayout(5, 5));
Label labelResults = new Label("Search results");
panelList.add("North", labelResults);
Panel panelListCurrent = new Panel();
panelListCurrent.setLayout(new BorderLayout(5, 5));
listMatches = new List(10);
panelListCurrent.add("North", listMatches);
labelStatus = new Label("");
panelListCurrent.add("South", labelStatus);
panelList.add("South", panelListCurrent);

panelListButtons.add("North", panelList);

Panel panelButtons = new Panel();
Button buttonSearch = new Button(SEARCH);
buttonSearch.addActionListener(this);
panelButtons.add(buttonSearch);
Button buttonStop = new Button(STOP);
buttonStop.addActionListener(this);
panelButtons.add(buttonStop);

panelListButtons.add("South", panelButtons);

panelMain.add("South", panelListButtons);

add(panelMain);
setVisible(true);

repaint();

vectorToSearch = new Vector();
vectorSearched = new Vector();
vectorMatches = new Vector();

URLConnection.setDefaultAllowUserInteraction(false );
}

public void start() {
}

public void stop() {
if (searchThread != null) {
setStatus("stopping...");
searchThread = null;
}
}

public void destroy() {
}


public void paint(Graphics g) {
g.drawRect(0, 0, getSize().width - 1, getSize().height - 1);

panelMain.paint(g);
panelMain.paintComponents(g);
}

public void run() {
String strURL = textURL.getText();
String strTargetType = choiceType.getSelectedItem();
int numberSearched = 0;
int numberFound = 0;

if (strURL.length() == 0) {
setStatus("ERROR: must enter a starting URL");
return;
}

vectorToSearch.removeAllElements();
vectorSearched.removeAllElements();
vectorMatches.removeAllElements();
listMatches.removeAll();

vectorToSearch.addElement(strURL);

while ((vectorToSearch.size() > 0)
&& (Thread.currentThread() == searchThread)) {
strURL = (String) vectorToSearch.elementAt(0);

setStatus("searching " + strURL);

URL url;
try {
url = new URL(strURL);
} catch (MalformedURLException e) {
setStatus("ERROR: invalid URL " + strURL);
break;
}

vectorToSearch.removeElementAt(0);
vectorSearched.addElement(strURL);

if (url.getProtocol().compareTo("http") != 0)
break;


try {
URLConnection urlConnection = url.openConnection();

urlConnection.setAllowUserInteraction(false);

InputStream urlStream = url.openStream();
String type
= urlConnection.guessContentTypeFromStream(urlStream );
if (type == null)
break;
if (type.compareTo("text/html") != 0)
break;

byte b[] = new byte[1000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close();

if (Thread.currentThread() != searchThread)
break;

String lowerCaseContent = content.toLowerCase();

int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;

if (Thread.currentThread() != searchThread)
break;

index++;
String remaining = content.substring(index);

StringTokenizer st
= new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();

URL urlLink;
try {
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e) {
setStatus("ERROR: bad URL " + strLink);
continue;
}

if (urlLink.getProtocol().compareTo("http") != 0)
break;

if (Thread.currentThread() != searchThread)
break;

try {
URLConnection urlLinkConnection
= urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType
= urlLinkConnection.guessContentTypeFromStream(linkS tream);
linkStream.close();

if (strType == null)
break;
if (strType.compareTo("text/html") == 0) {
if ((!vectorSearched.contains(strLink))
&& (!vectorToSearch.contains(strLink))) {

}
}

if (strType.compareTo(strTargetType) == 0) {
if (vectorMatches.contains(strLink) == false) {
listMatches.add(strLink);
vectorMatches.addElement(strLink);
numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strLink);
continue;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strURL);
break;
}

numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
}

if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
setStatus("reached search limit of " + SEARCH_LIMIT);
else
setStatus("done");
searchThread = null;

}

void setStatus(String status) {
labelStatus.setText(status);
}

public void actionPerformed(ActionEvent event) {
String command = event.getActionCommand();

if (command.compareTo(SEARCH) == 0) {
setStatus("searching...");

if (searchThread == null) {
searchThread = new Thread(this);
}
searchThread.start();
}
else if (command.compareTo(STOP) == 0) {
stop();
}
}
public static void main (String argv[])
{
Frame f = new Frame("My Crawler");
WebCrawler applet = new WebCrawler();
f.add("Center", applet);

/* Behind a firewall set your proxy and port here!
*/
String prox = "192.168.16.230";
Properties props= new Properties(System.getProperties());
props.put("http.proxySet","true");
props.put("http.proxyHost", "prox");
props.put("http.proxyPort", "8080");


Properties newprops = new Properties(props);
System.setProperties(newprops);



applet.init();
applet.start();
f.pack();
f.show();
}

}
Jan 5 '07 #1
3 1974
well i got this code from java.sun.com and tried modiifying it in all the possible ways,but to no good.. stil its not workin..pleas help me out and try postin good workinw web cralwer if u have.. need help asap...


import java.applet.Applet;
import java.text.*;
import java.awt.*;
import java.awt.List;
import java.awt.event.*;
import java.util.*;


import java.net.*;
import java.io.*;

public class WebCrawler extends Applet implements ActionListener, Runnable {
public static final String SEARCH = "Search";
public static final String STOP = "Stop";
public static final String DISALLOW = "Disallow:";
public static final int SEARCH_LIMIT = 50;

Panel panelMain;
List listMatches;
Label labelStatus;

Vector vectorToSearch;
Vector vectorSearched;
Vector vectorMatches;

Thread searchThread;

TextField textURL;
Choice choiceType;

public void init() {

panelMain = new Panel();
panelMain.setLayout(new BorderLayout(5, 5));

Panel panelEntry = new Panel();
panelEntry.setLayout(new BorderLayout(5, 5));

Panel panelURL = new Panel();
panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelURL = new Label("Starting URL: ", Label.RIGHT);
panelURL.add(labelURL);
textURL = new TextField("", 40);
panelURL.add(textURL);
panelEntry.add("North", panelURL);

Panel panelType = new Panel();
panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelType = new Label("Content type: ", Label.RIGHT);
panelType.add(labelType);
choiceType = new Choice();
choiceType.addItem("text/html");
choiceType.addItem("audio/basic");
choiceType.addItem("audio/au");
choiceType.addItem("audio/aiff");
choiceType.addItem("audio/wav");
choiceType.addItem("video/mpeg");
choiceType.addItem("video/x-avi");
panelType.add(choiceType);
panelEntry.add("South", panelType);

panelMain.add("North", panelEntry);

Panel panelListButtons = new Panel();
panelListButtons.setLayout(new BorderLayout(5, 5));

Panel panelList = new Panel();
panelList.setLayout(new BorderLayout(5, 5));
Label labelResults = new Label("Search results");
panelList.add("North", labelResults);
Panel panelListCurrent = new Panel();
panelListCurrent.setLayout(new BorderLayout(5, 5));
listMatches = new List(10);
panelListCurrent.add("North", listMatches);
labelStatus = new Label("");
panelListCurrent.add("South", labelStatus);
panelList.add("South", panelListCurrent);

panelListButtons.add("North", panelList);

Panel panelButtons = new Panel();
Button buttonSearch = new Button(SEARCH);
buttonSearch.addActionListener(this);
panelButtons.add(buttonSearch);
Button buttonStop = new Button(STOP);
buttonStop.addActionListener(this);
panelButtons.add(buttonStop);

panelListButtons.add("South", panelButtons);

panelMain.add("South", panelListButtons);

add(panelMain);
setVisible(true);

repaint();

vectorToSearch = new Vector();
vectorSearched = new Vector();
vectorMatches = new Vector();

URLConnection.setDefaultAllowUserInteraction(false );
}

public void start() {
}

public void stop() {
if (searchThread != null) {
setStatus("stopping...");
searchThread = null;
}
}

public void destroy() {
}


public void paint(Graphics g) {
g.drawRect(0, 0, getSize().width - 1, getSize().height - 1);

panelMain.paint(g);
panelMain.paintComponents(g);
}

public void run() {
String strURL = textURL.getText();
String strTargetType = choiceType.getSelectedItem();
int numberSearched = 0;
int numberFound = 0;

if (strURL.length() == 0) {
setStatus("ERROR: must enter a starting URL");
return;
}

vectorToSearch.removeAllElements();
vectorSearched.removeAllElements();
vectorMatches.removeAllElements();
listMatches.removeAll();

vectorToSearch.addElement(strURL);

while ((vectorToSearch.size() > 0)
&& (Thread.currentThread() == searchThread)) {
strURL = (String) vectorToSearch.elementAt(0);

setStatus("searching " + strURL);

URL url;
try {
url = new URL(strURL);
} catch (MalformedURLException e) {
setStatus("ERROR: invalid URL " + strURL);
break;
}

vectorToSearch.removeElementAt(0);
vectorSearched.addElement(strURL);

if (url.getProtocol().compareTo("http") != 0)
break;


try {
URLConnection urlConnection = url.openConnection();

urlConnection.setAllowUserInteraction(false);

InputStream urlStream = url.openStream();
String type
= urlConnection.guessContentTypeFromStream(urlStream );
if (type == null)
break;
if (type.compareTo("text/html") != 0)
break;

byte b[] = new byte[1000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close();

if (Thread.currentThread() != searchThread)
break;

String lowerCaseContent = content.toLowerCase();

int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;

if (Thread.currentThread() != searchThread)
break;

index++;
String remaining = content.substring(index);

StringTokenizer st
= new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();

URL urlLink;
try {
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e) {
setStatus("ERROR: bad URL " + strLink);
continue;
}

if (urlLink.getProtocol().compareTo("http") != 0)
break;

if (Thread.currentThread() != searchThread)
break;

try {
URLConnection urlLinkConnection
= urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType
= urlLinkConnection.guessContentTypeFromStream(linkS tream);
linkStream.close();

if (strType == null)
break;
if (strType.compareTo("text/html") == 0) {
if ((!vectorSearched.contains(strLink))
&& (!vectorToSearch.contains(strLink))) {

}
}

if (strType.compareTo(strTargetType) == 0) {
if (vectorMatches.contains(strLink) == false) {
listMatches.add(strLink);
vectorMatches.addElement(strLink);
numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strLink);
continue;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strURL);
break;
}

numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
}

if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
setStatus("reached search limit of " + SEARCH_LIMIT);
else
setStatus("done");
searchThread = null;

}

void setStatus(String status) {
labelStatus.setText(status);
}

public void actionPerformed(ActionEvent event) {
String command = event.getActionCommand();

if (command.compareTo(SEARCH) == 0) {
setStatus("searching...");

if (searchThread == null) {
searchThread = new Thread(this);
}
searchThread.start();
}
else if (command.compareTo(STOP) == 0) {
stop();
}
}
public static void main (String argv[])
{
Frame f = new Frame("My Crawler");
WebCrawler applet = new WebCrawler();
f.add("Center", applet);

/* Behind a firewall set your proxy and port here!
*/
String prox = "192.168.16.230";
Properties props= new Properties(System.getProperties());
props.put("http.proxySet","true");
props.put("http.proxyHost", "prox");
props.put("http.proxyPort", "8080");


Properties newprops = new Properties(props);
System.setProperties(newprops);



applet.init();
applet.start();
f.pack();
f.show();
}

}
Jan 5 '07 #2
r035198x
13,262 8TB
Expand|Select|Wrap|Line Numbers
  1.  
  2.  
  3. import java.applet.Applet;
  4. import java.text.*;
  5. import java.awt.*;
  6. import java.awt.List;
  7. import java.awt.event.*;
  8. import java.util.*;
  9.  
  10.  
  11. import java.net.*;
  12. import java.io.*; 
  13.  
  14. public class WebCrawler extends Applet implements ActionListener, Runnable {
  15. public static final String SEARCH = "Search";
  16. public static final String STOP = "Stop";
  17. public static final String DISALLOW = "Disallow:";
  18. public static final int SEARCH_LIMIT = 50;
  19.  
  20. Panel panelMain;
  21. List listMatches;
  22. Label labelStatus;
  23.  
  24. Vector vectorToSearch;
  25. Vector vectorSearched;
  26. Vector vectorMatches;
  27.  
  28. Thread searchThread;
  29.  
  30. TextField textURL;
  31. Choice choiceType;
  32.  
  33. public void init() {
  34.  
  35. panelMain = new Panel();
  36. panelMain.setLayout(new BorderLayout(5, 5));
  37.  
  38. Panel panelEntry = new Panel();
  39. panelEntry.setLayout(new BorderLayout(5, 5));
  40.  
  41. Panel panelURL = new Panel();
  42. panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
  43. Label labelURL = new Label("Starting URL: ", Label.RIGHT);
  44. panelURL.add(labelURL);
  45. textURL = new TextField("", 40);
  46. panelURL.add(textURL);
  47. panelEntry.add("North", panelURL);
  48.  
  49. Panel panelType = new Panel();
  50. panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
  51. Label labelType = new Label("Content type: ", Label.RIGHT);
  52. panelType.add(labelType);
  53. choiceType = new Choice();
  54. choiceType.addItem("text/html");
  55. choiceType.addItem("audio/basic");
  56. choiceType.addItem("audio/au");
  57. choiceType.addItem("audio/aiff");
  58. choiceType.addItem("audio/wav");
  59. choiceType.addItem("video/mpeg");
  60. choiceType.addItem("video/x-avi");
  61. panelType.add(choiceType);
  62. panelEntry.add("South", panelType);
  63.  
  64. panelMain.add("North", panelEntry);
  65.  
  66. Panel panelListButtons = new Panel();
  67. panelListButtons.setLayout(new BorderLayout(5, 5));
  68.  
  69. Panel panelList = new Panel();
  70. panelList.setLayout(new BorderLayout(5, 5));
  71. Label labelResults = new Label("Search results");
  72. panelList.add("North", labelResults);
  73. Panel panelListCurrent = new Panel();
  74. panelListCurrent.setLayout(new BorderLayout(5, 5));
  75. listMatches = new List(10);
  76. panelListCurrent.add("North", listMatches);
  77. labelStatus = new Label("");
  78. panelListCurrent.add("South", labelStatus);
  79. panelList.add("South", panelListCurrent);
  80.  
  81. panelListButtons.add("North", panelList);
  82.  
  83. Panel panelButtons = new Panel();
  84. Button buttonSearch = new Button(SEARCH);
  85. buttonSearch.addActionListener(this);
  86. panelButtons.add(buttonSearch);
  87. Button buttonStop = new Button(STOP);
  88. buttonStop.addActionListener(this);
  89. panelButtons.add(buttonStop);
  90.  
  91. panelListButtons.add("South", panelButtons);
  92.  
  93. panelMain.add("South", panelListButtons);
  94.  
  95. add(panelMain);
  96. setVisible(true);
  97.  
  98. repaint();
  99.  
  100. vectorToSearch = new Vector();
  101. vectorSearched = new Vector();
  102. vectorMatches = new Vector();
  103.  
  104. URLConnection.setDefaultAllowUserInteraction(false );
  105. }
  106.  
  107. public void start() {
  108. }
  109.  
  110. public void stop() {
  111. if (searchThread != null) {
  112. setStatus("stopping...");
  113. searchThread = null;
  114. }
  115. }
  116.  
  117. public void destroy() {
  118. }
  119.  
  120.  
  121. public void paint(Graphics g) {
  122. g.drawRect(0, 0, getSize().width - 1, getSize().height - 1);
  123.  
  124. panelMain.paint(g);
  125. panelMain.paintComponents(g);
  126. }
  127.  
  128. public void run() {
  129. String strURL = textURL.getText();
  130. String strTargetType = choiceType.getSelectedItem();
  131. int numberSearched = 0;
  132. int numberFound = 0;
  133.  
  134. if (strURL.length() == 0) {
  135. setStatus("ERROR: must enter a starting URL");
  136. return;
  137. }
  138.  
  139. vectorToSearch.removeAllElements();
  140. vectorSearched.removeAllElements();
  141. vectorMatches.removeAllElements();
  142. listMatches.removeAll();
  143.  
  144. vectorToSearch.addElement(strURL);
  145.  
  146. while ((vectorToSearch.size() > 0)
  147. && (Thread.currentThread() == searchThread)) {
  148. strURL = (String) vectorToSearch.elementAt(0);
  149.  
  150. setStatus("searching " + strURL);
  151.  
  152. URL url;
  153. try {
  154. url = new URL(strURL);
  155. } catch (MalformedURLException e) {
  156. setStatus("ERROR: invalid URL " + strURL);
  157. break;
  158. }
  159.  
  160. vectorToSearch.removeElementAt(0);
  161. vectorSearched.addElement(strURL);
  162.  
  163. if (url.getProtocol().compareTo("http") != 0)
  164. break;
  165.  
  166.  
  167. try {
  168. URLConnection urlConnection = url.openConnection();
  169.  
  170. urlConnection.setAllowUserInteraction(false);
  171.  
  172. InputStream urlStream = url.openStream();
  173. String type
  174. = urlConnection.guessContentTypeFromStream(urlStream );
  175. if (type == null)
  176. break;
  177. if (type.compareTo("text/html") != 0)
  178. break;
  179.  
  180. byte b[] = new byte[1000];
  181. int numRead = urlStream.read(b);
  182. String content = new String(b, 0, numRead);
  183. while (numRead != -1) {
  184. if (Thread.currentThread() != searchThread)
  185. break;
  186. numRead = urlStream.read(b);
  187. if (numRead != -1) {
  188. String newContent = new String(b, 0, numRead);
  189. content += newContent;
  190. }
  191. }
  192. urlStream.close();
  193.  
  194. if (Thread.currentThread() != searchThread)
  195. break;
  196.  
  197. String lowerCaseContent = content.toLowerCase();
  198.  
  199. int index = 0;
  200. while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
  201. {
  202. if ((index = lowerCaseContent.indexOf("href", index)) == -1)
  203. break;
  204. if ((index = lowerCaseContent.indexOf("=", index)) == -1)
  205. break;
  206.  
  207. if (Thread.currentThread() != searchThread)
  208. break;
  209.  
  210. index++;
  211. String remaining = content.substring(index);
  212.  
  213. StringTokenizer st
  214. = new StringTokenizer(remaining, "\t\n\r\">#");
  215. String strLink = st.nextToken();
  216.  
  217. URL urlLink;
  218. try {
  219. urlLink = new URL(url, strLink);
  220. strLink = urlLink.toString();
  221. } catch (MalformedURLException e) {
  222. setStatus("ERROR: bad URL " + strLink);
  223. continue;
  224. }
  225.  
  226. if (urlLink.getProtocol().compareTo("http") != 0)
  227. break;
  228.  
  229. if (Thread.currentThread() != searchThread)
  230. break;
  231.  
  232. try {
  233. URLConnection urlLinkConnection
  234. = urlLink.openConnection();
  235. urlLinkConnection.setAllowUserInteraction(false);
  236. InputStream linkStream = urlLink.openStream();
  237. String strType
  238. = urlLinkConnection.guessContentTypeFromStream(linkStream);
  239. linkStream.close();
  240.  
  241. if (strType == null)
  242. break;
  243. if (strType.compareTo("text/html") == 0) {
  244. if ((!vectorSearched.contains(strLink))
  245. && (!vectorToSearch.contains(strLink))) {
  246.  
  247. }
  248. }
  249.  
  250. if (strType.compareTo(strTargetType) == 0) {
  251. if (vectorMatches.contains(strLink) == false) {
  252. listMatches.add(strLink);
  253. vectorMatches.addElement(strLink);
  254. numberFound++;
  255. if (numberFound >= SEARCH_LIMIT)
  256. break;
  257. }
  258. }
  259. } catch (IOException e) {
  260. setStatus("ERROR: couldn't open URL " + strLink);
  261. continue;
  262. }
  263. }
  264. } catch (IOException e) {
  265. setStatus("ERROR: couldn't open URL " + strURL);
  266. break;
  267. }
  268.  
  269. numberSearched++;
  270. if (numberSearched >= SEARCH_LIMIT)
  271. break;
  272. }
  273.  
  274. if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
  275. setStatus("reached search limit of " + SEARCH_LIMIT);
  276. else
  277. setStatus("done");
  278. searchThread = null;
  279.  
  280. }
  281.  
  282. void setStatus(String status) {
  283. labelStatus.setText(status);
  284. }
  285.  
  286. public void actionPerformed(ActionEvent event) {
  287. String command = event.getActionCommand();
  288.  
  289. if (command.compareTo(SEARCH) == 0) {
  290. setStatus("searching...");
  291.  
  292. if (searchThread == null) {
  293. searchThread = new Thread(this);
  294. }
  295. searchThread.start();
  296. }
  297. else if (command.compareTo(STOP) == 0) {
  298. stop();
  299. }
  300. }
  301. public static void main (String argv[])
  302. {
  303. Frame f = new Frame("My Crawler");
  304. WebCrawler applet = new WebCrawler();
  305. f.add("Center", applet);
  306.  
  307. /* Behind a firewall set your proxy and port here!
  308. */
  309. String prox = "192.168.16.230";
  310. Properties props= new Properties(System.getProperties());
  311. props.put("http.proxySet","true");
  312. props.put("http.proxyHost", "prox");
  313. props.put("http.proxyPort", "8080");
  314.  
  315.  
  316. Properties newprops = new Properties(props);
  317. System.setProperties(newprops);
  318.  
  319.  
  320.  
  321. applet.init();
  322. applet.start();
  323. f.pack();
  324. f.show();
  325. }
  326.  
  327. }
  328.  
  329.  
  330.  
Try this. I just removed one space that was giving an error and it worked for me. If it does not work post the error that you get.
Jan 5 '07 #3
r035198x
13,262 8TB
well i got this code from java.sun.com and tried modiifying it in all the possible ways,but to no good.. stil its not workin..pleas help me out and try postin good workinw web cralwer if u have.. need help asap...


import java.applet.Applet;
import java.text.*;
import java.awt.*;
import java.awt.List;
import java.awt.event.*;
import java.util.*;


import java.net.*;
import java.io.*;

public class WebCrawler extends Applet implements ActionListener, Runnable {
public static final String SEARCH = "Search";
public static final String STOP = "Stop";
public static final String DISALLOW = "Disallow:";
public static final int SEARCH_LIMIT = 50;

Panel panelMain;
List listMatches;
Label labelStatus;

Vector vectorToSearch;
Vector vectorSearched;
Vector vectorMatches;

Thread searchThread;

TextField textURL;
Choice choiceType;

public void init() {

panelMain = new Panel();
panelMain.setLayout(new BorderLayout(5, 5));

Panel panelEntry = new Panel();
panelEntry.setLayout(new BorderLayout(5, 5));

Panel panelURL = new Panel();
panelURL.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelURL = new Label("Starting URL: ", Label.RIGHT);
panelURL.add(labelURL);
textURL = new TextField("", 40);
panelURL.add(textURL);
panelEntry.add("North", panelURL);

Panel panelType = new Panel();
panelType.setLayout(new FlowLayout(FlowLayout.LEFT, 5, 5));
Label labelType = new Label("Content type: ", Label.RIGHT);
panelType.add(labelType);
choiceType = new Choice();
choiceType.addItem("text/html");
choiceType.addItem("audio/basic");
choiceType.addItem("audio/au");
choiceType.addItem("audio/aiff");
choiceType.addItem("audio/wav");
choiceType.addItem("video/mpeg");
choiceType.addItem("video/x-avi");
panelType.add(choiceType);
panelEntry.add("South", panelType);

panelMain.add("North", panelEntry);

Panel panelListButtons = new Panel();
panelListButtons.setLayout(new BorderLayout(5, 5));

Panel panelList = new Panel();
panelList.setLayout(new BorderLayout(5, 5));
Label labelResults = new Label("Search results");
panelList.add("North", labelResults);
Panel panelListCurrent = new Panel();
panelListCurrent.setLayout(new BorderLayout(5, 5));
listMatches = new List(10);
panelListCurrent.add("North", listMatches);
labelStatus = new Label("");
panelListCurrent.add("South", labelStatus);
panelList.add("South", panelListCurrent);

panelListButtons.add("North", panelList);

Panel panelButtons = new Panel();
Button buttonSearch = new Button(SEARCH);
buttonSearch.addActionListener(this);
panelButtons.add(buttonSearch);
Button buttonStop = new Button(STOP);
buttonStop.addActionListener(this);
panelButtons.add(buttonStop);

panelListButtons.add("South", panelButtons);

panelMain.add("South", panelListButtons);

add(panelMain);
setVisible(true);

repaint();

vectorToSearch = new Vector();
vectorSearched = new Vector();
vectorMatches = new Vector();

URLConnection.setDefaultAllowUserInteraction(false );
}

public void start() {
}

public void stop() {
if (searchThread != null) {
setStatus("stopping...");
searchThread = null;
}
}

public void destroy() {
}


public void paint(Graphics g) {
g.drawRect(0, 0, getSize().width - 1, getSize().height - 1);

panelMain.paint(g);
panelMain.paintComponents(g);
}

public void run() {
String strURL = textURL.getText();
String strTargetType = choiceType.getSelectedItem();
int numberSearched = 0;
int numberFound = 0;

if (strURL.length() == 0) {
setStatus("ERROR: must enter a starting URL");
return;
}

vectorToSearch.removeAllElements();
vectorSearched.removeAllElements();
vectorMatches.removeAllElements();
listMatches.removeAll();

vectorToSearch.addElement(strURL);

while ((vectorToSearch.size() > 0)
&& (Thread.currentThread() == searchThread)) {
strURL = (String) vectorToSearch.elementAt(0);

setStatus("searching " + strURL);

URL url;
try {
url = new URL(strURL);
} catch (MalformedURLException e) {
setStatus("ERROR: invalid URL " + strURL);
break;
}

vectorToSearch.removeElementAt(0);
vectorSearched.addElement(strURL);

if (url.getProtocol().compareTo("http") != 0)
break;


try {
URLConnection urlConnection = url.openConnection();

urlConnection.setAllowUserInteraction(false);

InputStream urlStream = url.openStream();
String type
= urlConnection.guessContentTypeFromStream(urlStream );
if (type == null)
break;
if (type.compareTo("text/html") != 0)
break;

byte b[] = new byte[1000];
int numRead = urlStream.read(b);
String content = new String(b, 0, numRead);
while (numRead != -1) {
if (Thread.currentThread() != searchThread)
break;
numRead = urlStream.read(b);
if (numRead != -1) {
String newContent = new String(b, 0, numRead);
content += newContent;
}
}
urlStream.close();

if (Thread.currentThread() != searchThread)
break;

String lowerCaseContent = content.toLowerCase();

int index = 0;
while ((index = lowerCaseContent.indexOf("<a", index)) != -1)
{
if ((index = lowerCaseContent.indexOf("href", index)) == -1)
break;
if ((index = lowerCaseContent.indexOf("=", index)) == -1)
break;

if (Thread.currentThread() != searchThread)
break;

index++;
String remaining = content.substring(index);

StringTokenizer st
= new StringTokenizer(remaining, "\t\n\r\">#");
String strLink = st.nextToken();

URL urlLink;
try {
urlLink = new URL(url, strLink);
strLink = urlLink.toString();
} catch (MalformedURLException e) {
setStatus("ERROR: bad URL " + strLink);
continue;
}

if (urlLink.getProtocol().compareTo("http") != 0)
break;

if (Thread.currentThread() != searchThread)
break;

try {
URLConnection urlLinkConnection
= urlLink.openConnection();
urlLinkConnection.setAllowUserInteraction(false);
InputStream linkStream = urlLink.openStream();
String strType
= urlLinkConnection.guessContentTypeFromStream(linkS tream);
linkStream.close();

if (strType == null)
break;
if (strType.compareTo("text/html") == 0) {
if ((!vectorSearched.contains(strLink))
&& (!vectorToSearch.contains(strLink))) {

}
}

if (strType.compareTo(strTargetType) == 0) {
if (vectorMatches.contains(strLink) == false) {
listMatches.add(strLink);
vectorMatches.addElement(strLink);
numberFound++;
if (numberFound >= SEARCH_LIMIT)
break;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strLink);
continue;
}
}
} catch (IOException e) {
setStatus("ERROR: couldn't open URL " + strURL);
break;
}

numberSearched++;
if (numberSearched >= SEARCH_LIMIT)
break;
}

if (numberSearched >= SEARCH_LIMIT || numberFound >= SEARCH_LIMIT)
setStatus("reached search limit of " + SEARCH_LIMIT);
else
setStatus("done");
searchThread = null;

}

void setStatus(String status) {
labelStatus.setText(status);
}

public void actionPerformed(ActionEvent event) {
String command = event.getActionCommand();

if (command.compareTo(SEARCH) == 0) {
setStatus("searching...");

if (searchThread == null) {
searchThread = new Thread(this);
}
searchThread.start();
}
else if (command.compareTo(STOP) == 0) {
stop();
}
}
public static void main (String argv[])
{
Frame f = new Frame("My Crawler");
WebCrawler applet = new WebCrawler();
f.add("Center", applet);

/* Behind a firewall set your proxy and port here!
*/
String prox = "192.168.16.230";
Properties props= new Properties(System.getProperties());
props.put("http.proxySet","true");
props.put("http.proxyHost", "prox");
props.put("http.proxyPort", "8080");


Properties newprops = new Properties(props);
System.setProperties(newprops);



applet.init();
applet.start();
f.pack();
f.show();
}

}
Please do not double post
Jan 5 '07 #4

Sign in to post your reply or Sign up for a free account.

Similar topics

2
by: Metropolis | last post by:
Hello All, I am currently trying to teach a web crawler how to identify blogs, that is I am trying to determine a fairly inclusive set of criteria that will help my crawler to identify them. ...
1
by: Benjamin Lefevre | last post by:
I am currently developping a web crawler, mainly crawling mobile page (wml, mobile xhtml) but not only (also html/xml/...), and I ask myself which speed I can reach. This crawler is developped in...
1
by: Steve Ocsic | last post by:
Hi, I've coded a basic crawler where by you enter the URL and it will then crawl the said URL. What I would like to do now is to take it one step further and do the following: 1. pick up the...
0
by: Nicolas | last post by:
I need HELP!!!!! The crawler (Google or other) don't index my web site unless the web site is currently visited If there is nobody visiting those .aspx page therefor activating the aspnet no...
3
by: Bill | last post by:
Has anyone used/tested Request.Browser.Crawler ? Is it reliable, or are there false positives/negatives? Thanks!
13
by: abhinav | last post by:
Hi guys.I have to implement a topical crawler as a part of my project.What language should i implement C or Python?Python though has fast development cycle but my concern is speed also.I want to...
4
by: Petrosa | last post by:
Hey all, I have a project that i need to make a web crawler to find links in a website, and then represent the site's structure in a 3D tree. I have found an example at...
0
by: kishorealla | last post by:
Hello I need to create a web bot/crawler/spider that would go into different web sites and collect data for us and store in a database. The crawler needs to 'READ' the options on a website (either...
4
by: sonich | last post by:
I need simple web crawler, I found Ruya, but it's seems not currently maintained. Does anybody know good web crawler on python or with python interface?
0
by: taylorcarr | last post by:
A Canon printer is a smart device known for being advanced, efficient, and reliable. It is designed for home, office, and hybrid workspace use and can also be used for a variety of purposes. However,...
0
by: Charles Arthur | last post by:
How do i turn on java script on a villaon, callus and itel keypad mobile phone
0
by: aa123db | last post by:
Variable and constants Use var or let for variables and const fror constants. Var foo ='bar'; Let foo ='bar';const baz ='bar'; Functions function $name$ ($parameters$) { } ...
0
by: ryjfgjl | last post by:
In our work, we often receive Excel tables with data in the same format. If we want to analyze these data, it can be difficult to analyze them because the data is spread across multiple Excel files...
0
by: emmanuelkatto | last post by:
Hi All, I am Emmanuel katto from Uganda. I want to ask what challenges you've faced while migrating a website to cloud. Please let me know. Thanks! Emmanuel
0
BarryA
by: BarryA | last post by:
What are the essential steps and strategies outlined in the Data Structures and Algorithms (DSA) roadmap for aspiring data scientists? How can individuals effectively utilize this roadmap to progress...
1
by: nemocccc | last post by:
hello, everyone, I want to develop a software for my android phone for daily needs, any suggestions?
1
by: Sonnysonu | last post by:
This is the data of csv file 1 2 3 1 2 3 1 2 3 1 2 3 2 3 2 3 3 the lengths should be different i have to store the data by column-wise with in the specific length. suppose the i have to...
0
by: Hystou | last post by:
There are some requirements for setting up RAID: 1. The motherboard and BIOS support RAID configuration. 2. The motherboard has 2 or more available SATA protocol SSD/HDD slots (including MSATA, M.2...

By using Bytes.com and it's services, you agree to our Privacy Policy and Terms of Use.

To disable or enable advertisements and analytics tracking please visit the manage ads & tracking page.