Expand|Select|Wrap|Line Numbers
- #! /usr/bin/python
- import urllib
- import re
- import sys
- def crawl(urllist,done):
- curl=urllist[0].upper()
- f = urllib.urlopen(curl)
- rx=re.compile("href=\"(http://[a-zA-Z0-9_\./\?&%=#\-]+)[\s\"]")
- src=f.read()
- src.replace('\n',' ')
- ma =rx.findall(src)
- for i in range(0,len(ma)):
- ma[i]=ma[i].upper()
- urllist=urllist+ma
- done.append(curl.upper())
- print "**Done**"+curl
- for i in range(0,len(done)):
- while urllist.count(done[i]):
- urllist.pop(urllist.index(done[i]))
- if len(urllist)>0:
- crawl(urllist,done)
- url=sys.argv[1]
- url=url.upper()
- print "Seed="+url
- urllist=[url]
- done=[]
- crawl(urllist,done)
File "./crawler.py", line 35, in crawl
crawl(urllist,done)
File "./crawler.py", line 11, in crawl
f = urllib.urlopen(curl)
File "/usr/lib/python2.4/urllib.py", line 82, in urlopen
return opener.open(url)
File "/usr/lib/python2.4/urllib.py", line 190, in open
return getattr(self, name)(url)
File "/usr/lib/python2.4/urllib.py", line 313, in open_http
h.endheaders()
File "/usr/lib/python2.4/httplib.py", line 798, in endheaders
self._send_output()
File "/usr/lib/python2.4/httplib.py", line 679, in _send_output
self.send(msg)
File "/usr/lib/python2.4/httplib.py", line 646, in send
self.connect()
File "/usr/lib/python2.4/httplib.py", line 630, in connect
raise socket.error, msg
IOError: [Errno socket error] (110, 'Connection timed out')
is there a way around this problem?