I’m new to programming. I’m currently learning python to write a web crawler to extract all text from a web page, in addition to, crawling to further URLs and collecting the text there. The idea is to place all the extracted text in a .txt file with each word in a single line. So the text has to be tokenized. All punctuation marks, duplicate words and non-stop words have to be removed.
The program should crawl the web to a certain depth and collect the URLs and text from each depth (level). I decided to choose a depth of 3. I divided the code to two parts. Part one to collect the URLs and part two to extract the text. Here is my problem:
- The program is extremely slow.
- I'm not sure if it functions properly.
- Is there a better way to extract text?
- Are there any available modules to help clean the text i.e. removing duplicates, non-stop words ...
(Please Note: the majority of the code (the first part) is written by “James Mills”. I found the code online and it looks helpful so I used it. I just modified it and added my code to it)
Thanks,
Kal
Expand|Select|Wrap|Line Numbers
- import sys
- import urllib2
- import urlparse
- from BeautifulSoup import BeautifulSoup,NavigableString
- __version__ = "0.1"
- __copyright__ = "CopyRight (C) 2008 by James Mills"
- __license__ = "GPL"
- __author__ = "James Mills"
- __author_email__ = "James Mills, James dot Mills st dotred dot com dot au"
- USAGE = "%prog [options] <url>"
- VERSION = "%prog v" + __version__
- AGENT = "%s/%s" % (__name__, __version__)
- def encodeHTML(s=""):
- """encodeHTML(s) -> str
- Encode HTML special characters from their ASCII form to
- HTML entities.
- """
- return s.replace("&", "&") \
- .replace("<", "<") \
- .replace(">", ">") \
- .replace("\"", """) \
- .replace("'", "'") \
- .replace("--", "&mdash")
- class Fetcher(object):
- def __init__(self, url):
- self.url = url
- self.urls = []
- def __contains__(self, x):
- return x in self.urls
- def __getitem__(self, x):
- return self.urls[x]
- def _addHeaders(self, request):
- request.add_header("User-Agent", AGENT)
- def open(self):
- url = self.url
- #print "\nFollowing %s" % url
- try:
- request = urllib2.Request(url)
- handle = urllib2.build_opener()
- except IOError:
- return None
- return (request, handle)
- def fetch(self):
- request, handle = self.open()
- self._addHeaders(request)
- if handle:
- soup = BeautifulSoup()
- try:
- content = unicode(handle.open(request).read(), errors="ignore")
- soup.feed(content)
- #soup = BeautifulSoup(content)
- tags = soup('a')
- except urllib2.HTTPError, error:
- if error.code == 404:
- print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
- else:
- print >> sys.stderr, "ERROR: %s" % error
- tags = []
- except urllib2.URLError, error:
- print >> sys.stderr, "ERROR: %s" % error
- tags = []
- for tag in tags:
- try:
- href = tag["href"]
- if href is not None:
- url = urlparse.urljoin(self.url, encodeHTML(href))
- if url not in self:
- #print " Found: %s" % url
- self.urls.append(url)
- except KeyError:
- pass
- ################################################################################
- # I created 3 lists (root, level2 and level3). #
- # Each list saves the URLs of that level i.e. depth. I choose to create 3 #
- # lists so I can have the flexibility of testing the text in each level. Also, #
- # the 3 lists can be easily combined into one list. #
- ################################################################################
- # Level1:
- root = Fetcher('http://www.wantasimplewebsite.co.uk/index.html')
- root.fetch()
- for url in root:
- if url not in root: # Avoid duplicate links
- root.append(url)
- print "\nRoot URLs are:"
- for i, url in enumerate(root):
- print "%d. %s" % (i+1, url)
- # Level2:
- level2 = []
- for url in root: # Traverse every element(i.e URL) in root and fetch the URLs from it
- temp = Fetcher(url)
- temp.fetch()
- for url in temp:
- if url not in level2: # Avoid duplicate links
- level2.append(url)
- print "\nLevel2 URLs are:"
- for i, url in enumerate(level2):
- print "%d. %s" % (i+1, url)
- # Level3:
- level3 = []
- for url in level2: # Traverse every element(i.e URL) in level2 and fetch the URLs from it
- temp = Fetcher(url)
- temp.fetch()
- for url in temp:
- if url not in level3: # Avoid duplicate links
- level3.append(url)
- print "\nLevel3 URLs are:"
- for i, url in enumerate(level3):
- print "%d. %s" % (i+1, url)
- # 1. Traverse every link in the lists and extract its web page content.
- # 2. Tokenize Text.
- # 3. Remove stop-words (i.e. and, but, to...)
- # 4. Remove duplicates
- # 5. What about stemming?
- # 6. Check the spelling.
- # 7. Save the result in a file
- html = urllib2.urlopen('http://www.wantasimplewebsite.co.uk/index.html').read()
- soup = BeautifulSoup(html)
- def printText(tags):
- for tag in tags:
- if tag.__class__ == NavigableString:
- print tag,
- else:
- printText(tag)
- printText(soup.findAll("body"))