I am working now on html process.
So, I found a good class for getting html tag but I don't know how to use it
I wrote this code for getting the tag A hopping some help please >>>
Expand|Select|Wrap|Line Numbers
- <%
- import urllib
- from sgmllib import SGMLParser
- import htmlentitydefs
- class BaseHTMLProcessor(SGMLParser):
- def reset(self):
- # extend (called by SGMLParser.__init__)
- self.pieces = []
- SGMLParser.reset(self)
- def unknown_starttag(self, tag, attrs):
- # called for each start tag
- # attrs is a list of (attr, value) tuples
- # e.g. for <pre class="screen">, tag="pre", attrs=[("class", "screen")]
- # Ideally we would like to reconstruct original tag and attributes, but
- # we may end up quoting attribute values that weren't quoted in the source
- # document, or we may change the type of quotes around the attribute value
- # (single to double quotes).
- # Note that improperly embedded non-HTML code (like client-side Javascript)
- # may be parsed incorrectly by the ancestor, causing runtime script errors.
- # All non-HTML code must be enclosed in HTML comment tags (<!-- code -->)
- # to ensure that it will pass through this parser unaltered (in handle_comment).
- strattrs = "".join([' %s="%s"' % (key, value) for key, value in attrs])
- self.pieces.append("<%(tag)s%(strattrs)s>" % locals())
- def unknown_endtag(self, tag):
- # called for each end tag, e.g. for </pre>, tag will be "pre"
- # Reconstruct the original end tag.
- self.pieces.append("</%(tag)s>" % locals())
- def handle_charref(self, ref):
- # called for each character reference, e.g. for "*", ref will be "160"
- # Reconstruct the original character reference.
- self.pieces.append("&#%(ref)s;" % locals())
- def handle_entityref(self, ref):
- # called for each entity reference, e.g. for "©", ref will be "copy"
- # Reconstruct the original entity reference.
- self.pieces.append("&%(ref)s" % locals())
- # standard HTML entities are closed with a semicolon; other entities are not
- if htmlentitydefs.entitydefs.has_key(ref):
- self.pieces.append(";")
- def handle_data(self, text):
- # called for each block of plain text, i.e. outside of any tag and
- # not containing any character or entity references
- # Store the original text verbatim.
- self.pieces.append(text)
- def handle_comment(self, text):
- # called for each HTML comment, e.g. <!-- insert Javascript code here -->
- # Reconstruct the original comment.
- # It is especially important that the source document enclose client-side
- # code (like Javascript) within comments so it can pass through this
- # processor undisturbed; see comments in unknown_starttag for details.
- self.pieces.append("<!--%(text)s-->" % locals())
- def handle_pi(self, text):
- # called for each processing instruction, e.g. <?instruction>
- # Reconstruct original processing instruction.
- self.pieces.append("<?%(text)s>" % locals())
- def handle_decl(self, text):
- # called for the DOCTYPE, if present, e.g.
- # <!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
- # "http://www.w3.org/TR/html4/loose.dtd">
- # Reconstruct original DOCTYPE
- self.pieces.append("<!%(text)s>" % locals())
- def output(self):
- """Return processed HTML as a single string"""
- return "".join(self.pieces)
- url='http://google.com'
- f = urllib.urlopen(url)
- s = f.read() # The html code
- links = []
- myparser = BaseHTMLProcessor()
- links=myparser.output(myparser.unknown_starttag(s,'a','href'))
- req.write(links[0]+'<br>'+links[1])
- %>
The traceback >>
Expand|Select|Wrap|Line Numbers
- Traceback (most recent call last):
- File "/usr/lib/python2.5/site-packages/mod_python/importer.py", line 1537, in HandlerDispatch
- default=default_handler, arg=req, silent=hlist.silent)
- File "/usr/lib/python2.5/site-packages/mod_python/importer.py", line 1229, in _process_target
- result = _execute_target(config, req, object, arg)
- File "/usr/lib/python2.5/site-packages/mod_python/importer.py", line 1128, in _execute_target
- result = object(arg)
- File "/usr/lib/python2.5/site-packages/mod_python/psp.py", line 337, in handler
- p.run()
- File "/usr/lib/python2.5/site-packages/mod_python/psp.py", line 243, in run
- exec code in global_scope
- File "/var/www/html/smart/qui.psp", line 86, in <module>
- f = urllib.urlopen(url)
- File "/usr/lib/python2.5/urllib.py", line 82, in urlopen
- return opener.open(url)
- File "/usr/lib/python2.5/urllib.py", line 190, in open
- return getattr(self, name)(url)
- File "/usr/lib/python2.5/urllib.py", line 325, in open_http
- h.endheaders()
- File "/usr/lib/python2.5/httplib.py", line 856, in endheaders
- self._send_output()
- File "/usr/lib/python2.5/httplib.py", line 728, in _send_output
- self.send(msg)
- File "/usr/lib/python2.5/httplib.py", line 695, in send
- self.connect()
- File "/usr/lib/python2.5/httplib.py", line 663, in connect
- socket.SOCK_STREAM):
- IOError: [Errno socket error] (-3, 'Temporary failure in name resolution')
do you think it is from socket ??
should I put some code to set time out??