On Mar 4, 11:42 am, "seber...@spawa r.navy.mil"
<seber...@spawa r.navy.milwrote :
I understand that the web is full of ill-formed XHTML web pages but
this is Microsoft:
http://moneycentral.msn.com/companyreport?Symbol=BBBY
I can't validate it and xml.minidom.dom .parseString won't work on it.
If this was just some teenager's web site I'd move on. Is there any
hope avoiding regular expression hacks to extract the data from this
page?
Chris
How about a pyparsing hack instead? With English-readable expression
names and a few comments, I think this is fairly easy to follow. Also
note the sample statement at then end showing how to use the results
names to access the individual data fields (much easier than indexing
into a 20-element list!).
(You should also verify you are not running afoul of any terms of
service related to the content of this page.)
-- Paul
=============== ========
from pyparsing import *
import urllib
# define matching elements
integer = Word(nums).setP arseAction(lamb da t:int(t[0]))
real = Combine(Word(nu ms) + Word(".",nums)) .setParseAction (lambda
t:float(t[0]))
pct = real + Suppress("%")
date = Combine(Word(nu ms) + '/' + Word(nums))
tdStart,tdEnd = map(Suppress,ma keHTMLTags("td" ))
dollarUnits = oneOf("Mil Bil")
# stats are one of two patterns - single value or double value stat,
wrapped in HTML <tdtags
# also, attach parse action to make sure each matches only once
def statPattern(nam e,label,statExp r=real):
if (isinstance(sta tExpr,And)):
statExpr.exprs[0] = statExpr.exprs[0].setResultsName (name)
else:
statExpr = statExpr.setRes ultsName(name)
expr = tdStart + Suppress(label) + tdEnd + tdStart + statExpr +
tdEnd
return expr.setParseAc tion(OnlyOnce(l ambda t:None))
def bistatPattern(n ame,label,statE xpr1=real,statE xpr2=real):
expr = (tdStart + Suppress(label) + tdEnd +
tdStart + statExpr1 + tdEnd +
tdStart + statExpr2 + tdEnd).setResul tsName(name)
return expr.setParseAc tion(OnlyOnce(l ambda t:None))
stats = [
statPattern("la st","Last Price"),
statPattern("hi ","52 Week High"),
statPattern("lo ","52 Week Low"),
statPattern("vo l","Volume", real + Suppress(dollar Units)),
statPattern("av eDailyVol_13wk" ,"Average Daily Volume (13wk)", real
+ Suppress(dollar Units)),
statPattern("mo vingAve_50day", "50 Day Moving Average"),
statPattern("mo vingAve_200day" ,"200 Day Moving Average"),
statPattern("vo latility","Vola tility (beta)"),
bistatPattern(" relStrength_las t3","Last 3 Months", pct, integer),
bistatPattern(" relStrength_las t6","Last 6 Months", pct, integer),
bistatPattern(" relStrength_las t12","Last 12 Months", pct,
integer),
bistatPattern(" sales","Sales", real+Suppress(d ollarUnits), pct),
bistatPattern(" income","Income ", real+Suppress(d ollarUnits), pct),
bistatPattern(" divRate","Divid end Rate", real, pct | "NA"),
bistatPattern(" divYield","Divi dend Yield", pct, pct),
statPattern("cu rQtrEPSest","Qt r("+date+") EPS Estimate"),
statPattern("cu rFyEPSest","FY( "+date+") EPS Estimate"),
statPattern("cu rPE","Current P/E"),
statPattern("fw dEPSest","FY("+ date+") EPS Estimate"),
statPattern("fw dPE","Forward P/E"),
]
# create overall search pattern - things move faster if we verify that
we are positioned
# at a <tdtag before going through the MatchFirst group
statSearchPatte rn = FollowedBy(tdSt art) + MatchFirst(stat s)
# SETUP IS DONE - now get the HTML source
# read in web page
pg = urllib.urlopen( "http://moneycentral.ms n.com/companyreport?
Symbol=BBBY")
stockHTML = pg.read()
pg.close()
# extract and merge statistics
ticker =
sum( statSearchPatte rn.searchString (stockHTML),Par seResults([]) )
# print them out
print ticker.dump()
print ticker.last, ticker.hi,ticke r.lo,ticker.vol ,ticker.volatil ity
-----------------------
prints:
[39.549999999999 997, 43.32, 30.920000000000 002, 2.3599999999999 999,
2.7400000000000 002, 40.920000000000 002, 37.659999999999 997,
0.7299999999999 9998, 1.5, 55, 15.5, 69, 9.8000000000000 007, 62,
6.2999999999999 998, 19.399999999999 999, 586.29999999999 995,
27.199999999999 999, 0.0, 'NA', 0.0, 0.0, 0.7800000000000 0003,
2.1499999999999 999, 19.399999999999 999, 2.3900000000000 001,
18.399999999999 999]
- aveDailyVol_13w k: 2.74
- curFyEPSest: 2.15
- curPE: 19.4
- curQtrEPSest: 0.78
- divRate: [0.0, 'NA']
- divYield: [0.0, 0.0]
- fwdEPSest: 2.39
- fwdPE: 18.4
- hi: 43.32
- income: [586.29999999999 995, 27.199999999999 999]
- last: 39.55
- lo: 30.92
- movingAve_200da y: 37.66
- movingAve_50day : 40.92
- relStrength_las t12: [9.8000000000000 007, 62]
- relStrength_las t3: [1.5, 55]
- relStrength_las t6: [15.5, 69]
- sales: [6.2999999999999 998, 19.399999999999 999]
- vol: 2.36
- volatility: 0.73
39.55 43.32 30.92 2.36 0.73