471,334 Members | 1,927 Online
Bytes | Software Development & Data Engineering Community
Post +

Home Posts Topics Members FAQ

Join Bytes to post your question to a community of 471,334 software developers and data experts.

Program inefficiency?

I wrote the following simple program to loop through our help files
and fix some errors (in case you can't see the subtle RE search that's
happening, we're replacing spaces in bookmarks with _'s)

the program works great except for one thing. It's significantly
slower through the later files in the search then through the early
ones... Before anyone criticizes, I recognize that that middle section
could be simplified with a for loop... I just haven't cleaned it
up...

The problem is that the first 300 files take about 10-15 seconds and
the last 300 take about 2 minutes... If we do more than about 1500
files in one run, it just hangs up and never finishes...

Is there a solution here that I'm missing? What am I doing that is so
inefficient?

# File: masseditor.py

import re
import os
import time

def massreplace():
editfile = open("pathname\editfile.txt")
filestring = editfile.read()
filelist = filestring.splitlines()
## errorcheck = re.compile('(a name=)+(.*)(-)+(.*)(></a>)+')
for i in range(len(filelist)):
source = open(filelist[i])
starttext = source.read()
interimtext = replacecycle(starttext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
finaltext = replacecycle(interimtext)
source.close()
source = open(filelist[i],"w")
source.write(finaltext)
source.close()
## if errorcheck.findall(finaltext)!=[]:
## print errorcheck.findall(finaltext)
## print filelist[i]
if i == 100:
print "done 100"
print time.clock()
elif i == 300:
print "done 300"
print time.clock()
elif i == 600:
print "done 600"
print time.clock()
elif i == 1000:
print "done 1000"
print time.clock()
print "done"
print i
print time.clock()

def replacecycle(starttext):
p1= re.compile('(href=|HREF=)+(.*)(#)+(.*)( )+(.*)(">)+')
p2= re.compile('(name=")+(.*)( )+(.*)(">)+')
p3= re.compile('(href=|HREF=)+(.*)(#)+(.*)(\')+(.*)("> )+')
p4= re.compile('(name=")+(.*)(\')+(.*)(">)+')
p5= re.compile('(href=|HREF=)+(.*)(#)+(.*)(-)+(.*)(">)+')
p6= re.compile('(name=")+(.*)(-)+(.*)(">)+')
p7= re.compile('(href=|HREF=)+(.*)(#)+(.*)(<)+(.*)(">) +')
p8= re.compile('(name=")+(.*)(<)+(.*)(">)+')
p7= re.compile('(href=|HREF=")+(.*)(#)+(.*)(:)+(.*)("> )+')
p8= re.compile('(name=")+(.*)(:)+(.*)(">)+')
p9= re.compile('(href=|HREF=")+(.*)(#)+(.*)(\?)+(.*)(" >)+')
p10= re.compile('(name=")+(.*)(\?)+(.*)(">)+')
p100= re.compile('(a name=)+(.*)(-)+(.*)(></a>)+')
q1= r"\1\2\3\4_\6\7"
q2= r"\1\2_\4\5"
interimtext = p1.sub(q1, starttext)
interimtext = p2.sub(q2, interimtext)
interimtext = p3.sub(q1, interimtext)
interimtext = p4.sub(q2, interimtext)
interimtext = p5.sub(q1, interimtext)
interimtext = p6.sub(q2, interimtext)
interimtext = p7.sub(q1, interimtext)
interimtext = p8.sub(q2, interimtext)
interimtext = p9.sub(q1, interimtext)
interimtext = p10.sub(q2, interimtext)
interimtext = p100.sub(q2, interimtext)

return interimtext

massreplace()

Sep 29 '07 #1
14 1296
[...]
the program works great except for one thing. It's significantly
slower through the later files in the search then through the early
ones... Before anyone criticizes, I recognize that that middle section
could be simplified with a for loop... I just haven't cleaned it
up...

The problem is that the first 300 files take about 10-15 seconds and
the last 300 take about 2 minutes... If we do more than about 1500
files in one run, it just hangs up and never finishes...

Is there a solution here that I'm missing? What am I doing that is so
inefficient?
The only thing I see is that you compile all of the RE's every
time you call replacecycle(). They really only need to be
compiled once, but I don't know why that would cause the
progressive slowing.

FWIW, it seems to me like a shell+sed script would be the
obvious solution to the problem.
# File: masseditor.py

import re
import os
import time

def massreplace():
editfile = open("pathname\editfile.txt")
filestring = editfile.read()
filelist = filestring.splitlines()
## errorcheck = re.compile('(a name=)+(.*)(-)+(.*)(></a>)+')
for i in range(len(filelist)):
source = open(filelist[i])
starttext = source.read()
interimtext = replacecycle(starttext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
interimtext = replacecycle(interimtext)
finaltext = replacecycle(interimtext)
source.close()
source = open(filelist[i],"w")
source.write(finaltext)
source.close()
## if errorcheck.findall(finaltext)!=[]:
## print errorcheck.findall(finaltext)
## print filelist[i]
if i == 100:
print "done 100"
print time.clock()
elif i == 300:
print "done 300"
print time.clock()
elif i == 600:
print "done 600"
print time.clock()
elif i == 1000:
print "done 1000"
print time.clock()
print "done"
print i
print time.clock()

def replacecycle(starttext):
p1= re.compile('(href=|HREF=)+(.*)(#)+(.*)( )+(.*)(">)+')
p2= re.compile('(name=")+(.*)( )+(.*)(">)+')
p3= re.compile('(href=|HREF=)+(.*)(#)+(.*)(\')+(.*)("> )+')
p4= re.compile('(name=")+(.*)(\')+(.*)(">)+')
p5= re.compile('(href=|HREF=)+(.*)(#)+(.*)(-)+(.*)(">)+')
p6= re.compile('(name=")+(.*)(-)+(.*)(">)+')
p7= re.compile('(href=|HREF=)+(.*)(#)+(.*)(<)+(.*)(">) +')
p8= re.compile('(name=")+(.*)(<)+(.*)(">)+')
p7= re.compile('(href=|HREF=")+(.*)(#)+(.*)(:)+(.*)("> )+')
p8= re.compile('(name=")+(.*)(:)+(.*)(">)+')
p9= re.compile('(href=|HREF=")+(.*)(#)+(.*)(\?)+(.*)(" >)+')
p10= re.compile('(name=")+(.*)(\?)+(.*)(">)+')
p100= re.compile('(a name=)+(.*)(-)+(.*)(></a>)+')
q1= r"\1\2\3\4_\6\7"
q2= r"\1\2_\4\5"
interimtext = p1.sub(q1, starttext)
interimtext = p2.sub(q2, interimtext)
interimtext = p3.sub(q1, interimtext)
interimtext = p4.sub(q2, interimtext)
interimtext = p5.sub(q1, interimtext)
interimtext = p6.sub(q2, interimtext)
interimtext = p7.sub(q1, interimtext)
interimtext = p8.sub(q2, interimtext)
interimtext = p9.sub(q1, interimtext)
interimtext = p10.sub(q2, interimtext)
interimtext = p100.sub(q2, interimtext)

return interimtext

massreplace()

--
Grant Edwards grante Yow! Are you still
at SEXUALLY ACTIVE? Did you
visi.com BRING th' REINFORCEMENTS?
Sep 29 '07 #2
On Sat, 2007-09-29 at 15:22 +0000, ha*******@gmail.com wrote:
[...]
def replacecycle(starttext):
p1= re.compile('(href=|HREF=)+(.*)(#)+(.*)( )+(.*)(">)+')
p2= re.compile('(name=")+(.*)( )+(.*)(">)+')
p3= re.compile('(href=|HREF=)+(.*)(#)+(.*)(\')+(.*)("> )+')
p4= re.compile('(name=")+(.*)(\')+(.*)(">)+')
p5= re.compile('(href=|HREF=)+(.*)(#)+(.*)(-)+(.*)(">)+')
p6= re.compile('(name=")+(.*)(-)+(.*)(">)+')
p7= re.compile('(href=|HREF=)+(.*)(#)+(.*)(<)+(.*)(">) +')
p8= re.compile('(name=")+(.*)(<)+(.*)(">)+')
p7= re.compile('(href=|HREF=")+(.*)(#)+(.*)(:)+(.*)("> )+')
p8= re.compile('(name=")+(.*)(:)+(.*)(">)+')
p9= re.compile('(href=|HREF=")+(.*)(#)+(.*)(\?)+(.*)(" >)+')
p10= re.compile('(name=")+(.*)(\?)+(.*)(">)+')
p100= re.compile('(a name=)+(.*)(-)+(.*)(></a>)+')
[...]
One obvious opportunity for optimization is to compile those re's only
once at the beginning of the program instead of every time
replacecycle() is called (which is inexplicably called 13 times for each
file).

--
Carsten Haese
http://informixdb.sourceforge.net
Sep 29 '07 #3
I did try moveing the re.compile's up and out of the replacecylce()
but it didn't impact the time in any meaningful way (2 seconds
maybe)...

I'm not sure what an shell+sed script is... I'm fairly new to Python
and my only other coding experience is with VBA... This was my first
Python program

In case it helps... We started with only 6 loops of replacecycle() but
had to keep adding progressively more as we found more and more links
with lots of spaces in them... As we did that, the program's time grew
progressively longer but the length grew multiplicatively with the
added number of cycles... This is exactly what I would have expected
and it leads me to believe that the problem does not lie in the
replacecycle() def but in the masseditor() def... *shrug*

Sep 29 '07 #4
On 2007-09-29, ha*******@gmail.com <ha*******@gmail.comwrote:
I'm not sure what an shell+sed script is...
http://tldp.org/LDP/Bash-Beginners-G...#sect_05_01_01
http://tldp.org/LDP/Bash-Beginners-G...l/chap_05.html
http://www.grymoire.com/Unix/Sed.html

http://www.gnu.org/software/bash/
http://en.wikipedia.org/wiki/Bash

Unfortuantely it appears you're using Windows (a partucular bad
choice for this sort of file processing). You can, however,
get bash and sed for Windows if you wish:

http://www.cygwin.com/
In case it helps... We started with only 6 loops of replacecycle() but
had to keep adding progressively more as we found more and more links
with lots of spaces in them...
I would think with the correct RE's you'd only have to call it
once.
As we did that, the program's time grew progressively longer
but the length grew multiplicatively with the added number of
cycles... This is exactly what I would have expected and it
leads me to believe that the problem does not lie in the
replacecycle() def but in the masseditor() def... *shrug*
As the program runs on progressively more files does the
process's memory usage grow without bounds? Does the machine
start swapping?

--
Grant Edwards grante Yow! I'm pretending that
at we're all watching PHIL
visi.com SILVERS instead of RICARDO
MONTALBAN!
Sep 29 '07 #5
no swaps... memory usage is about 14k (these are small Html files)...
no hard drive cranking away or fan on my laptop going nutty... CPU
usage isn't even pegged... that's what makes me think it's not some
sort of bizarre memory leak... Unfortunately, it also means I'm out of
ideas...

Sep 29 '07 #6
For anyone that cares, I figured out the "problem"... the webhelp
files that it hits the wall on are the compiled search files... They
are the only files in the system that have line lengths that are
RIDICULOUS in length... I'm looking at one right now that has 32767
characters all on one line...

I'm absolutely certain that that's the problem...

Thanks for everyone's help

Sep 29 '07 #7
On Sep 29, 5:22 pm, hall.j...@gmail.com wrote:
I wrote the following simple program to loop through our help files
and fix some errors (in case you can't see the subtle RE search that's
happening, we're replacing spaces in bookmarks with _'s)

the program works great except for one thing. It's significantly
slower through the later files in the search then through the early
ones... Before anyone criticizes, I recognize that that middle section
could be simplified with a for loop... I just haven't cleaned it
up...

The problem is that the first 300 files take about 10-15 seconds and
the last 300 take about 2 minutes... If we do more than about 1500
files in one run, it just hangs up and never finishes...

Is there a solution here that I'm missing? What am I doing that is so
inefficient?
Ugh, that was entirely too many regexps for my taste :-)

How about something like:

def attr_ndx_iter(txt, attribute):
"Return all the start and end indices for the values of
attribute."
txt = txt.lower()
attribute = attribute.lower() + '='
alen = len(attribute)
chunks = txt.split(attribute)
if len(chunks) == 1:
return

start = len(chunks[0]) + alen
end = -1

for chunk in chunks[1:]:
qchar = chunk[0]
end = start + chunk.index(qchar, 1)
yield start + 1, end
start += len(chunk) + alen

def substr_map(txt, indices, fn):
"Apply fn to text within indices."
res = []
cur = 0

for i,j in indices:
res.append(txt[cur:i])
res.append(fn(txt[i:j]))
cur = j

res.append(txt[cur:])
return ''.join(res)

def transform(s):
"The transformation to do on the attribute values."
return s.replace(' ', '_')

def zap_spaces(txt, *attributes):
for attr in attributes:
txt = substr_map(txt, attr_ndx_iter(txt, attr), transform)
return txt

def mass_replace():
import sys
w = sys.stdout.write

for f in open(r'pathname\editfile.txt'):
try:
open(f, 'w').write(zap_spaces(open(f).read(), 'href',
'name'))
w('.') # progress-meter :-)
except:
print 'Error processing file:', f

minimally-tested'ly y'rs
-- bjorn

Sep 29 '07 #8
thebjorn wrote:
On Sep 29, 5:22 pm, hall.j...@gmail.com wrote:
>I wrote the following simple program to loop through our help files
and fix some errors (in case you can't see the subtle RE search that's
happening, we're replacing spaces in bookmarks with _'s)
(...)

Ugh, that was entirely too many regexps for my taste :-)

How about something like:

def attr_ndx_iter(txt, attribute):
(...)
def substr_map(txt, indices, fn):
(...)
def transform(s):
(...)
def zap_spaces(txt, *attributes):
(...)
def mass_replace():
(...)
Oh yeah, now it's clear as mud.
I do think that the whole program shouldn't take more than 10 lines of
code using one sensible regex (impossible to define without knowing the
real input and output formats).
And (sorry to tell) I'm convinced this is a problem for regexes, in
spite of anybody's personal taste.

Pablo
Sep 29 '07 #9
On Sep 29, 7:55 pm, Pablo Ziliani <pa...@decode.com.arwrote:
thebjorn wrote:
On Sep 29, 5:22 pm, hall.j...@gmail.com wrote:
I wrote the following simple program to loop through our help files
and fix some errors (in case you can't see the subtle RE search that's
happening, we're replacing spaces in bookmarks with _'s)
(...)
Ugh, that was entirely too many regexps for my taste :-)
How about something like:
def attr_ndx_iter(txt, attribute):
(...)
def substr_map(txt, indices, fn):
(...)
def transform(s):
(...)
def zap_spaces(txt, *attributes):
(...)
def mass_replace():
(...)

Oh yeah, now it's clear as mud.
I'm anxiously awaiting your beacon of clarity ;-)
I do think that the whole program shouldn't take more than 10 lines of
code
Well, my mass_replace above is 10 lines, and the actual replacement
code is a one liner. Perhaps you'd care to illustrate how you'd
shorten that while still keeping it "clear"?
using one sensible regex
I have no doubt that it would be possible to do with a single regex.
Whether it would be sensible or not is another matter entirely...
(impossible to define without knowing the real input and output formats).
Of course, but I don't think you can guess too terribly wrong. My
version handles upper and lower case attributes, quoting with single
(') and double (") quotes, and any number of spaces in attribute
values. It maintains all other text as-is, and converts spaces to
underscores in href and name attributes. Did I get anything majorly
wrong?
And (sorry to tell) I'm convinced this is a problem for regexes, in
spite of anybody's personal taste.
Well, let's see it then :-)

smack-smack'ly y'rs
-- bjorn

Sep 29 '07 #10
The search is trying to replace the spaces in our bookmarks (and the
links that go to those bookmarks)...

The bookmark tag looks like this:

<a href="Web_Sites.htm#A Web Sites">

and the bookmark tag looks like this

<a name="A Web Sites"></a>

some pitfalls I've already run up against...
SOMETIMES (but not often) the a and the href (or name) is split across
a line... this led me to just drop the "<a" from the front
If there are no spaces, SOME (but again, not all) of the "<a name"
tags don't have "'s... this is a problem because we're having to
replace all special characters with _'s...
Some of our bookmarks are quite wordy (we found one yesterday with 11
spaces)
href is sometimes all caps (HREF)

As you can imagine, there are alot of corner cases and I felt it was
easier just to be inefficient and write out all the regex cases and
loop through them repeatedly... I've also got to work around the stuff
already in the system (for example, I need to make certain I'm looking
behind the #'s in the bookmark links, otherwise I'll end up replacing
legitimate -'s in external web site addresses)

I think Pablo is correct that a single (or perhaps two) RE statements
are all that is needed... perhaps:

p1= re.compile('(href=|HREF=)+(.*)(#)+(.*)(\w\'\?-<: )+(.*)(">)+')
and the corresponding name replace and then the one corner case we ran
into of
p100= re.compile('(a name=)+(.*)(-)+(.*)(></a>)+')

Sep 29 '07 #11
It think he's saying it should look like this:

# File: masseditor.py

import re
import os
import time

p1= re.compile('(href=|HREF=)+(.*)(#)+(.*)(\w\'\?-<:)+(.*)(">)+')
p2= re.compile('(name=")+(.*)(\w\'\?-<:)+(.*)(">)+')
p100= re.compile('(a name=)+(.*)(-)+(.*)(></a>)+')
q1= r"\1\2\3\4_\6\7"
q2= r"\1\2_\4\5"

def massreplace():
editfile = open("C:\Program Files\Credit Risk Management\Masseditor
\editfile.txt")
filestring = editfile.read()
filelist = filestring.splitlines()

for i in range(len(filelist)):
source = open(filelist[i])
starttext = source.read()

for i in range (13):
interimtext = p1.sub(q1, starttext)
interimtext= p2.sub(q2, interimtext)
interimtext= p100.sub(q2, interimtext)
source.close()
source = open(filelist[i],"w")
source.write(finaltext)
source.close()

massreplace()

I'll try that and see how it works...

Sep 29 '07 #12
On Sep 29, 8:32 pm, hall.j...@gmail.com wrote:
It think he's saying it should look like this:

# File: masseditor.py

import re
import os
import time

p1= re.compile('(href=|HREF=)+(.*)(#)+(.*)(\w\'\?-<:)+(.*)(">)+')
p2= re.compile('(name=")+(.*)(\w\'\?-<:)+(.*)(">)+')
p100= re.compile('(a name=)+(.*)(-)+(.*)(></a>)+')
q1= r"\1\2\3\4_\6\7"
q2= r"\1\2_\4\5"

def massreplace():
editfile = open("C:\Program Files\Credit Risk Management\Masseditor
\editfile.txt")
filestring = editfile.read()
filelist = filestring.splitlines()

for i in range(len(filelist)):
source = open(filelist[i])
starttext = source.read()

for i in range (13):
interimtext = p1.sub(q1, starttext)
interimtext= p2.sub(q2, interimtext)
interimtext= p100.sub(q2, interimtext)
source.close()
source = open(filelist[i],"w")
source.write(finaltext)
source.close()

massreplace()

I'll try that and see how it works...
Ok, if you want a single RE... How about:
test = '''
<a href="Web_Sites.htm#A Web Sites">
<a name="A Web Sites"></a>
<a
href="Web_Sites.htm#A Web Sites">
<a
name="A Web Sites"></a>
<a HREF="Web_Sites.htm#A Web Sites">
<a name=Quoteless></a>
<a name = "oo ps"></a>
'''

import re

r = re.compile(r'''
(?:href=['"][^#]+[#]([^"']+)["'])
| (?:name=['"]?([^'">]+))
''', re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)

def zap_space(m):
return m.group(0).replace(' ', '_')

print r.sub(zap_space, test)

It prints out

<a href="Web_Sites.htm#A_Web_Sites">
<a name="A_Web_Sites"></a>
<a
href="Web_Sites.htm#A_Web_Sites">
<a
name="A_Web_Sites"></a>
<a HREF="Web_Sites.htm#A_Web_________________________ ____Sites">
<a name=Quoteless></a>
<a name = "oo ps"></a>

-- bjorn

Sep 29 '07 #13
On Sep 29, 2:32 pm, hall.j...@gmail.com wrote:
It think he's saying it should look like this:

(line noise snipped)
Or you can let BeautifulSoup do the dirty job for you and forget all
this ugliness:
from BeautifulSoup import BeautifulSoup

soup = BeautifulSoup(text)
for a in soup.findAll('a'):
for attr in 'href','name':
val = a.get(attr)
if val:
a[attr] = val.replace(' ','_')
print soup
George

Sep 29 '07 #14
On Sat, Sep 29, 2007 at 12:05:26PM -0700, thebjorn wrote:
Ok, if you want a single RE... How about:
...
r = re.compile(r'''
(?:href=['"][^#]+[#]([^"']+)["'])
| (?:name=['"]?([^'">]+))
''', re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)
maybe a little bit easier to read with ungreedy operators:

r = re.compile(r'''
(?:href=['"].+?[#](.+?)["'])
| (?:name=['"]?(.+?)['">]))
''', re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE)

flo.

Oct 1 '07 #15

This discussion thread is closed

Replies have been disabled for this discussion.

By using Bytes.com and it's services, you agree to our Privacy Policy and Terms of Use.

To disable or enable advertisements and analytics tracking please visit the manage ads & tracking page.