....is to forget they are sorted???

While trying to optimize some NUCULAR libraries I discovered

that the best way to merge 2 sorted lists together

into a new sorted list is to just append

them and re-sort. The following test case demonstrates this.

It can be criticized in many ways: it only tests lists of the same

size,

it only uses "hashed" data, etcetera...

Still, my testing shows "resort everything" is consistently

two times faster than an explicit python merge even for fairly large

data.

I'm beginning to think

a "sorted list merger" might make a nice tiny extension module

(at least for my purposes).

See timing demonstration code below. Let me know if there

is a better way or if the test is fatally flawed, please.

--- Aaron Watters

====

http://www.xfeedme.com/nucular/pydis...greedy+bastard

==========snip: test code below

"test different ways to merge two sorted lists"

def ObviousMerge(L1, L2):

"obvious way"

len1 = len(L1)

len2 = len(L2)

resultlen = len1+len2

result = [None] * resultlen

count = 0

index1 = 0

index2 = 0

while count<resultlen:

if index1<len1:

elt1 = L1[index1]

if index2<len2:

elt2 = L2[index2]

if elt1<elt2:

result[count] = elt1

index1+=1

else:

result[count] = elt2

index2+=1

else:

result[count] = elt1

index1+=1

else:

result[count] = L2[index2]

index2+=1

count+=1

return result

def AggregateTailMerge(L1, L2):

"obvious way, aggregating the tail"

len1 = len(L1)

len2 = len(L2)

resultlen = len1+len2

result = [None] * resultlen

count = 0

index1 = 0

index2 = 0

while index1<len1 and index2<len2:

elt1 = L1[index1]

elt2 = L2[index2]

if elt1<elt2:

result[count] = elt1

index1+=1

else:

result[count] = elt2

index2+=1

count+=1

if index1<len1:

result[count:] = L1[index1:]

if index2<len2:

result[count:] = L2[index2:]

return result

# could aggregate head and tail using bisect: skipped

def ResortEverythingMerge(L1, L2):

"aggregate everything using list append and sort"

result = L1+L2

result.sort()

return result

mergeFunctions = [ResortEverythingMerge, ObviousMerge,

AggregateTailMerge, ]

# make some test data

def makeAListOfHashes(length, data):

import md5

return [md5.md5(str(i)+data).hexdigest() for i in xrange(length)]

print "constructing global test data"

SIZES = [10, 100, 1000, 10000, 100000, 1000000]

LISTS = [ (L, makeAListOfHashes(L,"A"), makeAListOfHashes(L, "B") )

for L in SIZES ]

for (length, L1, L2) in LISTS:

L1.sort()

L2.sort()

print "done with making global test data"

print

def timings(mergerFunction):

from time import time

fname = mergerFunction.__name__

print

print "now timing", mergerFunction

print

for (length, L1, L2) in LISTS:

now = time()

result = f(L1, L2)

elapsed = time() - now

print " for", length, "elapsed %3.5f"%elapsed, "for", fname

if __name__=="__main__":

print

print "running timings"

for f in mergeFunctions:

timings(f)

================ snip run output below:

constructing global test data

done with making global test data

running timings

now timing <function ResortEverythingMerge at 0x20000000004f4de8>

for 10 elapsed 0.00003 for ResortEverythingMerge

for 100 elapsed 0.00006 for ResortEverythingMerge

for 1000 elapsed 0.00057 for ResortEverythingMerge

for 10000 elapsed 0.00626 for ResortEverythingMerge

for 100000 elapsed 0.12484 for ResortEverythingMerge

for 1000000 elapsed 1.60117 for ResortEverythingMerge

now timing <function ObviousMerge at 0x20000000004f47d0>

for 10 elapsed 0.00008 for ObviousMerge

for 100 elapsed 0.00027 for ObviousMerge

for 1000 elapsed 0.00259 for ObviousMerge

for 10000 elapsed 0.02587 for ObviousMerge

for 100000 elapsed 0.27862 for ObviousMerge

for 1000000 elapsed 2.89965 for ObviousMerge

now timing <function AggregateTailMerge at 0x20000000004f4cf8>

for 10 elapsed 0.00008 for AggregateTailMerge

for 100 elapsed 0.00025 for AggregateTailMerge

for 1000 elapsed 0.00246 for AggregateTailMerge

for 10000 elapsed 0.02366 for AggregateTailMerge

for 100000 elapsed 0.26594 for AggregateTailMerge

for 1000000 elapsed 2.77103 for AggregateTailMerge