Brian Quinlan wrote:
This is less a Python question and more a optimization/probability
question. Imaging that you have a list of objects and there frequency in
a population e.g.
lst = [(a, 0.01), (b, 0.05), (c, 0.50), (d, 0.30), (e, 0.04), (f, 0.10)]
and you want to drawn n items from that list (duplicates allowed), with
that probability distribution.
The fastest algorithm that I have been able to devise for doing so is:
O(n * log(len(lst))). Can anyone think or a solution with a better time
complexity? If not, is there an obviously better way to do this
(assuming n is big and the list size is small).
Any way I tried to slice and dice it, I could not get any faster.
draw2 and draw 3 generate code on the fly. draw4 sneakily tries to
trade memory and accuracy for speed but is even slower!
First the times, then the code:
$ ./timeit.py 'from probDistribution import draw as draw; draw(10000,
[0.01, 0.05, 0.50, 0.30, 0.04, 0.10])'
100 loops, best of 3: 13.4 msec per loop
$ ./timeit.py 'from probDistribution import draw2 as draw; draw(10000,
[0.01, 0.05, 0.50, 0.30, 0.04, 0.10])'
100 loops, best of 3: 15.2 msec per loop
$ ./timeit.py 'from probDistribution import draw3 as draw; draw(10000,
[0.01, 0.05, 0.50, 0.30, 0.04, 0.10])'
100 loops, best of 3: 16.2 msec per loop
$ ./timeit.py 'from probDistribution import draw4 as draw; draw(10000,
[0.01, 0.05, 0.50, 0.30, 0.04, 0.10])'
10 loops, best of 3: 30.5 msec per loop
=== CODE probDistribution.py ===
from random import random, randrange
from bisect import bisect
def draw(n, lst):
ps = []
last = 0
for p in lst:
ps.append(last + p)
last += p
# ps = [0.01, 0.06, 0.56, 0.86, 0.90, 1.00]
chosen = [0] * len(lst) # track frequency
for i in range(n):
r = random()
chosen[bisect(ps, r)] += 1 # binary search and increment
result = [] # rescale probability based on frequency
for c in chosen:
result.append(float(c) / n)
return result
def draw2(n, lst):
""" uses dynamicc code generation of this form:
chosen = [0] * 6
for i in xrange(10000):
r = random()
if r < 0.01: chosen[0]+=1
elif r < 0.06: chosen[1] +=1
...
elif r < 0.90: chosen[4] +=1
else chosen[5]+=1
"""
assert len(lst)>1, "Corner case NOT covered"
codestr = 'chosen = [0] * %i\n' % (len(lst),)
codestr += 'for i in xrange(%i):\n r = random()\n' % (n,)
last = 0.0
lstmax = len(lst)-1
for i,p in enumerate(lst):
last += p
if i==0: codestr += ' if r < %g: chosen[%i] +=1\n' %
(last, i)
elif i==lstmax: codestr += ' else: chosen[%i] +=1\n' % (i,)
else: codestr += ' elif r < %g: chosen[%i] +=1\n' %
(last, i)
exec codestr
result = [] # rescale probability based on frequency
for c in chosen:
result.append(float(c) / n)
return result
def draw3(n, lst):
""" uses dynamicc code generation of this form:
chosen = [0] * 6
for i in xrange(10000):
r = random()
chosen[-1+ (
((r<0.01) and 1)
or ((r<0.06) and 2)
...
or ((r<0.90) and 5)
or 6
)] +=1
"""
assert len(lst)>1, "Corner case NOT covered"
codestr = 'chosen = [0] * %i\n' % (len(lst),)
codestr += 'for i in xrange(%i):\n r = random()\n' % (n,)
codestr += ' chosen[-1+ (\n'
last = 0.0
lstmax = len(lst)-1
for i,p in enumerate(lst):
last += p
if i==0: codestr += ' ((r<%g) and 1)\n' % (last)
elif i==lstmax: codestr += ' or %i\n )] +=1\n' % (i+1,)
else: codestr += ' or ((r<%g) and %i)\n' % (last,
i+1)
#return codestr
exec codestr
result = [] # rescale probability based on frequency
for c in chosen:
result.append(float(c) / n)
return result
def draw4(n, lst, precision = 0.01, maxbins=10000):
""" Memory/speed tradeoff by coarse quantizing of frequency values.
"""
assert len(lst)>1, "Corner case NOT covered"
assert 0.0 < precision < 1.0 and (1.0/precision) < maxbins
binmax = int(1.0/precision)
chosenbin = [0] * binmax
for i in xrange(n):
chosenbin[randrange(binmax)] +=1
left, right = 0, 0 # extract bin range for summation
chosen = [0] * len(lst)
last = 0.0
for i,p in enumerate(lst):
last += p
right = int(last/precision)
chosen[i] = sum( chosenbin[left:right] )
left = right
result = [] # rescale probability based on frequency
for c in chosen:
result.append(float(c) / n)
return result
=== END CODE ===