I was messing around with a project, trying out various

compression schemes. One of them was for arithmetic

coding. My implementation is .. novel ... because it

uses the Rational library for buring those CPU cycles.

Here's the output when you run it on the command-line

with the file itself as the training set and some text

from the file

% python arithmetic_code r.py arithmetic_code r.py "def for in tot R"

Orig. 136 bits, compr. 85 bits, ratio = 66%

242524315643209 44671345436

Was it 'def for in tot R'?

Guess so.

That second line is the compressed text * 2**85.

For funsies, here it is.

# A very slow arithmetic coder for Python.

#

# "Rationals explode quickly in term of space and ... time."

# -- comment in Rational.py (probably Tim Peters)

#

# Really. Don't use this for real work. Read Mark Nelson's

# Dr. Dobb's article on the topic at

# http://dogma.net/markn/articles/arith/part1.htm

# It's readable, informative and even includes clean sample code.

#

# Contributed to the public domain. (Like they would want it ;)

# Andrew Dalke < dalke @ dalke scientific . com >

import sys

import Rational, math

R = Rational.ration al

def train(text):

"""text -> 0-order probability statistics as a dictionary

Text must not contain the NUL (0x00) character because that's

used to indicate the end of data.

"""

assert "\x00" not in text

counts = {}

for c in text:

counts[c]=counts.get(c,0 )+1

counts["\x00"] = 1

tot_letters = sum(counts.valu es())

tot = 0

d = {}

prev = R(0)

for c, count in counts.items():

next = R(tot + count, tot_letters)

d[c] = (prev, next)

prev = next

tot = tot + count

assert tot == tot_letters

return d

def encode(text, probs):

"""text and the 0-order probability statistics -> longval, nbits

The encoded number is rational(longva l, 2**nbits)

"""

minval = R(0)

maxval = R(1)

for c in text + "\x00":

prob_range = probs[c]

delta = maxval - minval

maxval = minval + prob_range[1] * delta

minval = minval + prob_range[0] * delta

# I tried without the /2 just to check. Doesn't work.

# Keep scaling up until the error range is >= 1. That

# gives me the minimum number of bits needed to resolve

# down to the end-of-data character.

delta = (maxval - minval)/2

nbits = 0L

while delta < 1:

nbits = nbits + 1

delta = delta << 1

if nbits == 0:

return 0, 0

else:

avg = (maxval + minval)<<(nbits-1) # using -1 instead of /2

# Could return a rational instead ...

return avg.n/avg.d, nbits # the division truncation is deliberate

def decode(longval, nbits, probs):

"""decoe the number to a string using the given statistics"""

val = R(longval, 1L<<nbits)

letters = []

probs_items = [(c, minval, maxval) for (c, (minval, maxval))

in probs.items()]

while 1:

for (c, minval, maxval) in probs_items:

if minval <= val < maxval:

break

else:

raise AssertionError( "not found")

if c == "\x00":

break

letters.append( c)

delta = maxval - minval

val = (val - minval)/delta

return "".join(letters )

if __name__ == "__main__":

# getopt? optparse? What are they?

import sys

trainfilename = sys.argv[1] # must give a filename

phrase = sys.argv[2] # must give text to compress (slowly!)

probs = train(open(trai nfilename).read ())

n, nbits = encode(phrase, probs)

# +1 for the NUL terminator or equivalent

print "Orig. %d bits, compr. %d bits, ratio = %3.f%%" % (

(len(phrase)+1) *8, nbits, (100.*nbits) / (len(phrase)*8) )

print n

new_phrase = decode(n, nbits, probs)

print "Was it '%s'?" % (new_phrase,)

if phrase == new_phrase:

print "Guess so."

else:

print "Why not?!"