On Wed, 27 Apr 2005 15:54:54 -0700, Michael Spencer <ma**@telcopartners.com> wrote:
andrea wrote:I was thinking to code the huffman algorithm and trying to compress
something with it, but I've got a problem.
How can I represent for example a char with only 3 bits??I had a look to the compression modules but I can't understand them much...
... I understand I can't do it easily in python, but maybe I could define a
new type in C and use it to do those dirty works, what do you think?
Why do you need to create 'very small types'?
You only need actual bit-twiddling when you do the encoding/de-coding right?
If you create an encoding map 'codes' as a dict of strings of '1' and '0',
encoding might look like (untested):
def encode(stream):
outchar = count = 0
for char in stream:
for bit in codes[char]:
(outchar << 1) | (bit == "1")
count +=1
if count ==8:
yield chr(outchar)
outchar = count = 0
if count:
outchar <<= 8-count # make last bit field big-endian like others? yield chr(outchar)
I think I prefer little-endian bit streams though, e.g. (just hacked, not tested beyond
what you see and not recommended as efficient either way, esp the decode, but it might
be illustrative of something for Andrea ;-)
----< trivcodec.py >------------------------------------------------
#trivcodec.py
def encode(stream):
outchar = count = 0
for char in stream:
#print '%r code: %r'%(char, codes[char])
bits, width = codes[char]
outchar |= (bits<<count)
count += width
while count >= 8:
yield chr(outchar%0xff)
outchar >>= 8
count -= 8
if count:
yield chr(outchar)
def decode(stream):
codebuf = count = 0
width = 1
for codebyte in stream:
codebyte = ord(codebyte)
codebuf |= (codebyte<<count)
count += 8
if width>count:
continue
while width <= count:
code = (codebuf&((1<<width)-1), width)
if code not in chars:
width += 1
continue
yield chars[code]
codebuf >>= width
count -= width
width = 1
width = 1
#trivial encoding dict: a=0 b=01 C=0011 D=0111 E=1011 F=1111
codes = {'a':(0x00,1), 'b':(0x01,2),
'C':(0x3+0x4*0,4), 'D':(0x3+0x4*1,4),
'E':(0x3+0x4*2,4), 'F':(0x3+0x4*3,4)}
chars = dict([(v,k) for k,v in codes.items()]) # reverse
def test(*tests):
if not tests: tests = ['abaFECbaabbDF']
for charstream in tests:
print
codestream = ''.join(list(encode(charstream)))
print '%r [%s] -(encode)-> %r [%s]' % (
charstream, len(charstream), codestream, len(codestream))
recovered = ''.join(list(decode(codestream)))
print '%r [%s] -(decode)-> %r [%s]' % (
codestream, len(codestream), charstream, len(charstream))
if __name__ == '__main__':
import sys
test(*sys.argv[1:])
--------------------------------------------------------------------
Result:
[22:02] C:\pywk\clp>py24 trivcodec.py
'abaFECbaabbDF' [13] -(encode)-> '\xf2;Q\xf7' [4]
'\xf2;Q\xf7' [4] -(decode)-> 'abaFECbaabbDF' [13]
[22:02] C:\pywk\clp>py24 trivcodec.py a b C D E F
'a' [1] -(encode)-> '\x00' [1]
'\x00' [1] -(decode)-> 'a' [1]
'b' [1] -(encode)-> '\x01' [1]
'\x01' [1] -(decode)-> 'b' [1]
'C' [1] -(encode)-> '\x03' [1]
'\x03' [1] -(decode)-> 'C' [1]
'D' [1] -(encode)-> '\x07' [1]
'\x07' [1] -(decode)-> 'D' [1]
'E' [1] -(encode)-> '\x0b' [1]
'\x0b' [1] -(decode)-> 'E' [1]
'F' [1] -(encode)-> '\x0f' [1]
'\x0f' [1] -(decode)-> 'F' [1]
[22:02] C:\pywk\clp>py24 trivcodec.py
'abaFECbaabbDF' [13] -(encode)-> '\xf2;Q\xf7' [4]
'\xf2;Q\xf7' [4] -(decode)-> 'abaFECbaabbDF' [13]
[22:02] C:\pywk\clp>py24 trivcodec.py aaaaaaaa bbbb CC DD EE FF
'aaaaaaaa' [8] -(encode)-> '\x00' [1]
'\x00' [1] -(decode)-> 'aaaaaaaa' [8]
'bbbb' [4] -(encode)-> 'U' [1]
'U' [1] -(decode)-> 'bbbb' [4]
'CC' [2] -(encode)-> '3' [1]
'3' [1] -(decode)-> 'CC' [2]
'DD' [2] -(encode)-> 'w' [1]
'w' [1] -(decode)-> 'DD' [2]
'EE' [2] -(encode)-> '\xbb' [1]
'\xbb' [1] -(decode)-> 'EE' [2]
'FF' [2] -(encode)-> '\x00' [1]
'\x00' [1] -(decode)-> 'FF' [2]
[22:03] C:\pywk\clp>py24 trivcodec.py aaaaaaaabbbbCCDDEEFF
'aaaaaaaabbbbCCDDEEFF' [20] -(encode)-> '\x00U3w\xbb\x00' [6]
'\x00U3w\xbb\x00' [6] -(decode)-> 'aaaaaaaabbbbCCDDEEFF' [20]
Regards,
Bengt Richter