I made a Python script which takes Usenet message bodies from a database,
decodes uuencoded contents and inserts them as Large Object into a
PostGreSQL database. However, it appears that the to last few bytes
of uudecoded data are always mangled. Take a look of this hexdump output:
Originals (decoded with Pan, each line is from a different file):
000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
0011a10 ff54 00d9
00093e0 fb4f a80d ffd9 c200 ffef 00d9
Decoded by the script:
000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
0011a10 ff54 00d8
00093e0 fb4f a80d ffd9 c200 ffef 00d8
As you can see, one of the last two bytes gets altered in all cases.
The script also outputs the decoded file to disk for debugging purposes,
and the database large object and filesystem file match so it can't be a
PostGreSQL problem.
So, if anyone has any idea what is wrong, please tell me ? I can't found
any reason why the bytes would get mangled...
The script follows:
#!/usr/local/bin/python2.3
# Insert message contents into the database, for each message-id already there
#
# Copyright 2004 by Juho Saarikko
# License: GNU General Public License (GPL) version 2
# See www.gnu.org for details
from pyPgSQL import libpq
import nntplib
import sys
import string
import regex
import sha
import imghdr
import binascii
import StringIO
import os
def strip_trailing_ dots(n):
tmp = []
for i in range(len(n)):
if n[i][-1] == "," or n[i][-1] == ".":
tmp.append(n[i][:-1])
else:
tmp.append(n[i])
return tmp
def findmimetype(bo dy, filename):
tail4 = string.lower(fi lename[-5:])
tail3 = string.lower(fi lename[-4:])
if tail4 == ".jpeg":
return "image/jpeg"
if tail3 == ".jpg":
return "image/jpeg"
if tail3 == ".png":
return "image/png"
if tail3 == ".jpe":
return "image/jpeg"
if tail3 == ".gif":
return "image/gif"
return None
def insert_picture( conn, image, filename):
hash = sha.new(image)
qhash = libpq.PgQuoteBy tea(hash.digest ())
candidates = conn.query("SEL ECT id, picture FROM pictures WHERE hash = " + qhash )
if candidates.ntup les > 0:
print "Found possible mathces " + str(candidates. ntuples)
for x in range(candidate s.ntuples):
old = candidates.getv alue(x, 1)
old.open("r")
oldpic = old.read()
old.close()
if oldpic == image:
print "Found a match"
ret = (candidates.get value(x,0), 1)
return ret
mime = findmimetype(im age, filename)
print "attempting to get mimetype"
if mime == None:
print "No mimetype found"
ret = (0, 0)
return ret
mime = libpq.PgQuoteSt ring(mime)
mimeres = conn.query("SEL ECT id FROM mimetypes WHERE mimetype = " + mime)
if mimeres.ntuples == 0:
conn.query("INS ERT INTO mimetypes (mimetype) VALUES (" + mime + ")")
mimeres = conn.query("SEL ECT id FROM mimetypes WHERE mimetype = " + mime)
mimetype = mimeres.getvalu e(0,0)
picture = conn.lo_creat(" rw")
picture.open("r w")
picture.write(i mage)
picture.close()
tmp = conn.query("INS ERT INTO pictures (hash, mimetype, picture) VALUES (" + qhash + ", " +str(mimetype) + ", " + picture.name + ")")
temp = conn.query("SEL ECT id FROM pictures WHERE OID = " + str(tmp.oidValu e))
id = temp.getvalue(0 ,0)
ret = (id, 0)
return ret
def try_decode_and_ insert_uuencode d(conn, id):
begin = regex.compile(" begin [0-9]+ \(.*\)")
conn.query("BEG IN")
basedir = "kuvat"
message = conn.query("SEL ECT data FROM fragments_bodie s WHERE message = " + str(id) + " ORDER BY line")
# print message.ntuples
keywords = []
picids = []
newpicids = []
n = 0
s = ""
picid = 0
print 'Starting message id ' + str(id)
while n < message.ntuples :
# print "length of row " + str(n)
# print str(message.get length(n, 0))
# print "Got length"
abcddummy = message.getvalu e(n, 0)
# print "Got value"
s = message.getvalu e(n, 0)
# print "Got s"
if begin.match(s) > 0:
# if match_beginning (s) > 0:
# print "Begin matched"
body = []
file = begin.group(1)
# file = get_file_name(s )
# print "Starting to decode, at line " + str(n + 1)
for k in range(n+1, message.ntuples ):
# print "Decodind row " + str(k)
s = message.getvalu e(k, 0)
if s[:3] == "end":
n = k + 1
break
try:
body.append(bin ascii.a2b_uu(s) )
except:
try:
bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
body.append(bin ascii.a2b_uu(s[:bytes]))
except:
print "Broken attachment in message " + str(id)
conn.query("ROL LBACK")
return
# print "Got to end, at line " + str(n)
# print "Attempting to join body"
body = string.join(bod y, "")
# print "Attempting to hash body"
# hash = sha.new(body)
# qhash = libpq.PgQuoteBy tea(hash.digest ())
# qbody = libpq.PgQuoteBy tea(body)
# print "Attempting to find whether the pic already exists"
print "Mimetype returned " + str(findmimetyp e(body, file))
# temporary = open("dummy", "wb")
# temporary.write (body)
# temporary.close ()
# dummy.write("ds fds")
print "Calling insert function"
picid, exists = insert_picture( conn, body, file)
print "Returned from insert function with value " + str(picid)
if picid > 0:
# already = conn.query("SEL ECT id FROM pictures WHERE hash = " + qhash)
# if already.ntuples == 0:
# print "Attempting to find mimetype"
# mimetype = findmimetype(bo dy, file)
# print "Found mimetype"
# if mimetype != None:
# o = conn.query("INS ERT INTO pictures (picture, hash, mimetype) VALUES (" + qbody + ", " + qhash + ", " + libpq.PgQuoteSt ring(mimetype) + ")")
# already = conn.query("SEL ECT id FROM pictures WHERE OID = " + str(o.oidValue( )));
# already = conn.query("SEL ECT id FROM pictures WHERE data = " + qbody)
# already = conn.query("SEL ECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to insert hash and mimetype"
# conn.query("INS ERT INTO pictures (hash, mimetype) VALUES (" + qhash + ", " + libpq.PgQuoteSt ring(mimetype) + ")")
# print "Attempting to get id"
# already = conn.query("SEL ECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to get value"
# picid = already.getvalu e(0, 0)
print picid
print "Attempting to OK dir"
if os.access(based ir + "/tmp", os.F_OK) != 1:
os.mkdir(basedi r + "/tmp")
fh = open(basedir + "/tmp/" + str(picid), "wb")
fh.write(body)
fh.close()
print "File ok"
picids.append(p icid)
if exists == 0:
newpicids.appen d(picid)
if file != "":
keywords.append (file)
# else:
# picid = already.getvalu e(0, 0)
# if already.ntuples == 0:
# conn.query("ROL LBACK")
# return
# picids.append(p icid)
# if already.ntuples == 0:
# print "already.ntuple s == 0, ROLLBACKing"
# conn.query("ROL LBACK")
# return
# print "Appending picid"
# picids.append(p icid)
# print "Picid appended"
else:
tmptmp = string.split(s)
tmpkey = strip_trailing_ dots(tmptmp)
if len(tmpkey) > 0:
for j in range(len(tmpke y)):
keywords.append (tmpkey[j])
# print "Adding 1 to n"
n = n + 1
if len(picids) > 0:
print "Found " + str(len(picids) ) + " pictures (" + str(len(newpici ds)) + " new ones)"
# print "Finding Subject"
head = conn.query("SEL ECT contents FROM fragments_heade r_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_heade r_names WHERE header ilike 'Subject')")
if head.ntuples > 0:
# print "Splitting Subject"
blah = head.getvalue(0 ,0)
# print str(blah)
blahblah = string.split(st r(blah))
# print "Stripping"
abctmpkey = strip_trailing_ dots(blahblah)
# print "Stripping done"
# print "Really"
tmpkey = abctmpkey
# print "Subject split"
if len(tmpkey) > 0:
for j in range(len(tmpke y)):
keywords.append (tmpkey[j])
o = conn.query("INS ERT INTO messages DEFAULT VALUES")
mid = conn.query("SEL ECT id FROM messages WHERE OID = " + str(o.oidValue) )
messageid = mid.getvalue(0, 0)
nresult = conn.query("SEL ECT contents FROM fragments_heade r_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_heade r_names WHERE header ILIKE 'Newsgroups')")
if nresult.ntuples > 0:
for x in range(nresult.n tuples):
newsgroups = string.split(nr esult.getvalue( x, 0), ",")
if len(newsgroups) > 0:
for y in range (len(newsgroups )):
newsgroup = libpq.PgQuoteSt ring(newsgroups[y])
ngroupres = conn.query("SEL ECT id FROM newsgroups WHERE name = " + newsgroup)
if ngroupres.ntupl es > 0:
newsgid = ngroupres.getva lue(0, 0)
else:
conn.query("INS ERT INTO newsgroups (name) VALUES (" + newsgroup + ")")
ngrtmpres = conn.query("SEL ECT id FROM newsgroups WHERE name = " + newsgroup)
newsgid = ngrtmpres.getva lue(0, 0)
conn.query("INS ERT INTO messages_ngroup s_glue (message, newsgroup) VALUES (" + str(messageid) + ", " + str(newsgid) + ")")
else:
print "An empty Newsgroups: header at messag " + str(id)
conn.query("ROL LBACK")
return
else:
print "No Newsgroups: header at message " + str(id)
conn.query("ROL LBACK")
return
for x in range(len(picid s)):
conn.query("INS ERT INTO messages_pictur es_glue (message, picture) VALUES (" + str(messageid) + ", " + str(picids[x]) + ")")
if len(keywords) > 0:
for x in range(len(tmpke y)):
qword = libpq.PgQuoteSt ring(str(keywor ds[x]))
tmp = conn.query("SEL ECT id FROM keywords_words WHERE keyword = " + qword)
if tmp.ntuples == 0:
conn.query("INS ERT INTO keywords_words (keyword) VALUES (" + qword + ")")
tmp = conn.query("SEL ECT id FROM keywords_words WHERE keyword = " + qword)
keyid = str(tmp.getvalu e(0, 0))
for y in range(len(picid s)):
conn.query("INS ERT INTO keywords_glue(w ord, picture) VALUES (" + keyid + ", " + str(picids[y]) + ")")
dummyone = "SELECT fragments_heade r_contents.line , fragments_heade r_names.header, "
dummytwo = " fragments_heade r_contents.cont ents FROM fragments_heade r_names, fragments_heade r_contents"
dummythree = " WHERE fragments_heade r_contents.mess age = " + str(id)
dummyfour = " AND fragments_heade r_contents.head er = fragments_heade r_names.id"
head = conn.query(dumm yone + dummytwo + dummythree + dummyfour)
if head.ntuples > 0:
for h in range(head.ntup les):
qhead = libpq.PgQuoteSt ring(str(head.g etvalue(h, 1)))
qcont = libpq.PgQuoteSt ring(str(head.g etvalue(h, 2)))
tmp = conn.query("SEL ECT id FROM header_names WHERE header = " + qhead)
if tmp.ntuples == 0:
conn.query("INS ERT INTO header_names (header) VALUES (" + qhead + ")")
tmp = conn.query("SEL ECT id FROM header_names WHERE header = " + qhead)
headid = str(tmp.getvalu e(0, 0))
line = str(head.getval ue(0, 0))
conn.query("INS ERT INTO header_contents (header, message, line, contents) VALUES (" + headid + ", " + str(messageid) + ", " + line + ", " + qcont + ")")
conn.query("DEL ETE FROM fragments_heade r_contents WHERE message = " + str(id))
conn.query("DEL ETE FROM fragments_bodie s WHERE message = " + str(id))
conn.query("COM MIT")
if len(newpicids) > 0:
tmpdir = basedir + "/tmp/"
for i in range(len(newpi cids)):
picid = newpicids[i]
tmppicname = tmpdir + str(picid)
permpicname = basedir + "/" + str(picid%1000) + "/" + str(picid)
print tmppicname
print permpicname
if os.access(based ir + "/" + str(picid%1000) , os.F_OK) != 1:
os.mkdir(basedi r + "/" + str(picid%1000) )
os.link(tmppicn ame, permpicname)
os.unlink(tmpdi r +str(picid))
else:
print "No pictures found"
conn.query("ROL LBACK")
return
database = libpq.PQconnect db('dbname = kuvat')
items = database.query( "SELECT message FROM whole_attachmen ts")
# try_decode_and_ insert_uuencode d(database, 5407)
for i in range(items.ntu ples):
try:
print 'Starting call ' + str(i)
try_decode_and_ insert_uuencode d(database, items.getvalue( items.ntuples - 1 - i,0))
print ' returned from call ' + str(i)
except:
print 'Some other error occurred at message " + str(i) + ", trying to continue...'