Problem with uudecode

Juho Saarikko

I made a Python script which takes Usenet message bodies from a database,
decodes uuencoded contents and inserts them as Large Object into a
PostGreSQL database. However, it appears that the to last few bytes
of uudecoded data are always mangled. Take a look of this hexdump output:

Originals (decoded with Pan, each line is from a different file):
000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
0011a10 ff54 00d9
00093e0 fb4f a80d ffd9 c200 ffef 00d9

Decoded by the script:
000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
0011a10 ff54 00d8
00093e0 fb4f a80d ffd9 c200 ffef 00d8

As you can see, one of the last two bytes gets altered in all cases.

The script also outputs the decoded file to disk for debugging purposes,
and the database large object and filesystem file match so it can't be a
PostGreSQL problem.

So, if anyone has any idea what is wrong, please tell me ? I can't found
any reason why the bytes would get mangled...

The script follows:

#!/usr/local/bin/python2.3

# Insert message contents into the database, for each message-id already there
#
# Copyright 2004 by Juho Saarikko
# License: GNU General Public License (GPL) version 2
# See www.gnu.org for details

from pyPgSQL import libpq
import nntplib
import sys
import string
import regex
import sha
import imghdr
import binascii
import StringIO
import os

def strip_trailing_dots(n):
tmp = []
for i in range(len(n)):
if n[i][-1] == "," or n[i][-1] == ".":
tmp.append(n[i][:-1])
else:
tmp.append(n[i])
return tmp

def findmimetype(body, filename):
tail4 = string.lower(filename[-5:])
tail3 = string.lower(filename[-4:])
if tail4 == ".jpeg":
return "image/jpeg"
if tail3 == ".jpg":
return "image/jpeg"
if tail3 == ".png":
return "image/png"
if tail3 == ".jpe":
return "image/jpeg"
if tail3 == ".gif":
return "image/gif"
return None

def insert_picture(conn, image, filename):
hash = sha.new(image)
qhash = libpq.PgQuoteBytea(hash.digest())
candidates = conn.query("SELECT id, picture FROM pictures WHERE hash = " + qhash )
if candidates.ntuples > 0:
print "Found possible mathces " + str(candidates.ntuples)
for x in range(candidates.ntuples):
old = candidates.getvalue(x, 1)
old.open("r")
oldpic = old.read()
old.close()
if oldpic == image:
print "Found a match"
ret = (candidates.getvalue(x,0), 1)
return ret
mime = findmimetype(image, filename)
print "attempting to get mimetype"
if mime == None:
print "No mimetype found"
ret = (0, 0)
return ret
mime = libpq.PgQuoteString(mime)
mimeres = conn.query("SELECT id FROM mimetypes WHERE mimetype = " + mime)
if mimeres.ntuples == 0:
conn.query("INSERT INTO mimetypes (mimetype) VALUES (" + mime + ")")
mimeres = conn.query("SELECT id FROM mimetypes WHERE mimetype = " + mime)
mimetype = mimeres.getvalue(0,0)
picture = conn.lo_creat("rw")
picture.open("rw")
picture.write(image)
picture.close()
tmp = conn.query("INSERT INTO pictures (hash, mimetype, picture) VALUES (" + qhash + ", " +str(mimetype) + ", " + picture.name + ")")
temp = conn.query("SELECT id FROM pictures WHERE OID = " + str(tmp.oidValue))
id = temp.getvalue(0,0)
ret = (id, 0)
return ret

def try_decode_and_insert_uuencoded(conn, id):
begin = regex.compile("begin [0-9]+ \(.*\)")
conn.query("BEGIN")
basedir = "kuvat"
message = conn.query("SELECT data FROM fragments_bodies WHERE message = " + str(id) + " ORDER BY line")
# print message.ntuples

keywords = []
picids = []
newpicids = []
n = 0
s = ""
picid = 0
print 'Starting message id ' + str(id)
while n < message.ntuples:
# print "length of row " + str(n)
# print str(message.getlength(n, 0))
# print "Got length"
abcddummy = message.getvalue(n, 0)
# print "Got value"
s = message.getvalue(n, 0)
# print "Got s"
if begin.match(s) > 0:
# if match_beginning(s) > 0:
# print "Begin matched"
body = []
file = begin.group(1)
# file = get_file_name(s)
# print "Starting to decode, at line " + str(n + 1)
for k in range(n+1, message.ntuples):
# print "Decodind row " + str(k)
s = message.getvalue(k, 0)
if s[:3] == "end":
n = k + 1
break
try:
body.append(binascii.a2b_uu(s))
except:
try:
bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
body.append(binascii.a2b_uu(s[:bytes]))
except:
print "Broken attachment in message " + str(id)
conn.query("ROLLBACK")
return
# print "Got to end, at line " + str(n)
# print "Attempting to join body"
body = string.join(body, "")
# print "Attempting to hash body"
# hash = sha.new(body)
# qhash = libpq.PgQuoteBytea(hash.digest())
# qbody = libpq.PgQuoteBytea(body)
# print "Attempting to find whether the pic already exists"
print "Mimetype returned " + str(findmimetype(body, file))
# temporary = open("dummy", "wb")
# temporary.write(body)
# temporary.close()
# dummy.write("dsfds")
print "Calling insert function"
picid, exists = insert_picture(conn, body, file)
print "Returned from insert function with value " + str(picid)
if picid > 0:
# already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# if already.ntuples == 0:
# print "Attempting to find mimetype"
# mimetype = findmimetype(body, file)
# print "Found mimetype"
# if mimetype != None:
# o = conn.query("INSERT INTO pictures (picture, hash, mimetype) VALUES (" + qbody + ", " + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
# already = conn.query("SELECT id FROM pictures WHERE OID = " + str(o.oidValue()));
# already = conn.query("SELECT id FROM pictures WHERE data = " + qbody)
# already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to insert hash and mimetype"
# conn.query("INSERT INTO pictures (hash, mimetype) VALUES (" + qhash + ", " + libpq.PgQuoteString(mimetype) + ")")
# print "Attempting to get id"
# already = conn.query("SELECT id FROM pictures WHERE hash = " + qhash)
# print "Attempting to get value"
# picid = already.getvalue(0, 0)
print picid
print "Attempting to OK dir"
if os.access(basedir + "/tmp", os.F_OK) != 1:
os.mkdir(basedir + "/tmp")
fh = open(basedir + "/tmp/" + str(picid), "wb")
fh.write(body)
fh.close()
print "File ok"
picids.append(picid)
if exists == 0:
newpicids.append(picid)
if file != "":
keywords.append(file)
# else:
# picid = already.getvalue(0, 0)
# if already.ntuples == 0:
# conn.query("ROLLBACK")
# return
# picids.append(picid)
# if already.ntuples == 0:
# print "already.ntuples == 0, ROLLBACKing"
# conn.query("ROLLBACK")
# return
# print "Appending picid"
# picids.append(picid)
# print "Picid appended"
else:
tmptmp = string.split(s)
tmpkey = strip_trailing_dots(tmptmp)
if len(tmpkey) > 0:
for j in range(len(tmpkey)):
keywords.append(tmpkey[j])
# print "Adding 1 to n"
n = n + 1
if len(picids) > 0:
print "Found " + str(len(picids)) + " pictures (" + str(len(newpicids)) + " new ones)"
# print "Finding Subject"
head = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ilike 'Subject')")
if head.ntuples > 0:
# print "Splitting Subject"
blah = head.getvalue(0,0)
# print str(blah)
blahblah = string.split(str(blah))
# print "Stripping"
abctmpkey = strip_trailing_dots(blahblah)
# print "Stripping done"
# print "Really"
tmpkey = abctmpkey
# print "Subject split"
if len(tmpkey) > 0:
for j in range(len(tmpkey)):
keywords.append(tmpkey[j])
o = conn.query("INSERT INTO messages DEFAULT VALUES")
mid = conn.query("SELECT id FROM messages WHERE OID = " + str(o.oidValue))
messageid = mid.getvalue(0, 0)
nresult = conn.query("SELECT contents FROM fragments_header_contents WHERE message = " + str(id) + " AND header = (SELECT id FROM fragments_header_names WHERE header ILIKE 'Newsgroups')")
if nresult.ntuples > 0:
for x in range(nresult.ntuples):
newsgroups = string.split(nresult.getvalue(x, 0), ",")
if len(newsgroups) > 0:
for y in range (len(newsgroups)):
newsgroup = libpq.PgQuoteString(newsgroups[y])
ngroupres = conn.query("SELECT id FROM newsgroups WHERE name = " + newsgroup)
if ngroupres.ntuples > 0:
newsgid = ngroupres.getvalue(0, 0)
else:
conn.query("INSERT INTO newsgroups (name) VALUES (" + newsgroup + ")")
ngrtmpres = conn.query("SELECT id FROM newsgroups WHERE name = " + newsgroup)
newsgid = ngrtmpres.getvalue(0, 0)
conn.query("INSERT INTO messages_ngroups_glue (message, newsgroup) VALUES (" + str(messageid) + ", " + str(newsgid) + ")")
else:
print "An empty Newsgroups: header at messag " + str(id)
conn.query("ROLLBACK")
return
else:
print "No Newsgroups: header at message " + str(id)
conn.query("ROLLBACK")
return
for x in range(len(picids)):
conn.query("INSERT INTO messages_pictures_glue (message, picture) VALUES (" + str(messageid) + ", " + str(picids[x]) + ")")
if len(keywords) > 0:
for x in range(len(tmpkey)):
qword = libpq.PgQuoteString(str(keywords[x]))
tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
if tmp.ntuples == 0:
conn.query("INSERT INTO keywords_words (keyword) VALUES (" + qword + ")")
tmp = conn.query("SELECT id FROM keywords_words WHERE keyword = " + qword)
keyid = str(tmp.getvalue(0, 0))
for y in range(len(picids)):
conn.query("INSERT INTO keywords_glue(word, picture) VALUES (" + keyid + ", " + str(picids[y]) + ")")
dummyone = "SELECT fragments_header_contents.line, fragments_header_names.header,"
dummytwo = " fragments_header_contents.contents FROM fragments_header_names, fragments_header_contents"
dummythree = " WHERE fragments_header_contents.message = " + str(id)
dummyfour = " AND fragments_header_contents.header = fragments_header_names.id"
head = conn.query(dummyone + dummytwo + dummythree + dummyfour)
if head.ntuples > 0:
for h in range(head.ntuples):
qhead = libpq.PgQuoteString(str(head.getvalue(h, 1)))
qcont = libpq.PgQuoteString(str(head.getvalue(h, 2)))
tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
if tmp.ntuples == 0:
conn.query("INSERT INTO header_names (header) VALUES (" + qhead + ")")
tmp = conn.query("SELECT id FROM header_names WHERE header = " + qhead)
headid = str(tmp.getvalue(0, 0))
line = str(head.getvalue(0, 0))
conn.query("INSERT INTO header_contents (header, message, line, contents) VALUES (" + headid + ", " + str(messageid) + ", " + line + ", " + qcont + ")")
conn.query("DELETE FROM fragments_header_contents WHERE message = " + str(id))
conn.query("DELETE FROM fragments_bodies WHERE message = " + str(id))
conn.query("COMMIT")
if len(newpicids) > 0:
tmpdir = basedir + "/tmp/"
for i in range(len(newpicids)):
picid = newpicids[i]
tmppicname = tmpdir + str(picid)
permpicname = basedir + "/" + str(picid%1000) + "/" + str(picid)
print tmppicname
print permpicname
if os.access(basedir + "/" + str(picid%1000), os.F_OK) != 1:
os.mkdir(basedir + "/" + str(picid%1000))
os.link(tmppicname, permpicname)
os.unlink(tmpdir +str(picid))
else:
print "No pictures found"
conn.query("ROLLBACK")
return
database = libpq.PQconnectdb('dbname = kuvat')
items = database.query("SELECT message FROM whole_attachments")

# try_decode_and_insert_uuencoded(database, 5407)

for i in range(items.ntuples):
try:
print 'Starting call ' + str(i)
try_decode_and_insert_uuencoded(database, items.getvalue(items.ntuples - 1 - i,0))
print ' returned from call ' + str(i)
except:
print 'Some other error occurred at message " + str(i) + ", trying to continue...'

Jul 18 '05 #1

Subscribe Post Reply

2703

Ville Vainio

>>>>> "Juho" == Juho Saarikko <so***@but.no.spam> writes:

Juho> I made a Python script which takes Usenet message bodies
Juho> from a database, decodes uuencoded contents and inserts them
Juho> as Large Object into a PostGreSQL database. However, it
Juho> appears that the to last few bytes

I skimmed through your program, and noticed that you use binascii
module uuencode/decode. Have you given the "uu" module a try, to see
if it works better?

Also, get rid of "regex" module, it even gives a DeprecationWarning
suggesting switching to "re".

--
Ville Vainio http://tinyurl.com/2prnb

Jul 18 '05 #2

Juho Saarikko

On Tue, 25 May 2004 22:04:24 +0300, Ville Vainio wrote:

>> "Juho" == Juho Saarikko <so***@but.no.spam> writes:

Juho> I made a Python script which takes Usenet message bodies
Juho> from a database, decodes uuencoded contents and inserts them
Juho> as Large Object into a PostGreSQL database. However, it
Juho> appears that the to last few bytes

I skimmed through your program, and noticed that you use binascii
module uuencode/decode. Have you given the "uu" module a try, to see
if it works better?

I did examine the uu module, but it would seem that I'd had to parse the
message first anyway to get the file name and the non-binary parts of the
message as keywords. Besides, as I understand it, the uu module uses the
binascii module, so if there's something wrong with the binascii module,
the uu module can't possibly work well.

Oh well, I would had to write the parsing engine anyway (or learn to
use the e-mail classes), to properly handle mime and yenc messages. And I
suppose I'd better start using imagemagic to verify the mimetype of
decoded files, instead of just believing the filename. And join together
files that have been spread over multiple messages. Work, work, work...
Also, get rid of "regex" module, it even gives a DeprecationWarning
suggesting switching to "re".

I would, if I knew how to make regular expressions; I found the uu-parsing
snippet from the net and built my script around it, but the
regular expression doesn't seem to work with the re module.

Jul 18 '05 #3

Steve Holden

Juho Saarikko wrote:

I made a Python script which takes Usenet message bodies from a database,
decodes uuencoded contents and inserts them as Large Object into a
PostGreSQL database. However, it appears that the to last few bytes
of uudecoded data are always mangled. Take a look of this hexdump output:

Originals (decoded with Pan, each line is from a different file):
000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
0011a10 ff54 00d9
00093e0 fb4f a80d ffd9 c200 ffef 00d9

Decoded by the script:
000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
0011a10 ff54 00d8
00093e0 fb4f a80d ffd9 c200 ffef 00d8

As you can see, one of the last two bytes gets altered in all cases.

The script also outputs the decoded file to disk for debugging purposes,
and the database large object and filesystem file match so it can't be a
PostGreSQL problem.

So, if anyone has any idea what is wrong, please tell me ? I can't found
any reason why the bytes would get mangled...

The script follows:

[...]
I note that you are dumping words rather than bytes. Is it possible that
the last byte isn't actually a part of the file, that
endianness makes the last byte look like the penultimate byte, and that
what you are seeing is simply noise?

If not then it should probably be looked into ...

regards
Steve

Jul 18 '05 #4

Juho Saarikko

On Tue, 25 May 2004 18:54:44 -0400, Steve Holden wrote:

I note that you are dumping words rather than bytes. Is it possible that
the last byte isn't actually a part of the file, that
endianness makes the last byte look like the penultimate byte, and that
what you are seeing is simply noise?
Well, ImageMagick complains that the image contains errors (altought
Eye of Gnome shows it with no artifacts), so it's likely to be part of the
file itself.

I get both

"display: Premature end of JPEG file"

and

"display: Invalid JPEG file structure: two SOI markers"

errors. The later error prevent ImageMagick's display-command from
displaying the image (but not Eye of Gnome).
If not then it should probably be looked into ...

Looked, looked, but where to start ? The bug could be anywhere from my
script to binascii module to the nntp module to the string.join -function.

Jul 18 '05 #5

Tim Roberts

Juho Saarikko <so***@but.no.spam> wrote:

I made a Python script which takes Usenet message bodies from a database,
decodes uuencoded contents and inserts them as Large Object into a
PostGreSQL database. However, it appears that the to last few bytes
of uudecoded data are always mangled. Take a look of this hexdump output:

Originals (decoded with Pan, each line is from a different file):
000c2c0 e1bf 00ff 2541 a9e4 a724 d9ff
0011a10 ff54 00d9
00093e0 fb4f a80d ffd9 c200 ffef 00d9

Decoded by the script:
000c2c0 e1bf 00ff 2541 a9e4 a724 d0ff
0011a10 ff54 00d8
00093e0 fb4f a80d ffd9 c200 ffef 00d8

As you can see, one of the last two bytes gets altered in all cases.
As others have pointed out, it's really the last byte that is getting
altered.
for k in range(n+1, message.ntuples):
# print "Decodind row " + str(k)
s = message.getvalue(k, 0)
if s[:3] == "end":
n = k + 1
break
try:
body.append(binascii.a2b_uu(s))
except:
try:
bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3
body.append(binascii.a2b_uu(s[:bytes]))
except:
print "Broken attachment in message " + str(id)
conn.query("ROLLBACK")
return

Your computation of the number of bytes in the uuencoded string will come
up one short: you're not accounting for the length byte. That will have
exactly the effect you describe. You lose the last encoded character,
which means you'll miss the last 6 bits of the file. Change it to this:

bytes = (((ord(s[0])-32) & 63) * 4 + 3) / 3 + 1

However, you should not need to wrap the first binascii.a2b_uu call with
try/except at all. What is happening that causes the error in the first
place? I suspect if you fix the root cause, you could eliminate the except
clause altogether.
--
- Tim Roberts, ti**@probo.com
Providenza & Boekelheide, Inc.

Jul 18 '05 #6

Similar topics

Windows python/Pmw-tkinter problem

by: Bruce Davis | last post by:

I'm having a problem on windows (both 2000 and XP) with a multi-threaded tkinter gui application. The problem appears to be a deadlock condition when a child thread pops up a Pmw dialog window in...

Python

Virtual function problem

by: Kostatus | last post by:

I have a virtual function in a base class, which is then overwritten by a function of the same name in a publically derived class. When I call the function using a pointer to the derived class...

C / C++

117

Solution to the halting Problem?

by: Peter Olcott | last post by:

www.halting-problem.com

C / C++

Problem with virtual method

by: Jon Davis | last post by:

If I have a class with a virtual method, and a child class that overrides the virtual method, and then I create an instance of the child class AS A base class... BaseClass bc = new ChildClass();...

C# / C Sharp

Problem: Request time out

by: Ammar | last post by:

Dear All, I'm facing a small problem. I have a portal web site, that contains articles, for each article, the end user can send a comment about the article. The problem is: I the comment length...

ASP.NET

SP1 Problem SOAPException doesn't return quote and Umlaute correcty

by: Dany | last post by:

Our web service was working fine until we installed .net Framework 1.1 service pack 1. Uninstalling SP1 is not an option because our largest customer says service packs marked as "critical" by...

.NET Framework

uuDecode problem

by: py | last post by:

Hi, I am encoding a string such as... data = someFile.readlines() encoded = for line in data: encoded.append(binascii.b2a_uu(stringToEncode)) return encoded

Python

Determining Dropdown Value Problem

by: Mike Collins | last post by:

I cannot get the correct drop down list value from a drop down I have on my web form. I get the initial value that was loaded in the list. It was asked by someone else what the autopostback was...

C# / C Sharp

Problem with exec

by: Nelluru | last post by:

Hi, I am using PHP 5.2.5 and IIS 5.1 on Windows XP SP3 machine. I am trying to execute an exe by using exec or system command. When I run this php script from the command line it works fine...

PHP

How to turn on java script in a villaon keypad mobile phone

by: Charles Arthur | last post by:

How do i turn on java script on a villaon, callus and itel keypad mobile phone

Java

Navigating the Data Structures and Algorithms (DSA)

by: BarryA | last post by:

What are the essential steps and strategies outlined in the Data Structures and Algorithms (DSA) roadmap for aspiring data scientists? How can individuals effectively utilize this roadmap to progress...

Algorithms / Advanced Math

Looking to do Android software development, any suggestions? Is flutter better?

by: nemocccc | last post by:

hello, everyone, I want to develop a software for my android phone for daily needs, any suggestions?

General

What is ONU?

by: marktang | last post by:

ONU (Optical Network Unit) is one of the key components for providing high-speed Internet services. Its primary function is to act as an endpoint device located at the user's premises. However,...

General

Problem With Comparison Operator <=> in G++

by: Oralloy | last post by:

Hello folks, I am unable to find appropriate documentation on the type promotion of bit-fields when using the generalised comparison operator "<=>". The problem is that using the GNU compilers,...

C / C++

Maximizing Business Potential: The Nexus of Website Design and Digital Marketing

by: jinu1996 | last post by:

In today's digital age, having a compelling online presence is paramount for businesses aiming to thrive in a competitive landscape. At the heart of this digital strategy lies an intricately woven...

Online Marketing

Discussion: How does Zigbee compare with other wireless protocols in smart home applications?

by: tracyyun | last post by:

Dear forum friends, With the development of smart home technology, a variety of wireless communication protocols have appeared on the market, such as Zigbee, Z-Wave, Wi-Fi, Bluetooth, etc. Each...

General

AI Job Threat for Devs

by: agi2029 | last post by:

Let's talk about the concept of autonomous AI software engineers and no-code agents. These AIs are designed to manage the entire lifecycle of a software development project—planning, coding, testing,...

Career Advice

Access Europe - Using VBA to create a class based on a table - Wed 1 May

by: isladogs | last post by:

The next Access Europe User Group meeting will be on Wednesday 1 May 2024 starting at 18:00 UK time (6PM UTC+1) and finishing by 19:30 (7.30PM). In this session, we are pleased to welcome a new...

Microsoft Access / VBA