I'm attempting to do the following:Use iterparse() instead of parsing the file into memory completely.
A) Read/scan/iterate/etc. through a semi-large XML file (about 135 mb)
B) Grab specific fields and output to a tab-delimited text file
[...]
out = open('output.txt','w')
cat = etree.parse('catalog.xml')
untested:
for _, item in etree.iterparse('catalog.xml', tag='Item'):
# do some cleanup to save memory
previous_item = item.getprevious()
while previous_item is not None:
previous_item.getparent().remove(previous_item)
previous_item = item.getprevious()
# now read the data
id = item.get('ID')
collect = {}
for child in item:
if child.tag != 'ItemVal': continue
collect[child.get('ValueId')] = child.get('value')
print "%s\t%s\t%s\t%s" % ((id,) + tuple(
collect[key] for key in ['name','description','image']))
Stefan