Hi ,
Am new to python coding so require some suggestion in code.
We are having huge XML files (we get 150 files in an hour to process out of which 20 files are 5 MB )and trying to convert to csv using XSLT .
The XML is very complex that it has various subarray ,child tags hence we have to create new XSLT for each array .
so for one master tag we ended up creating 100 xslt .
we have written while loop to process each of the xslt to csv.hence the python code will run 100 xslt in sequence manner to create 100 csv . is there any way i can run the code in parallel?
Code:
import lxml.etree as ET
import sys, getopt
import os.path
import gzip
timeit library is used to benchmark performance
from timeit import default_timer as timer
Helper to detect gzipped input python - How to tell if a file is gzip compressed? - Stack Overflow
def is_gz_file(filepath):
with open(filepath, ‘rb’) as test_f:
return test_f.read(2) == b’\x1f\x8b’
def main(argv):
# Read command-line arguments
xmlfile = ‘’
xslfile =
outfile =
try:
opts, args = getopt.getopt(argv,“hx:s:o:”,[“xfile=”,“sfile=”,“ofile=”])
except getopt.GetoptError:
print ‘parse.py -x -s -o [ -s -o … ]’
sys.exit(2)
for opt, arg in opts:
if opt == ‘-h’:
print ‘parse.py -x -s -o [ -s -o … ]’
sys.exit()
elif opt in ("-x", “–xmlfile”):
xmlfile = arg
elif opt in ("-s", “–xslfile”):
xslfile.append(arg)
elif opt in ("-o", “–outfile”):
outfile.append(arg)
# Check input xml file can be read
if not os.path.isfile(xmlfile):
print 'cannot open xml', xmlfile
sys.exit(2)
# Check if input is gz compressed and uncompress if necessary
if is_gz_file(xmlfile):
# Expand, then parse xml input
start = timer()
exp = gzip.open(xmlfile, 'r')
print timer()-start,'gzip.open',xmlfile
start = timer()
xml = ET.parse(exp)
print timer()-start,'ET.parse',xmlfile
else:
# Parse xml input
start = timer()
xml = ET.parse(xmlfile)
print timer()-start,'ET.parse',xmlfile
# iterate xslts / outputs
i = 0
words = xmlfile.split('/')
while i < len(xslfile) and i < len(outfile):
# Check input xslt can be read
if not os.path.isfile(xslfile[i]):
print 'cannot open xslt', xslfile[i]
sys.exit(2)
start = timer()
xsl = ET.parse(xslfile[i])
print timer()-start,'ET.parse',xmlfile,xslfile[i]
start = timer()
transform = ET.XSLT(xsl)
print timer()-start,'ET.XSLT',xmlfile,xslfile[i]
start = timer()
newdom = transform(xml,var1=ET.XSLT.strparam(words[-1]))
test=ET.tostring(newdom)
print timer()-start,'transform',xmlfile,xslfile[i],len(str(newdom))
if len(str(newdom)) > 0 :
with open(outfile[i], 'wb') as f:
start = timer()
f.write(newdom)
print timer()-start,'f.write',xmlfile,xslfile[i]
i += 1
if name == “main”:
main(sys.argv[1:])