Hello,
Below is my code, where I am converting pdf to xml format.
But this gives me xml formatted file only if I used XFA-PDF(pdf form) formatted pdf.
I need to convert any type of pdf to xml format and xml contain information about text value, tables, images, objects/drawings and their x,y co-ordinates.
Is there any way to get this type of xml from pdf?
Thank you!
import PyPDF2
import re
def findInDict(needle, haystack):
for key in haystack.keys():
try:
value=haystack[key]
except:
continue
if key==needle:
return value
if isinstance(value,dict):
x=findInDict(needle,value)
if x is not None:
return x
def create_xml_PDFform(xfa):
for i in range(0,len(xfa)):
try:
xml = xfa[i].getObject().getData()
f = open('C:\\Users\\tanvi_karekar\\'+str(pdf_file)+'.xml', 'ab')
f.write(xml)
f.close()
except:
continue
if __name__ == '__main__':
pdf_file = 'sampleDoc3'
pdf_file_path = 'C:\\Users\\tanvi_karekar\\'+str(pdf_file)+'.pdf'
pdfobject = open(pdf_file_path,'rb')
pdf = PyPDF2.PdfFileReader(pdfobject)
xfa = findInDict('/XFA',pdf.resolved_objects)
create_xml_PDFform(xfa)