Hello, I am trying to create a python script which extract the XML value stored in a SQL database table and then extract particular fields from the XML. So far i have managed to create a script which will get the XML value, get the data from the top levels. However i am having trouble extracting data from the reference section (i have included a picture of the XML and highlighted the section i am trying to export). Below is the script i am using it works up until “message =” section and then reference number variable keeps coming back blank as it cannot find it. Does anyone know what i am doing wrong as it just stops finding things when it gets to the sections with the XLMS parts in the attributes.
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows
import pyodbc
from xml.etree import ElementTree as ET
import re
import datetime
from datetime import datetime
def XMLExistsCheck(XMLSec, XMLValue, IsAtt):
'''
Checks whether XML tag requested is in the XML script
Args:
res (str): String version of response XML.
XMLSec: this is the XML Section the value to be found in
XMLValue: this is the XML value which is being looked for
IsAtt: if its an attribute being looked for then its 1 else 0
Returns:
if it finds the value then the string 'found' is returned
'''
import xml.etree.ElementTree as ET
import re
# Compile once on import. Regular expression to strip XML namespace.
# We don't need it and it amkes aprsing wqith elementree a pain.
strip_namespace_regex = re.compile(' xmlns="[^"]+"')
if IsAtt > 0:
if XMLValue in XMLSec.attrib:
return 'found'
else:
if XMLValue == '':
if XMLSec.text is not None:
return 'found'
if XMLSec.find(XMLValue) is not None:
return 'found'
cnxn = pyodbc.connect(r'Driver=SQL Server;Server=server01;Database=testXMLDB;Trusted_Connection=yes;')
cursor = cnxn.cursor()
cursor.execute("SELECT TOP 1 * FROM XMLtesttable WHERE YEAR(LoadedDateTime) > 2023")
while 1:
row = cursor.fetchone()
if not row:
break
COMMDATE = row.LoadedDateTime
COMMXML = row.XMLData
root = ET.fromstring(COMMXML)
#root = tree.getroot()
Addressee = root.findall('.//Addressee')
for Addressee in Addressee:
ComName = Addressee.attrib['CompanyName'] if XMLExistsCheck(Addressee,'CompanyName',1) == 'found' else ''
Address = Addressee.findall('.//Address')
for Address in Address:
ComPostcode = Address.attrib['PostCode'] if XMLExistsCheck(Address,'PostCode',1) == 'found' else ''
Content = root.findall('.//Content')
for Content in Content:
CommonExtract = Content.findall('.//CommonExtract')
for CommonExtract in CommonExtract:
SystemDate = CommonExtract.attrib['SystemDate'] if XMLExistsCheck(CommonExtract,'SystemDate',1) == 'found' else ''
SystemDateconvert = datetime.strptime(SystemDate, '%Y-%m-%d')
SystemDateYear = SystemDateconvert.year
message = CommonExtract.findall('.//{http://www.origostandards.com/schema/soap/v1}message')
for message in message:
m_content = message.findall('.//{http://www.origostandards.com/schema/ReceiveRemunerationStatement/v1/LoadStatementRequest}m_content')
for m_content in m_content:
payee_firm = m_content.findall('.//{http://www.origostandards.com/schema/ReceiveRemunerationStatement/v1/LoadStatementRequest}payee_firm')
for payee_firm in payee_firm:
reference = payee_firm.findall('.//reference')
for reference in reference:
reference_number = reference.find('reference_number').text
print(ComName)
print(ComPostcode)
print(SystemDate)
print(SystemDateYear)
cursor.close()
cnxn.close()
import xml.etree.ElementTree as ET
import datetime
from datetime import datetime
def XMLExistsCheck(XMLSec, XMLValue, IsAtt):
'''
Checks whether XML tag requested is in the XML script
Args:
res (str): String version of response XML.
XMLSec: this is the XML Section the value to be found in
XMLValue: this is the XML value which is being looked for
IsAtt: if its an attribute being looked for then its 1 else 0
Returns:
if it finds the value then the string 'found' is returned
'''
import xml.etree.ElementTree as ET
import re
# Compile once on import. Regular expression to strip XML namespace.
# We don't need it and it amkes aprsing wqith elementree a pain.
strip_namespace_regex = re.compile(' xmlns="[^"]+"')
if IsAtt > 0:
if XMLValue in XMLSec.attrib:
return 'found'
else:
if XMLValue == '':
if XMLSec.text is not None:
return 'found'
if XMLSec.find(XMLValue) is not None:
return 'found'
tree = ET.parse('c:\TEST.xml')
root = tree.getroot()
Addressee = root.findall('.//Addressee')
for Addressee in Addressee:
ComName = Addressee.attrib['CompanyName'] if XMLExistsCheck(Addressee,'CompanyName',1) == 'found' else ''
Address = Addressee.findall('.//Address')
for Address in Address:
ComPostcode = Address.attrib['PostCode'] if XMLExistsCheck(Address,'PostCode',1) == 'found' else ''
Content = root.findall('.//Content')
for Content in Content:
CommonExtract = Content.findall('.//CommonExtract')
for CommonExtract in CommonExtract:
SystemDate = CommonExtract.attrib['SystemDate'] if XMLExistsCheck(CommonExtract,'SystemDate',1) == 'found' else ''
SystemDateconvert = datetime.strptime(SystemDate, '%Y-%m-%d')
SystemDateYear = SystemDateconvert.year
message = CommonExtract.findall('.//{http://www.origostandards.com/schema/soap/v1}message')
for message in message:
m_content = message.findall('.//{http://www.origostandards.com/schema/ReceiveRemunerationStatement/v1/LoadStatementRequest}m_content')
for m_content in m_content:
payee_firm = m_content.findall('.//{http://www.origostandards.com/schema/ReceiveRemunerationStatement/v1/LoadStatementRequest}payee_firm')
for payee_firm in payee_firm:
reference = payee_firm.findall('.//reference')
for reference in reference:
reference_number = reference.find('reference_number').text
print(ComName)
print(ComPostcode)
print(SystemDate)
print(SystemDateYear)