python中的MemoryError用于大型XML文件
我会在Python中遇到错误,以加载Oracle DB表中的大型XML文件:
self._root = parser._parse_whole(source)
MemoryError
我的XML
文件是2GB
,大小为2GB ,我有8GB RAM
。使用2MB XML
文件,代码正常工作,但2GB XML
将其失败如2分钟后归档。
我是Python的新手,实际上不知道如何解决这个问题。 以下是我的XML文件的样子:
<wmX
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns="http://www.wtest" majorRelease="97" minorRelease="1" currDate="2022-04-04+00:00" producer="wtest.core" name="id:19015 name:GTXML01" currSeqNo="304" >
<finObj chgFlag="i" idName="Instrument" idVal="223434343">
<ids>
<id scheme="WKN">434535434</id>
<id scheme="ISIN">TREE123123</id>
</ids>
<section chgFlag="i" idName="E" idVal="12345235564">
<fld idName="ED001">
<datVl chgFlag="i">4</datVl>
</fld>
<fld idName="ED005">
<datVl chgFlag="i">14</datVl>
</fld>
<fld idName="ED006">
<datVl chgFlag="i">91</datVl>
</fld>
<table chgFlag="i" idName="GV325">
<row chgFlag="i" idVal="1">
<fld idName="GV325A">
<datVl chgFlag="i">1</datVl>
</fld>
<fld idName="GV325B">
<datVl chgFlag="i">2001-12-01</datVl>
</fld>
<fld idName="GV325C">
<datVl chgFlag="i">2006-04-30</datVl>
</fld>
<fld idName="GV325D">
<datVl chgFlag="i">01</datVl>
</fld>
</row>
</table>
</section>
</finObj>
</wmX>
下面是我的Python代码的一部分:
import pandas as pd
import xml.etree.ElementTree as ET
from collections import defaultdict
import xmltodict
import sqlalchemy
from sqlalchemy import create_engine
import cx_Oracle
username = "XXX"
password = "XXX-"
host = "XXX"
port = XXX
database = "XXXXX"
engine = create_engine(f'oracle://{username}:{password}@{host}:{port}/?service_name={database}', echo=False)
tree = ET.parse(r'P:\Kopie.xml')
table_name = "H_TEST"
root = tree.getroot()
# we need the namespace to call the objects
namespace = root.tag.split('}')[0].strip('{')
# the data is in two section
sections = list(list(root)[0])[1:]
# the default row is the data that is the same in each example
default_row = {
"TR_NAME": root.get("name"),
"TRE_CURRDATE": root.get("currDate"),
}
def add_new_row():
"""Nested function to create new row"""
# we copy to create a depy copy of the default row
new_row = default_row.copy()
# new values
new_row.update({
"SECTION_IDNAME": section.get("idName"),
})
# the field inside is either datVl or txtVl
for f in list(field):
if f.tag[-5:] == "datVl":
new_row.update({
"DATVL_SECTION_DATA": f.text,
"DATVL_SECTION_CHGFLAG": f.get("chgFlag")
})
if f.tag[-5:] == "txtVl":
new_row.update({
"TXTVL_SECTION_DATA": f.text,
})
return new_row
rows = []
# iterate through sections
for section in sections:
# iterate through individual fields
for field in row.findall("xmlns:fld", namespaces={"xmlns": namespace}):
new_row = add_new_row()
new_row.update({
"TABLE_IDNAME": table.get("idName"),
"TABLE_CHGFLAG": table.get("chgFlag"),
})
rows.append(new_row)
# make a DataFrame
df = pd.DataFrame(rows)
# insert in database
I am getting error in Python for loading large XML file in Oracle DB table :
self._root = parser._parse_whole(source)
MemoryError
My XML
file is 2GB
in size and I have 8GB RAM
. With 2MB XML
file the code is working fine but with 2GB XML
file its failing like after 2min.
I am new in Python and dont know actually how to resolve this issue.
Below is how my XML file looks like:
<wmX
xmlns:xs="http://www.w3.org/2001/XMLSchema"
xmlns="http://www.wtest" majorRelease="97" minorRelease="1" currDate="2022-04-04+00:00" producer="wtest.core" name="id:19015 name:GTXML01" currSeqNo="304" >
<finObj chgFlag="i" idName="Instrument" idVal="223434343">
<ids>
<id scheme="WKN">434535434</id>
<id scheme="ISIN">TREE123123</id>
</ids>
<section chgFlag="i" idName="E" idVal="12345235564">
<fld idName="ED001">
<datVl chgFlag="i">4</datVl>
</fld>
<fld idName="ED005">
<datVl chgFlag="i">14</datVl>
</fld>
<fld idName="ED006">
<datVl chgFlag="i">91</datVl>
</fld>
<table chgFlag="i" idName="GV325">
<row chgFlag="i" idVal="1">
<fld idName="GV325A">
<datVl chgFlag="i">1</datVl>
</fld>
<fld idName="GV325B">
<datVl chgFlag="i">2001-12-01</datVl>
</fld>
<fld idName="GV325C">
<datVl chgFlag="i">2006-04-30</datVl>
</fld>
<fld idName="GV325D">
<datVl chgFlag="i">01</datVl>
</fld>
</row>
</table>
</section>
</finObj>
</wmX>
Below is part of my Python code:
import pandas as pd
import xml.etree.ElementTree as ET
from collections import defaultdict
import xmltodict
import sqlalchemy
from sqlalchemy import create_engine
import cx_Oracle
username = "XXX"
password = "XXX-"
host = "XXX"
port = XXX
database = "XXXXX"
engine = create_engine(f'oracle://{username}:{password}@{host}:{port}/?service_name={database}', echo=False)
tree = ET.parse(r'P:\Kopie.xml')
table_name = "H_TEST"
root = tree.getroot()
# we need the namespace to call the objects
namespace = root.tag.split('}')[0].strip('{')
# the data is in two section
sections = list(list(root)[0])[1:]
# the default row is the data that is the same in each example
default_row = {
"TR_NAME": root.get("name"),
"TRE_CURRDATE": root.get("currDate"),
}
def add_new_row():
"""Nested function to create new row"""
# we copy to create a depy copy of the default row
new_row = default_row.copy()
# new values
new_row.update({
"SECTION_IDNAME": section.get("idName"),
})
# the field inside is either datVl or txtVl
for f in list(field):
if f.tag[-5:] == "datVl":
new_row.update({
"DATVL_SECTION_DATA": f.text,
"DATVL_SECTION_CHGFLAG": f.get("chgFlag")
})
if f.tag[-5:] == "txtVl":
new_row.update({
"TXTVL_SECTION_DATA": f.text,
})
return new_row
rows = []
# iterate through sections
for section in sections:
# iterate through individual fields
for field in row.findall("xmlns:fld", namespaces={"xmlns": namespace}):
new_row = add_new_row()
new_row.update({
"TABLE_IDNAME": table.get("idName"),
"TABLE_CHGFLAG": table.get("chgFlag"),
})
rows.append(new_row)
# make a DataFrame
df = pd.DataFrame(rows)
# insert in database
如果你对这篇内容有疑问,欢迎到本站社区发帖提问 参与讨论,获取更多帮助,或者扫码二维码加入 Web 技术交流群。

绑定邮箱获取回复消息
由于您还没有绑定你的真实邮箱,如果其他用户或者作者回复了您的评论,将不能在第一时间通知您!
发布评论