python中的MemoryError用于大型XML文件

发布于 2025-02-08 12:41:42 字数 4013 浏览 3 评论 0原文

我会在Python中遇到错误，以加载Oracle DB表中的大型XML文件：

 self._root = parser._parse_whole(source)
MemoryError

我的XML文件是2GB，大小为2GB ，我有8GB RAM。使用2MB XML文件，代码正常工作，但2GB XML将其失败如2分钟后归档。

我是Python的新手，实际上不知道如何解决这个问题。以下是我的XML文件的样子：

<wmX
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns="http://www.wtest" majorRelease="97" minorRelease="1" currDate="2022-04-04+00:00" producer="wtest.core" name="id:19015 name:GTXML01" currSeqNo="304" >
    <finObj chgFlag="i" idName="Instrument" idVal="223434343">
        <ids>
            <id scheme="WKN">434535434</id>
            <id scheme="ISIN">TREE123123</id>
        </ids>
        <section chgFlag="i" idName="E" idVal="12345235564">
            <fld idName="ED001">
                <datVl chgFlag="i">4</datVl>
            </fld>
            <fld idName="ED005">
                <datVl chgFlag="i">14</datVl>
            </fld>
            <fld idName="ED006">
                <datVl chgFlag="i">91</datVl>
            </fld>
            <table chgFlag="i" idName="GV325">
                <row chgFlag="i" idVal="1">
                    <fld idName="GV325A">
                        <datVl chgFlag="i">1</datVl>
                    </fld>
                    <fld idName="GV325B">
                        <datVl chgFlag="i">2001-12-01</datVl>
                    </fld>
                    <fld idName="GV325C">
                        <datVl chgFlag="i">2006-04-30</datVl>
                    </fld>
                    <fld idName="GV325D">
                        <datVl chgFlag="i">01</datVl>
                    </fld>
                </row>
            </table>
        </section>
    </finObj>
</wmX>

下面是我的Python代码的一部分：

import pandas as pd
import xml.etree.ElementTree as ET
from collections import defaultdict
import xmltodict
import sqlalchemy
from sqlalchemy import create_engine
import cx_Oracle


username = "XXX"
password = "XXX-"
host = "XXX"
port = XXX
database = "XXXXX"
engine = create_engine(f'oracle://{username}:{password}@{host}:{port}/?service_name={database}', echo=False)

tree = ET.parse(r'P:\Kopie.xml')

table_name = "H_TEST"

root = tree.getroot()

# we need the namespace to call the objects
namespace = root.tag.split('}')[0].strip('{')

# the data is in two section
sections = list(list(root)[0])[1:]

# the default row is the data that is the same in each example
default_row = {
    "TR_NAME": root.get("name"),
    "TRE_CURRDATE": root.get("currDate"),
}


def add_new_row():
    """Nested function to create new row"""
    # we copy to create a depy copy of the default row
    new_row = default_row.copy()
    # new values
    new_row.update({
        "SECTION_IDNAME": section.get("idName"),
    })
    # the field inside is either datVl or txtVl
    for f in list(field):
        if f.tag[-5:] == "datVl":
            new_row.update({
                "DATVL_SECTION_DATA": f.text,
                "DATVL_SECTION_CHGFLAG": f.get("chgFlag")
            })
        if f.tag[-5:] == "txtVl":
            new_row.update({
                "TXTVL_SECTION_DATA": f.text,
            })
    return new_row


rows = []

# iterate through sections
for section in sections:
    # iterate through individual fields        
            for field in row.findall("xmlns:fld", namespaces={"xmlns": namespace}):
                new_row = add_new_row()
                new_row.update({
                    "TABLE_IDNAME": table.get("idName"),
                    "TABLE_CHGFLAG": table.get("chgFlag"),
                })
                rows.append(new_row)

# make a DataFrame
df = pd.DataFrame(rows)
# insert in database

原文

I am getting error in Python for loading large XML file in Oracle DB table :

 self._root = parser._parse_whole(source)
MemoryError

My XML file is 2GB in size and I have 8GB RAM. With 2MB XML file the code is working fine but with 2GB XML file its failing like after 2min.

I am new in Python and dont know actually how to resolve this issue.
Below is how my XML file looks like:

<wmX
    xmlns:xs="http://www.w3.org/2001/XMLSchema"
    xmlns="http://www.wtest" majorRelease="97" minorRelease="1" currDate="2022-04-04+00:00" producer="wtest.core" name="id:19015 name:GTXML01" currSeqNo="304" >
    <finObj chgFlag="i" idName="Instrument" idVal="223434343">
        <ids>
            <id scheme="WKN">434535434</id>
            <id scheme="ISIN">TREE123123</id>
        </ids>
        <section chgFlag="i" idName="E" idVal="12345235564">
            <fld idName="ED001">
                <datVl chgFlag="i">4</datVl>
            </fld>
            <fld idName="ED005">
                <datVl chgFlag="i">14</datVl>
            </fld>
            <fld idName="ED006">
                <datVl chgFlag="i">91</datVl>
            </fld>
            <table chgFlag="i" idName="GV325">
                <row chgFlag="i" idVal="1">
                    <fld idName="GV325A">
                        <datVl chgFlag="i">1</datVl>
                    </fld>
                    <fld idName="GV325B">
                        <datVl chgFlag="i">2001-12-01</datVl>
                    </fld>
                    <fld idName="GV325C">
                        <datVl chgFlag="i">2006-04-30</datVl>
                    </fld>
                    <fld idName="GV325D">
                        <datVl chgFlag="i">01</datVl>
                    </fld>
                </row>
            </table>
        </section>
    </finObj>
</wmX>

Below is part of my Python code:

import pandas as pd
import xml.etree.ElementTree as ET
from collections import defaultdict
import xmltodict
import sqlalchemy
from sqlalchemy import create_engine
import cx_Oracle


username = "XXX"
password = "XXX-"
host = "XXX"
port = XXX
database = "XXXXX"
engine = create_engine(f'oracle://{username}:{password}@{host}:{port}/?service_name={database}', echo=False)

tree = ET.parse(r'P:\Kopie.xml')

table_name = "H_TEST"

root = tree.getroot()

# we need the namespace to call the objects
namespace = root.tag.split('}')[0].strip('{')

# the data is in two section
sections = list(list(root)[0])[1:]

# the default row is the data that is the same in each example
default_row = {
    "TR_NAME": root.get("name"),
    "TRE_CURRDATE": root.get("currDate"),
}


def add_new_row():
    """Nested function to create new row"""
    # we copy to create a depy copy of the default row
    new_row = default_row.copy()
    # new values
    new_row.update({
        "SECTION_IDNAME": section.get("idName"),
    })
    # the field inside is either datVl or txtVl
    for f in list(field):
        if f.tag[-5:] == "datVl":
            new_row.update({
                "DATVL_SECTION_DATA": f.text,
                "DATVL_SECTION_CHGFLAG": f.get("chgFlag")
            })
        if f.tag[-5:] == "txtVl":
            new_row.update({
                "TXTVL_SECTION_DATA": f.text,
            })
    return new_row


rows = []

# iterate through sections
for section in sections:
    # iterate through individual fields        
            for field in row.findall("xmlns:fld", namespaces={"xmlns": namespace}):
                new_row = add_new_row()
                new_row.update({
                    "TABLE_IDNAME": table.get("idName"),
                    "TABLE_CHGFLAG": table.get("chgFlag"),
                })
                rows.append(new_row)

# make a DataFrame
df = pd.DataFrame(rows)
# insert in database

分享到QQ

分享到微博