Code runs slowly parsing large xml files w/ python
I've got 2 very large xml files that hold different data for the same
place/building/room combinations. I am currently using python etree parse
on the first large file then looping through it to extract the
place/building/room ids (along with other information) to then use those
ids to loop through the second large xml file (same structure as the
first) which I'm currenlty using lxml iterparse to find and extract the
Place element in the second file pertainent to the specific place from the
first file. Then it loops through that place element to find the pertinent
data It works, but it continues to get slower and slower as I loop through
further and further into the first file.
I've done everything I can to clear() elements not pertainent in the
iterparse of the second large file which helped, but I've got 5000 places
to loop through and the first 100 are processed very quickly (less than a
minute) then the next 400 take 30 minutes, and so on. After 15 hours I was
at about 4000 facilities and moving very slowly. I suspect the parsing of
one of the files is holding too much data.
Here's the simplified code (sorry I couldn't simplify it further) with
genericified xml.
# get_place_elem function
def get_place_elem(pplaceid,largefile2):
    Placenode = ET.iterparse(Largefile2, events=("end",), tag='Place')
    for event, Place in Placenode:
        for PlaceId in Place.findall('PlaceIdentification'):
            placeid = PlaceId.find('PlaceIdentifier').text
                if placeid == pplaceid:
                    del Placenode
                    return Place
        Place.clear()
        while Place.getprevious() is not None:
            del Place.getparent()[0]
    del Placenode
# getfacdata function
def getfacdata(pplaceid,pbuildid,proomid,Place):
    for Build in Place.findall('Building'):
        euid = ' '
        for BuildId in Build.findall('BuildingIdentification'):
            bid = BuildId.find('Identifier').text
        if bid ==pbid:
            for Room in Build.findall('Room'):
                roomid = ' '
                for RoomId in Room.findall('RoomIdentification'):
                    roomid = RoomId.find('Identifier').text
                    if roomid == proomid:
                        ...Collect data from Room element...
                        ... do some simple math with if statements
                        return data; # list of 15 data values
main code
largefile1 = "largefile1.xml"
largefile2 = "largeFile2.xml"
ptree = ET.parse (largefile1)
proot = ptree.getroot()
o = open('output.txt', 'w')
for pPlace in proot.findall('.//Place'):
    for pPlaceId in pPlace.findall 
('cer:FacilityIdentification',namespaces=namespaces):
        pplaceid = pPlaceId.find('PlaceIdentifier').text
            if placeid == pplaceid:
                placecnt += 1
                #... get some data
for pBuild in pPlace.findall('Buidling'):
    for pBuildId in pBuild.findall('BuildingIdentification'):
        pbid = pBuildId.find('Identifier').text
        # cycle through each ProcessIdentification element of the
EmissionUnit element in the point inventory file and assign the
SCC code
        for pRoom in pBuild.findall('Room'):
            for pRoomId in pRoom.findall('RoomIdentification'):
                proomid = pRoom.find('Identifier').text
                if prevpplaceid != pplaceid:
                    if placecnt != 1:Place.clear()
                        Place = get_fac_elem(pplaceid,largefile2)
                        prevpplaceid = pplaceid
                data = getfacdata(pplaceid,pbid,proomid,Place)
                #...Collect data from Room element...
                #... do some simple math with if statements
                writer = csv.writer(o)
                writer.writerow( ( # data from proom and from 'data' list
from processing largefile2 in csv format##))
                break
prevpplaceid = pplaceid
o.close()
genericified xml
 <Payload>
<Place>
    <PlaceName>Place1</PlaceName>
    <PlaceStatusCode>OP</PlaceStatusCode>
    <PlaceStatusCodeYear>2011</PlaceStatusCodeYear>
    <PlaceComment/>
    <PlaceIdentification>
        <PlaceIdentifier>id001</PlaceIdentifier>
        <StateAndCountyFIPSCode>77702</StateAndCountyFIPSCode>
    </PlaceIdentification>
    <PlaceAddress>
        <LocationAddressText>111 Main</LocationAddressText>
        <SupplementalLocationText/>
        <LocalityName>City1</LocalityName>
        <LocationAddressStateCode>State1</LocationAddressStateCode>
        <LocationAddressPostalCode>12345</LocationAddressPostalCode>
        <LocationAddressCountryCode>USA</LocationAddressCountryCode>
    </PlaceAddress>
    <PlaceGeographicCoordinates>
        <LatitudeMeasure>88.888</LatitudeMeasure>
        <LongitudeMeasure>-99.999</LongitudeMeasure>
    </PlaceGeographicCoordinates>
    <Building>
        <BuildingDescription>Building1</BuildingDescription>
        <BuildingTypeCode>999</BuildingTypeCode>
        <BuildingIdentification>
            <Identifier>Building1</Identifier>
        </BuildingIdentification>
        <Room>
            <RoomIdentification>
                <Identifier>Room1</Identifier>
            </RoomIdentification>
            ... More data ...
        </Room>
        <Room>
            <RoomIdentification>
                <Identifier>Room2</Identifier>
            </RoomIdentification>
            ... More data ...
        </Room>
        ...
    </Building>
    <Building>
        <BuildingDescription>Building2</BuildingDescription>
        <BuildingTypeCode>999</BuildingTypeCode>
        <BuildingIdentification>
            <Identifier>Building2</Identifier>
        </BuildingIdentification>
        <Room>
            <RoomIdentification>
                <Identifier>Room1</Identifier>
            </RoomIdentification>
            ... More data ...
        </Room>
        <Room>
            <RoomIdentification>
                <Identifier>Room4</Identifier>
            </RoomIdentification>
            ... More data ...
        </Room>
        ...
    </Building>
    ...
</Place>
<Place>
    ...
</Place>
 
No comments:
Post a Comment