Skip to main content

Parse XML in streaming manner with lxml

from lxml import etree
import csv
import os

def extract_xml_element_text_to_csv_streaming(
xml_file_path,
csv_file_path,
tag_names=None,
element_names_to_extract=None, # Now we're extracting text from child elements
wildcard_namespace_tags=True
):
"""
Extracts text content from specified XML elements in a streaming fashion
and writes them to a CSV file. Designed for low memory utilization with huge XML files.
Handles XML namespaces for tags and multiple target tags.

Args:
xml_file_path (str): The path to the input XML file.
csv_file_path (str): The path to the output CSV file.
tag_names (str or list): The XML tag name(s) of the "parent" elements
from which to extract text from their children.
Can be a single string (e.g., 'person') or a list of strings
(e.g., ['person', 'organization']).
Defaults to ['person'] if None.
Use '{uri}tag_name' for explicit namespace, or rely on wildcard.
element_names_to_extract (list): A list of local names of child elements whose
text content needs to be fetched (e.g., ['name', 'city', 'description']).
If None, it defaults to ['name', 'city', 'school'].
The function will attempt to find these elements regardless of their namespace.
wildcard_namespace_tags (bool): If True and a tag in `tag_names` doesn't have an explicit
'{uri}' prefix, it will be treated as `{*}` prefixed,
matching elements in any namespace or no namespace. Defaults to True.

Returns:
bool: True if the element texts were successfully extracted and written to CSV, False otherwise.
"""
if not os.path.exists(xml_file_path):
print(f"Error: XML file not found at '{xml_file_path}'")
return False

# Normalize tag_names input
if tag_names is None:
tag_names = ['person']
elif isinstance(tag_names, str):
tag_names = [tag_names]
elif not isinstance(tag_names, list) or not all(isinstance(tag, str) for tag in tag_names):
print("Error: 'tag_names' must be a string or a list of strings.")
return False
if not tag_names:
print("Warning: 'tag_names' is empty. No elements will be processed.")
return True

# Prepare target tags for iterparse, considering wildcard namespace for tags
target_tags_for_iterparse = []
for tag in tag_names:
if '{' in tag and '}' in tag: # Already an explicit QName
target_tags_for_iterparse.append(tag)
elif wildcard_namespace_tags:
target_tags_for_iterparse.append(f"{{*}}{tag}") # Match any namespace or no namespace
else:
target_tags_for_iterparse.append(tag) # Assume no namespace if no wildcard and no explicit URI

# Normalize element_names_to_extract input
if element_names_to_extract is None:
element_names_to_extract = ['name', 'city', 'school']
elif not isinstance(element_names_to_extract, list) or not all(isinstance(elem_name, str) for elem_name in element_names_to_extract):
print("Error: 'element_names_to_extract' must be a list of strings (local names).")
return False
if not element_names_to_extract:
print("Warning: 'element_names_to_extract' is empty. No element texts will be extracted.")
return True

print(f"Starting extraction from '{xml_file_path}' to '{csv_file_path}' for tags: {tag_names}...")
print(f"Targeting child element texts (local names): {element_names_to_extract}")

try:
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
csv_writer = csv.writer(csvfile)

# Write CSV header (using the original element names for readability)
csv_writer.writerow(element_names_to_extract)

# Use iterparse for streaming parsing
context = etree.iterparse(xml_file_path, events=('end',), tag=target_tags_for_iterparse)

for event, parent_elem in context:
row_data = []
# To extract text from children, we need to find them within the parent_elem
# We'll use a more general way to find child elements by their local name
# regardless of their namespace.

# Create a temporary dictionary to store found child element texts
found_element_texts = {}
# Iterate over direct children of the current 'parent_elem'
for child_elem in parent_elem:
# Get the local name of the child element
local_child_name = etree.QName(child_elem.tag).localname
# Store its text content, strip whitespace for cleanliness
found_element_texts[local_child_name] = (child_elem.text or '').strip()

# Populate row_data based on requested element_names_to_extract
for desired_elem_local_name in element_names_to_extract:
row_data.append(found_element_texts.get(desired_elem_local_name, ''))

csv_writer.writerow(row_data)

# Crucial for memory management: clear the parent element and its children from memory
parent_elem.clear()

print(f"Successfully extracted data to '{csv_file_path}'")
return True

except FileNotFoundError:
print(f"Error: Output CSV file path '{csv_file_path}' cannot be created.")
return False
except etree.XMLSyntaxError as e:
print(f"Error: XML syntax error in '{xml_file_path}': {e}")
return False
except Exception as e:
print(f"An unexpected error occurred: {e}")
return False

# --- Example Usage ---
if __name__ == "__main__":
# Create a dummy XML file with namespaces and nested elements for text extraction
dummy_xml_file_text_ns = "data_text_ns.xml"
num_records = 50000 # For faster testing
xml_content_text_ns = """<?xml version="1.0" encoding="UTF-8"?>
<root xmlns:ns="http://example.com/ns" xmlns:other="http://example.com/other">
"""
for i in range(num_records // 2):
xml_content_text_ns += f"""
<ns:person id='{i+1}'>
<name>Person {i}</name>
<city ns:type="home">NewYork {i % 5}</city>
<school>HighSchool {i % 3}</school>
<ns:gender>Male</ns:gender>
<description>This is a person record number {i}.</description>
</ns:person>
"""
for i in range(num_records // 2):
xml_content_text_ns += f"""
<other:organization org_id='{i+100000}'>
<name>Org {i}</name>
<city>London {i % 4}</city>
<other:sector>IT Solutions</other:sector>
<notes>Notes for organization {i}.</notes>
</other:organization>
"""
xml_content_text_ns += "</root>"

with open(dummy_xml_file_text_ns, 'w', encoding='utf-8') as f:
f.write(xml_content_text_ns)
print(f"Dummy XML file with element text and namespaces generated: {dummy_xml_file_text_ns}")

# --- Test Cases ---

# Case 1: Extract text from 'person' elements (default wildcard for tags and element names)
print("\n--- Case 1: Extracting text from 'person' elements ---")
output_csv_text_1 = "persons_text_data.csv"
extract_xml_element_text_to_csv_streaming(
xml_file_path=dummy_xml_file_text_ns,
csv_file_path=output_csv_text_1,
tag_names='person', # Will match <ns:person> due to wildcard_namespace_tags=True
element_names_to_extract=['name', 'city', 'school', 'gender', 'description'] # 'gender' will match <ns:gender>
)

# Case 2: Extract text from 'organization' elements
print("\n--- Case 2: Extracting text from 'organization' elements ---")
output_csv_text_2 = "organizations_text_data.csv"
extract_xml_element_text_to_csv_streaming(
xml_file_path=dummy_xml_file_text_ns,
csv_file_path=output_csv_text_2,
tag_names='organization', # Will match <other:organization>
element_names_to_extract=['name', 'city', 'sector', 'notes'] # 'sector' will match <other:sector>
)

# Case 3: Extract text from multiple tags (person and organization)
print("\n--- Case 3: Extracting text from 'person' and 'organization' (combined) ---")
output_csv_text_3 = "combined_text_data.csv"
extract_xml_element_text_to_csv_streaming(
xml_file_path=dummy_xml_file_text_ns,
csv_file_path=output_csv_text_3,
tag_names=['person', 'organization'],
element_names_to_extract=['name', 'city', 'school', 'gender', 'description', 'sector', 'notes']
)
# Expected: For 'person' rows, 'sector' and 'notes' will be empty.
# For 'organization' rows, 'school', 'gender', 'description' will be empty.


# Clean up dummy files
print("\nCleaning up dummy files...")
os.remove(dummy_xml_file_text_ns)
if os.path.exists(output_csv_text_1): os.remove(output_csv_text_1)
if os.path.exists(output_csv_text_2): os.remove(output_csv_text_2)
if os.path.exists(output_csv_text_3): os.remove(output_csv_text_3)
print("Cleanup complete.")