0
# Core XML/HTML Processing
1
2
Comprehensive ElementTree-compatible API for XML and HTML document parsing, manipulation, and serialization. This module provides the foundation for all lxml functionality with full standards compliance, namespace support, and high-performance processing.
3
4
## Capabilities
5
6
### Document Parsing
7
8
Parse XML and HTML documents from strings, files, URLs, or file-like objects with configurable parsers and error handling.
9
10
```python { .api }
11
def parse(source, parser=None, base_url=None):
12
"""
13
Parse XML/HTML document from file, URL, or file-like object.
14
15
Args:
16
source: File path, URL, file-like object, or filename
17
parser: XMLParser or HTMLParser instance (optional)
18
base_url: Base URL for resolving relative references (optional)
19
20
Returns:
21
ElementTree: Parsed document tree
22
"""
23
24
def fromstring(text, parser=None, base_url=None):
25
"""
26
Parse XML/HTML document from string.
27
28
Args:
29
text: str or bytes containing XML/HTML content
30
parser: XMLParser or HTMLParser instance (optional)
31
base_url: Base URL for resolving relative references (optional)
32
33
Returns:
34
Element: Root element of parsed document
35
"""
36
37
def XML(text, parser=None, base_url=None):
38
"""
39
Parse XML string with validation enabled by default.
40
41
Args:
42
text: str or bytes containing XML content
43
parser: XMLParser instance (optional)
44
base_url: Base URL for resolving relative references (optional)
45
46
Returns:
47
Element: Root element of parsed XML
48
"""
49
50
def HTML(text, parser=None, base_url=None):
51
"""
52
Parse HTML string with lenient parsing.
53
54
Args:
55
text: str or bytes containing HTML content
56
parser: HTMLParser instance (optional)
57
base_url: Base URL for resolving relative references (optional)
58
59
Returns:
60
Element: Root element of parsed HTML
61
"""
62
```
63
64
### Incremental Parsing
65
66
Memory-efficient parsing for large documents using event-driven processing.
67
68
```python { .api }
69
def iterparse(source, events=None, tag=None, attribute_defaults=False,
70
dtd_validation=False, load_dtd=False, no_network=True,
71
remove_blank_text=False, remove_comments=False,
72
remove_pis=False, encoding=None, huge_tree=False,
73
schema=None):
74
"""
75
Incrementally parse XML document yielding (event, element) pairs.
76
77
Args:
78
source: File path, URL, or file-like object
79
events: tuple of events to report ('start', 'end', 'start-ns', 'end-ns')
80
tag: str or sequence of tag names to filter
81
82
Yields:
83
tuple: (event, element) pairs during parsing
84
"""
85
86
def iterwalk(element_or_tree, events=('end',), tag=None):
87
"""
88
Walk through existing element tree yielding events.
89
90
Args:
91
element_or_tree: Element or ElementTree to walk
92
events: tuple of events to report ('start', 'end')
93
tag: str or sequence of tag names to filter
94
95
Yields:
96
tuple: (event, element) pairs during traversal
97
"""
98
```
99
100
### Element Creation and Manipulation
101
102
Create and modify XML/HTML elements with full attribute and content support.
103
104
```python { .api }
105
class Element:
106
"""XML/HTML element with tag, attributes, text, and children."""
107
108
def __init__(self, tag, attrib=None, nsmap=None, **extra):
109
"""
110
Create new element.
111
112
Args:
113
tag: Element tag name (str or QName)
114
attrib: dict of attributes (optional)
115
nsmap: dict mapping namespace prefixes to URIs (optional)
116
**extra: Additional attributes as keyword arguments
117
"""
118
119
# Element properties
120
tag: str # Element tag name
121
text: str | None # Text content before first child
122
tail: str | None # Text content after element
123
attrib: dict[str, str] # Element attributes
124
nsmap: dict[str, str] # Namespace mapping
125
sourceline: int | None # Source line number (if available)
126
127
# Tree navigation
128
def find(self, path, namespaces=None):
129
"""Find first child element matching path."""
130
131
def findall(self, path, namespaces=None):
132
"""Find all child elements matching path."""
133
134
def iterfind(self, path, namespaces=None):
135
"""Iterate over child elements matching path."""
136
137
def findtext(self, path, default=None, namespaces=None):
138
"""Find text content of first matching child element."""
139
140
def xpath(self, _path, namespaces=None, extensions=None,
141
smart_strings=True, **_variables):
142
"""Evaluate XPath expression on element."""
143
144
# Tree modification
145
def append(self, element):
146
"""Add element as last child."""
147
148
def insert(self, index, element):
149
"""Insert element at specified position."""
150
151
def remove(self, element):
152
"""Remove child element."""
153
154
def clear(self):
155
"""Remove all children and attributes."""
156
157
# Attribute access
158
def get(self, key, default=None):
159
"""Get attribute value."""
160
161
def set(self, key, value):
162
"""Set attribute value."""
163
164
def keys(self):
165
"""Get attribute names."""
166
167
def values(self):
168
"""Get attribute values."""
169
170
def items(self):
171
"""Get (name, value) pairs for attributes."""
172
173
def SubElement(parent, tag, attrib=None, nsmap=None, **extra):
174
"""
175
Create child element and add to parent.
176
177
Args:
178
parent: Parent Element
179
tag: Child element tag name
180
attrib: dict of attributes (optional)
181
nsmap: dict of namespace mappings (optional)
182
**extra: Additional attributes
183
184
Returns:
185
Element: New child element
186
"""
187
```
188
189
### Document Trees
190
191
Manage complete XML/HTML documents with document-level operations.
192
193
```python { .api }
194
class ElementTree:
195
"""Document tree containing root element and document info."""
196
197
def __init__(self, element=None, file=None, parser=None):
198
"""
199
Create document tree.
200
201
Args:
202
element: Root element (optional)
203
file: File to parse (optional)
204
parser: Parser instance (optional)
205
"""
206
207
def getroot(self):
208
"""Get root element."""
209
210
def setroot(self, root):
211
"""Set root element."""
212
213
def parse(self, source, parser=None, base_url=None):
214
"""Parse document from source."""
215
216
def write(self, file, encoding=None, xml_declaration=None,
217
default_namespace=None, method="xml", pretty_print=False,
218
with_tail=True, standalone=None, compression=0,
219
exclusive=False, inclusive_ns_prefixes=None,
220
with_comments=True, strip_cdata=True):
221
"""Write document to file."""
222
223
def xpath(self, _path, namespaces=None, extensions=None,
224
smart_strings=True, **_variables):
225
"""Evaluate XPath expression on document."""
226
227
def xslt(self, _xslt, extensions=None, access_control=None, **_kw):
228
"""Apply XSLT transformation."""
229
230
def relaxng(self, relaxng):
231
"""Validate against RelaxNG schema."""
232
233
def xmlschema(self, xmlschema):
234
"""Validate against XML Schema."""
235
236
def xinclude(self):
237
"""Process XInclude directives."""
238
239
@property
240
def docinfo(self):
241
"""Document information (encoding, version, etc.)."""
242
```
243
244
### Serialization
245
246
Convert elements and trees to strings or bytes with formatting options.
247
248
```python { .api }
249
def tostring(element_or_tree, encoding=None, method="xml",
250
xml_declaration=None, pretty_print=False, with_tail=True,
251
standalone=None, doctype=None, exclusive=False,
252
inclusive_ns_prefixes=None, with_comments=True,
253
strip_cdata=True):
254
"""
255
Serialize element or tree to string/bytes.
256
257
Args:
258
element_or_tree: Element or ElementTree to serialize
259
encoding: Output encoding ('unicode' for str, bytes encoding for bytes)
260
method: Serialization method ('xml', 'html', 'text', 'c14n')
261
xml_declaration: Include XML declaration (bool or None for auto)
262
pretty_print: Format output with whitespace (bool)
263
with_tail: Include tail text (bool)
264
doctype: Document type declaration (str)
265
266
Returns:
267
str or bytes: Serialized document
268
"""
269
270
def tostringlist(element_or_tree, encoding=None, method="xml",
271
xml_declaration=None, pretty_print=False, with_tail=True,
272
standalone=None, doctype=None, exclusive=False,
273
inclusive_ns_prefixes=None, with_comments=True,
274
strip_cdata=True):
275
"""Serialize to list of strings/bytes."""
276
277
def tounicode(element_or_tree, method="xml", pretty_print=False,
278
with_tail=True, doctype=None):
279
"""Serialize to unicode string."""
280
281
def dump(elem):
282
"""Debug dump element structure to stdout."""
283
```
284
285
### Parser Configuration
286
287
Configurable parsers for different XML/HTML processing needs.
288
289
```python { .api }
290
class XMLParser:
291
"""Configurable XML parser with validation and processing options."""
292
293
def __init__(self, encoding=None, attribute_defaults=False,
294
dtd_validation=False, load_dtd=False, no_network=True,
295
ns_clean=False, recover=False, schema=None,
296
huge_tree=False, remove_blank_text=False,
297
resolve_entities=True, remove_comments=False,
298
remove_pis=False, strip_cdata=True, collect_ids=True,
299
target=None, compact=True):
300
"""
301
Create XML parser with specified options.
302
303
Args:
304
encoding: Character encoding override
305
attribute_defaults: Load default attributes from DTD
306
dtd_validation: Enable DTD validation
307
load_dtd: Load and parse DTD
308
no_network: Disable network access
309
recover: Enable error recovery
310
huge_tree: Support very large documents
311
remove_blank_text: Remove whitespace-only text nodes
312
remove_comments: Remove comment nodes
313
remove_pis: Remove processing instruction nodes
314
"""
315
316
class HTMLParser:
317
"""Lenient HTML parser with automatic error recovery."""
318
319
def __init__(self, encoding=None, remove_blank_text=False,
320
remove_comments=False, remove_pis=False,
321
strip_cdata=True, no_network=True, target=None,
322
schema=None, recover=True, compact=True):
323
"""Create HTML parser with specified options."""
324
325
def get_default_parser():
326
"""Get current default parser."""
327
328
def set_default_parser(parser):
329
"""Set global default parser."""
330
```
331
332
### Tree Manipulation Utilities
333
334
High-level functions for common tree modification operations.
335
336
```python { .api }
337
def cleanup_namespaces(tree_or_element):
338
"""Remove unused namespace declarations."""
339
340
def strip_attributes(tree_or_element, *attribute_names):
341
"""Remove specified attributes from all elements."""
342
343
def strip_elements(tree_or_element, *tag_names, with_tail=True):
344
"""Remove elements with specified tag names."""
345
346
def strip_tags(tree_or_element, *tag_names):
347
"""Remove tags but keep text content."""
348
349
def register_namespace(prefix, uri):
350
"""Register namespace prefix for serialization."""
351
```
352
353
### Node Type Classes
354
355
Specialized classes for different XML node types.
356
357
```python { .api }
358
class Comment:
359
"""XML comment node."""
360
def __init__(self, text=None): ...
361
362
class ProcessingInstruction:
363
"""XML processing instruction node."""
364
def __init__(self, target, text=None): ...
365
366
@property
367
def target(self) -> str: ...
368
369
class Entity:
370
"""XML entity reference node."""
371
def __init__(self, name): ...
372
373
@property
374
def name(self) -> str: ...
375
376
class CDATA:
377
"""XML CDATA section."""
378
def __init__(self, data): ...
379
380
# Factory functions
381
def Comment(text=None):
382
"""Create comment node."""
383
384
def ProcessingInstruction(target, text=None):
385
"""Create processing instruction node."""
386
387
PI = ProcessingInstruction # Alias
388
```
389
390
## Usage Examples
391
392
### Basic XML Processing
393
394
```python
395
from lxml import etree
396
397
# Parse XML document
398
xml_data = '''<?xml version="1.0"?>
399
<catalog>
400
<book id="1" category="fiction">
401
<title>The Great Gatsby</title>
402
<author>F. Scott Fitzgerald</author>
403
<year>1925</year>
404
<price currency="USD">12.99</price>
405
</book>
406
<book id="2" category="science">
407
<title>A Brief History of Time</title>
408
<author>Stephen Hawking</author>
409
<year>1988</year>
410
<price currency="USD">15.99</price>
411
</book>
412
</catalog>'''
413
414
root = etree.fromstring(xml_data)
415
416
# Navigate and query
417
books = root.findall('book')
418
fiction_books = root.xpath('//book[@category="fiction"]')
419
titles = root.xpath('//title/text()')
420
421
# Modify content
422
new_book = etree.SubElement(root, 'book', id="3", category="mystery")
423
etree.SubElement(new_book, 'title').text = "The Murder Mystery"
424
etree.SubElement(new_book, 'author').text = "Agatha Christie"
425
etree.SubElement(new_book, 'year').text = "1934"
426
price_elem = etree.SubElement(new_book, 'price', currency="USD")
427
price_elem.text = "11.99"
428
429
# Serialize with formatting
430
output = etree.tostring(root, pretty_print=True, encoding='unicode')
431
print(output)
432
```
433
434
### HTML Document Processing
435
436
```python
437
from lxml import etree
438
439
# Parse HTML with XML parser (requires well-formed HTML)
440
html_data = '''<!DOCTYPE html>
441
<html>
442
<head>
443
<title>Sample Page</title>
444
<meta charset="UTF-8"/>
445
</head>
446
<body>
447
<h1>Welcome</h1>
448
<div class="content">
449
<p>This is a paragraph.</p>
450
<ul>
451
<li>Item 1</li>
452
<li>Item 2</li>
453
</ul>
454
</div>
455
</body>
456
</html>'''
457
458
# Use HTML parser for lenient parsing
459
parser = etree.HTMLParser()
460
doc = etree.fromstring(html_data, parser)
461
462
# Find elements
463
title = doc.find('.//title').text
464
content_div = doc.find('.//div[@class="content"]')
465
list_items = doc.xpath('//li/text()')
466
467
print(f"Title: {title}")
468
print(f"List items: {list_items}")
469
```
470
471
### Error Handling
472
473
```python
474
from lxml import etree
475
476
try:
477
# This will raise XMLSyntaxError due to unclosed tag
478
bad_xml = '<root><child></root>'
479
etree.fromstring(bad_xml)
480
except etree.XMLSyntaxError as e:
481
print(f"XML Error: {e}")
482
print(f"Line: {e.lineno}, Column: {e.offset}")
483
484
# Use recovery parser for malformed XML
485
try:
486
parser = etree.XMLParser(recover=True)
487
root = etree.fromstring(bad_xml, parser)
488
print("Recovered:", etree.tostring(root, encoding='unicode'))
489
except Exception as e:
490
print(f"Recovery failed: {e}")
491
```