0
# Utility Modules
1
2
Additional functionality including SAX interface compatibility, CSS selector support, element builders, XInclude processing, and namespace management. These modules provide specialized capabilities for integration with other XML tools and advanced XML processing workflows.
3
4
## Capabilities
5
6
### SAX Interface Compatibility
7
8
Bridge between lxml and Python's SAX (Simple API for XML) for integration with SAX-based applications.
9
10
```python { .api }
11
class ElementTreeContentHandler:
12
"""SAX ContentHandler that builds lxml ElementTree."""
13
14
def __init__(self, makeelement=None):
15
"""
16
Create SAX content handler for building ElementTree.
17
18
Args:
19
makeelement: Custom element factory function (optional)
20
"""
21
22
def etree(self):
23
"""Get built ElementTree after parsing completes."""
24
25
# SAX ContentHandler interface methods
26
def setDocumentLocator(self, locator): ...
27
def startDocument(self): ...
28
def endDocument(self): ...
29
def startPrefixMapping(self, prefix, uri): ...
30
def endPrefixMapping(self, prefix): ...
31
def startElement(self, name, attrs): ...
32
def endElement(self, name): ...
33
def startElementNS(self, name, qname, attrs): ...
34
def endElementNS(self, name, qname): ...
35
def characters(self, data): ...
36
def ignorableWhitespace(self, whitespace): ...
37
def processingInstruction(self, target, data): ...
38
def skippedEntity(self, name): ...
39
40
class ElementTreeProducer:
41
"""Generate SAX events from lxml ElementTree."""
42
43
def __init__(self, element_or_tree, content_handler):
44
"""
45
Create SAX event producer.
46
47
Args:
48
element_or_tree: Element or ElementTree to process
49
content_handler: SAX ContentHandler to receive events
50
"""
51
52
def saxify(self):
53
"""Generate SAX events for the element tree."""
54
55
def saxify(element_or_tree, content_handler):
56
"""
57
Generate SAX events from lxml tree.
58
59
Args:
60
element_or_tree: Element or ElementTree to process
61
content_handler: SAX ContentHandler to receive events
62
"""
63
64
class SaxError(LxmlError):
65
"""SAX processing error."""
66
```
67
68
### CSS Selectors
69
70
CSS selector support for finding elements using CSS syntax instead of XPath.
71
72
```python { .api }
73
class CSSSelector:
74
"""CSS selector that compiles to XPath for element matching."""
75
76
def __init__(self, css, namespaces=None, translator='xml'):
77
"""
78
Create CSS selector.
79
80
Args:
81
css: CSS selector string
82
namespaces: dict mapping prefixes to namespace URIs
83
translator: Selector translator ('xml' or 'html')
84
"""
85
86
def __call__(self, element):
87
"""
88
Find elements matching CSS selector.
89
90
Args:
91
element: Element or ElementTree to search
92
93
Returns:
94
list: Matching elements
95
"""
96
97
@property
98
def css(self):
99
"""CSS selector string."""
100
101
@property
102
def path(self):
103
"""Compiled XPath expression."""
104
105
class LxmlTranslator:
106
"""CSS to XPath translator with lxml-specific extensions."""
107
108
def css_to_xpath(self, css, prefix='descendant-or-self::'):
109
"""Convert CSS selector to XPath expression."""
110
111
class LxmlHTMLTranslator(LxmlTranslator):
112
"""HTML-specific CSS to XPath translator."""
113
114
# CSS selector error classes
115
class SelectorSyntaxError(Exception):
116
"""CSS selector syntax error."""
117
118
class ExpressionError(Exception):
119
"""CSS expression error."""
120
121
class SelectorError(Exception):
122
"""General CSS selector error."""
123
```
124
125
### Element Builders
126
127
Factory classes for programmatically creating XML elements with fluent APIs.
128
129
```python { .api }
130
class ElementMaker:
131
"""Factory for creating XML elements with builder pattern."""
132
133
def __init__(self, typemap=None, namespace=None, nsmap=None,
134
makeelement=None, **default_attributes):
135
"""
136
Create element factory.
137
138
Args:
139
typemap: dict mapping Python types to conversion functions
140
namespace: Default namespace URI for created elements
141
nsmap: Namespace prefix mapping
142
makeelement: Custom element factory function
143
**default_attributes: Default attributes for all elements
144
"""
145
146
def __call__(self, tag, *children, **attributes):
147
"""
148
Create element with tag, children, and attributes.
149
150
Args:
151
tag: Element tag name
152
*children: Child elements, text, or other content
153
**attributes: Element attributes
154
155
Returns:
156
Element: Created element with children and attributes
157
"""
158
159
def __getattr__(self, tag):
160
"""Create element factory method for specific tag."""
161
162
# Default element maker instance
163
E = ElementMaker()
164
```
165
166
### XInclude Processing
167
168
XML Inclusions (XInclude) processing for modular XML documents.
169
170
```python { .api }
171
def include(elem, loader=None, base_url=None, max_depth=6):
172
"""
173
Process XInclude directives in element tree.
174
175
Args:
176
elem: Element containing XInclude directives
177
loader: Custom resource loader function
178
base_url: Base URL for resolving relative hrefs
179
max_depth: Maximum inclusion recursion depth
180
181
Raises:
182
FatalIncludeError: Fatal inclusion error
183
LimitedRecursiveIncludeError: Recursion limit exceeded
184
"""
185
186
def default_loader(href, parse, encoding=None):
187
"""
188
Default XInclude resource loader.
189
190
Args:
191
href: Resource URI to load
192
parse: Parse mode ('xml' or 'text')
193
encoding: Character encoding for text resources
194
195
Returns:
196
Element or str: Loaded resource content
197
"""
198
199
class FatalIncludeError(LxmlError):
200
"""Fatal XInclude processing error."""
201
202
class LimitedRecursiveIncludeError(FatalIncludeError):
203
"""XInclude recursion limit exceeded."""
204
205
# XInclude constants
206
DEFAULT_MAX_INCLUSION_DEPTH = 6
207
XINCLUDE_NAMESPACE = "http://www.w3.org/2001/XInclude"
208
```
209
210
### ElementPath Support
211
212
Simple XPath-like expressions for element tree navigation (similar to ElementTree).
213
214
```python { .api }
215
def find(element, path, namespaces=None):
216
"""
217
Find first element matching simple path expression.
218
219
Args:
220
element: Element to search from
221
path: Simple path expression (e.g., 'child/grandchild')
222
namespaces: Namespace prefix mapping
223
224
Returns:
225
Element or None: First matching element
226
"""
227
228
def findall(element, path, namespaces=None):
229
"""
230
Find all elements matching simple path expression.
231
232
Args:
233
element: Element to search from
234
path: Simple path expression
235
namespaces: Namespace prefix mapping
236
237
Returns:
238
list: All matching elements
239
"""
240
241
def iterfind(element, path, namespaces=None):
242
"""
243
Iterate over elements matching simple path expression.
244
245
Args:
246
element: Element to search from
247
path: Simple path expression
248
namespaces: Namespace prefix mapping
249
250
Yields:
251
Element: Matching elements
252
"""
253
254
def findtext(element, path, default=None, namespaces=None):
255
"""
256
Find text content of first element matching path.
257
258
Args:
259
element: Element to search from
260
path: Simple path expression
261
default: Default value if no match found
262
namespaces: Namespace prefix mapping
263
264
Returns:
265
str or default: Text content or default value
266
"""
267
```
268
269
### Document Testing Utilities
270
271
Enhanced utilities for testing XML documents and doctests.
272
273
```python { .api }
274
# lxml.usedoctest - doctest support
275
def temp_install(modules=None, verbose=None):
276
"""Temporarily install lxml doctests."""
277
278
# lxml.doctestcompare - enhanced doctest comparison
279
class LXMLOutputChecker:
280
"""Enhanced output checker for XML doctests."""
281
282
def check_output(self, want, got, optionflags):
283
"""Compare expected and actual XML output."""
284
285
class LHTMLOutputChecker:
286
"""Enhanced output checker for HTML doctests."""
287
288
# Test options
289
PARSE_HTML = ...
290
PARSE_XML = ...
291
NOPARSE_MARKUP = ...
292
```
293
294
### Python Class Lookup
295
296
Custom element class assignment based on Python logic.
297
298
```python { .api }
299
# lxml.pyclasslookup - Python-based element class lookup
300
class PythonElementClassLookup:
301
"""Element class lookup using Python callback functions."""
302
303
def __init__(self, fallback=None):
304
"""
305
Create Python-based class lookup.
306
307
Args:
308
fallback: Fallback class lookup for unhandled cases
309
"""
310
311
def lookup(self, doc, element):
312
"""
313
Lookup element class based on document and element.
314
315
Args:
316
doc: Document containing element
317
element: Element to assign class for
318
319
Returns:
320
type or None: Element class or None for default
321
"""
322
```
323
324
### Development Utilities
325
326
Helper functions for development and compilation workflows.
327
328
```python { .api }
329
def get_include():
330
"""
331
Returns header include paths for compiling C code against lxml.
332
333
Returns paths for lxml itself, libxml2, and libxslt headers when lxml
334
was built with statically linked libraries.
335
336
Returns:
337
list: List of include directory paths
338
"""
339
```
340
341
## Usage Examples
342
343
### SAX Interface Integration
344
345
```python
346
from lxml import etree
347
from lxml.sax import ElementTreeContentHandler, saxify
348
from xml.sax import make_parser
349
import xml.sax.handler
350
351
# Build ElementTree from SAX events
352
class MyContentHandler(ElementTreeContentHandler):
353
def __init__(self):
354
super().__init__()
355
self.elements_seen = []
356
357
def startElement(self, name, attrs):
358
super().startElement(name, attrs)
359
self.elements_seen.append(name)
360
361
# Parse XML using SAX, build with lxml
362
xml_data = '''<?xml version="1.0"?>
363
<catalog>
364
<book id="1">
365
<title>Python Guide</title>
366
<author>John Doe</author>
367
</book>
368
<book id="2">
369
<title>XML Processing</title>
370
<author>Jane Smith</author>
371
</book>
372
</catalog>'''
373
374
handler = MyContentHandler()
375
parser = make_parser()
376
parser.setContentHandler(handler)
377
378
# Parse and get resulting ElementTree
379
from io import StringIO
380
parser.parse(StringIO(xml_data))
381
tree = handler.etree()
382
383
print(f"Elements seen: {handler.elements_seen}")
384
print(f"Root tag: {tree.getroot().tag}")
385
386
# Generate SAX events from lxml tree
387
class LoggingHandler(xml.sax.handler.ContentHandler):
388
def startElement(self, name, attrs):
389
print(f"Start: {name} {dict(attrs)}")
390
391
def endElement(self, name):
392
print(f"End: {name}")
393
394
def characters(self, content):
395
content = content.strip()
396
if content:
397
print(f"Text: {content}")
398
399
# Send lxml tree to SAX handler
400
root = etree.fromstring(xml_data)
401
logging_handler = LoggingHandler()
402
saxify(root, logging_handler)
403
```
404
405
### CSS Selectors
406
407
```python
408
from lxml import html
409
from lxml.cssselect import CSSSelector
410
411
# HTML document for CSS selection
412
html_content = '''
413
<html>
414
<head>
415
<title>CSS Selector Example</title>
416
</head>
417
<body>
418
<div id="header" class="main-header">
419
<h1>Welcome</h1>
420
<nav class="navigation">
421
<a href="/home" class="nav-link active">Home</a>
422
<a href="/about" class="nav-link">About</a>
423
<a href="/contact" class="nav-link">Contact</a>
424
</nav>
425
</div>
426
<div id="content" class="main-content">
427
<article class="post featured">
428
<h2>Featured Article</h2>
429
<p>This is a featured article.</p>
430
</article>
431
<article class="post">
432
<h2>Regular Article</h2>
433
<p>This is a regular article.</p>
434
</article>
435
</div>
436
<footer id="footer">
437
<p>© 2023 Example Site</p>
438
</footer>
439
</body>
440
</html>
441
'''
442
443
doc = html.fromstring(html_content)
444
445
# Create CSS selectors
446
header_selector = CSSSelector('#header')
447
nav_links_selector = CSSSelector('nav.navigation a.nav-link')
448
featured_post_selector = CSSSelector('article.post.featured')
449
all_headings_selector = CSSSelector('h1, h2, h3, h4, h5, h6')
450
451
# Use selectors to find elements
452
header = header_selector(doc)
453
print(f"Header element: {header[0].get('class') if header else 'Not found'}")
454
455
nav_links = nav_links_selector(doc)
456
print(f"Navigation links: {len(nav_links)}")
457
for link in nav_links:
458
print(f" {link.text}: {link.get('href')}")
459
460
featured = featured_post_selector(doc)
461
if featured:
462
print(f"Featured article title: {featured[0].find('.//h2').text}")
463
464
headings = all_headings_selector(doc)
465
print(f"All headings:")
466
for heading in headings:
467
print(f" {heading.tag}: {heading.text}")
468
469
# Advanced CSS selectors
470
active_link_selector = CSSSelector('a.nav-link.active')
471
first_paragraph_selector = CSSSelector('article p:first-child')
472
not_featured_selector = CSSSelector('article.post:not(.featured)')
473
474
active_links = active_link_selector(doc)
475
print(f"Active navigation links: {len(active_links)}")
476
477
first_paragraphs = first_paragraph_selector(doc)
478
print(f"First paragraphs in articles: {len(first_paragraphs)}")
479
480
regular_posts = not_featured_selector(doc)
481
print(f"Regular (non-featured) posts: {len(regular_posts)}")
482
```
483
484
### Element Builders
485
486
```python
487
from lxml import etree
488
from lxml.builder import ElementMaker
489
490
# Create element maker with namespace
491
E = ElementMaker(namespace="http://example.com/catalog",
492
nsmap={None: "http://example.com/catalog"})
493
494
# Build XML structure using element maker
495
catalog = E.catalog(
496
E.metadata(
497
E.title("Book Catalog"),
498
E.created("2023-12-07"),
499
E.version("1.0")
500
),
501
E.books(
502
E.book(
503
E.title("Python Programming"),
504
E.author("John Smith"),
505
E.isbn("978-0123456789"),
506
E.price("29.99", currency="USD"),
507
E.categories(
508
E.category("Programming"),
509
E.category("Python"),
510
E.category("Computers")
511
),
512
id="1",
513
available="true"
514
),
515
E.book(
516
E.title("Web Development"),
517
E.author("Jane Doe"),
518
E.isbn("978-0987654321"),
519
E.price("34.95", currency="USD"),
520
E.categories(
521
E.category("Web"),
522
E.category("HTML"),
523
E.category("CSS")
524
),
525
id="2",
526
available="false"
527
)
528
)
529
)
530
531
print("Generated XML:")
532
print(etree.tostring(catalog, pretty_print=True, encoding='unicode'))
533
534
# Custom element maker with type mapping
535
def format_price(value):
536
"""Custom price formatter."""
537
return f"${float(value):.2f}"
538
539
def bool_to_string(value):
540
"""Convert boolean to string."""
541
return "yes" if value else "no"
542
543
custom_typemap = {
544
float: format_price,
545
bool: bool_to_string
546
}
547
548
CustomE = ElementMaker(typemap=custom_typemap)
549
550
# Use custom element maker
551
product = CustomE.product(
552
CustomE.name("Widget"),
553
CustomE.price(19.99), # Will be formatted as currency
554
CustomE.available(True), # Will be converted to "yes"
555
CustomE.features(
556
CustomE.feature("Lightweight"),
557
CustomE.feature("Durable"),
558
CustomE.feature("Affordable")
559
)
560
)
561
562
print("\nCustom formatted XML:")
563
print(etree.tostring(product, pretty_print=True, encoding='unicode'))
564
```
565
566
### XInclude Processing
567
568
```python
569
from lxml import etree
570
from lxml.ElementInclude import include, default_loader
571
import tempfile
572
import os
573
574
# Create temporary files for XInclude example
575
temp_dir = tempfile.mkdtemp()
576
577
# Create included content files
578
header_content = '''<?xml version="1.0"?>
579
<header>
580
<title>Document Title</title>
581
<author>John Doe</author>
582
<date>2023-12-07</date>
583
</header>'''
584
585
footer_content = '''<?xml version="1.0"?>
586
<footer>
587
<copyright>© 2023 Example Corp</copyright>
588
<contact>contact@example.com</contact>
589
</footer>'''
590
591
# Write include files
592
header_file = os.path.join(temp_dir, 'header.xml')
593
footer_file = os.path.join(temp_dir, 'footer.xml')
594
595
with open(header_file, 'w') as f:
596
f.write(header_content)
597
598
with open(footer_file, 'w') as f:
599
f.write(footer_content)
600
601
# Main document with XInclude directives
602
main_doc_content = f'''<?xml version="1.0"?>
603
<document xmlns:xi="http://www.w3.org/2001/XInclude">
604
<xi:include href="{header_file}"/>
605
606
<content>
607
<section>
608
<h1>Introduction</h1>
609
<p>This is the main content of the document.</p>
610
</section>
611
<section>
612
<h1>Details</h1>
613
<p>More detailed information goes here.</p>
614
</section>
615
</content>
616
617
<xi:include href="{footer_file}"/>
618
</document>'''
619
620
# Parse document with XInclude processing
621
root = etree.fromstring(main_doc_content)
622
print("Before XInclude processing:")
623
print(etree.tostring(root, pretty_print=True, encoding='unicode'))
624
625
# Process XInclude directives
626
include(root)
627
print("\nAfter XInclude processing:")
628
print(etree.tostring(root, pretty_print=True, encoding='unicode'))
629
630
# Custom loader for XInclude
631
def custom_loader(href, parse, encoding=None):
632
"""Custom XInclude loader with logging."""
633
print(f"Loading: {href} (parse={parse}, encoding={encoding})")
634
return default_loader(href, parse, encoding)
635
636
# Use custom loader
637
root2 = etree.fromstring(main_doc_content)
638
include(root2, loader=custom_loader)
639
640
# Clean up temporary files
641
os.unlink(header_file)
642
os.unlink(footer_file)
643
os.rmdir(temp_dir)
644
```
645
646
### ElementPath Simple Queries
647
648
```python
649
from lxml import etree
650
from lxml._elementpath import find, findall, iterfind, findtext
651
652
# XML document for path queries
653
xml_data = '''<?xml version="1.0"?>
654
<library>
655
<section name="fiction">
656
<book id="1">
657
<title>The Great Gatsby</title>
658
<author>F. Scott Fitzgerald</author>
659
<metadata>
660
<genre>Classic Literature</genre>
661
<year>1925</year>
662
</metadata>
663
</book>
664
<book id="2">
665
<title>To Kill a Mockingbird</title>
666
<author>Harper Lee</author>
667
<metadata>
668
<genre>Classic Literature</genre>
669
<year>1960</year>
670
</metadata>
671
</book>
672
</section>
673
<section name="science">
674
<book id="3">
675
<title>A Brief History of Time</title>
676
<author>Stephen Hawking</author>
677
<metadata>
678
<genre>Science</genre>
679
<year>1988</year>
680
</metadata>
681
</book>
682
</section>
683
</library>'''
684
685
root = etree.fromstring(xml_data)
686
687
# Simple path queries (ElementTree-style)
688
fiction_section = find(root, 'section[@name="fiction"]')
689
print(f"Fiction section: {fiction_section.get('name') if fiction_section else 'Not found'}")
690
691
# Find all books in any section
692
all_books = findall(root, './/book')
693
print(f"Total books: {len(all_books)}")
694
695
# Find specific book by ID
696
book1 = find(root, './/book[@id="1"]')
697
if book1:
698
title = findtext(book1, 'title')
699
author = findtext(book1, 'author')
700
print(f"Book 1: {title} by {author}")
701
702
# Iterate over books in fiction section
703
fiction_books = iterfind(root, 'section[@name="fiction"]/book')
704
print("Fiction books:")
705
for book in fiction_books:
706
title = findtext(book, 'title')
707
year = findtext(book, 'metadata/year')
708
print(f" {title} ({year})")
709
710
# Find text with default value
711
unknown_book = findtext(root, 'section/book[@id="999"]/title', 'Unknown Book')
712
print(f"Unknown book title: {unknown_book}")
713
714
# Complex paths
715
classic_books = findall(root, './/book[metadata/genre="Classic Literature"]')
716
print(f"Classic literature books: {len(classic_books)}")
717
718
recent_books = findall(root, './/book[metadata/year>"1950"]')
719
print(f"Books after 1950: {len(recent_books)}")
720
```
721
722
### Custom Element Classes
723
724
```python
725
from lxml import etree
726
727
# Define custom element classes
728
class BookElement(etree.ElementBase):
729
"""Custom element class for book elements."""
730
731
@property
732
def title(self):
733
"""Get book title."""
734
title_elem = self.find('title')
735
return title_elem.text if title_elem is not None else None
736
737
@property
738
def author(self):
739
"""Get book author."""
740
author_elem = self.find('author')
741
return author_elem.text if author_elem is not None else None
742
743
@property
744
def year(self):
745
"""Get publication year as integer."""
746
year_elem = self.find('metadata/year')
747
if year_elem is not None:
748
try:
749
return int(year_elem.text)
750
except (ValueError, TypeError):
751
return None
752
return None
753
754
def is_classic(self):
755
"""Check if book is classic literature."""
756
genre_elem = self.find('metadata/genre')
757
return genre_elem is not None and genre_elem.text == 'Classic Literature'
758
759
class SectionElement(etree.ElementBase):
760
"""Custom element class for section elements."""
761
762
@property
763
def name(self):
764
"""Get section name."""
765
return self.get('name', 'Unnamed Section')
766
767
def get_books(self):
768
"""Get all books in this section."""
769
return self.findall('book')
770
771
def count_books(self):
772
"""Count books in this section."""
773
return len(self.findall('book'))
774
775
# Create element class lookup
776
class CustomElementClassLookup(etree.PythonElementClassLookup):
777
def lookup(self, document, element):
778
if element.tag == 'book':
779
return BookElement
780
elif element.tag == 'section':
781
return SectionElement
782
return None
783
784
# Set up parser with custom lookup
785
lookup = CustomElementClassLookup()
786
parser = etree.XMLParser()
787
parser.set_element_class_lookup(lookup)
788
789
# Parse with custom element classes
790
xml_data = '''<?xml version="1.0"?>
791
<library>
792
<section name="fiction">
793
<book id="1">
794
<title>The Great Gatsby</title>
795
<author>F. Scott Fitzgerald</author>
796
<metadata>
797
<genre>Classic Literature</genre>
798
<year>1925</year>
799
</metadata>
800
</book>
801
<book id="2">
802
<title>Modern Fiction</title>
803
<author>Contemporary Author</author>
804
<metadata>
805
<genre>Modern Literature</genre>
806
<year>2020</year>
807
</metadata>
808
</book>
809
</section>
810
</library>'''
811
812
root = etree.fromstring(xml_data, parser)
813
814
# Use custom element methods
815
fiction_section = root.find('section')
816
print(f"Section: {fiction_section.name}")
817
print(f"Books in section: {fiction_section.count_books()}")
818
819
for book in fiction_section.get_books():
820
print(f" {book.title} by {book.author} ({book.year})")
821
print(f" Is classic: {book.is_classic()}")
822
```