0
# XPath and XSLT Processing
1
2
Advanced XML querying and transformation capabilities with XPath 1.0/2.0 evaluation, XSLT 1.0 stylesheets, extension functions, namespace handling, and XML canonicalization. These features enable powerful XML processing workflows for data extraction, transformation, and analysis.
3
4
## Capabilities
5
6
### XPath Evaluation
7
8
Compile and evaluate XPath expressions with variables, extension functions, and namespace support.
9
10
```python { .api }
11
class XPath:
12
"""Compiled XPath expression for efficient repeated evaluation."""
13
14
def __init__(self, path, namespaces=None, extensions=None,
15
regexp=True, smart_strings=True):
16
"""
17
Compile XPath expression.
18
19
Args:
20
path: XPath expression string
21
namespaces: dict mapping prefixes to namespace URIs
22
extensions: dict of extension function modules
23
regexp: Enable EXSLT regular expression functions
24
smart_strings: Return Python str objects instead of lxml._ElementUnicodeResult objects
25
"""
26
27
def __call__(self, _etree_or_element, **_variables):
28
"""
29
Evaluate XPath on element or document.
30
31
Args:
32
_etree_or_element: Element or ElementTree to evaluate on
33
**_variables: XPath variables as keyword arguments
34
35
Returns:
36
list: XPath evaluation results (elements, strings, numbers, or booleans depending on expression)
37
"""
38
39
@property
40
def path(self):
41
"""XPath expression string."""
42
43
class XPathEvaluator:
44
"""XPath evaluation context with persistent variables and functions."""
45
46
def __init__(self, etree_or_element, namespaces=None, extensions=None,
47
enable_regexp=True, smart_strings=True):
48
"""
49
Create XPath evaluator for specific element/document.
50
51
Args:
52
etree_or_element: Element or ElementTree to evaluate on
53
namespaces: dict mapping prefixes to namespace URIs
54
extensions: dict of extension function modules
55
enable_regexp: Enable EXSLT regular expression functions
56
smart_strings: Return Python str objects instead of lxml._ElementUnicodeResult objects
57
"""
58
59
def __call__(self, _path, **_variables):
60
"""Evaluate XPath expression with variables."""
61
62
def evaluate(self, _path, **_variables):
63
"""Evaluate XPath expression with variables."""
64
65
def register_namespace(self, prefix, uri):
66
"""Register namespace prefix for this evaluator."""
67
68
def register_namespaces(self, namespaces):
69
"""Register multiple namespace prefixes."""
70
71
class XPathDocumentEvaluator:
72
"""Document-level XPath evaluator with document context."""
73
74
def __init__(self, etree, namespaces=None, extensions=None,
75
enable_regexp=True, smart_strings=True):
76
"""Create document-level XPath evaluator."""
77
78
def __call__(self, _path, **_variables):
79
"""Evaluate XPath expression on document."""
80
81
# Element XPath methods
82
class Element:
83
def xpath(self, _path, namespaces=None, extensions=None,
84
smart_strings=True, **_variables):
85
"""Evaluate XPath expression on element."""
86
```
87
88
### XSLT Transformation
89
90
Apply XSLT stylesheets to transform XML documents with parameters and extension functions.
91
92
```python { .api }
93
class XSLT:
94
"""XSLT stylesheet processor."""
95
96
def __init__(self, xslt_input, extensions=None, regexp=True,
97
access_control=None):
98
"""
99
Create XSLT processor from stylesheet.
100
101
Args:
102
xslt_input: Element, ElementTree, or file containing XSLT
103
extensions: dict of extension function modules
104
regexp: Enable EXSLT regular expression functions
105
access_control: XSLTAccessControl for security restrictions
106
"""
107
108
def __call__(self, _input, profile_run=False, **kwargs):
109
"""
110
Transform XML document using stylesheet.
111
112
Args:
113
_input: Element or ElementTree to transform
114
profile_run: Enable XSLT profiling
115
**kwargs: XSLT parameters as keyword arguments
116
117
Returns:
118
ElementTree: Transformation result
119
"""
120
121
def apply(self, _input, **kwargs):
122
"""Apply transformation and return result tree."""
123
124
def transform(self, _input, **kwargs):
125
"""Transform document (same as __call__)."""
126
127
@property
128
def error_log(self):
129
"""XSLT processing error log."""
130
131
@staticmethod
132
def strparam(s):
133
"""Convert Python string to XSLT string parameter."""
134
135
class XSLTAccessControl:
136
"""Security access control for XSLT processing to prevent unauthorized file/network access."""
137
138
DENY_ALL = None # Deny all external access (most secure)
139
DENY_WRITE = None # Deny write operations but allow reads
140
DENY_READ = None # Deny read operations but allow writes (rarely used)
141
142
def __init__(self, read_file=True, write_file=False, create_dir=False,
143
read_network=False, write_network=False):
144
"""
145
Create access control configuration for XSLT security.
146
147
Args:
148
read_file: Allow XSLT to read files from filesystem
149
write_file: Allow XSLT to write files to filesystem (security risk)
150
create_dir: Allow XSLT to create directories (security risk)
151
read_network: Allow XSLT to fetch resources via HTTP/HTTPS (security risk)
152
write_network: Allow XSLT to send data over network (security risk)
153
"""
154
```
155
156
### XML Canonicalization
157
158
XML canonicalization (C14N) for consistent XML representation and digital signatures.
159
160
```python { .api }
161
def canonicalize(xml_input, out=None, from_file=False, **options):
162
"""
163
Canonicalize XML document using C14N algorithm.
164
165
Args:
166
xml_input: XML string, Element, ElementTree, or filename
167
out: Output file or file-like object (optional)
168
from_file: Treat xml_input as filename
169
**options: C14N options including:
170
- exclusive: bool - Use exclusive canonicalization
171
- with_comments: bool - Include comments (default True)
172
- inclusive_ns_prefixes: list - Namespace prefixes to include
173
- strip_cdata: bool - Convert CDATA to text (default True)
174
175
Returns:
176
bytes: Canonicalized XML (if out not specified)
177
"""
178
179
class C14NWriterTarget:
180
"""Writer target for canonical XML output during parsing."""
181
182
def __init__(self, write, **c14n_options):
183
"""
184
Create C14N writer target.
185
186
Args:
187
write: Function to write canonicalized output
188
**c14n_options: C14N canonicalization options
189
"""
190
```
191
192
### Extension Functions
193
194
Create custom XPath and XSLT extension functions.
195
196
```python { .api }
197
class Extension:
198
"""Base class for XSLT extensions."""
199
200
class XSLTExtension:
201
"""XSLT extension function handler."""
202
203
class FunctionNamespace:
204
"""XPath extension function namespace."""
205
206
def __init__(self, namespace_uri):
207
"""
208
Create function namespace.
209
210
Args:
211
namespace_uri: Namespace URI for extension functions
212
"""
213
214
def __setitem__(self, function_name, function):
215
"""Register extension function."""
216
217
def __getitem__(self, function_name):
218
"""Get registered extension function."""
219
220
def __delitem__(self, function_name):
221
"""Unregister extension function."""
222
```
223
224
### XPath Error Handling
225
226
Comprehensive error classes for XPath and XSLT processing.
227
228
```python { .api }
229
class XPathError(LxmlError):
230
"""Base class for XPath-related errors."""
231
232
class XPathEvalError(XPathError):
233
"""XPath evaluation error."""
234
235
class XPathSyntaxError(XPathError):
236
"""XPath syntax error."""
237
238
class XPathResultError(XPathError):
239
"""XPath result type error."""
240
241
class XPathFunctionError(XPathError):
242
"""XPath function call error."""
243
244
class XSLTError(LxmlError):
245
"""Base class for XSLT-related errors."""
246
247
class XSLTParseError(XSLTError):
248
"""XSLT stylesheet parsing error."""
249
250
class XSLTApplyError(XSLTError):
251
"""XSLT transformation error."""
252
253
class XSLTSaveError(XSLTError):
254
"""XSLT result saving error."""
255
256
class XSLTExtensionError(XSLTError):
257
"""XSLT extension function error."""
258
259
class C14NError(LxmlError):
260
"""XML canonicalization error."""
261
```
262
263
## Usage Examples
264
265
### Basic XPath Queries
266
267
```python
268
from lxml import etree
269
270
# Sample XML document
271
xml_data = '''<?xml version="1.0"?>
272
<library xmlns:book="http://example.com/book">
273
<book:catalog>
274
<book:item id="1" category="fiction">
275
<book:title>The Great Gatsby</book:title>
276
<book:author>F. Scott Fitzgerald</book:author>
277
<book:year>1925</book:year>
278
<book:price currency="USD">12.99</book:price>
279
</book:item>
280
<book:item id="2" category="science">
281
<book:title>A Brief History of Time</book:title>
282
<book:author>Stephen Hawking</book:author>
283
<book:year>1988</book:year>
284
<book:price currency="USD">15.99</book:price>
285
</book:item>
286
<book:item id="3" category="fiction">
287
<book:title>To Kill a Mockingbird</book:title>
288
<book:author>Harper Lee</book:author>
289
<book:year>1960</book:year>
290
<book:price currency="USD">11.99</book:price>
291
</book:item>
292
</book:catalog>
293
</library>'''
294
295
root = etree.fromstring(xml_data)
296
297
# Define namespace mapping
298
namespaces = {'b': 'http://example.com/book'}
299
300
# Basic XPath queries
301
all_books = root.xpath('//b:item', namespaces=namespaces)
302
print(f"Found {len(all_books)} books")
303
304
fiction_books = root.xpath('//b:item[@category="fiction"]', namespaces=namespaces)
305
print(f"Fiction books: {len(fiction_books)}")
306
307
# Extract text content
308
titles = root.xpath('//b:title/text()', namespaces=namespaces)
309
print(f"Book titles: {titles}")
310
311
# Extract attributes
312
book_ids = root.xpath('//b:item/@id', namespaces=namespaces)
313
print(f"Book IDs: {book_ids}")
314
315
# Complex queries with predicates
316
expensive_books = root.xpath('//b:item[number(b:price) > 13]', namespaces=namespaces)
317
recent_books = root.xpath('//b:item[b:year > 1950]', namespaces=namespaces)
318
319
print(f"Expensive books: {len(expensive_books)}")
320
print(f"Recent books: {len(recent_books)}")
321
322
# XPath functions
323
oldest_book = root.xpath('//b:item[b:year = min(//b:year)]/b:title/text()', namespaces=namespaces)
324
print(f"Oldest book: {oldest_book[0] if oldest_book else 'None'}")
325
```
326
327
### Compiled XPath Expressions
328
329
```python
330
from lxml import etree
331
332
xml_data = '''
333
<products>
334
<product id="1" price="19.99" category="electronics">
335
<name>Widget</name>
336
<stock>15</stock>
337
</product>
338
<product id="2" price="29.99" category="electronics">
339
<name>Gadget</name>
340
<stock>8</stock>
341
</product>
342
<product id="3" price="9.99" category="books">
343
<name>Manual</name>
344
<stock>25</stock>
345
</product>
346
</products>
347
'''
348
349
root = etree.fromstring(xml_data)
350
351
# Compile XPath expressions for reuse
352
find_by_category = etree.XPath('//product[@category=$cat]')
353
find_by_price_range = etree.XPath('//product[number(@price) >= $min and number(@price) <= $max]')
354
count_in_stock = etree.XPath('sum(//product[@category=$cat]/stock)')
355
356
# Use compiled expressions with variables
357
electronics = find_by_category(root, cat='electronics')
358
print(f"Electronics products: {len(electronics)}")
359
360
affordable = find_by_price_range(root, min=10, max=25)
361
print(f"Affordable products: {len(affordable)}")
362
363
electronics_stock = count_in_stock(root, cat='electronics')
364
print(f"Total electronics in stock: {electronics_stock}")
365
366
# XPath evaluator for persistent context
367
evaluator = etree.XPathEvaluator(root)
368
evaluator.register_namespace('p', 'http://example.com/products')
369
370
# Evaluate multiple expressions with same context
371
product_count = evaluator('count(//product)')
372
avg_price = evaluator('sum(//product/@price) div count(//product)')
373
categories = evaluator('distinct-values(//product/@category)')
374
375
print(f"Products: {product_count}, Average price: ${avg_price:.2f}")
376
```
377
378
### XSLT Transformations
379
380
```python
381
from lxml import etree
382
383
# XML data to transform
384
xml_data = '''<?xml version="1.0"?>
385
<catalog>
386
<book id="1">
387
<title>Python Programming</title>
388
<author>John Smith</author>
389
<year>2023</year>
390
<price>29.99</price>
391
</book>
392
<book id="2">
393
<title>Web Development</title>
394
<author>Jane Doe</author>
395
<year>2022</year>
396
<price>34.95</price>
397
</book>
398
</catalog>'''
399
400
# XSLT stylesheet
401
xslt_stylesheet = '''<?xml version="1.0"?>
402
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
403
<xsl:param name="format" select="'html'"/>
404
<xsl:param name="title" select="'Book Catalog'"/>
405
406
<xsl:template match="/">
407
<xsl:choose>
408
<xsl:when test="$format='html'">
409
<html>
410
<head><title><xsl:value-of select="$title"/></title></head>
411
<body>
412
<h1><xsl:value-of select="$title"/></h1>
413
<table border="1">
414
<tr>
415
<th>Title</th>
416
<th>Author</th>
417
<th>Year</th>
418
<th>Price</th>
419
</tr>
420
<xsl:for-each select="catalog/book">
421
<xsl:sort select="year" order="descending"/>
422
<tr>
423
<td><xsl:value-of select="title"/></td>
424
<td><xsl:value-of select="author"/></td>
425
<td><xsl:value-of select="year"/></td>
426
<td>$<xsl:value-of select="price"/></td>
427
</tr>
428
</xsl:for-each>
429
</table>
430
</body>
431
</html>
432
</xsl:when>
433
<xsl:otherwise>
434
<book-list>
435
<xsl:for-each select="catalog/book">
436
<item>
437
<xsl:value-of select="title"/> by <xsl:value-of select="author"/> (<xsl:value-of select="year"/>)
438
</item>
439
</xsl:for-each>
440
</book-list>
441
</xsl:otherwise>
442
</xsl:choose>
443
</xsl:template>
444
</xsl:stylesheet>'''
445
446
# Parse XML and XSLT
447
xml_doc = etree.fromstring(xml_data)
448
xslt_doc = etree.fromstring(xslt_stylesheet)
449
450
# Create XSLT processor
451
transform = etree.XSLT(xslt_doc)
452
453
# Transform with parameters
454
html_result = transform(xml_doc, format="'html'", title="'My Book Collection'")
455
print("HTML transformation:")
456
print(etree.tostring(html_result, pretty_print=True, encoding='unicode'))
457
458
# Transform with different parameters
459
text_result = transform(xml_doc, format="'text'")
460
print("\nText transformation:")
461
print(etree.tostring(text_result, pretty_print=True, encoding='unicode'))
462
463
# Check for transformation errors
464
if transform.error_log:
465
print("XSLT errors:")
466
for error in transform.error_log:
467
print(f" {error}")
468
```
469
470
### Extension Functions
471
472
```python
473
from lxml import etree
474
475
# Define custom extension functions
476
def custom_format_price(context, price_list, currency='USD'):
477
"""Format price with currency symbol."""
478
if not price_list:
479
return ''
480
price = float(price_list[0])
481
symbols = {'USD': '$', 'EUR': '€', 'GBP': '£'}
482
symbol = symbols.get(currency, currency)
483
return f"{symbol}{price:.2f}"
484
485
def custom_word_count(context, text_list):
486
"""Count words in text."""
487
if not text_list:
488
return 0
489
text = str(text_list[0])
490
return len(text.split())
491
492
# Create extension namespace
493
ns = etree.FunctionNamespace('http://example.com/functions')
494
ns['format-price'] = custom_format_price
495
ns['word-count'] = custom_word_count
496
497
# XML with custom processing
498
xml_data = '''
499
<products>
500
<product>
501
<name>Programming Guide</name>
502
<description>A comprehensive guide to Python programming for beginners and experts</description>
503
<price>29.99</price>
504
</product>
505
<product>
506
<name>Quick Reference</name>
507
<description>Essential commands and functions</description>
508
<price>15.50</price>
509
</product>
510
</products>
511
'''
512
513
# XSLT using extension functions
514
xslt_with_extensions = '''<?xml version="1.0"?>
515
<xsl:stylesheet version="1.0"
516
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
517
xmlns:custom="http://example.com/functions">
518
519
<xsl:template match="/">
520
<product-report>
521
<xsl:for-each select="products/product">
522
<item>
523
<name><xsl:value-of select="name"/></name>
524
<formatted-price>
525
<xsl:value-of select="custom:format-price(price, 'USD')"/>
526
</formatted-price>
527
<description-length>
528
<xsl:value-of select="custom:word-count(description)"/> words
529
</description-length>
530
</item>
531
</xsl:for-each>
532
</product-report>
533
</xsl:template>
534
</xsl:stylesheet>
535
'''
536
537
# Transform using extensions
538
xml_doc = etree.fromstring(xml_data)
539
xslt_doc = etree.fromstring(xslt_with_extensions)
540
541
# Create transform with extensions enabled
542
extensions = {('http://example.com/functions', 'format-price'): custom_format_price,
543
('http://example.com/functions', 'word-count'): custom_word_count}
544
545
transform = etree.XSLT(xslt_doc, extensions=extensions)
546
result = transform(xml_doc)
547
548
print("Result with extension functions:")
549
print(etree.tostring(result, pretty_print=True, encoding='unicode'))
550
```
551
552
### XML Canonicalization
553
554
```python
555
from lxml import etree
556
557
# XML document with varying whitespace and attribute order
558
xml_data = '''<?xml version="1.0"?>
559
<root xmlns:a="http://example.com/a"
560
xmlns:b="http://example.com/b">
561
562
<element b:attr="value2" a:attr="value1" >
563
<child> text content </child>
564
<!-- This is a comment -->
565
<another-child/>
566
</element>
567
568
</root>'''
569
570
# Parse document
571
doc = etree.fromstring(xml_data)
572
573
# Basic canonicalization
574
canonical_xml = etree.canonicalize(xml_data)
575
print("Canonical XML (default):")
576
print(canonical_xml.decode('utf-8'))
577
578
# Canonicalization without comments
579
canonical_no_comments = etree.canonicalize(xml_data, with_comments=False)
580
print("\nCanonical XML (no comments):")
581
print(canonical_no_comments.decode('utf-8'))
582
583
# Exclusive canonicalization
584
canonical_exclusive = etree.canonicalize(xml_data, exclusive=True)
585
print("\nExclusive canonical XML:")
586
print(canonical_exclusive.decode('utf-8'))
587
588
# Canonicalize to file
589
with open('/tmp/canonical.xml', 'wb') as f:
590
etree.canonicalize(xml_data, out=f)
591
592
# Using C14N writer target during parsing
593
output_parts = []
594
def write_canonical(data):
595
output_parts.append(data)
596
597
target = etree.C14NWriterTarget(write_canonical, with_comments=False)
598
parser = etree.XMLParser(target=target)
599
etree.fromstring(xml_data, parser)
600
601
print("\nCanonical XML via writer target:")
602
print(b''.join(output_parts).decode('utf-8'))
603
```
604
605
### Advanced XPath with Namespaces
606
607
```python
608
from lxml import etree
609
610
# Complex XML with multiple namespaces
611
xml_data = '''<?xml version="1.0"?>
612
<root xmlns="http://example.com/default"
613
xmlns:meta="http://example.com/metadata"
614
xmlns:content="http://example.com/content">
615
616
<meta:info>
617
<meta:created>2023-12-07</meta:created>
618
<meta:author>John Doe</meta:author>
619
</meta:info>
620
621
<content:document>
622
<content:section id="intro">
623
<content:title>Introduction</content:title>
624
<content:paragraph>This is the introduction.</content:paragraph>
625
</content:section>
626
<content:section id="main">
627
<content:title>Main Content</content:title>
628
<content:paragraph>This is the main content.</content:paragraph>
629
<content:subsection>
630
<content:title>Subsection</content:title>
631
<content:paragraph>Subsection content.</content:paragraph>
632
</content:subsection>
633
</content:section>
634
</content:document>
635
636
</root>'''
637
638
root = etree.fromstring(xml_data)
639
640
# Define comprehensive namespace mappings
641
namespaces = {
642
'default': 'http://example.com/default',
643
'meta': 'http://example.com/metadata',
644
'content': 'http://example.com/content'
645
}
646
647
# Complex XPath queries with namespaces
648
author = root.xpath('//meta:author/text()', namespaces=namespaces)
649
print(f"Author: {author[0] if author else 'Unknown'}")
650
651
# Find all sections and subsections
652
sections = root.xpath('//content:section | //content:subsection', namespaces=namespaces)
653
print(f"Found {len(sections)} sections")
654
655
# Extract titles with context
656
titles_with_id = root.xpath('//content:section[@id]/content:title/text()', namespaces=namespaces)
657
for title in titles_with_id:
658
print(f"Section title: {title}")
659
660
# Count paragraphs in main section
661
main_paragraphs = root.xpath('count(//content:section[@id="main"]//content:paragraph)', namespaces=namespaces)
662
print(f"Paragraphs in main section: {main_paragraphs}")
663
664
# Build document outline
665
outline_xpath = etree.XPath('''
666
for $section in //content:section
667
return concat($section/@id, ": ", $section/content:title/text())
668
''', namespaces=namespaces)
669
670
outline = outline_xpath(root)
671
print("Document outline:")
672
for item in outline:
673
print(f" {item}")
674
```