Tessl Tile for pypi/lxml@6.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

etree-core.md html-processing.md index.md objectify-api.md utility-modules.md validation.md xpath-xslt.md

etree-core.mddocs/

0
# Core XML/HTML Processing
1

2
Comprehensive ElementTree-compatible API for XML and HTML document parsing, manipulation, and serialization. This module provides the foundation for all lxml functionality with full standards compliance, namespace support, and high-performance processing.
3

4
## Capabilities
5

6
### Document Parsing
7

8
Parse XML and HTML documents from strings, files, URLs, or file-like objects with configurable parsers and error handling.
9

10
```python { .api }
11
def parse(source, parser=None, base_url=None):
12
    """
13
    Parse XML/HTML document from file, URL, or file-like object.
14
    
15
    Args:
16
        source: File path, URL, file-like object, or filename
17
        parser: XMLParser or HTMLParser instance (optional)
18
        base_url: Base URL for resolving relative references (optional)
19
    
20
    Returns:
21
        ElementTree: Parsed document tree
22
    """
23

24
def fromstring(text, parser=None, base_url=None):
25
    """
26
    Parse XML/HTML document from string.
27
    
28
    Args:
29
        text: str or bytes containing XML/HTML content
30
        parser: XMLParser or HTMLParser instance (optional)
31
        base_url: Base URL for resolving relative references (optional)
32
    
33
    Returns:
34
        Element: Root element of parsed document
35
    """
36

37
def XML(text, parser=None, base_url=None):
38
    """
39
    Parse XML string with validation enabled by default.
40
    
41
    Args:
42
        text: str or bytes containing XML content
43
        parser: XMLParser instance (optional)
44
        base_url: Base URL for resolving relative references (optional)
45
    
46
    Returns:
47
        Element: Root element of parsed XML
48
    """
49

50
def HTML(text, parser=None, base_url=None):
51
    """
52
    Parse HTML string with lenient parsing.
53
    
54
    Args:
55
        text: str or bytes containing HTML content
56
        parser: HTMLParser instance (optional)
57
        base_url: Base URL for resolving relative references (optional)
58
    
59
    Returns:
60
        Element: Root element of parsed HTML
61
    """
62
```
63

64
### Incremental Parsing
65

66
Memory-efficient parsing for large documents using event-driven processing.
67

68
```python { .api }
69
def iterparse(source, events=None, tag=None, attribute_defaults=False, 
70
              dtd_validation=False, load_dtd=False, no_network=True, 
71
              remove_blank_text=False, remove_comments=False, 
72
              remove_pis=False, encoding=None, huge_tree=False, 
73
              schema=None):
74
    """
75
    Incrementally parse XML document yielding (event, element) pairs.
76
    
77
    Args:
78
        source: File path, URL, or file-like object
79
        events: tuple of events to report ('start', 'end', 'start-ns', 'end-ns')
80
        tag: str or sequence of tag names to filter
81
        
82
    Yields:
83
        tuple: (event, element) pairs during parsing
84
    """
85

86
def iterwalk(element_or_tree, events=('end',), tag=None):
87
    """
88
    Walk through existing element tree yielding events.
89
    
90
    Args:
91
        element_or_tree: Element or ElementTree to walk
92
        events: tuple of events to report ('start', 'end')
93
        tag: str or sequence of tag names to filter
94
        
95
    Yields:
96
        tuple: (event, element) pairs during traversal
97
    """
98
```
99

100
### Element Creation and Manipulation
101

102
Create and modify XML/HTML elements with full attribute and content support.
103

104
```python { .api }
105
class Element:
106
    """XML/HTML element with tag, attributes, text, and children."""
107
    
108
    def __init__(self, tag, attrib=None, nsmap=None, **extra):
109
        """
110
        Create new element.
111
        
112
        Args:
113
            tag: Element tag name (str or QName)
114
            attrib: dict of attributes (optional)
115
            nsmap: dict mapping namespace prefixes to URIs (optional)
116
            **extra: Additional attributes as keyword arguments
117
        """
118
    
119
    # Element properties
120
    tag: str                    # Element tag name
121
    text: str | None           # Text content before first child
122
    tail: str | None           # Text content after element
123
    attrib: dict[str, str]     # Element attributes
124
    nsmap: dict[str, str]      # Namespace mapping
125
    sourceline: int | None     # Source line number (if available)
126
    
127
    # Tree navigation
128
    def find(self, path, namespaces=None):
129
        """Find first child element matching path."""
130
    
131
    def findall(self, path, namespaces=None):
132
        """Find all child elements matching path."""
133
    
134
    def iterfind(self, path, namespaces=None):
135
        """Iterate over child elements matching path."""
136
    
137
    def findtext(self, path, default=None, namespaces=None):
138
        """Find text content of first matching child element."""
139
    
140
    def xpath(self, _path, namespaces=None, extensions=None, 
141
              smart_strings=True, **_variables):
142
        """Evaluate XPath expression on element."""
143
    
144
    # Tree modification
145
    def append(self, element):
146
        """Add element as last child."""
147
    
148
    def insert(self, index, element):
149
        """Insert element at specified position."""
150
    
151
    def remove(self, element):
152
        """Remove child element."""
153
    
154
    def clear(self):
155
        """Remove all children and attributes."""
156
    
157
    # Attribute access
158
    def get(self, key, default=None):
159
        """Get attribute value."""
160
    
161
    def set(self, key, value):
162
        """Set attribute value."""
163
    
164
    def keys(self):
165
        """Get attribute names."""
166
    
167
    def values(self):
168
        """Get attribute values."""
169
    
170
    def items(self):
171
        """Get (name, value) pairs for attributes."""
172

173
def SubElement(parent, tag, attrib=None, nsmap=None, **extra):
174
    """
175
    Create child element and add to parent.
176
    
177
    Args:
178
        parent: Parent Element
179
        tag: Child element tag name
180
        attrib: dict of attributes (optional)
181
        nsmap: dict of namespace mappings (optional)
182
        **extra: Additional attributes
183
    
184
    Returns:
185
        Element: New child element
186
    """
187
```
188

189
### Document Trees
190

191
Manage complete XML/HTML documents with document-level operations.
192

193
```python { .api }
194
class ElementTree:
195
    """Document tree containing root element and document info."""
196
    
197
    def __init__(self, element=None, file=None, parser=None):
198
        """
199
        Create document tree.
200
        
201
        Args:
202
            element: Root element (optional)
203
            file: File to parse (optional)
204
            parser: Parser instance (optional)
205
        """
206
    
207
    def getroot(self):
208
        """Get root element."""
209
    
210
    def setroot(self, root):
211
        """Set root element."""
212
    
213
    def parse(self, source, parser=None, base_url=None):
214
        """Parse document from source."""
215
    
216
    def write(self, file, encoding=None, xml_declaration=None, 
217
              default_namespace=None, method="xml", pretty_print=False,
218
              with_tail=True, standalone=None, compression=0, 
219
              exclusive=False, inclusive_ns_prefixes=None, 
220
              with_comments=True, strip_cdata=True):
221
        """Write document to file."""
222
    
223
    def xpath(self, _path, namespaces=None, extensions=None, 
224
              smart_strings=True, **_variables):
225
        """Evaluate XPath expression on document."""
226
    
227
    def xslt(self, _xslt, extensions=None, access_control=None, **_kw):
228
        """Apply XSLT transformation."""
229
    
230
    def relaxng(self, relaxng):
231
        """Validate against RelaxNG schema."""
232
    
233
    def xmlschema(self, xmlschema):
234
        """Validate against XML Schema."""
235
    
236
    def xinclude(self):
237
        """Process XInclude directives."""
238

239
    @property 
240
    def docinfo(self):
241
        """Document information (encoding, version, etc.)."""
242
```
243

244
### Serialization
245

246
Convert elements and trees to strings or bytes with formatting options.
247

248
```python { .api }
249
def tostring(element_or_tree, encoding=None, method="xml", 
250
             xml_declaration=None, pretty_print=False, with_tail=True, 
251
             standalone=None, doctype=None, exclusive=False, 
252
             inclusive_ns_prefixes=None, with_comments=True, 
253
             strip_cdata=True):
254
    """
255
    Serialize element or tree to string/bytes.
256
    
257
    Args:
258
        element_or_tree: Element or ElementTree to serialize
259
        encoding: Output encoding ('unicode' for str, bytes encoding for bytes)
260
        method: Serialization method ('xml', 'html', 'text', 'c14n')
261
        xml_declaration: Include XML declaration (bool or None for auto)
262
        pretty_print: Format output with whitespace (bool)
263
        with_tail: Include tail text (bool)
264
        doctype: Document type declaration (str)
265
        
266
    Returns:
267
        str or bytes: Serialized document
268
    """
269

270
def tostringlist(element_or_tree, encoding=None, method="xml", 
271
                 xml_declaration=None, pretty_print=False, with_tail=True, 
272
                 standalone=None, doctype=None, exclusive=False, 
273
                 inclusive_ns_prefixes=None, with_comments=True, 
274
                 strip_cdata=True):
275
    """Serialize to list of strings/bytes."""
276

277
def tounicode(element_or_tree, method="xml", pretty_print=False, 
278
              with_tail=True, doctype=None):
279
    """Serialize to unicode string."""
280

281
def dump(elem):
282
    """Debug dump element structure to stdout."""
283
```
284

285
### Parser Configuration
286

287
Configurable parsers for different XML/HTML processing needs.
288

289
```python { .api }
290
class XMLParser:
291
    """Configurable XML parser with validation and processing options."""
292
    
293
    def __init__(self, encoding=None, attribute_defaults=False,
294
                 dtd_validation=False, load_dtd=False, no_network=True,
295
                 ns_clean=False, recover=False, schema=None,
296
                 huge_tree=False, remove_blank_text=False,
297
                 resolve_entities=True, remove_comments=False,
298
                 remove_pis=False, strip_cdata=True, collect_ids=True,
299
                 target=None, compact=True):
300
        """
301
        Create XML parser with specified options.
302
        
303
        Args:
304
            encoding: Character encoding override
305
            attribute_defaults: Load default attributes from DTD
306
            dtd_validation: Enable DTD validation
307
            load_dtd: Load and parse DTD
308
            no_network: Disable network access
309
            recover: Enable error recovery
310
            huge_tree: Support very large documents
311
            remove_blank_text: Remove whitespace-only text nodes
312
            remove_comments: Remove comment nodes
313
            remove_pis: Remove processing instruction nodes
314
        """
315

316
class HTMLParser:
317
    """Lenient HTML parser with automatic error recovery."""
318
    
319
    def __init__(self, encoding=None, remove_blank_text=False,
320
                 remove_comments=False, remove_pis=False, 
321
                 strip_cdata=True, no_network=True, target=None,
322
                 schema=None, recover=True, compact=True):
323
        """Create HTML parser with specified options."""
324

325
def get_default_parser():
326
    """Get current default parser."""
327

328
def set_default_parser(parser):
329
    """Set global default parser."""
330
```
331

332
### Tree Manipulation Utilities
333

334
High-level functions for common tree modification operations.
335

336
```python { .api }
337
def cleanup_namespaces(tree_or_element):
338
    """Remove unused namespace declarations."""
339

340
def strip_attributes(tree_or_element, *attribute_names):
341
    """Remove specified attributes from all elements."""
342

343
def strip_elements(tree_or_element, *tag_names, with_tail=True):
344
    """Remove elements with specified tag names."""
345

346
def strip_tags(tree_or_element, *tag_names):
347
    """Remove tags but keep text content."""
348

349
def register_namespace(prefix, uri):
350
    """Register namespace prefix for serialization."""
351
```
352

353
### Node Type Classes
354

355
Specialized classes for different XML node types.
356

357
```python { .api }
358
class Comment:
359
    """XML comment node."""
360
    def __init__(self, text=None): ...
361

362
class ProcessingInstruction:
363
    """XML processing instruction node."""
364
    def __init__(self, target, text=None): ...
365
    
366
    @property
367
    def target(self) -> str: ...
368

369
class Entity:
370
    """XML entity reference node."""
371
    def __init__(self, name): ...
372
    
373
    @property
374
    def name(self) -> str: ...
375

376
class CDATA:
377
    """XML CDATA section."""
378
    def __init__(self, data): ...
379

380
# Factory functions
381
def Comment(text=None):
382
    """Create comment node."""
383

384
def ProcessingInstruction(target, text=None):
385
    """Create processing instruction node."""
386
    
387
PI = ProcessingInstruction  # Alias
388
```
389

390
## Usage Examples
391

392
### Basic XML Processing
393

394
```python
395
from lxml import etree
396

397
# Parse XML document
398
xml_data = '''<?xml version="1.0"?>
399
<catalog>
400
    <book id="1" category="fiction">
401
        <title>The Great Gatsby</title>
402
        <author>F. Scott Fitzgerald</author>
403
        <year>1925</year>
404
        <price currency="USD">12.99</price>
405
    </book>
406
    <book id="2" category="science">
407
        <title>A Brief History of Time</title>
408
        <author>Stephen Hawking</author>
409
        <year>1988</year>
410
        <price currency="USD">15.99</price>
411
    </book>
412
</catalog>'''
413

414
root = etree.fromstring(xml_data)
415

416
# Navigate and query
417
books = root.findall('book')
418
fiction_books = root.xpath('//book[@category="fiction"]')
419
titles = root.xpath('//title/text()')
420

421
# Modify content
422
new_book = etree.SubElement(root, 'book', id="3", category="mystery")
423
etree.SubElement(new_book, 'title').text = "The Murder Mystery"
424
etree.SubElement(new_book, 'author').text = "Agatha Christie"
425
etree.SubElement(new_book, 'year').text = "1934"
426
price_elem = etree.SubElement(new_book, 'price', currency="USD")
427
price_elem.text = "11.99"
428

429
# Serialize with formatting
430
output = etree.tostring(root, pretty_print=True, encoding='unicode')
431
print(output)
432
```
433

434
### HTML Document Processing
435

436
```python
437
from lxml import etree
438

439
# Parse HTML with XML parser (requires well-formed HTML)
440
html_data = '''<!DOCTYPE html>
441
<html>
442
<head>
443
    <title>Sample Page</title>
444
    <meta charset="UTF-8"/>
445
</head>
446
<body>
447
    <h1>Welcome</h1>
448
    <div class="content">
449
        <p>This is a paragraph.</p>
450
        <ul>
451
            <li>Item 1</li>
452
            <li>Item 2</li>
453
        </ul>
454
    </div>
455
</body>
456
</html>'''
457

458
# Use HTML parser for lenient parsing
459
parser = etree.HTMLParser()
460
doc = etree.fromstring(html_data, parser)
461

462
# Find elements
463
title = doc.find('.//title').text
464
content_div = doc.find('.//div[@class="content"]')
465
list_items = doc.xpath('//li/text()')
466

467
print(f"Title: {title}")
468
print(f"List items: {list_items}")
469
```
470

471
### Error Handling
472

473
```python
474
from lxml import etree
475

476
try:
477
    # This will raise XMLSyntaxError due to unclosed tag
478
    bad_xml = '<root><child></root>'
479
    etree.fromstring(bad_xml)
480
except etree.XMLSyntaxError as e:
481
    print(f"XML Error: {e}")
482
    print(f"Line: {e.lineno}, Column: {e.offset}")
483

484
# Use recovery parser for malformed XML
485
try:
486
    parser = etree.XMLParser(recover=True)
487
    root = etree.fromstring(bad_xml, parser)
488
    print("Recovered:", etree.tostring(root, encoding='unicode'))
489
except Exception as e:
490
    print(f"Recovery failed: {e}")
491
```

Version

Tile

Files

etree-core.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

etree-core.mddocs/