Tessl Tile for pypi/lxml@6.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

etree-core.md html-processing.md index.md objectify-api.md utility-modules.md validation.md xpath-xslt.md

html-processing.mddocs/

0
# HTML Processing
1

2
Specialized HTML document processing with web-specific features including lenient parsing, form handling, link processing, CSS class manipulation, and HTML5 support. The html module provides a high-level interface optimized for working with HTML documents in web applications.
3

4
## Capabilities
5

6
### HTML Document Parsing
7

8
Parse HTML documents with lenient parsing that handles malformed HTML gracefully.
9

10
```python { .api }
11
def parse(filename_or_url, parser=None, base_url=None, **kwargs):
12
    """
13
    Parse HTML document from file or URL.
14
    
15
    Args:
16
        filename_or_url: Path to file or URL to parse
17
        parser: HTMLParser instance (optional)
18
        base_url: Base URL for resolving relative links (optional)
19
        **kwargs: Additional arguments passed to parser
20
    
21
    Returns:
22
        ElementTree: Parsed HTML document tree
23
    """
24

25
def document_fromstring(html, parser=None, ensure_head_body=False, base_url=None):
26
    """
27
    Parse complete HTML document from string.
28
    
29
    Args:
30
        html: str or bytes containing HTML content
31
        parser: HTMLParser instance (optional)
32
        ensure_head_body: Ensure document has <head> and <body> elements
33
        base_url: Base URL for resolving relative references
34
    
35
    Returns:
36
        Element: Root <html> element
37
    """
38

39
def fragment_fromstring(html, create_parent=False, tag=None, base_url=None, parser=None):
40
    """
41
    Parse HTML fragment from string.
42
    
43
    Args:
44
        html: str or bytes containing HTML fragment
45
        create_parent: Wrap fragment in parent element
46
        tag: Parent tag name if create_parent=True
47
        base_url: Base URL for resolving relative references
48
        parser: HTMLParser instance (optional)
49
    
50
    Returns:
51
        Element: Fragment root element or parent element
52
    """
53

54
def fragments_fromstring(html, no_leading_text=False, base_url=None, parser=None):
55
    """
56
    Parse HTML string into list of elements and text.
57
    
58
    Args:
59
        html: str or bytes containing HTML fragments
60
        no_leading_text: Exclude leading text before first element
61
        base_url: Base URL for resolving relative references
62
        parser: HTMLParser instance (optional)
63
    
64
    Returns:
65
        list: Elements and text strings from parsed content
66
    """
67

68
def fromstring(html, base_url=None, parser=None):
69
    """
70
    Intelligently parse HTML as document or fragment.
71
    
72
    Args:
73
        html: str or bytes containing HTML content
74
        base_url: Base URL for resolving relative references
75
        parser: HTMLParser instance (optional)
76
    
77
    Returns:
78
        Element: Root element (document or fragment)
79
    """
80
```
81

82
### HTML Element Classes
83

84
HTML-specific element classes with web functionality.
85

86
```python { .api }
87
class HtmlElement:
88
    """Base HTML element class with HTML-specific methods."""
89
    
90
    # CSS class manipulation
91
    def get_class(self):
92
        """Get CSS classes as set-like object."""
93
    
94
    def set_class(self, classes):
95
        """Set CSS classes from string or iterable."""
96
    
97
    classes = property(get_class, set_class)
98
    
99
    # Link processing
100
    def make_links_absolute(self, base_url=None, resolve_base_href=True):
101
        """Make all relative links absolute."""
102
    
103
    def resolve_base_href(self, handle_failures=True):
104
        """Apply base href to relative links."""
105
    
106
    def iterlinks(self):
107
        """Iterate over all links in element."""
108
    
109
    def rewrite_links(self, link_repl_func, resolve_base_href=True, base_href=None):
110
        """Rewrite links using callback function."""
111
    
112
    # Content extraction
113
    def text_content(self):
114
        """Get all text content with whitespace normalized."""
115
    
116
    def drop_tree(self):
117
        """Remove element and children from document."""
118
    
119
    def drop_tag(self):
120
        """Remove element tag but keep children."""
121
    
122
    # Form-related methods (for form elements)
123
    @property
124
    def forms(self):
125
        """List of form elements in document."""
126
    
127
    @property
128
    def body(self):
129
        """Document body element (for document root)."""
130

131
class HtmlComment(HtmlElement):
132
    """HTML comment element."""
133

134
class HtmlEntity(HtmlElement):  
135
    """HTML entity element."""
136

137
class HtmlProcessingInstruction(HtmlElement):
138
    """HTML processing instruction element."""
139
```
140

141
### Form Handling
142

143
Specialized classes for working with HTML forms and form elements.
144

145
```python { .api }
146
class FormElement(HtmlElement):
147
    """HTML form element with submission capabilities."""
148
    
149
    @property
150
    def inputs(self):
151
        """Dictionary-like access to form inputs."""
152
    
153
    @property  
154
    def fields(self):
155
        """Dictionary of form field names to elements."""
156
    
157
    @property
158
    def action(self):
159
        """Form action URL."""
160
    
161
    @property
162
    def method(self):
163
        """Form submission method (GET/POST)."""
164
    
165
    def form_values(self):
166
        """Get list of (name, value) pairs for form submission."""
167
    
168
    def _name_values(self):
169
        """Internal method for getting form data."""
170

171
class InputElement(HtmlElement):
172
    """HTML input element."""
173
    
174
    @property
175
    def name(self):
176
        """Input name attribute."""
177
    
178
    @property
179
    def value(self):
180
        """Input value."""
181
    
182
    @value.setter
183
    def value(self, value):
184
        """Set input value."""
185
    
186
    @property
187
    def type(self):
188
        """Input type (text, password, checkbox, etc.)."""
189
    
190
    @property
191
    def checked(self):
192
        """Checked state for checkbox/radio inputs."""
193
    
194
    @checked.setter
195
    def checked(self, checked):
196
        """Set checked state."""
197

198
class SelectElement(HtmlElement):
199
    """HTML select element."""
200
    
201
    @property
202
    def value(self):
203
        """Selected value(s)."""
204
    
205
    @value.setter  
206
    def value(self, value):
207
        """Set selected value(s)."""
208
    
209
    @property
210
    def value_options(self):
211
        """List of possible values."""
212
    
213
    @property
214
    def multiple(self):
215
        """Multiple selection enabled."""
216

217
class TextareaElement(HtmlElement):
218
    """HTML textarea element."""
219
    
220
    @property
221
    def value(self):
222
        """Textarea content."""
223
    
224
    @value.setter
225
    def value(self, value):
226
        """Set textarea content."""
227

228
class LabelElement(HtmlElement):
229
    """HTML label element."""
230
    
231
    @property
232
    def for_element(self):
233
        """Associated form element."""
234
```
235

236
### Link Processing
237

238
Functions for processing and manipulating links in HTML documents.
239

240
```python { .api }
241
def make_links_absolute(element, base_url=None, resolve_base_href=True, handle_failures=True):
242
    """
243
    Convert relative links to absolute URLs.
244
    
245
    Args:
246
        element: HTML element or document
247
        base_url: Base URL for resolving relative links
248
        resolve_base_href: Process <base href> elements first
249
        handle_failures: Continue on URL resolution errors
250
    """
251

252
def resolve_base_href(element, handle_failures=True):
253
    """
254
    Apply <base href> elements to relative links.
255
    
256
    Args:
257
        element: HTML element or document
258
        handle_failures: Continue on URL resolution errors
259
    """
260

261
def iterlinks(element):
262
    """
263
    Iterate over all links in HTML element.
264
    
265
    Args:
266
        element: HTML element or document
267
    
268
    Yields:
269
        tuple: (element, attribute, link, pos) for each link
270
    """
271

272
def rewrite_links(element, link_repl_func, resolve_base_href=True, base_href=None):
273
    """
274
    Rewrite links using callback function.
275
    
276
    Args:
277
        element: HTML element or document
278
        link_repl_func: Function to transform URLs
279
        resolve_base_href: Process <base href> elements first
280
        base_href: Override base URL
281
    """
282

283
def find_rel_links(element, rel):
284
    """
285
    Find links with specified rel attribute.
286
    
287
    Args:
288
        element: HTML element or document
289
        rel: rel attribute value to match
290
    
291
    Returns:
292
        list: Elements with matching rel attribute
293
    """
294

295
def find_class(element, class_name):
296
    """
297
    Find elements with specified CSS class.
298
    
299
    Args:
300
        element: HTML element or document
301
        class_name: CSS class name to match
302
    
303
    Returns:
304
        list: Elements with matching class
305
    """
306
```
307

308
### CSS Class Management
309

310
Utility classes for managing CSS classes on HTML elements.
311

312
```python { .api }
313
class Classes:
314
    """Set-like interface for CSS classes."""
315
    
316
    def __init__(self, element):
317
        """Create class manager for element."""
318
    
319
    def add(self, *classes):
320
        """Add CSS classes."""
321
    
322
    def discard(self, class_name):
323
        """Remove CSS class if present."""
324
    
325
    def remove(self, class_name):
326
        """Remove CSS class (raises KeyError if not present)."""
327
    
328
    def update(self, classes):
329
        """Add multiple classes from iterable."""
330
    
331
    def clear(self):
332
        """Remove all classes."""
333
    
334
    def __contains__(self, class_name):
335
        """Test if class is present."""
336
    
337
    def __iter__(self):
338
        """Iterate over classes."""
339
    
340
    def __len__(self):
341
        """Number of classes."""
342
```
343

344
### HTML Serialization
345

346
Convert HTML elements and documents to strings with HTML-specific formatting.
347

348
```python { .api }
349
def tostring(doc, pretty_print=False, include_meta_content_type=False, 
350
             encoding=None, method="html", with_tail=True, doctype=None):
351
    """
352
    Serialize HTML element or document to string.
353
    
354
    Args:
355
        doc: HTML element or document
356
        pretty_print: Format output with whitespace
357
        include_meta_content_type: Add meta charset tag
358
        encoding: Output encoding ('unicode' for str)
359
        method: Serialization method (usually 'html')
360
        with_tail: Include tail text
361
        doctype: Document type declaration
362
    
363
    Returns:
364
        str or bytes: Serialized HTML
365
    """
366
```
367

368
### Form Submission
369

370
Submit HTML forms programmatically.
371

372
```python { .api }
373
def submit_form(form, extra_values=None, open_http=None):
374
    """
375
    Submit HTML form and return response.
376
    
377
    Args:
378
        form: FormElement to submit
379
        extra_values: Additional form values as dict
380
        open_http: Function to handle HTTP request
381
    
382
    Returns:
383
        Response from form submission
384
    """
385
```
386

387
### Utility Functions
388

389
Additional HTML processing utilities.
390

391
```python { .api }
392
def Element(tag, attrib=None, nsmap=None, **extra):
393
    """
394
    Create HTML element.
395
    
396
    Args:
397
        tag: Element tag name
398
        attrib: Attribute dictionary
399
        nsmap: Namespace mapping (rarely used for HTML)
400
        **extra: Additional attributes
401
    
402
    Returns:  
403
        HtmlElement: New HTML element
404
    """
405

406
def open_in_browser(doc, encoding=None):
407
    """
408
    Open HTML document in web browser.
409
    
410
    Args:
411
        doc: HTML element or document
412
        encoding: Character encoding for temporary file
413
    """
414
```
415

416
### Sub-modules
417

418
Additional HTML processing functionality in sub-modules.
419

420
```python { .api }
421
# HTML definitions and constants
422
import lxml.html.defs
423

424
# HTML element builder
425
import lxml.html.builder
426

427
# HTML document comparison and diffing  
428
import lxml.html.diff
429

430
# Form filling utilities
431
import lxml.html.formfill
432

433
# HTML cleaning and sanitization
434
import lxml.html.clean
435

436
# BeautifulSoup compatibility
437
import lxml.html.soupparser
438

439
# HTML5 parsing (requires html5lib)
440
import lxml.html.html5parser
441
```
442

443
## Usage Examples
444

445
### Basic HTML Processing
446

447
```python
448
from lxml import html
449

450
# Parse HTML document
451
html_content = '''
452
<!DOCTYPE html>
453
<html>
454
<head>
455
    <title>Sample Page</title>
456
    <base href="https://example.com/">
457
</head>
458
<body>
459
    <div class="header">
460
        <h1>Welcome</h1>
461
        <nav>
462
            <a href="/home">Home</a>
463
            <a href="/about">About</a>
464
            <a href="contact.html">Contact</a>
465
        </nav>
466
    </div>
467
    <div class="content main-content">
468
        <p>This is the main content.</p>
469
        <img src="images/logo.png" alt="Logo">
470
    </div>
471
</body>
472
</html>
473
'''
474

475
doc = html.fromstring(html_content)
476

477
# Find elements by CSS class
478
header = html.find_class(doc, 'header')[0]
479
content_divs = html.find_class(doc, 'content')
480

481
# Work with CSS classes
482
content_div = content_divs[0]
483
print(content_div.classes)  # {'content', 'main-content'}
484
content_div.classes.add('highlighted')
485
content_div.classes.discard('main-content')
486

487
# Process links
488
html.make_links_absolute(doc, base_url='https://mysite.com')
489
for element, attribute, link, pos in html.iterlinks(doc):
490
    print(f"{element.tag}.{attribute}: {link}")
491

492
# Get text content
493
title = doc.find('.//title').text_content()
494
print(f"Page title: {title}")
495
```
496

497
### Form Processing
498

499
```python
500
from lxml import html
501

502
# HTML with form
503
form_html = '''
504
<html>
505
<body>
506
    <form action="/login" method="post">
507
        <input type="text" name="username" value="john">
508
        <input type="password" name="password" value="">
509
        <input type="checkbox" name="remember" checked>
510
        <select name="role">
511
            <option value="user">User</option>
512
            <option value="admin" selected>Admin</option>
513
        </select>
514
        <textarea name="comments">Default text</textarea>
515
        <button type="submit">Login</button>
516
    </form>
517
</body>
518
</html>
519
'''
520

521
doc = html.fromstring(form_html)
522
form = doc.forms[0]
523

524
# Access form properties
525
print(f"Action: {form.action}")
526
print(f"Method: {form.method}")
527

528
# Work with form fields
529
print("Form fields:")
530
for name, element in form.fields.items():
531
    if hasattr(element, 'value'):
532
        print(f"  {name}: {element.value}")
533
    elif hasattr(element, 'checked'):
534
        print(f"  {name}: {'checked' if element.checked else 'unchecked'}")
535

536
# Modify form values
537
form.fields['username'].value = 'alice'
538
form.fields['password'].value = 'secret123'
539
form.fields['remember'].checked = False
540
form.fields['role'].value = 'user'
541

542
# Get form data for submission
543
form_data = form.form_values()
544
print("Form data:", dict(form_data))
545
```
546

547
### Link Manipulation
548

549
```python
550
from lxml import html
551

552
html_content = '''
553
<div>
554
    <a href="/internal">Internal Link</a>
555
    <a href="http://external.com">External Link</a>
556
    <img src="images/photo.jpg" alt="Photo">
557
    <link rel="stylesheet" href="styles/main.css">
558
</div>
559
'''
560

561
doc = html.fragment_fromstring(html_content)
562

563
# Make links absolute
564
html.make_links_absolute(doc, base_url='https://mysite.com')
565

566
# Rewrite specific links
567
def rewrite_image_links(url):
568
    if url.endswith(('.jpg', '.png', '.gif')):
569
        return f"https://cdn.mysite.com/{url.lstrip('/')}"
570
    return url
571

572
html.rewrite_links(doc, rewrite_image_links)
573

574
# Find specific link types
575
stylesheets = html.find_rel_links(doc, 'stylesheet')
576
for link in stylesheets:
577
    print(f"Stylesheet: {link.get('href')}")
578

579
print(html.tostring(doc, encoding='unicode'))
580
```
581

582
### Content Extraction and Modification
583

584
```python
585
from lxml import html
586

587
html_content = '''
588
<article>
589
    <h1>Article Title</h1>
590
    <div class="meta">
591
        <span class="author">John Doe</span>
592
        <span class="date">2023-12-07</span>
593
    </div>
594
    <div class="content">
595
        <p>First paragraph with <a href="link1.html">a link</a>.</p>
596
        <p>Second paragraph with <strong>bold text</strong>.</p>
597
        <div class="sidebar">Sidebar content</div>
598
    </div>
599
</article>
600
'''
601

602
doc = html.fromstring(html_content)
603

604
# Extract text content
605
title = doc.find('.//h1').text_content()
606
author = html.find_class(doc, 'author')[0].text_content()
607
content_text = html.find_class(doc, 'content')[0].text_content()
608

609
print(f"Title: {title}")
610
print(f"Author: {author}")
611
print(f"Content: {content_text[:100]}...")
612

613
# Remove unwanted elements
614
sidebar = html.find_class(doc, 'sidebar')[0]
615
sidebar.drop_tree()  # Remove element and children
616

617
# Remove tags but keep content
618
for strong in doc.xpath('.//strong'):
619
    strong.drop_tag()  # Remove <strong> tags but keep text
620

621
print(html.tostring(doc, pretty_print=True, encoding='unicode'))
622
```
623

624
### CSS Class Management
625

626
```python
627
from lxml import html
628

629
html_content = '<div class="content main highlighted"></div>'
630
element = html.fragment_fromstring(html_content)
631

632
# Work with classes as a set
633
classes = element.classes
634
print(f"Initial classes: {set(classes)}")
635

636
# Add and remove classes
637
classes.add('active')
638
classes.discard('highlighted')
639
classes.update(['responsive', 'mobile-friendly'])
640

641
print(f"Final classes: {set(classes)}")
642
print(f"Has 'active': {'active' in classes}")
643
print(f"Number of classes: {len(classes)}")
644

645
# Convert back to HTML
646
print(html.tostring(element, encoding='unicode'))
647
```

Version

Tile

Files

html-processing.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

html-processing.mddocs/