0
# HTML Processing
1
2
Specialized HTML document processing with web-specific features including lenient parsing, form handling, link processing, CSS class manipulation, and HTML5 support. The html module provides a high-level interface optimized for working with HTML documents in web applications.
3
4
## Capabilities
5
6
### HTML Document Parsing
7
8
Parse HTML documents with lenient parsing that handles malformed HTML gracefully.
9
10
```python { .api }
11
def parse(filename_or_url, parser=None, base_url=None, **kwargs):
12
"""
13
Parse HTML document from file or URL.
14
15
Args:
16
filename_or_url: Path to file or URL to parse
17
parser: HTMLParser instance (optional)
18
base_url: Base URL for resolving relative links (optional)
19
**kwargs: Additional arguments passed to parser
20
21
Returns:
22
ElementTree: Parsed HTML document tree
23
"""
24
25
def document_fromstring(html, parser=None, ensure_head_body=False, base_url=None):
26
"""
27
Parse complete HTML document from string.
28
29
Args:
30
html: str or bytes containing HTML content
31
parser: HTMLParser instance (optional)
32
ensure_head_body: Ensure document has <head> and <body> elements
33
base_url: Base URL for resolving relative references
34
35
Returns:
36
Element: Root <html> element
37
"""
38
39
def fragment_fromstring(html, create_parent=False, tag=None, base_url=None, parser=None):
40
"""
41
Parse HTML fragment from string.
42
43
Args:
44
html: str or bytes containing HTML fragment
45
create_parent: Wrap fragment in parent element
46
tag: Parent tag name if create_parent=True
47
base_url: Base URL for resolving relative references
48
parser: HTMLParser instance (optional)
49
50
Returns:
51
Element: Fragment root element or parent element
52
"""
53
54
def fragments_fromstring(html, no_leading_text=False, base_url=None, parser=None):
55
"""
56
Parse HTML string into list of elements and text.
57
58
Args:
59
html: str or bytes containing HTML fragments
60
no_leading_text: Exclude leading text before first element
61
base_url: Base URL for resolving relative references
62
parser: HTMLParser instance (optional)
63
64
Returns:
65
list: Elements and text strings from parsed content
66
"""
67
68
def fromstring(html, base_url=None, parser=None):
69
"""
70
Intelligently parse HTML as document or fragment.
71
72
Args:
73
html: str or bytes containing HTML content
74
base_url: Base URL for resolving relative references
75
parser: HTMLParser instance (optional)
76
77
Returns:
78
Element: Root element (document or fragment)
79
"""
80
```
81
82
### HTML Element Classes
83
84
HTML-specific element classes with web functionality.
85
86
```python { .api }
87
class HtmlElement:
88
"""Base HTML element class with HTML-specific methods."""
89
90
# CSS class manipulation
91
def get_class(self):
92
"""Get CSS classes as set-like object."""
93
94
def set_class(self, classes):
95
"""Set CSS classes from string or iterable."""
96
97
classes = property(get_class, set_class)
98
99
# Link processing
100
def make_links_absolute(self, base_url=None, resolve_base_href=True):
101
"""Make all relative links absolute."""
102
103
def resolve_base_href(self, handle_failures=True):
104
"""Apply base href to relative links."""
105
106
def iterlinks(self):
107
"""Iterate over all links in element."""
108
109
def rewrite_links(self, link_repl_func, resolve_base_href=True, base_href=None):
110
"""Rewrite links using callback function."""
111
112
# Content extraction
113
def text_content(self):
114
"""Get all text content with whitespace normalized."""
115
116
def drop_tree(self):
117
"""Remove element and children from document."""
118
119
def drop_tag(self):
120
"""Remove element tag but keep children."""
121
122
# Form-related methods (for form elements)
123
@property
124
def forms(self):
125
"""List of form elements in document."""
126
127
@property
128
def body(self):
129
"""Document body element (for document root)."""
130
131
class HtmlComment(HtmlElement):
132
"""HTML comment element."""
133
134
class HtmlEntity(HtmlElement):
135
"""HTML entity element."""
136
137
class HtmlProcessingInstruction(HtmlElement):
138
"""HTML processing instruction element."""
139
```
140
141
### Form Handling
142
143
Specialized classes for working with HTML forms and form elements.
144
145
```python { .api }
146
class FormElement(HtmlElement):
147
"""HTML form element with submission capabilities."""
148
149
@property
150
def inputs(self):
151
"""Dictionary-like access to form inputs."""
152
153
@property
154
def fields(self):
155
"""Dictionary of form field names to elements."""
156
157
@property
158
def action(self):
159
"""Form action URL."""
160
161
@property
162
def method(self):
163
"""Form submission method (GET/POST)."""
164
165
def form_values(self):
166
"""Get list of (name, value) pairs for form submission."""
167
168
def _name_values(self):
169
"""Internal method for getting form data."""
170
171
class InputElement(HtmlElement):
172
"""HTML input element."""
173
174
@property
175
def name(self):
176
"""Input name attribute."""
177
178
@property
179
def value(self):
180
"""Input value."""
181
182
@value.setter
183
def value(self, value):
184
"""Set input value."""
185
186
@property
187
def type(self):
188
"""Input type (text, password, checkbox, etc.)."""
189
190
@property
191
def checked(self):
192
"""Checked state for checkbox/radio inputs."""
193
194
@checked.setter
195
def checked(self, checked):
196
"""Set checked state."""
197
198
class SelectElement(HtmlElement):
199
"""HTML select element."""
200
201
@property
202
def value(self):
203
"""Selected value(s)."""
204
205
@value.setter
206
def value(self, value):
207
"""Set selected value(s)."""
208
209
@property
210
def value_options(self):
211
"""List of possible values."""
212
213
@property
214
def multiple(self):
215
"""Multiple selection enabled."""
216
217
class TextareaElement(HtmlElement):
218
"""HTML textarea element."""
219
220
@property
221
def value(self):
222
"""Textarea content."""
223
224
@value.setter
225
def value(self, value):
226
"""Set textarea content."""
227
228
class LabelElement(HtmlElement):
229
"""HTML label element."""
230
231
@property
232
def for_element(self):
233
"""Associated form element."""
234
```
235
236
### Link Processing
237
238
Functions for processing and manipulating links in HTML documents.
239
240
```python { .api }
241
def make_links_absolute(element, base_url=None, resolve_base_href=True, handle_failures=True):
242
"""
243
Convert relative links to absolute URLs.
244
245
Args:
246
element: HTML element or document
247
base_url: Base URL for resolving relative links
248
resolve_base_href: Process <base href> elements first
249
handle_failures: Continue on URL resolution errors
250
"""
251
252
def resolve_base_href(element, handle_failures=True):
253
"""
254
Apply <base href> elements to relative links.
255
256
Args:
257
element: HTML element or document
258
handle_failures: Continue on URL resolution errors
259
"""
260
261
def iterlinks(element):
262
"""
263
Iterate over all links in HTML element.
264
265
Args:
266
element: HTML element or document
267
268
Yields:
269
tuple: (element, attribute, link, pos) for each link
270
"""
271
272
def rewrite_links(element, link_repl_func, resolve_base_href=True, base_href=None):
273
"""
274
Rewrite links using callback function.
275
276
Args:
277
element: HTML element or document
278
link_repl_func: Function to transform URLs
279
resolve_base_href: Process <base href> elements first
280
base_href: Override base URL
281
"""
282
283
def find_rel_links(element, rel):
284
"""
285
Find links with specified rel attribute.
286
287
Args:
288
element: HTML element or document
289
rel: rel attribute value to match
290
291
Returns:
292
list: Elements with matching rel attribute
293
"""
294
295
def find_class(element, class_name):
296
"""
297
Find elements with specified CSS class.
298
299
Args:
300
element: HTML element or document
301
class_name: CSS class name to match
302
303
Returns:
304
list: Elements with matching class
305
"""
306
```
307
308
### CSS Class Management
309
310
Utility classes for managing CSS classes on HTML elements.
311
312
```python { .api }
313
class Classes:
314
"""Set-like interface for CSS classes."""
315
316
def __init__(self, element):
317
"""Create class manager for element."""
318
319
def add(self, *classes):
320
"""Add CSS classes."""
321
322
def discard(self, class_name):
323
"""Remove CSS class if present."""
324
325
def remove(self, class_name):
326
"""Remove CSS class (raises KeyError if not present)."""
327
328
def update(self, classes):
329
"""Add multiple classes from iterable."""
330
331
def clear(self):
332
"""Remove all classes."""
333
334
def __contains__(self, class_name):
335
"""Test if class is present."""
336
337
def __iter__(self):
338
"""Iterate over classes."""
339
340
def __len__(self):
341
"""Number of classes."""
342
```
343
344
### HTML Serialization
345
346
Convert HTML elements and documents to strings with HTML-specific formatting.
347
348
```python { .api }
349
def tostring(doc, pretty_print=False, include_meta_content_type=False,
350
encoding=None, method="html", with_tail=True, doctype=None):
351
"""
352
Serialize HTML element or document to string.
353
354
Args:
355
doc: HTML element or document
356
pretty_print: Format output with whitespace
357
include_meta_content_type: Add meta charset tag
358
encoding: Output encoding ('unicode' for str)
359
method: Serialization method (usually 'html')
360
with_tail: Include tail text
361
doctype: Document type declaration
362
363
Returns:
364
str or bytes: Serialized HTML
365
"""
366
```
367
368
### Form Submission
369
370
Submit HTML forms programmatically.
371
372
```python { .api }
373
def submit_form(form, extra_values=None, open_http=None):
374
"""
375
Submit HTML form and return response.
376
377
Args:
378
form: FormElement to submit
379
extra_values: Additional form values as dict
380
open_http: Function to handle HTTP request
381
382
Returns:
383
Response from form submission
384
"""
385
```
386
387
### Utility Functions
388
389
Additional HTML processing utilities.
390
391
```python { .api }
392
def Element(tag, attrib=None, nsmap=None, **extra):
393
"""
394
Create HTML element.
395
396
Args:
397
tag: Element tag name
398
attrib: Attribute dictionary
399
nsmap: Namespace mapping (rarely used for HTML)
400
**extra: Additional attributes
401
402
Returns:
403
HtmlElement: New HTML element
404
"""
405
406
def open_in_browser(doc, encoding=None):
407
"""
408
Open HTML document in web browser.
409
410
Args:
411
doc: HTML element or document
412
encoding: Character encoding for temporary file
413
"""
414
```
415
416
### Sub-modules
417
418
Additional HTML processing functionality in sub-modules.
419
420
```python { .api }
421
# HTML definitions and constants
422
import lxml.html.defs
423
424
# HTML element builder
425
import lxml.html.builder
426
427
# HTML document comparison and diffing
428
import lxml.html.diff
429
430
# Form filling utilities
431
import lxml.html.formfill
432
433
# HTML cleaning and sanitization
434
import lxml.html.clean
435
436
# BeautifulSoup compatibility
437
import lxml.html.soupparser
438
439
# HTML5 parsing (requires html5lib)
440
import lxml.html.html5parser
441
```
442
443
## Usage Examples
444
445
### Basic HTML Processing
446
447
```python
448
from lxml import html
449
450
# Parse HTML document
451
html_content = '''
452
<!DOCTYPE html>
453
<html>
454
<head>
455
<title>Sample Page</title>
456
<base href="https://example.com/">
457
</head>
458
<body>
459
<div class="header">
460
<h1>Welcome</h1>
461
<nav>
462
<a href="/home">Home</a>
463
<a href="/about">About</a>
464
<a href="contact.html">Contact</a>
465
</nav>
466
</div>
467
<div class="content main-content">
468
<p>This is the main content.</p>
469
<img src="images/logo.png" alt="Logo">
470
</div>
471
</body>
472
</html>
473
'''
474
475
doc = html.fromstring(html_content)
476
477
# Find elements by CSS class
478
header = html.find_class(doc, 'header')[0]
479
content_divs = html.find_class(doc, 'content')
480
481
# Work with CSS classes
482
content_div = content_divs[0]
483
print(content_div.classes) # {'content', 'main-content'}
484
content_div.classes.add('highlighted')
485
content_div.classes.discard('main-content')
486
487
# Process links
488
html.make_links_absolute(doc, base_url='https://mysite.com')
489
for element, attribute, link, pos in html.iterlinks(doc):
490
print(f"{element.tag}.{attribute}: {link}")
491
492
# Get text content
493
title = doc.find('.//title').text_content()
494
print(f"Page title: {title}")
495
```
496
497
### Form Processing
498
499
```python
500
from lxml import html
501
502
# HTML with form
503
form_html = '''
504
<html>
505
<body>
506
<form action="/login" method="post">
507
<input type="text" name="username" value="john">
508
<input type="password" name="password" value="">
509
<input type="checkbox" name="remember" checked>
510
<select name="role">
511
<option value="user">User</option>
512
<option value="admin" selected>Admin</option>
513
</select>
514
<textarea name="comments">Default text</textarea>
515
<button type="submit">Login</button>
516
</form>
517
</body>
518
</html>
519
'''
520
521
doc = html.fromstring(form_html)
522
form = doc.forms[0]
523
524
# Access form properties
525
print(f"Action: {form.action}")
526
print(f"Method: {form.method}")
527
528
# Work with form fields
529
print("Form fields:")
530
for name, element in form.fields.items():
531
if hasattr(element, 'value'):
532
print(f" {name}: {element.value}")
533
elif hasattr(element, 'checked'):
534
print(f" {name}: {'checked' if element.checked else 'unchecked'}")
535
536
# Modify form values
537
form.fields['username'].value = 'alice'
538
form.fields['password'].value = 'secret123'
539
form.fields['remember'].checked = False
540
form.fields['role'].value = 'user'
541
542
# Get form data for submission
543
form_data = form.form_values()
544
print("Form data:", dict(form_data))
545
```
546
547
### Link Manipulation
548
549
```python
550
from lxml import html
551
552
html_content = '''
553
<div>
554
<a href="/internal">Internal Link</a>
555
<a href="http://external.com">External Link</a>
556
<img src="images/photo.jpg" alt="Photo">
557
<link rel="stylesheet" href="styles/main.css">
558
</div>
559
'''
560
561
doc = html.fragment_fromstring(html_content)
562
563
# Make links absolute
564
html.make_links_absolute(doc, base_url='https://mysite.com')
565
566
# Rewrite specific links
567
def rewrite_image_links(url):
568
if url.endswith(('.jpg', '.png', '.gif')):
569
return f"https://cdn.mysite.com/{url.lstrip('/')}"
570
return url
571
572
html.rewrite_links(doc, rewrite_image_links)
573
574
# Find specific link types
575
stylesheets = html.find_rel_links(doc, 'stylesheet')
576
for link in stylesheets:
577
print(f"Stylesheet: {link.get('href')}")
578
579
print(html.tostring(doc, encoding='unicode'))
580
```
581
582
### Content Extraction and Modification
583
584
```python
585
from lxml import html
586
587
html_content = '''
588
<article>
589
<h1>Article Title</h1>
590
<div class="meta">
591
<span class="author">John Doe</span>
592
<span class="date">2023-12-07</span>
593
</div>
594
<div class="content">
595
<p>First paragraph with <a href="link1.html">a link</a>.</p>
596
<p>Second paragraph with <strong>bold text</strong>.</p>
597
<div class="sidebar">Sidebar content</div>
598
</div>
599
</article>
600
'''
601
602
doc = html.fromstring(html_content)
603
604
# Extract text content
605
title = doc.find('.//h1').text_content()
606
author = html.find_class(doc, 'author')[0].text_content()
607
content_text = html.find_class(doc, 'content')[0].text_content()
608
609
print(f"Title: {title}")
610
print(f"Author: {author}")
611
print(f"Content: {content_text[:100]}...")
612
613
# Remove unwanted elements
614
sidebar = html.find_class(doc, 'sidebar')[0]
615
sidebar.drop_tree() # Remove element and children
616
617
# Remove tags but keep content
618
for strong in doc.xpath('.//strong'):
619
strong.drop_tag() # Remove <strong> tags but keep text
620
621
print(html.tostring(doc, pretty_print=True, encoding='unicode'))
622
```
623
624
### CSS Class Management
625
626
```python
627
from lxml import html
628
629
html_content = '<div class="content main highlighted"></div>'
630
element = html.fragment_fromstring(html_content)
631
632
# Work with classes as a set
633
classes = element.classes
634
print(f"Initial classes: {set(classes)}")
635
636
# Add and remove classes
637
classes.add('active')
638
classes.discard('highlighted')
639
classes.update(['responsive', 'mobile-friendly'])
640
641
print(f"Final classes: {set(classes)}")
642
print(f"Has 'active': {'active' in classes}")
643
print(f"Number of classes: {len(classes)}")
644
645
# Convert back to HTML
646
print(html.tostring(element, encoding='unicode'))
647
```