or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

etree-core.mdhtml-processing.mdindex.mdobjectify-api.mdutility-modules.mdvalidation.mdxpath-xslt.md

html-processing.mddocs/

0

# HTML Processing

1

2

Specialized HTML document processing with web-specific features including lenient parsing, form handling, link processing, CSS class manipulation, and HTML5 support. The html module provides a high-level interface optimized for working with HTML documents in web applications.

3

4

## Capabilities

5

6

### HTML Document Parsing

7

8

Parse HTML documents with lenient parsing that handles malformed HTML gracefully.

9

10

```python { .api }

11

def parse(filename_or_url, parser=None, base_url=None, **kwargs):

12

"""

13

Parse HTML document from file or URL.

14

15

Args:

16

filename_or_url: Path to file or URL to parse

17

parser: HTMLParser instance (optional)

18

base_url: Base URL for resolving relative links (optional)

19

**kwargs: Additional arguments passed to parser

20

21

Returns:

22

ElementTree: Parsed HTML document tree

23

"""

24

25

def document_fromstring(html, parser=None, ensure_head_body=False, base_url=None):

26

"""

27

Parse complete HTML document from string.

28

29

Args:

30

html: str or bytes containing HTML content

31

parser: HTMLParser instance (optional)

32

ensure_head_body: Ensure document has <head> and <body> elements

33

base_url: Base URL for resolving relative references

34

35

Returns:

36

Element: Root <html> element

37

"""

38

39

def fragment_fromstring(html, create_parent=False, tag=None, base_url=None, parser=None):

40

"""

41

Parse HTML fragment from string.

42

43

Args:

44

html: str or bytes containing HTML fragment

45

create_parent: Wrap fragment in parent element

46

tag: Parent tag name if create_parent=True

47

base_url: Base URL for resolving relative references

48

parser: HTMLParser instance (optional)

49

50

Returns:

51

Element: Fragment root element or parent element

52

"""

53

54

def fragments_fromstring(html, no_leading_text=False, base_url=None, parser=None):

55

"""

56

Parse HTML string into list of elements and text.

57

58

Args:

59

html: str or bytes containing HTML fragments

60

no_leading_text: Exclude leading text before first element

61

base_url: Base URL for resolving relative references

62

parser: HTMLParser instance (optional)

63

64

Returns:

65

list: Elements and text strings from parsed content

66

"""

67

68

def fromstring(html, base_url=None, parser=None):

69

"""

70

Intelligently parse HTML as document or fragment.

71

72

Args:

73

html: str or bytes containing HTML content

74

base_url: Base URL for resolving relative references

75

parser: HTMLParser instance (optional)

76

77

Returns:

78

Element: Root element (document or fragment)

79

"""

80

```

81

82

### HTML Element Classes

83

84

HTML-specific element classes with web functionality.

85

86

```python { .api }

87

class HtmlElement:

88

"""Base HTML element class with HTML-specific methods."""

89

90

# CSS class manipulation

91

def get_class(self):

92

"""Get CSS classes as set-like object."""

93

94

def set_class(self, classes):

95

"""Set CSS classes from string or iterable."""

96

97

classes = property(get_class, set_class)

98

99

# Link processing

100

def make_links_absolute(self, base_url=None, resolve_base_href=True):

101

"""Make all relative links absolute."""

102

103

def resolve_base_href(self, handle_failures=True):

104

"""Apply base href to relative links."""

105

106

def iterlinks(self):

107

"""Iterate over all links in element."""

108

109

def rewrite_links(self, link_repl_func, resolve_base_href=True, base_href=None):

110

"""Rewrite links using callback function."""

111

112

# Content extraction

113

def text_content(self):

114

"""Get all text content with whitespace normalized."""

115

116

def drop_tree(self):

117

"""Remove element and children from document."""

118

119

def drop_tag(self):

120

"""Remove element tag but keep children."""

121

122

# Form-related methods (for form elements)

123

@property

124

def forms(self):

125

"""List of form elements in document."""

126

127

@property

128

def body(self):

129

"""Document body element (for document root)."""

130

131

class HtmlComment(HtmlElement):

132

"""HTML comment element."""

133

134

class HtmlEntity(HtmlElement):

135

"""HTML entity element."""

136

137

class HtmlProcessingInstruction(HtmlElement):

138

"""HTML processing instruction element."""

139

```

140

141

### Form Handling

142

143

Specialized classes for working with HTML forms and form elements.

144

145

```python { .api }

146

class FormElement(HtmlElement):

147

"""HTML form element with submission capabilities."""

148

149

@property

150

def inputs(self):

151

"""Dictionary-like access to form inputs."""

152

153

@property

154

def fields(self):

155

"""Dictionary of form field names to elements."""

156

157

@property

158

def action(self):

159

"""Form action URL."""

160

161

@property

162

def method(self):

163

"""Form submission method (GET/POST)."""

164

165

def form_values(self):

166

"""Get list of (name, value) pairs for form submission."""

167

168

def _name_values(self):

169

"""Internal method for getting form data."""

170

171

class InputElement(HtmlElement):

172

"""HTML input element."""

173

174

@property

175

def name(self):

176

"""Input name attribute."""

177

178

@property

179

def value(self):

180

"""Input value."""

181

182

@value.setter

183

def value(self, value):

184

"""Set input value."""

185

186

@property

187

def type(self):

188

"""Input type (text, password, checkbox, etc.)."""

189

190

@property

191

def checked(self):

192

"""Checked state for checkbox/radio inputs."""

193

194

@checked.setter

195

def checked(self, checked):

196

"""Set checked state."""

197

198

class SelectElement(HtmlElement):

199

"""HTML select element."""

200

201

@property

202

def value(self):

203

"""Selected value(s)."""

204

205

@value.setter

206

def value(self, value):

207

"""Set selected value(s)."""

208

209

@property

210

def value_options(self):

211

"""List of possible values."""

212

213

@property

214

def multiple(self):

215

"""Multiple selection enabled."""

216

217

class TextareaElement(HtmlElement):

218

"""HTML textarea element."""

219

220

@property

221

def value(self):

222

"""Textarea content."""

223

224

@value.setter

225

def value(self, value):

226

"""Set textarea content."""

227

228

class LabelElement(HtmlElement):

229

"""HTML label element."""

230

231

@property

232

def for_element(self):

233

"""Associated form element."""

234

```

235

236

### Link Processing

237

238

Functions for processing and manipulating links in HTML documents.

239

240

```python { .api }

241

def make_links_absolute(element, base_url=None, resolve_base_href=True, handle_failures=True):

242

"""

243

Convert relative links to absolute URLs.

244

245

Args:

246

element: HTML element or document

247

base_url: Base URL for resolving relative links

248

resolve_base_href: Process <base href> elements first

249

handle_failures: Continue on URL resolution errors

250

"""

251

252

def resolve_base_href(element, handle_failures=True):

253

"""

254

Apply <base href> elements to relative links.

255

256

Args:

257

element: HTML element or document

258

handle_failures: Continue on URL resolution errors

259

"""

260

261

def iterlinks(element):

262

"""

263

Iterate over all links in HTML element.

264

265

Args:

266

element: HTML element or document

267

268

Yields:

269

tuple: (element, attribute, link, pos) for each link

270

"""

271

272

def rewrite_links(element, link_repl_func, resolve_base_href=True, base_href=None):

273

"""

274

Rewrite links using callback function.

275

276

Args:

277

element: HTML element or document

278

link_repl_func: Function to transform URLs

279

resolve_base_href: Process <base href> elements first

280

base_href: Override base URL

281

"""

282

283

def find_rel_links(element, rel):

284

"""

285

Find links with specified rel attribute.

286

287

Args:

288

element: HTML element or document

289

rel: rel attribute value to match

290

291

Returns:

292

list: Elements with matching rel attribute

293

"""

294

295

def find_class(element, class_name):

296

"""

297

Find elements with specified CSS class.

298

299

Args:

300

element: HTML element or document

301

class_name: CSS class name to match

302

303

Returns:

304

list: Elements with matching class

305

"""

306

```

307

308

### CSS Class Management

309

310

Utility classes for managing CSS classes on HTML elements.

311

312

```python { .api }

313

class Classes:

314

"""Set-like interface for CSS classes."""

315

316

def __init__(self, element):

317

"""Create class manager for element."""

318

319

def add(self, *classes):

320

"""Add CSS classes."""

321

322

def discard(self, class_name):

323

"""Remove CSS class if present."""

324

325

def remove(self, class_name):

326

"""Remove CSS class (raises KeyError if not present)."""

327

328

def update(self, classes):

329

"""Add multiple classes from iterable."""

330

331

def clear(self):

332

"""Remove all classes."""

333

334

def __contains__(self, class_name):

335

"""Test if class is present."""

336

337

def __iter__(self):

338

"""Iterate over classes."""

339

340

def __len__(self):

341

"""Number of classes."""

342

```

343

344

### HTML Serialization

345

346

Convert HTML elements and documents to strings with HTML-specific formatting.

347

348

```python { .api }

349

def tostring(doc, pretty_print=False, include_meta_content_type=False,

350

encoding=None, method="html", with_tail=True, doctype=None):

351

"""

352

Serialize HTML element or document to string.

353

354

Args:

355

doc: HTML element or document

356

pretty_print: Format output with whitespace

357

include_meta_content_type: Add meta charset tag

358

encoding: Output encoding ('unicode' for str)

359

method: Serialization method (usually 'html')

360

with_tail: Include tail text

361

doctype: Document type declaration

362

363

Returns:

364

str or bytes: Serialized HTML

365

"""

366

```

367

368

### Form Submission

369

370

Submit HTML forms programmatically.

371

372

```python { .api }

373

def submit_form(form, extra_values=None, open_http=None):

374

"""

375

Submit HTML form and return response.

376

377

Args:

378

form: FormElement to submit

379

extra_values: Additional form values as dict

380

open_http: Function to handle HTTP request

381

382

Returns:

383

Response from form submission

384

"""

385

```

386

387

### Utility Functions

388

389

Additional HTML processing utilities.

390

391

```python { .api }

392

def Element(tag, attrib=None, nsmap=None, **extra):

393

"""

394

Create HTML element.

395

396

Args:

397

tag: Element tag name

398

attrib: Attribute dictionary

399

nsmap: Namespace mapping (rarely used for HTML)

400

**extra: Additional attributes

401

402

Returns:

403

HtmlElement: New HTML element

404

"""

405

406

def open_in_browser(doc, encoding=None):

407

"""

408

Open HTML document in web browser.

409

410

Args:

411

doc: HTML element or document

412

encoding: Character encoding for temporary file

413

"""

414

```

415

416

### Sub-modules

417

418

Additional HTML processing functionality in sub-modules.

419

420

```python { .api }

421

# HTML definitions and constants

422

import lxml.html.defs

423

424

# HTML element builder

425

import lxml.html.builder

426

427

# HTML document comparison and diffing

428

import lxml.html.diff

429

430

# Form filling utilities

431

import lxml.html.formfill

432

433

# HTML cleaning and sanitization

434

import lxml.html.clean

435

436

# BeautifulSoup compatibility

437

import lxml.html.soupparser

438

439

# HTML5 parsing (requires html5lib)

440

import lxml.html.html5parser

441

```

442

443

## Usage Examples

444

445

### Basic HTML Processing

446

447

```python

448

from lxml import html

449

450

# Parse HTML document

451

html_content = '''

452

<!DOCTYPE html>

453

<html>

454

<head>

455

<title>Sample Page</title>

456

<base href="https://example.com/">

457

</head>

458

<body>

459

<div class="header">

460

<h1>Welcome</h1>

461

<nav>

462

<a href="/home">Home</a>

463

<a href="/about">About</a>

464

<a href="contact.html">Contact</a>

465

</nav>

466

</div>

467

<div class="content main-content">

468

<p>This is the main content.</p>

469

<img src="images/logo.png" alt="Logo">

470

</div>

471

</body>

472

</html>

473

'''

474

475

doc = html.fromstring(html_content)

476

477

# Find elements by CSS class

478

header = html.find_class(doc, 'header')[0]

479

content_divs = html.find_class(doc, 'content')

480

481

# Work with CSS classes

482

content_div = content_divs[0]

483

print(content_div.classes) # {'content', 'main-content'}

484

content_div.classes.add('highlighted')

485

content_div.classes.discard('main-content')

486

487

# Process links

488

html.make_links_absolute(doc, base_url='https://mysite.com')

489

for element, attribute, link, pos in html.iterlinks(doc):

490

print(f"{element.tag}.{attribute}: {link}")

491

492

# Get text content

493

title = doc.find('.//title').text_content()

494

print(f"Page title: {title}")

495

```

496

497

### Form Processing

498

499

```python

500

from lxml import html

501

502

# HTML with form

503

form_html = '''

504

<html>

505

<body>

506

<form action="/login" method="post">

507

<input type="text" name="username" value="john">

508

<input type="password" name="password" value="">

509

<input type="checkbox" name="remember" checked>

510

<select name="role">

511

<option value="user">User</option>

512

<option value="admin" selected>Admin</option>

513

</select>

514

<textarea name="comments">Default text</textarea>

515

<button type="submit">Login</button>

516

</form>

517

</body>

518

</html>

519

'''

520

521

doc = html.fromstring(form_html)

522

form = doc.forms[0]

523

524

# Access form properties

525

print(f"Action: {form.action}")

526

print(f"Method: {form.method}")

527

528

# Work with form fields

529

print("Form fields:")

530

for name, element in form.fields.items():

531

if hasattr(element, 'value'):

532

print(f" {name}: {element.value}")

533

elif hasattr(element, 'checked'):

534

print(f" {name}: {'checked' if element.checked else 'unchecked'}")

535

536

# Modify form values

537

form.fields['username'].value = 'alice'

538

form.fields['password'].value = 'secret123'

539

form.fields['remember'].checked = False

540

form.fields['role'].value = 'user'

541

542

# Get form data for submission

543

form_data = form.form_values()

544

print("Form data:", dict(form_data))

545

```

546

547

### Link Manipulation

548

549

```python

550

from lxml import html

551

552

html_content = '''

553

<div>

554

<a href="/internal">Internal Link</a>

555

<a href="http://external.com">External Link</a>

556

<img src="images/photo.jpg" alt="Photo">

557

<link rel="stylesheet" href="styles/main.css">

558

</div>

559

'''

560

561

doc = html.fragment_fromstring(html_content)

562

563

# Make links absolute

564

html.make_links_absolute(doc, base_url='https://mysite.com')

565

566

# Rewrite specific links

567

def rewrite_image_links(url):

568

if url.endswith(('.jpg', '.png', '.gif')):

569

return f"https://cdn.mysite.com/{url.lstrip('/')}"

570

return url

571

572

html.rewrite_links(doc, rewrite_image_links)

573

574

# Find specific link types

575

stylesheets = html.find_rel_links(doc, 'stylesheet')

576

for link in stylesheets:

577

print(f"Stylesheet: {link.get('href')}")

578

579

print(html.tostring(doc, encoding='unicode'))

580

```

581

582

### Content Extraction and Modification

583

584

```python

585

from lxml import html

586

587

html_content = '''

588

<article>

589

<h1>Article Title</h1>

590

<div class="meta">

591

<span class="author">John Doe</span>

592

<span class="date">2023-12-07</span>

593

</div>

594

<div class="content">

595

<p>First paragraph with <a href="link1.html">a link</a>.</p>

596

<p>Second paragraph with <strong>bold text</strong>.</p>

597

<div class="sidebar">Sidebar content</div>

598

</div>

599

</article>

600

'''

601

602

doc = html.fromstring(html_content)

603

604

# Extract text content

605

title = doc.find('.//h1').text_content()

606

author = html.find_class(doc, 'author')[0].text_content()

607

content_text = html.find_class(doc, 'content')[0].text_content()

608

609

print(f"Title: {title}")

610

print(f"Author: {author}")

611

print(f"Content: {content_text[:100]}...")

612

613

# Remove unwanted elements

614

sidebar = html.find_class(doc, 'sidebar')[0]

615

sidebar.drop_tree() # Remove element and children

616

617

# Remove tags but keep content

618

for strong in doc.xpath('.//strong'):

619

strong.drop_tag() # Remove <strong> tags but keep text

620

621

print(html.tostring(doc, pretty_print=True, encoding='unicode'))

622

```

623

624

### CSS Class Management

625

626

```python

627

from lxml import html

628

629

html_content = '<div class="content main highlighted"></div>'

630

element = html.fragment_fromstring(html_content)

631

632

# Work with classes as a set

633

classes = element.classes

634

print(f"Initial classes: {set(classes)}")

635

636

# Add and remove classes

637

classes.add('active')

638

classes.discard('highlighted')

639

classes.update(['responsive', 'mobile-friendly'])

640

641

print(f"Final classes: {set(classes)}")

642

print(f"Has 'active': {'active' in classes}")

643

print(f"Number of classes: {len(classes)}")

644

645

# Convert back to HTML

646

print(html.tostring(element, encoding='unicode'))

647

```