or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

etree-core.mdhtml-processing.mdindex.mdobjectify-api.mdutility-modules.mdvalidation.mdxpath-xslt.md

etree-core.mddocs/

0

# Core XML/HTML Processing

1

2

Comprehensive ElementTree-compatible API for XML and HTML document parsing, manipulation, and serialization. This module provides the foundation for all lxml functionality with full standards compliance, namespace support, and high-performance processing.

3

4

## Capabilities

5

6

### Document Parsing

7

8

Parse XML and HTML documents from strings, files, URLs, or file-like objects with configurable parsers and error handling.

9

10

```python { .api }

11

def parse(source, parser=None, base_url=None):

12

"""

13

Parse XML/HTML document from file, URL, or file-like object.

14

15

Args:

16

source: File path, URL, file-like object, or filename

17

parser: XMLParser or HTMLParser instance (optional)

18

base_url: Base URL for resolving relative references (optional)

19

20

Returns:

21

ElementTree: Parsed document tree

22

"""

23

24

def fromstring(text, parser=None, base_url=None):

25

"""

26

Parse XML/HTML document from string.

27

28

Args:

29

text: str or bytes containing XML/HTML content

30

parser: XMLParser or HTMLParser instance (optional)

31

base_url: Base URL for resolving relative references (optional)

32

33

Returns:

34

Element: Root element of parsed document

35

"""

36

37

def XML(text, parser=None, base_url=None):

38

"""

39

Parse XML string with validation enabled by default.

40

41

Args:

42

text: str or bytes containing XML content

43

parser: XMLParser instance (optional)

44

base_url: Base URL for resolving relative references (optional)

45

46

Returns:

47

Element: Root element of parsed XML

48

"""

49

50

def HTML(text, parser=None, base_url=None):

51

"""

52

Parse HTML string with lenient parsing.

53

54

Args:

55

text: str or bytes containing HTML content

56

parser: HTMLParser instance (optional)

57

base_url: Base URL for resolving relative references (optional)

58

59

Returns:

60

Element: Root element of parsed HTML

61

"""

62

```

63

64

### Incremental Parsing

65

66

Memory-efficient parsing for large documents using event-driven processing.

67

68

```python { .api }

69

def iterparse(source, events=None, tag=None, attribute_defaults=False,

70

dtd_validation=False, load_dtd=False, no_network=True,

71

remove_blank_text=False, remove_comments=False,

72

remove_pis=False, encoding=None, huge_tree=False,

73

schema=None):

74

"""

75

Incrementally parse XML document yielding (event, element) pairs.

76

77

Args:

78

source: File path, URL, or file-like object

79

events: tuple of events to report ('start', 'end', 'start-ns', 'end-ns')

80

tag: str or sequence of tag names to filter

81

82

Yields:

83

tuple: (event, element) pairs during parsing

84

"""

85

86

def iterwalk(element_or_tree, events=('end',), tag=None):

87

"""

88

Walk through existing element tree yielding events.

89

90

Args:

91

element_or_tree: Element or ElementTree to walk

92

events: tuple of events to report ('start', 'end')

93

tag: str or sequence of tag names to filter

94

95

Yields:

96

tuple: (event, element) pairs during traversal

97

"""

98

```

99

100

### Element Creation and Manipulation

101

102

Create and modify XML/HTML elements with full attribute and content support.

103

104

```python { .api }

105

class Element:

106

"""XML/HTML element with tag, attributes, text, and children."""

107

108

def __init__(self, tag, attrib=None, nsmap=None, **extra):

109

"""

110

Create new element.

111

112

Args:

113

tag: Element tag name (str or QName)

114

attrib: dict of attributes (optional)

115

nsmap: dict mapping namespace prefixes to URIs (optional)

116

**extra: Additional attributes as keyword arguments

117

"""

118

119

# Element properties

120

tag: str # Element tag name

121

text: str | None # Text content before first child

122

tail: str | None # Text content after element

123

attrib: dict[str, str] # Element attributes

124

nsmap: dict[str, str] # Namespace mapping

125

sourceline: int | None # Source line number (if available)

126

127

# Tree navigation

128

def find(self, path, namespaces=None):

129

"""Find first child element matching path."""

130

131

def findall(self, path, namespaces=None):

132

"""Find all child elements matching path."""

133

134

def iterfind(self, path, namespaces=None):

135

"""Iterate over child elements matching path."""

136

137

def findtext(self, path, default=None, namespaces=None):

138

"""Find text content of first matching child element."""

139

140

def xpath(self, _path, namespaces=None, extensions=None,

141

smart_strings=True, **_variables):

142

"""Evaluate XPath expression on element."""

143

144

# Tree modification

145

def append(self, element):

146

"""Add element as last child."""

147

148

def insert(self, index, element):

149

"""Insert element at specified position."""

150

151

def remove(self, element):

152

"""Remove child element."""

153

154

def clear(self):

155

"""Remove all children and attributes."""

156

157

# Attribute access

158

def get(self, key, default=None):

159

"""Get attribute value."""

160

161

def set(self, key, value):

162

"""Set attribute value."""

163

164

def keys(self):

165

"""Get attribute names."""

166

167

def values(self):

168

"""Get attribute values."""

169

170

def items(self):

171

"""Get (name, value) pairs for attributes."""

172

173

def SubElement(parent, tag, attrib=None, nsmap=None, **extra):

174

"""

175

Create child element and add to parent.

176

177

Args:

178

parent: Parent Element

179

tag: Child element tag name

180

attrib: dict of attributes (optional)

181

nsmap: dict of namespace mappings (optional)

182

**extra: Additional attributes

183

184

Returns:

185

Element: New child element

186

"""

187

```

188

189

### Document Trees

190

191

Manage complete XML/HTML documents with document-level operations.

192

193

```python { .api }

194

class ElementTree:

195

"""Document tree containing root element and document info."""

196

197

def __init__(self, element=None, file=None, parser=None):

198

"""

199

Create document tree.

200

201

Args:

202

element: Root element (optional)

203

file: File to parse (optional)

204

parser: Parser instance (optional)

205

"""

206

207

def getroot(self):

208

"""Get root element."""

209

210

def setroot(self, root):

211

"""Set root element."""

212

213

def parse(self, source, parser=None, base_url=None):

214

"""Parse document from source."""

215

216

def write(self, file, encoding=None, xml_declaration=None,

217

default_namespace=None, method="xml", pretty_print=False,

218

with_tail=True, standalone=None, compression=0,

219

exclusive=False, inclusive_ns_prefixes=None,

220

with_comments=True, strip_cdata=True):

221

"""Write document to file."""

222

223

def xpath(self, _path, namespaces=None, extensions=None,

224

smart_strings=True, **_variables):

225

"""Evaluate XPath expression on document."""

226

227

def xslt(self, _xslt, extensions=None, access_control=None, **_kw):

228

"""Apply XSLT transformation."""

229

230

def relaxng(self, relaxng):

231

"""Validate against RelaxNG schema."""

232

233

def xmlschema(self, xmlschema):

234

"""Validate against XML Schema."""

235

236

def xinclude(self):

237

"""Process XInclude directives."""

238

239

@property

240

def docinfo(self):

241

"""Document information (encoding, version, etc.)."""

242

```

243

244

### Serialization

245

246

Convert elements and trees to strings or bytes with formatting options.

247

248

```python { .api }

249

def tostring(element_or_tree, encoding=None, method="xml",

250

xml_declaration=None, pretty_print=False, with_tail=True,

251

standalone=None, doctype=None, exclusive=False,

252

inclusive_ns_prefixes=None, with_comments=True,

253

strip_cdata=True):

254

"""

255

Serialize element or tree to string/bytes.

256

257

Args:

258

element_or_tree: Element or ElementTree to serialize

259

encoding: Output encoding ('unicode' for str, bytes encoding for bytes)

260

method: Serialization method ('xml', 'html', 'text', 'c14n')

261

xml_declaration: Include XML declaration (bool or None for auto)

262

pretty_print: Format output with whitespace (bool)

263

with_tail: Include tail text (bool)

264

doctype: Document type declaration (str)

265

266

Returns:

267

str or bytes: Serialized document

268

"""

269

270

def tostringlist(element_or_tree, encoding=None, method="xml",

271

xml_declaration=None, pretty_print=False, with_tail=True,

272

standalone=None, doctype=None, exclusive=False,

273

inclusive_ns_prefixes=None, with_comments=True,

274

strip_cdata=True):

275

"""Serialize to list of strings/bytes."""

276

277

def tounicode(element_or_tree, method="xml", pretty_print=False,

278

with_tail=True, doctype=None):

279

"""Serialize to unicode string."""

280

281

def dump(elem):

282

"""Debug dump element structure to stdout."""

283

```

284

285

### Parser Configuration

286

287

Configurable parsers for different XML/HTML processing needs.

288

289

```python { .api }

290

class XMLParser:

291

"""Configurable XML parser with validation and processing options."""

292

293

def __init__(self, encoding=None, attribute_defaults=False,

294

dtd_validation=False, load_dtd=False, no_network=True,

295

ns_clean=False, recover=False, schema=None,

296

huge_tree=False, remove_blank_text=False,

297

resolve_entities=True, remove_comments=False,

298

remove_pis=False, strip_cdata=True, collect_ids=True,

299

target=None, compact=True):

300

"""

301

Create XML parser with specified options.

302

303

Args:

304

encoding: Character encoding override

305

attribute_defaults: Load default attributes from DTD

306

dtd_validation: Enable DTD validation

307

load_dtd: Load and parse DTD

308

no_network: Disable network access

309

recover: Enable error recovery

310

huge_tree: Support very large documents

311

remove_blank_text: Remove whitespace-only text nodes

312

remove_comments: Remove comment nodes

313

remove_pis: Remove processing instruction nodes

314

"""

315

316

class HTMLParser:

317

"""Lenient HTML parser with automatic error recovery."""

318

319

def __init__(self, encoding=None, remove_blank_text=False,

320

remove_comments=False, remove_pis=False,

321

strip_cdata=True, no_network=True, target=None,

322

schema=None, recover=True, compact=True):

323

"""Create HTML parser with specified options."""

324

325

def get_default_parser():

326

"""Get current default parser."""

327

328

def set_default_parser(parser):

329

"""Set global default parser."""

330

```

331

332

### Tree Manipulation Utilities

333

334

High-level functions for common tree modification operations.

335

336

```python { .api }

337

def cleanup_namespaces(tree_or_element):

338

"""Remove unused namespace declarations."""

339

340

def strip_attributes(tree_or_element, *attribute_names):

341

"""Remove specified attributes from all elements."""

342

343

def strip_elements(tree_or_element, *tag_names, with_tail=True):

344

"""Remove elements with specified tag names."""

345

346

def strip_tags(tree_or_element, *tag_names):

347

"""Remove tags but keep text content."""

348

349

def register_namespace(prefix, uri):

350

"""Register namespace prefix for serialization."""

351

```

352

353

### Node Type Classes

354

355

Specialized classes for different XML node types.

356

357

```python { .api }

358

class Comment:

359

"""XML comment node."""

360

def __init__(self, text=None): ...

361

362

class ProcessingInstruction:

363

"""XML processing instruction node."""

364

def __init__(self, target, text=None): ...

365

366

@property

367

def target(self) -> str: ...

368

369

class Entity:

370

"""XML entity reference node."""

371

def __init__(self, name): ...

372

373

@property

374

def name(self) -> str: ...

375

376

class CDATA:

377

"""XML CDATA section."""

378

def __init__(self, data): ...

379

380

# Factory functions

381

def Comment(text=None):

382

"""Create comment node."""

383

384

def ProcessingInstruction(target, text=None):

385

"""Create processing instruction node."""

386

387

PI = ProcessingInstruction # Alias

388

```

389

390

## Usage Examples

391

392

### Basic XML Processing

393

394

```python

395

from lxml import etree

396

397

# Parse XML document

398

xml_data = '''<?xml version="1.0"?>

399

<catalog>

400

<book id="1" category="fiction">

401

<title>The Great Gatsby</title>

402

<author>F. Scott Fitzgerald</author>

403

<year>1925</year>

404

<price currency="USD">12.99</price>

405

</book>

406

<book id="2" category="science">

407

<title>A Brief History of Time</title>

408

<author>Stephen Hawking</author>

409

<year>1988</year>

410

<price currency="USD">15.99</price>

411

</book>

412

</catalog>'''

413

414

root = etree.fromstring(xml_data)

415

416

# Navigate and query

417

books = root.findall('book')

418

fiction_books = root.xpath('//book[@category="fiction"]')

419

titles = root.xpath('//title/text()')

420

421

# Modify content

422

new_book = etree.SubElement(root, 'book', id="3", category="mystery")

423

etree.SubElement(new_book, 'title').text = "The Murder Mystery"

424

etree.SubElement(new_book, 'author').text = "Agatha Christie"

425

etree.SubElement(new_book, 'year').text = "1934"

426

price_elem = etree.SubElement(new_book, 'price', currency="USD")

427

price_elem.text = "11.99"

428

429

# Serialize with formatting

430

output = etree.tostring(root, pretty_print=True, encoding='unicode')

431

print(output)

432

```

433

434

### HTML Document Processing

435

436

```python

437

from lxml import etree

438

439

# Parse HTML with XML parser (requires well-formed HTML)

440

html_data = '''<!DOCTYPE html>

441

<html>

442

<head>

443

<title>Sample Page</title>

444

<meta charset="UTF-8"/>

445

</head>

446

<body>

447

<h1>Welcome</h1>

448

<div class="content">

449

<p>This is a paragraph.</p>

450

<ul>

451

<li>Item 1</li>

452

<li>Item 2</li>

453

</ul>

454

</div>

455

</body>

456

</html>'''

457

458

# Use HTML parser for lenient parsing

459

parser = etree.HTMLParser()

460

doc = etree.fromstring(html_data, parser)

461

462

# Find elements

463

title = doc.find('.//title').text

464

content_div = doc.find('.//div[@class="content"]')

465

list_items = doc.xpath('//li/text()')

466

467

print(f"Title: {title}")

468

print(f"List items: {list_items}")

469

```

470

471

### Error Handling

472

473

```python

474

from lxml import etree

475

476

try:

477

# This will raise XMLSyntaxError due to unclosed tag

478

bad_xml = '<root><child></root>'

479

etree.fromstring(bad_xml)

480

except etree.XMLSyntaxError as e:

481

print(f"XML Error: {e}")

482

print(f"Line: {e.lineno}, Column: {e.offset}")

483

484

# Use recovery parser for malformed XML

485

try:

486

parser = etree.XMLParser(recover=True)

487

root = etree.fromstring(bad_xml, parser)

488

print("Recovered:", etree.tostring(root, encoding='unicode'))

489

except Exception as e:

490

print(f"Recovery failed: {e}")

491

```