or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

content.mdindex.mdmodification.mdnavigation.mdoutput.mdparsing.mdsearch.md

parsing.mddocs/

0

# Core Parsing

1

2

Primary BeautifulSoup class for parsing HTML and XML documents with configurable parser backends and automatic encoding detection. Handles malformed markup gracefully while providing access to the complete parse tree.

3

4

## Capabilities

5

6

### BeautifulSoup Parser

7

8

The main parsing class that converts HTML/XML markup into a navigable parse tree using pluggable parser backends.

9

10

```python { .api }

11

class BeautifulSoup(Tag):

12

def __init__(self, markup="", features=None, builder=None,

13

parse_only=None, from_encoding=None, **kwargs):

14

"""

15

Parse HTML/XML markup into a navigable tree structure.

16

17

Parameters:

18

- markup: str, bytes, or file-like object containing HTML/XML

19

- features: str or list, parser features ('html.parser', 'lxml', 'html5lib', 'xml')

20

- builder: TreeBuilder instance (alternative to features)

21

- parse_only: SoupStrainer to parse only matching elements

22

- from_encoding: str, character encoding to assume for markup

23

- **kwargs: deprecated arguments from BeautifulSoup 3.x

24

25

Examples:

26

- BeautifulSoup(html_string, 'html.parser')

27

- BeautifulSoup(xml_string, 'lxml-xml')

28

- BeautifulSoup(markup, 'html5lib')

29

"""

30

```

31

32

Usage Examples:

33

34

```python

35

# Parse HTML with different parsers

36

from bs4 import BeautifulSoup

37

38

html = '<html><body><p>Hello</p></body></html>'

39

40

# Built-in HTML parser (slower but always available)

41

soup = BeautifulSoup(html, 'html.parser')

42

43

# lxml parser (faster, requires lxml package)

44

soup = BeautifulSoup(html, 'lxml')

45

46

# html5lib parser (most lenient, handles HTML5)

47

soup = BeautifulSoup(html, 'html5lib')

48

49

# XML parsing with lxml

50

xml = '<?xml version="1.0"?><root><item>data</item></root>'

51

soup = BeautifulSoup(xml, 'xml') # or 'lxml-xml'

52

53

# Parse from file

54

with open('document.html', 'r') as f:

55

soup = BeautifulSoup(f, 'html.parser')

56

57

# Parse with encoding specification

58

soup = BeautifulSoup(markup_bytes, 'html.parser', from_encoding='utf-8')

59

```

60

61

### Element Creation

62

63

Create new tags and strings that are associated with the soup object and can be inserted into the parse tree.

64

65

```python { .api }

66

def new_tag(self, name, namespace=None, nsprefix=None, **attrs):

67

"""

68

Create a new Tag associated with this soup.

69

70

Parameters:

71

- name: str, tag name

72

- namespace: str, XML namespace URI

73

- nsprefix: str, XML namespace prefix

74

- **attrs: tag attributes as keyword arguments

75

76

Returns:

77

Tag instance ready for insertion into parse tree

78

"""

79

80

def new_string(self, s, subclass=NavigableString):

81

"""

82

Create a new NavigableString associated with this soup.

83

84

Parameters:

85

- s: str, string content

86

- subclass: NavigableString subclass (Comment, CData, etc.)

87

88

Returns:

89

NavigableString instance ready for insertion

90

"""

91

92

def decode(self, pretty_print=False, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):

93

"""

94

Render the entire soup as Unicode string.

95

96

Parameters:

97

- pretty_print: bool - format with indentation (default: False)

98

- eventual_encoding: str - encoding for XML declaration if XML (default: "utf-8")

99

- formatter: str or function - entity formatting ("minimal", "html", "xml")

100

101

Returns:

102

str - Complete document as Unicode string

103

104

Note: BeautifulSoup.decode() differs from Tag.decode() in first parameter

105

"""

106

```

107

108

Usage Examples:

109

110

```python

111

from bs4 import BeautifulSoup, Comment

112

113

soup = BeautifulSoup('<html><body></body></html>', 'html.parser')

114

115

# Create new tag with attributes

116

new_div = soup.new_tag('div', class_='container', id='main')

117

new_div.string = 'Content here'

118

119

# Create with namespace (XML)

120

new_item = soup.new_tag('item', namespace='http://example.com/ns')

121

122

# Create navigable string

123

new_text = soup.new_string('Some text content')

124

125

# Create comment

126

new_comment = soup.new_string('This is a comment', Comment)

127

128

# Insert into tree

129

soup.body.append(new_div)

130

soup.body.append(new_comment)

131

```

132

133

### Parsing Options

134

135

Control parsing behavior with features, filters, and encoding options.

136

137

```python { .api }

138

# Parser features (can be combined)

139

features = [

140

'html.parser', # Built-in Python HTML parser

141

'lxml', # lxml HTML parser (fast)

142

'lxml-xml', # lxml XML parser

143

'xml', # Alias for lxml-xml

144

'html5lib', # html5lib parser (lenient)

145

'html', # HTML parsing mode

146

'xml', # XML parsing mode

147

'fast', # Prefer faster parsers

148

'permissive' # Handle malformed markup

149

]

150

151

# Parse only specific elements

152

from bs4 import SoupStrainer

153

154

# Only parse div tags with class 'content'

155

parse_only = SoupStrainer('div', class_='content')

156

soup = BeautifulSoup(markup, 'html.parser', parse_only=parse_only)

157

158

# Only parse links

159

parse_only = SoupStrainer('a')

160

soup = BeautifulSoup(markup, 'html.parser', parse_only=parse_only)

161

```

162

163

### Parser Information

164

165

Access information about the parser used and document characteristics.

166

167

```python { .api }

168

# Parser properties

169

soup.builder # TreeBuilder instance used

170

soup.is_xml # Boolean, True if XML parser was used

171

soup.original_encoding # Detected encoding of source markup

172

soup.declared_html_encoding # Encoding declared in HTML meta tags

173

soup.contains_replacement_characters # Whether encoding conversion lost data

174

```

175

176

### Error Handling

177

178

Handle parsing errors and invalid markup gracefully.

179

180

```python { .api }

181

class FeatureNotFound(ValueError):

182

"""Raised when requested parser features are not available"""

183

184

class ParserRejectedMarkup(Exception):

185

"""Raised when parser cannot handle the provided markup"""

186

```

187

188

Usage Examples:

189

190

```python

191

from bs4 import BeautifulSoup, FeatureNotFound

192

193

try:

194

# This will fail if lxml is not installed

195

soup = BeautifulSoup(markup, 'lxml')

196

except FeatureNotFound:

197

# Fall back to built-in parser

198

soup = BeautifulSoup(markup, 'html.parser')

199

200

# Handle malformed markup

201

malformed_html = '<html><body><p>Unclosed paragraph<div>Mixed nesting</body></html>'

202

soup = BeautifulSoup(malformed_html, 'html.parser') # Parses successfully

203

```

204

205

### Diagnostic Functions

206

207

Debug parsing issues and compare parser performance with diagnostic utilities.

208

209

```python { .api }

210

def diagnose(data):

211

"""

212

Comprehensive diagnostic suite for troubleshooting parsing issues.

213

214

Tests multiple parsers on the same data and shows results and errors.

215

Useful for tech support and debugging parser selection problems.

216

217

Parameters:

218

- data: str, bytes, file-like object, or filename to parse

219

220

Prints diagnostic information including:

221

- Beautiful Soup version and Python version

222

- Available parsers and their versions

223

- Parse results from each parser

224

- Exception traces for failed parsers

225

"""

226

227

def lxml_trace(data, html=True, **kwargs):

228

"""

229

Print lxml parsing events to see raw parser behavior.

230

231

Shows the underlying lxml events during parsing without Beautiful Soup.

232

233

Parameters:

234

- data: str - markup to parse

235

- html: bool - use HTML parser mode (default: True)

236

- **kwargs: additional lxml parser options

237

238

Prints events in format: "event, tag, text"

239

"""

240

241

def htmlparser_trace(data):

242

"""

243

Print HTMLParser events to see raw parser behavior.

244

245

Shows the underlying HTMLParser events during parsing without Beautiful Soup.

246

247

Parameters:

248

- data: str - markup to parse

249

250

Prints events like: "TAG START", "DATA", "TAG END"

251

"""

252

253

def benchmark_parsers(num_elements=100000):

254

"""

255

Basic performance benchmark comparing available parsers.

256

257

Generates a large invalid HTML document and times parsing with

258

different parser backends to compare performance.

259

260

Parameters:

261

- num_elements: int - size of generated test document

262

263

Prints timing results for each available parser

264

"""

265

266

def profile(num_elements=100000, parser="lxml"):

267

"""

268

Profile Beautiful Soup parsing performance in detail.

269

270

Uses cProfile to analyze where time is spent during parsing.

271

272

Parameters:

273

- num_elements: int - size of generated test document

274

- parser: str - parser to profile ("lxml", "html.parser", etc.)

275

276

Returns profile statistics for analysis

277

"""

278

```

279

280

Usage Examples:

281

282

```python

283

from bs4.diagnose import diagnose, lxml_trace, htmlparser_trace, benchmark_parsers

284

285

# Debug parsing problems

286

problematic_html = '<html><body><p>Malformed HTML...'

287

diagnose(problematic_html)

288

289

# Compare parser performance

290

benchmark_parsers(50000)

291

292

# See raw parser events

293

lxml_trace('<p>Hello <b>world</b></p>')

294

htmlparser_trace('<p>Hello <em>world</em></p>')

295

296

# Profile for performance optimization

297

from bs4.diagnose import profile

298

profile(100000, 'lxml')

299

```

300

301

### Builder and Parser Configuration

302

303

Advanced parser configuration and tree builder architecture for customizing parsing behavior.

304

305

```python { .api }

306

class TreeBuilder:

307

"""

308

Base class for parser backends that convert markup into Beautiful Soup trees.

309

310

Used internally by BeautifulSoup to abstract different parser implementations.

311

"""

312

features = [] # List of supported feature strings

313

is_xml = False # Whether this parser handles XML

314

preserve_whitespace_tags = set() # Tags that preserve whitespace

315

empty_element_tags = None # Tags that can be self-closing

316

cdata_list_attributes = {} # Attributes containing space-separated lists

317

318

class HTMLTreeBuilder(TreeBuilder):

319

"""

320

Base class for HTML-specific tree builders.

321

322

Defines HTML-specific parsing behavior and tag characteristics.

323

"""

324

preserve_whitespace_tags = {'pre', 'textarea'}

325

empty_element_tags = {'br', 'hr', 'input', 'img', 'meta', 'spacer', 'link', 'frame', 'base'}

326

327

class TreeBuilderRegistry:

328

"""

329

Registry for managing available parser backends.

330

331

Automatically selects appropriate parsers based on requested features.

332

"""

333

def register(self, treebuilder_class): ...

334

def lookup(self, *features): ...

335

336

# Parser feature constants

337

FAST = 'fast'

338

PERMISSIVE = 'permissive'

339

STRICT = 'strict'

340

XML = 'xml'

341

HTML = 'html'

342

HTML_5 = 'html5'

343

344

# Global parser registry

345

builder_registry = TreeBuilderRegistry()

346

```

347

348

### Encoding Detection and Processing

349

350

Handle character encoding detection and entity processing.

351

352

```python { .api }

353

class UnicodeDammit:

354

"""

355

Automatic character encoding detection and conversion to Unicode.

356

357

Handles encoding detection from HTML meta tags, XML declarations,

358

byte order marks, and statistical analysis of byte patterns.

359

"""

360

def __init__(self, markup, override_encodings=[], smart_quotes_to="xml",

361

is_html=True, exclude_encodings=[]): ...

362

363

@property

364

def unicode_markup(self): ... # Converted Unicode string

365

@property

366

def original_encoding(self): ... # Detected source encoding

367

368

class EntitySubstitution:

369

"""

370

HTML and XML entity encoding and decoding utilities.

371

372

Handles conversion between Unicode characters and HTML/XML entities.

373

"""

374

@classmethod

375

def substitute_html(cls, s): ... # Convert to HTML entities

376

@classmethod

377

def substitute_xml(cls, s): ... # Convert to XML entities

378

@classmethod

379

def quoted_attribute_value(cls, value): ... # Quote attribute values

380

381

class HTMLAwareEntitySubstitution(EntitySubstitution):

382

"""

383

Entity substitution that preserves script and style tag contents.

384

385

Avoids entity conversion in script and style tags where it would

386

break JavaScript or CSS code.

387

"""

388

cdata_containing_tags = {'script', 'style'}

389

preformatted_tags = {'pre'}

390

```

391

392

Usage Examples:

393

394

```python

395

from bs4.builder import builder_registry, FAST, PERMISSIVE

396

from bs4.dammit import UnicodeDammit, EntitySubstitution

397

398

# Check available parsers

399

available_parsers = []

400

for builder in builder_registry.builders:

401

available_parsers.append(builder.features)

402

print("Available parsers:", available_parsers)

403

404

# Manual encoding detection

405

raw_data = b'<html><meta charset="latin1"><body>Caf\xe9</body></html>'

406

dammit = UnicodeDammit(raw_data)

407

print("Detected encoding:", dammit.original_encoding)

408

print("Unicode markup:", dammit.unicode_markup)

409

410

# Entity handling

411

text_with_entities = "R&D <division> & \"innovation\""

412

html_entities = EntitySubstitution.substitute_html(text_with_entities)

413

xml_entities = EntitySubstitution.substitute_xml(text_with_entities)

414

print("HTML entities:", html_entities)

415

print("XML entities:", xml_entities)

416

417

# Parser feature lookup

418

fast_parser = builder_registry.lookup(FAST)

419

permissive_html_parser = builder_registry.lookup(PERMISSIVE, 'html')

420

```