or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

content.mdindex.mdmodification.mdnavigation.mdoutput.mdparsing.mdsearch.md

output.mddocs/

0

# Output and Serialization

1

2

Render parse tree elements as formatted HTML/XML with encoding control, pretty-printing, and entity substitution options. Beautiful Soup provides flexible output methods for converting parse trees back to markup strings with various formatting and encoding options.

3

4

## Capabilities

5

6

### Basic Output Methods

7

8

Convert elements to string representations with different encoding and formatting options.

9

10

```python { .api }

11

def __str__(self):

12

"""

13

Default string representation using UTF-8 encoding.

14

15

Returns:

16

str - HTML/XML markup

17

"""

18

19

def __unicode__(self):

20

"""

21

Unicode string representation (Python 2 compatibility).

22

23

Returns:

24

unicode - HTML/XML markup

25

"""

26

27

def encode(self, encoding="utf-8", indent_level=None, formatter="minimal", errors="xmlcharrefreplace"):

28

"""

29

Render element to bytes with specified encoding.

30

31

Parameters:

32

- encoding: str - character encoding (default: "utf-8")

33

- indent_level: int or None - indentation level for pretty printing

34

- formatter: str or function - entity formatting ("minimal", "html", "xml", or custom)

35

- errors: str - encoding error handling ("xmlcharrefreplace", "strict", etc.)

36

37

Returns:

38

bytes - encoded markup

39

"""

40

41

def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):

42

"""

43

Render element to Unicode string.

44

45

Parameters:

46

- indent_level: int or None - indentation level for pretty printing

47

- eventual_encoding: str - encoding for XML declaration (XML only)

48

- formatter: str or function - entity formatting

49

50

Returns:

51

str - Unicode markup

52

"""

53

```

54

55

Usage Examples:

56

57

```python

58

from bs4 import BeautifulSoup

59

60

html = '<div><p>Hello <em>world</em>!</p></div>'

61

soup = BeautifulSoup(html, 'html.parser')

62

63

div = soup.find('div')

64

65

# Basic string conversion

66

print(str(div)) # <div><p>Hello <em>world</em>!</p></div>

67

68

# Encode to bytes

69

utf8_bytes = div.encode('utf-8')

70

print(type(utf8_bytes)) # <class 'bytes'>

71

72

latin1_bytes = div.encode('latin-1')

73

ascii_bytes = div.encode('ascii', errors='xmlcharrefreplace')

74

75

# Decode to Unicode string

76

unicode_str = div.decode()

77

print(type(unicode_str)) # <class 'str'>

78

79

# With different encodings in XML

80

xml = '<?xml version="1.0"?><root><item>content</item></root>'

81

xml_soup = BeautifulSoup(xml, 'xml')

82

xml_output = xml_soup.decode(eventual_encoding='iso-8859-1')

83

print(xml_output) # Includes encoding declaration

84

```

85

86

### Pretty Printing

87

88

Format output with indentation and line breaks for human readability.

89

90

```python { .api }

91

def prettify(self, encoding=None, formatter="minimal"):

92

"""

93

Render with pretty formatting (indentation and line breaks).

94

95

Parameters:

96

- encoding: str or None - if specified, return bytes; if None, return str

97

- formatter: str or function - entity formatting

98

99

Returns:

100

str or bytes - formatted markup

101

"""

102

103

# Pretty printing uses these rules:

104

# - Each tag gets its own line

105

# - Child elements are indented

106

# - Text content may be wrapped

107

# - Empty tags use minimal formatting

108

```

109

110

Usage Examples:

111

112

```python

113

html = '<html><head><title>Page</title></head><body><div class="content"><p>Paragraph 1</p><p>Paragraph 2</p></div></body></html>'

114

soup = BeautifulSoup(html, 'html.parser')

115

116

# Pretty print as string

117

pretty_str = soup.prettify()

118

print(pretty_str)

119

# Output:

120

# <html>

121

# <head>

122

# <title>

123

# Page

124

# </title>

125

# </head>

126

# <body>

127

# <div class="content">

128

# <p>

129

# Paragraph 1

130

# </p>

131

# <p>

132

# Paragraph 2

133

# </p>

134

# </div>

135

# </body>

136

# </html>

137

138

# Pretty print as bytes

139

pretty_bytes = soup.prettify(encoding='utf-8')

140

print(type(pretty_bytes)) # <class 'bytes'>

141

142

# Pretty print specific elements

143

div = soup.find('div')

144

print(div.prettify())

145

# <div class="content">

146

# <p>

147

# Paragraph 1

148

# </p>

149

# <p>

150

# Paragraph 2

151

# </p>

152

# </div>

153

```

154

155

### Content-Only Output

156

157

Render just the contents of elements without the container tags.

158

159

```python { .api }

160

def decode_contents(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):

161

"""

162

Render only the contents (children) as Unicode string.

163

164

Parameters:

165

- indent_level: int or None - indentation level

166

- eventual_encoding: str - encoding for XML declaration

167

- formatter: str or function - entity formatting

168

169

Returns:

170

str - contents as Unicode markup

171

"""

172

173

def encode_contents(self, encoding="utf-8", indent_level=None, formatter="minimal", errors="xmlcharrefreplace"):

174

"""

175

Render only the contents (children) as bytes.

176

177

Parameters:

178

- encoding: str - character encoding

179

- indent_level: int or None - indentation level

180

- formatter: str or function - entity formatting

181

- errors: str - encoding error handling

182

183

Returns:

184

bytes - contents as encoded markup

185

"""

186

```

187

188

Usage Examples:

189

190

```python

191

html = '<div class="wrapper"><p>Content 1</p><p>Content 2</p></div>'

192

soup = BeautifulSoup(html, 'html.parser')

193

194

div = soup.find('div')

195

196

# Full element output

197

print(div.decode())

198

# <div class="wrapper"><p>Content 1</p><p>Content 2</p></div>

199

200

# Contents only (without wrapper div)

201

print(div.decode_contents())

202

# <p>Content 1</p><p>Content 2</p>

203

204

# Contents as bytes

205

contents_bytes = div.encode_contents('utf-8')

206

print(contents_bytes.decode('utf-8'))

207

# <p>Content 1</p><p>Content 2</p>

208

209

# Useful for template replacement

210

template = '<html><body>{content}</body></html>'

211

content = div.decode_contents()

212

final_html = template.format(content=content)

213

```

214

215

### Entity Formatting

216

217

Control how special characters and entities are handled in output.

218

219

```python { .api }

220

# Formatter options

221

formatters = {

222

"minimal": "Escape only <, >, & and quotes in attributes",

223

"html": "Use HTML entity names where possible",

224

"xml": "Use XML entities only (&lt;, &gt;, &amp;, &quot;, &apos;)",

225

None: "No entity substitution",

226

callable: "Custom formatter function"

227

}

228

229

# Custom formatter signature

230

def custom_formatter(string):

231

"""

232

Custom entity substitution function.

233

234

Parameters:

235

- string: str - string to format

236

237

Returns:

238

str - formatted string

239

"""

240

```

241

242

Usage Examples:

243

244

```python

245

from bs4 import BeautifulSoup

246

from bs4.dammit import EntitySubstitution

247

248

html = '<div title="Ben & Jerry\'s">Price: $5 < $10</div>'

249

soup = BeautifulSoup(html, 'html.parser')

250

div = soup.find('div')

251

252

# Minimal formatting (default)

253

print(div.encode(formatter="minimal").decode())

254

# <div title="Ben &amp; Jerry's">Price: $5 &lt; $10</div>

255

256

# HTML entity formatting

257

print(div.encode(formatter="html").decode())

258

# Uses HTML entity names where available

259

260

# XML entity formatting

261

print(div.encode(formatter="xml").decode())

262

# <div title="Ben &amp; Jerry&apos;s">Price: $5 &lt; $10</div>

263

264

# No entity substitution

265

print(div.encode(formatter=None).decode())

266

# <div title="Ben & Jerry's">Price: $5 < $10</div>

267

268

# Custom formatter

269

def quote_formatter(s):

270

return s.replace('"', '&quot;').replace("'", '&#x27;')

271

272

print(div.encode(formatter=quote_formatter).decode())

273

274

# Using EntitySubstitution directly

275

formatted = EntitySubstitution.substitute_html('Ben & Jerry\'s <script>')

276

print(formatted) # Ben &amp; Jerry's &lt;script&gt;

277

```

278

279

### Encoding Handling

280

281

Control character encoding in output with proper error handling.

282

283

```python { .api }

284

# Encoding options

285

encoding_options = [

286

"utf-8", # Unicode encoding (default)

287

"ascii", # ASCII with entity fallback

288

"latin-1", # ISO 8859-1

289

"cp1252", # Windows encoding

290

None # Return Unicode string

291

]

292

293

# Error handling modes

294

error_modes = [

295

"xmlcharrefreplace", # Replace with XML entities (default)

296

"strict", # Raise exception on encoding errors

297

"ignore", # Skip unencodable characters

298

"replace" # Replace with ? character

299

]

300

```

301

302

Usage Examples:

303

304

```python

305

html = '<div>Unicode: café, naïve, résumé</div>'

306

soup = BeautifulSoup(html, 'html.parser')

307

div = soup.find('div')

308

309

# UTF-8 encoding (handles all Unicode)

310

utf8 = div.encode('utf-8')

311

print(utf8.decode('utf-8')) # café, naïve, résumé

312

313

# ASCII with XML character references

314

ascii_xml = div.encode('ascii', errors='xmlcharrefreplace')

315

print(ascii_xml.decode('ascii')) # caf&#233;, na&#239;ve, r&#233;sum&#233;

316

317

# Latin-1 (handles some accented characters)

318

try:

319

latin1 = div.encode('latin-1')

320

print(latin1.decode('latin-1')) # café, naïve, résumé

321

except UnicodeEncodeError:

322

print("Some characters not encodable in Latin-1")

323

324

# Handle encoding errors

325

ascii_ignore = div.encode('ascii', errors='ignore')

326

print(ascii_ignore.decode('ascii')) # caf, nave, rsum

327

328

ascii_replace = div.encode('ascii', errors='replace')

329

print(ascii_replace.decode('ascii')) # caf?, na?ve, r?sum?

330

```

331

332

### XML Declaration Handling

333

334

Control XML declaration output for XML documents.

335

336

```python { .api }

337

# XML-specific output features

338

def decode(self, eventual_encoding=DEFAULT_OUTPUT_ENCODING):

339

"""

340

For XML documents, includes <?xml version="1.0" encoding="..."?> declaration.

341

342

Parameters:

343

- eventual_encoding: str - encoding to declare in XML header

344

"""

345

346

# XML declaration is automatically added for:

347

# - BeautifulSoup objects parsed with XML parser

348

# - When is_xml property is True

349

```

350

351

Usage Examples:

352

353

```python

354

xml = '<root><item>content</item></root>'

355

356

# Parse as XML

357

xml_soup = BeautifulSoup(xml, 'xml')

358

print(xml_soup.decode())

359

# <?xml version="1.0" encoding="utf-8"?>

360

# <root><item>content</item></root>

361

362

# Specify encoding in declaration

363

print(xml_soup.decode(eventual_encoding='iso-8859-1'))

364

# <?xml version="1.0" encoding="iso-8859-1"?>

365

# <root><item>content</item></root>

366

367

# Parse as HTML (no XML declaration)

368

html_soup = BeautifulSoup(xml, 'html.parser')

369

print(html_soup.decode())

370

# <root><item>content</item></root>

371

```

372

373

### Output Utilities

374

375

Helper functions and patterns for common output scenarios.

376

377

```python { .api }

378

# Common output patterns

379

380

def save_to_file(soup, filename, encoding='utf-8'):

381

"""Save soup to file with proper encoding"""

382

with open(filename, 'w', encoding=encoding) as f:

383

f.write(soup.decode())

384

385

def get_text_content(element, separator=' '):

386

"""Extract clean text content"""

387

return separator.join(element.stripped_strings)

388

389

def minify_html(soup):

390

"""Remove extra whitespace from HTML"""

391

return str(soup).replace('\n', '').replace(' ', ' ')

392

```

393

394

Usage Examples:

395

396

```python

397

import os

398

399

html = '''

400

<html>

401

<head>

402

<title>Sample Page</title>

403

</head>

404

<body>

405

<h1>Main Title</h1>

406

<p>Content paragraph with <em>emphasis</em>.</p>

407

</body>

408

</html>

409

'''

410

411

soup = BeautifulSoup(html, 'html.parser')

412

413

# Save formatted HTML to file

414

with open('output.html', 'w', encoding='utf-8') as f:

415

f.write(soup.prettify())

416

417

# Save minified HTML

418

minified = str(soup).replace('\n', '').replace(' ', ' ')

419

with open('minified.html', 'w', encoding='utf-8') as f:

420

f.write(minified)

421

422

# Extract and save text content only

423

text_content = soup.get_text('\n', strip=True)

424

with open('content.txt', 'w', encoding='utf-8') as f:

425

f.write(text_content)

426

427

# Convert to different encodings

428

for encoding in ['utf-8', 'latin-1', 'ascii']:

429

try:

430

filename = f'output_{encoding}.html'

431

with open(filename, 'wb') as f:

432

f.write(soup.encode(encoding))

433

print(f"Saved {filename}")

434

except UnicodeEncodeError as e:

435

print(f"Cannot encode as {encoding}: {e}")

436

437

# Clean up files

438

for f in ['output.html', 'minified.html', 'content.txt']:

439

if os.path.exists(f):

440

os.remove(f)

441

```