0
# Output and Serialization
1
2
Render parse tree elements as formatted HTML/XML with encoding control, pretty-printing, and entity substitution options. Beautiful Soup provides flexible output methods for converting parse trees back to markup strings with various formatting and encoding options.
3
4
## Capabilities
5
6
### Basic Output Methods
7
8
Convert elements to string representations with different encoding and formatting options.
9
10
```python { .api }
11
def __str__(self):
12
"""
13
Default string representation using UTF-8 encoding.
14
15
Returns:
16
str - HTML/XML markup
17
"""
18
19
def __unicode__(self):
20
"""
21
Unicode string representation (Python 2 compatibility).
22
23
Returns:
24
unicode - HTML/XML markup
25
"""
26
27
def encode(self, encoding="utf-8", indent_level=None, formatter="minimal", errors="xmlcharrefreplace"):
28
"""
29
Render element to bytes with specified encoding.
30
31
Parameters:
32
- encoding: str - character encoding (default: "utf-8")
33
- indent_level: int or None - indentation level for pretty printing
34
- formatter: str or function - entity formatting ("minimal", "html", "xml", or custom)
35
- errors: str - encoding error handling ("xmlcharrefreplace", "strict", etc.)
36
37
Returns:
38
bytes - encoded markup
39
"""
40
41
def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
42
"""
43
Render element to Unicode string.
44
45
Parameters:
46
- indent_level: int or None - indentation level for pretty printing
47
- eventual_encoding: str - encoding for XML declaration (XML only)
48
- formatter: str or function - entity formatting
49
50
Returns:
51
str - Unicode markup
52
"""
53
```
54
55
Usage Examples:
56
57
```python
58
from bs4 import BeautifulSoup
59
60
html = '<div><p>Hello <em>world</em>!</p></div>'
61
soup = BeautifulSoup(html, 'html.parser')
62
63
div = soup.find('div')
64
65
# Basic string conversion
66
print(str(div)) # <div><p>Hello <em>world</em>!</p></div>
67
68
# Encode to bytes
69
utf8_bytes = div.encode('utf-8')
70
print(type(utf8_bytes)) # <class 'bytes'>
71
72
latin1_bytes = div.encode('latin-1')
73
ascii_bytes = div.encode('ascii', errors='xmlcharrefreplace')
74
75
# Decode to Unicode string
76
unicode_str = div.decode()
77
print(type(unicode_str)) # <class 'str'>
78
79
# With different encodings in XML
80
xml = '<?xml version="1.0"?><root><item>content</item></root>'
81
xml_soup = BeautifulSoup(xml, 'xml')
82
xml_output = xml_soup.decode(eventual_encoding='iso-8859-1')
83
print(xml_output) # Includes encoding declaration
84
```
85
86
### Pretty Printing
87
88
Format output with indentation and line breaks for human readability.
89
90
```python { .api }
91
def prettify(self, encoding=None, formatter="minimal"):
92
"""
93
Render with pretty formatting (indentation and line breaks).
94
95
Parameters:
96
- encoding: str or None - if specified, return bytes; if None, return str
97
- formatter: str or function - entity formatting
98
99
Returns:
100
str or bytes - formatted markup
101
"""
102
103
# Pretty printing uses these rules:
104
# - Each tag gets its own line
105
# - Child elements are indented
106
# - Text content may be wrapped
107
# - Empty tags use minimal formatting
108
```
109
110
Usage Examples:
111
112
```python
113
html = '<html><head><title>Page</title></head><body><div class="content"><p>Paragraph 1</p><p>Paragraph 2</p></div></body></html>'
114
soup = BeautifulSoup(html, 'html.parser')
115
116
# Pretty print as string
117
pretty_str = soup.prettify()
118
print(pretty_str)
119
# Output:
120
# <html>
121
# <head>
122
# <title>
123
# Page
124
# </title>
125
# </head>
126
# <body>
127
# <div class="content">
128
# <p>
129
# Paragraph 1
130
# </p>
131
# <p>
132
# Paragraph 2
133
# </p>
134
# </div>
135
# </body>
136
# </html>
137
138
# Pretty print as bytes
139
pretty_bytes = soup.prettify(encoding='utf-8')
140
print(type(pretty_bytes)) # <class 'bytes'>
141
142
# Pretty print specific elements
143
div = soup.find('div')
144
print(div.prettify())
145
# <div class="content">
146
# <p>
147
# Paragraph 1
148
# </p>
149
# <p>
150
# Paragraph 2
151
# </p>
152
# </div>
153
```
154
155
### Content-Only Output
156
157
Render just the contents of elements without the container tags.
158
159
```python { .api }
160
def decode_contents(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
161
"""
162
Render only the contents (children) as Unicode string.
163
164
Parameters:
165
- indent_level: int or None - indentation level
166
- eventual_encoding: str - encoding for XML declaration
167
- formatter: str or function - entity formatting
168
169
Returns:
170
str - contents as Unicode markup
171
"""
172
173
def encode_contents(self, encoding="utf-8", indent_level=None, formatter="minimal", errors="xmlcharrefreplace"):
174
"""
175
Render only the contents (children) as bytes.
176
177
Parameters:
178
- encoding: str - character encoding
179
- indent_level: int or None - indentation level
180
- formatter: str or function - entity formatting
181
- errors: str - encoding error handling
182
183
Returns:
184
bytes - contents as encoded markup
185
"""
186
```
187
188
Usage Examples:
189
190
```python
191
html = '<div class="wrapper"><p>Content 1</p><p>Content 2</p></div>'
192
soup = BeautifulSoup(html, 'html.parser')
193
194
div = soup.find('div')
195
196
# Full element output
197
print(div.decode())
198
# <div class="wrapper"><p>Content 1</p><p>Content 2</p></div>
199
200
# Contents only (without wrapper div)
201
print(div.decode_contents())
202
# <p>Content 1</p><p>Content 2</p>
203
204
# Contents as bytes
205
contents_bytes = div.encode_contents('utf-8')
206
print(contents_bytes.decode('utf-8'))
207
# <p>Content 1</p><p>Content 2</p>
208
209
# Useful for template replacement
210
template = '<html><body>{content}</body></html>'
211
content = div.decode_contents()
212
final_html = template.format(content=content)
213
```
214
215
### Entity Formatting
216
217
Control how special characters and entities are handled in output.
218
219
```python { .api }
220
# Formatter options
221
formatters = {
222
"minimal": "Escape only <, >, & and quotes in attributes",
223
"html": "Use HTML entity names where possible",
224
"xml": "Use XML entities only (<, >, &, ", ')",
225
None: "No entity substitution",
226
callable: "Custom formatter function"
227
}
228
229
# Custom formatter signature
230
def custom_formatter(string):
231
"""
232
Custom entity substitution function.
233
234
Parameters:
235
- string: str - string to format
236
237
Returns:
238
str - formatted string
239
"""
240
```
241
242
Usage Examples:
243
244
```python
245
from bs4 import BeautifulSoup
246
from bs4.dammit import EntitySubstitution
247
248
html = '<div title="Ben & Jerry\'s">Price: $5 < $10</div>'
249
soup = BeautifulSoup(html, 'html.parser')
250
div = soup.find('div')
251
252
# Minimal formatting (default)
253
print(div.encode(formatter="minimal").decode())
254
# <div title="Ben & Jerry's">Price: $5 < $10</div>
255
256
# HTML entity formatting
257
print(div.encode(formatter="html").decode())
258
# Uses HTML entity names where available
259
260
# XML entity formatting
261
print(div.encode(formatter="xml").decode())
262
# <div title="Ben & Jerry's">Price: $5 < $10</div>
263
264
# No entity substitution
265
print(div.encode(formatter=None).decode())
266
# <div title="Ben & Jerry's">Price: $5 < $10</div>
267
268
# Custom formatter
269
def quote_formatter(s):
270
return s.replace('"', '"').replace("'", ''')
271
272
print(div.encode(formatter=quote_formatter).decode())
273
274
# Using EntitySubstitution directly
275
formatted = EntitySubstitution.substitute_html('Ben & Jerry\'s <script>')
276
print(formatted) # Ben & Jerry's <script>
277
```
278
279
### Encoding Handling
280
281
Control character encoding in output with proper error handling.
282
283
```python { .api }
284
# Encoding options
285
encoding_options = [
286
"utf-8", # Unicode encoding (default)
287
"ascii", # ASCII with entity fallback
288
"latin-1", # ISO 8859-1
289
"cp1252", # Windows encoding
290
None # Return Unicode string
291
]
292
293
# Error handling modes
294
error_modes = [
295
"xmlcharrefreplace", # Replace with XML entities (default)
296
"strict", # Raise exception on encoding errors
297
"ignore", # Skip unencodable characters
298
"replace" # Replace with ? character
299
]
300
```
301
302
Usage Examples:
303
304
```python
305
html = '<div>Unicode: café, naïve, résumé</div>'
306
soup = BeautifulSoup(html, 'html.parser')
307
div = soup.find('div')
308
309
# UTF-8 encoding (handles all Unicode)
310
utf8 = div.encode('utf-8')
311
print(utf8.decode('utf-8')) # café, naïve, résumé
312
313
# ASCII with XML character references
314
ascii_xml = div.encode('ascii', errors='xmlcharrefreplace')
315
print(ascii_xml.decode('ascii')) # café, naïve, résumé
316
317
# Latin-1 (handles some accented characters)
318
try:
319
latin1 = div.encode('latin-1')
320
print(latin1.decode('latin-1')) # café, naïve, résumé
321
except UnicodeEncodeError:
322
print("Some characters not encodable in Latin-1")
323
324
# Handle encoding errors
325
ascii_ignore = div.encode('ascii', errors='ignore')
326
print(ascii_ignore.decode('ascii')) # caf, nave, rsum
327
328
ascii_replace = div.encode('ascii', errors='replace')
329
print(ascii_replace.decode('ascii')) # caf?, na?ve, r?sum?
330
```
331
332
### XML Declaration Handling
333
334
Control XML declaration output for XML documents.
335
336
```python { .api }
337
# XML-specific output features
338
def decode(self, eventual_encoding=DEFAULT_OUTPUT_ENCODING):
339
"""
340
For XML documents, includes <?xml version="1.0" encoding="..."?> declaration.
341
342
Parameters:
343
- eventual_encoding: str - encoding to declare in XML header
344
"""
345
346
# XML declaration is automatically added for:
347
# - BeautifulSoup objects parsed with XML parser
348
# - When is_xml property is True
349
```
350
351
Usage Examples:
352
353
```python
354
xml = '<root><item>content</item></root>'
355
356
# Parse as XML
357
xml_soup = BeautifulSoup(xml, 'xml')
358
print(xml_soup.decode())
359
# <?xml version="1.0" encoding="utf-8"?>
360
# <root><item>content</item></root>
361
362
# Specify encoding in declaration
363
print(xml_soup.decode(eventual_encoding='iso-8859-1'))
364
# <?xml version="1.0" encoding="iso-8859-1"?>
365
# <root><item>content</item></root>
366
367
# Parse as HTML (no XML declaration)
368
html_soup = BeautifulSoup(xml, 'html.parser')
369
print(html_soup.decode())
370
# <root><item>content</item></root>
371
```
372
373
### Output Utilities
374
375
Helper functions and patterns for common output scenarios.
376
377
```python { .api }
378
# Common output patterns
379
380
def save_to_file(soup, filename, encoding='utf-8'):
381
"""Save soup to file with proper encoding"""
382
with open(filename, 'w', encoding=encoding) as f:
383
f.write(soup.decode())
384
385
def get_text_content(element, separator=' '):
386
"""Extract clean text content"""
387
return separator.join(element.stripped_strings)
388
389
def minify_html(soup):
390
"""Remove extra whitespace from HTML"""
391
return str(soup).replace('\n', '').replace(' ', ' ')
392
```
393
394
Usage Examples:
395
396
```python
397
import os
398
399
html = '''
400
<html>
401
<head>
402
<title>Sample Page</title>
403
</head>
404
<body>
405
<h1>Main Title</h1>
406
<p>Content paragraph with <em>emphasis</em>.</p>
407
</body>
408
</html>
409
'''
410
411
soup = BeautifulSoup(html, 'html.parser')
412
413
# Save formatted HTML to file
414
with open('output.html', 'w', encoding='utf-8') as f:
415
f.write(soup.prettify())
416
417
# Save minified HTML
418
minified = str(soup).replace('\n', '').replace(' ', ' ')
419
with open('minified.html', 'w', encoding='utf-8') as f:
420
f.write(minified)
421
422
# Extract and save text content only
423
text_content = soup.get_text('\n', strip=True)
424
with open('content.txt', 'w', encoding='utf-8') as f:
425
f.write(text_content)
426
427
# Convert to different encodings
428
for encoding in ['utf-8', 'latin-1', 'ascii']:
429
try:
430
filename = f'output_{encoding}.html'
431
with open(filename, 'wb') as f:
432
f.write(soup.encode(encoding))
433
print(f"Saved {filename}")
434
except UnicodeEncodeError as e:
435
print(f"Cannot encode as {encoding}: {e}")
436
437
# Clean up files
438
for f in ['output.html', 'minified.html', 'content.txt']:
439
if os.path.exists(f):
440
os.remove(f)
441
```