Tessl Tile for pypi/beautifulsoup4@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

content.md index.md modification.md output.md parsing.md search.md

output.mddocs/

0
# Output and Serialization
1

2
Render parse tree elements as formatted HTML/XML with encoding control, pretty-printing, and entity substitution options. Beautiful Soup provides flexible output methods for converting parse trees back to markup strings with various formatting and encoding options.
3

4
## Capabilities
5

6
### Basic Output Methods
7

8
Convert elements to string representations with different encoding and formatting options.
9

10
```python { .api }
11
def __str__(self):
12
    """
13
    Default string representation using UTF-8 encoding.
14
    
15
    Returns:
16
    str - HTML/XML markup
17
    """
18

19
def __unicode__(self):
20
    """
21
    Unicode string representation (Python 2 compatibility).
22
    
23
    Returns:
24
    unicode - HTML/XML markup
25
    """
26

27
def encode(self, encoding="utf-8", indent_level=None, formatter="minimal", errors="xmlcharrefreplace"):
28
    """
29
    Render element to bytes with specified encoding.
30
    
31
    Parameters:
32
    - encoding: str - character encoding (default: "utf-8")
33
    - indent_level: int or None - indentation level for pretty printing
34
    - formatter: str or function - entity formatting ("minimal", "html", "xml", or custom)
35
    - errors: str - encoding error handling ("xmlcharrefreplace", "strict", etc.)
36
    
37
    Returns:
38
    bytes - encoded markup
39
    """
40

41
def decode(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
42
    """
43
    Render element to Unicode string.
44
    
45
    Parameters:
46
    - indent_level: int or None - indentation level for pretty printing  
47
    - eventual_encoding: str - encoding for XML declaration (XML only)
48
    - formatter: str or function - entity formatting
49
    
50
    Returns:
51
    str - Unicode markup
52
    """
53
```
54

55
Usage Examples:
56

57
```python
58
from bs4 import BeautifulSoup
59

60
html = '<div><p>Hello <em>world</em>!</p></div>'
61
soup = BeautifulSoup(html, 'html.parser')
62

63
div = soup.find('div')
64

65
# Basic string conversion
66
print(str(div))  # <div><p>Hello <em>world</em>!</p></div>
67

68
# Encode to bytes
69
utf8_bytes = div.encode('utf-8')
70
print(type(utf8_bytes))  # <class 'bytes'>
71

72
latin1_bytes = div.encode('latin-1')
73
ascii_bytes = div.encode('ascii', errors='xmlcharrefreplace')
74

75
# Decode to Unicode string  
76
unicode_str = div.decode()
77
print(type(unicode_str))  # <class 'str'>
78

79
# With different encodings in XML
80
xml = '<?xml version="1.0"?><root><item>content</item></root>'
81
xml_soup = BeautifulSoup(xml, 'xml')
82
xml_output = xml_soup.decode(eventual_encoding='iso-8859-1')
83
print(xml_output)  # Includes encoding declaration
84
```
85

86
### Pretty Printing
87

88
Format output with indentation and line breaks for human readability.
89

90
```python { .api }
91
def prettify(self, encoding=None, formatter="minimal"):
92
    """
93
    Render with pretty formatting (indentation and line breaks).
94
    
95
    Parameters:
96
    - encoding: str or None - if specified, return bytes; if None, return str
97
    - formatter: str or function - entity formatting
98
    
99
    Returns:
100
    str or bytes - formatted markup
101
    """
102

103
# Pretty printing uses these rules:
104
# - Each tag gets its own line
105
# - Child elements are indented
106
# - Text content may be wrapped
107
# - Empty tags use minimal formatting
108
```
109

110
Usage Examples:
111

112
```python
113
html = '<html><head><title>Page</title></head><body><div class="content"><p>Paragraph 1</p><p>Paragraph 2</p></div></body></html>'
114
soup = BeautifulSoup(html, 'html.parser')
115

116
# Pretty print as string
117
pretty_str = soup.prettify()
118
print(pretty_str)
119
# Output:
120
# <html>
121
#  <head>
122
#   <title>
123
#    Page
124
#   </title>
125
#  </head>
126
#  <body>
127
#   <div class="content">
128
#    <p>
129
#     Paragraph 1
130
#    </p>
131
#    <p>
132
#     Paragraph 2
133
#    </p>
134
#   </div>
135
#  </body>
136
# </html>
137

138
# Pretty print as bytes
139
pretty_bytes = soup.prettify(encoding='utf-8')
140
print(type(pretty_bytes))  # <class 'bytes'>
141

142
# Pretty print specific elements
143
div = soup.find('div')
144
print(div.prettify())
145
# <div class="content">
146
#  <p>
147
#   Paragraph 1
148
#  </p>
149
#  <p>
150
#   Paragraph 2
151
#  </p>
152
# </div>
153
```
154

155
### Content-Only Output
156

157
Render just the contents of elements without the container tags.
158

159
```python { .api }
160
def decode_contents(self, indent_level=None, eventual_encoding=DEFAULT_OUTPUT_ENCODING, formatter="minimal"):
161
    """
162
    Render only the contents (children) as Unicode string.
163
    
164
    Parameters:
165
    - indent_level: int or None - indentation level
166
    - eventual_encoding: str - encoding for XML declaration  
167
    - formatter: str or function - entity formatting
168
    
169
    Returns:
170
    str - contents as Unicode markup
171
    """
172

173
def encode_contents(self, encoding="utf-8", indent_level=None, formatter="minimal", errors="xmlcharrefreplace"):
174
    """
175
    Render only the contents (children) as bytes.
176
    
177
    Parameters:
178
    - encoding: str - character encoding
179
    - indent_level: int or None - indentation level
180
    - formatter: str or function - entity formatting
181
    - errors: str - encoding error handling
182
    
183
    Returns:  
184
    bytes - contents as encoded markup
185
    """
186
```
187

188
Usage Examples:
189

190
```python
191
html = '<div class="wrapper"><p>Content 1</p><p>Content 2</p></div>'
192
soup = BeautifulSoup(html, 'html.parser')
193

194
div = soup.find('div')
195

196
# Full element output
197
print(div.decode())
198
# <div class="wrapper"><p>Content 1</p><p>Content 2</p></div>
199

200
# Contents only (without wrapper div)
201
print(div.decode_contents())
202
# <p>Content 1</p><p>Content 2</p>
203

204
# Contents as bytes
205
contents_bytes = div.encode_contents('utf-8')
206
print(contents_bytes.decode('utf-8'))
207
# <p>Content 1</p><p>Content 2</p>
208

209
# Useful for template replacement
210
template = '<html><body>{content}</body></html>'
211
content = div.decode_contents()
212
final_html = template.format(content=content)
213
```
214

215
### Entity Formatting
216

217
Control how special characters and entities are handled in output.
218

219
```python { .api }
220
# Formatter options
221
formatters = {
222
    "minimal": "Escape only <, >, & and quotes in attributes",
223
    "html": "Use HTML entity names where possible", 
224
    "xml": "Use XML entities only (&lt;, &gt;, &amp;, &quot;, &apos;)",
225
    None: "No entity substitution",
226
    callable: "Custom formatter function"
227
}
228

229
# Custom formatter signature
230
def custom_formatter(string):
231
    """
232
    Custom entity substitution function.
233
    
234
    Parameters:
235
    - string: str - string to format
236
    
237
    Returns:
238
    str - formatted string
239
    """
240
```
241

242
Usage Examples:
243

244
```python
245
from bs4 import BeautifulSoup
246
from bs4.dammit import EntitySubstitution
247

248
html = '<div title="Ben & Jerry\'s">Price: $5 < $10</div>'
249
soup = BeautifulSoup(html, 'html.parser')
250
div = soup.find('div')
251

252
# Minimal formatting (default)
253
print(div.encode(formatter="minimal").decode())
254
# <div title="Ben &amp; Jerry's">Price: $5 &lt; $10</div>
255

256
# HTML entity formatting
257
print(div.encode(formatter="html").decode())
258
# Uses HTML entity names where available
259

260
# XML entity formatting  
261
print(div.encode(formatter="xml").decode())
262
# <div title="Ben &amp; Jerry&apos;s">Price: $5 &lt; $10</div>
263

264
# No entity substitution
265
print(div.encode(formatter=None).decode())
266
# <div title="Ben & Jerry's">Price: $5 < $10</div>
267

268
# Custom formatter
269
def quote_formatter(s):
270
    return s.replace('"', '&quot;').replace("'", '&#x27;')
271

272
print(div.encode(formatter=quote_formatter).decode())
273

274
# Using EntitySubstitution directly
275
formatted = EntitySubstitution.substitute_html('Ben & Jerry\'s <script>')
276
print(formatted)  # Ben &amp; Jerry's &lt;script&gt;
277
```
278

279
### Encoding Handling
280

281
Control character encoding in output with proper error handling.
282

283
```python { .api }
284
# Encoding options
285
encoding_options = [
286
    "utf-8",      # Unicode encoding (default)
287
    "ascii",      # ASCII with entity fallback
288
    "latin-1",    # ISO 8859-1
289
    "cp1252",     # Windows encoding
290
    None          # Return Unicode string
291
]
292

293
# Error handling modes
294
error_modes = [
295
    "xmlcharrefreplace",  # Replace with XML entities (default)
296
    "strict",             # Raise exception on encoding errors
297
    "ignore",             # Skip unencodable characters
298
    "replace"             # Replace with ? character
299
]
300
```
301

302
Usage Examples:
303

304
```python
305
html = '<div>Unicode: café, naïve, résumé</div>'
306
soup = BeautifulSoup(html, 'html.parser')
307
div = soup.find('div')
308

309
# UTF-8 encoding (handles all Unicode)
310
utf8 = div.encode('utf-8')
311
print(utf8.decode('utf-8'))  # café, naïve, résumé
312

313
# ASCII with XML character references
314
ascii_xml = div.encode('ascii', errors='xmlcharrefreplace')
315
print(ascii_xml.decode('ascii'))  # caf&#233;, na&#239;ve, r&#233;sum&#233;
316

317
# Latin-1 (handles some accented characters)
318
try:
319
    latin1 = div.encode('latin-1')
320
    print(latin1.decode('latin-1'))  # café, naïve, résumé
321
except UnicodeEncodeError:
322
    print("Some characters not encodable in Latin-1")
323

324
# Handle encoding errors
325
ascii_ignore = div.encode('ascii', errors='ignore')
326
print(ascii_ignore.decode('ascii'))  # caf, nave, rsum
327

328
ascii_replace = div.encode('ascii', errors='replace')  
329
print(ascii_replace.decode('ascii'))  # caf?, na?ve, r?sum?
330
```
331

332
### XML Declaration Handling
333

334
Control XML declaration output for XML documents.
335

336
```python { .api }
337
# XML-specific output features
338
def decode(self, eventual_encoding=DEFAULT_OUTPUT_ENCODING):
339
    """
340
    For XML documents, includes <?xml version="1.0" encoding="..."?> declaration.
341
    
342
    Parameters:
343
    - eventual_encoding: str - encoding to declare in XML header
344
    """
345

346
# XML declaration is automatically added for:
347
# - BeautifulSoup objects parsed with XML parser
348
# - When is_xml property is True
349
```
350

351
Usage Examples:
352

353
```python
354
xml = '<root><item>content</item></root>'
355

356
# Parse as XML
357
xml_soup = BeautifulSoup(xml, 'xml')
358
print(xml_soup.decode())
359
# <?xml version="1.0" encoding="utf-8"?>
360
# <root><item>content</item></root>
361

362
# Specify encoding in declaration
363
print(xml_soup.decode(eventual_encoding='iso-8859-1'))
364
# <?xml version="1.0" encoding="iso-8859-1"?>
365
# <root><item>content</item></root>
366

367
# Parse as HTML (no XML declaration)
368
html_soup = BeautifulSoup(xml, 'html.parser')
369
print(html_soup.decode())
370
# <root><item>content</item></root>
371
```
372

373
### Output Utilities
374

375
Helper functions and patterns for common output scenarios.
376

377
```python { .api }
378
# Common output patterns
379

380
def save_to_file(soup, filename, encoding='utf-8'):
381
    """Save soup to file with proper encoding"""
382
    with open(filename, 'w', encoding=encoding) as f:
383
        f.write(soup.decode())
384

385
def get_text_content(element, separator=' '):
386
    """Extract clean text content"""
387
    return separator.join(element.stripped_strings)
388

389
def minify_html(soup):
390
    """Remove extra whitespace from HTML"""
391
    return str(soup).replace('\n', '').replace('  ', ' ')
392
```
393

394
Usage Examples:
395

396
```python
397
import os
398

399
html = '''
400
<html>
401
  <head>
402
    <title>Sample Page</title>
403
  </head>
404
  <body>
405
    <h1>Main Title</h1>
406
    <p>Content paragraph with <em>emphasis</em>.</p>
407
  </body>
408
</html>
409
'''
410

411
soup = BeautifulSoup(html, 'html.parser')
412

413
# Save formatted HTML to file
414
with open('output.html', 'w', encoding='utf-8') as f:
415
    f.write(soup.prettify())
416

417
# Save minified HTML  
418
minified = str(soup).replace('\n', '').replace('  ', ' ')
419
with open('minified.html', 'w', encoding='utf-8') as f:
420
    f.write(minified)
421

422
# Extract and save text content only
423
text_content = soup.get_text('\n', strip=True)
424
with open('content.txt', 'w', encoding='utf-8') as f:
425
    f.write(text_content)
426

427
# Convert to different encodings
428
for encoding in ['utf-8', 'latin-1', 'ascii']:
429
    try:
430
        filename = f'output_{encoding}.html'
431
        with open(filename, 'wb') as f:
432
            f.write(soup.encode(encoding))
433
        print(f"Saved {filename}")
434
    except UnicodeEncodeError as e:
435
        print(f"Cannot encode as {encoding}: {e}")
436

437
# Clean up files
438
for f in ['output.html', 'minified.html', 'content.txt']:
439
    if os.path.exists(f):
440
        os.remove(f)
441
```

Version

Tile

Files

output.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

output.mddocs/