Tessl Tile for pypi/beautifulsoup4@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

content.md index.md modification.md output.md parsing.md search.md

modification.mddocs/

0
# Tree Modification
1

2
Modify the parse tree by inserting, removing, replacing elements and their attributes with automatic relationship maintenance. Beautiful Soup ensures tree integrity when making changes and provides memory management through explicit cleanup methods.
3

4
## Capabilities
5

6
### Element Removal
7

8
Remove elements from the parse tree with optional memory cleanup.
9

10
```python { .api }
11
def extract(self):
12
    """
13
    Remove this element from the tree and return it.
14
    
15
    The element can be reinserted elsewhere. All parent/sibling 
16
    relationships are updated automatically.
17
    
18
    Returns:
19
    The extracted element (self)
20
    """
21

22
def decompose(self):
23
    """
24
    Recursively destroy this element and its children to free memory.
25
    
26
    Use when you're done with an element and want to reclaim memory.
27
    The element becomes unusable after decomposition.
28
    
29
    Returns:
30
    None
31
    """
32

33
def clear(self, decompose=False):
34
    """
35
    Remove all children from this element.
36
    
37
    Parameters:
38
    - decompose: bool - if True, decompose children to free memory
39
    
40
    Returns:
41
    None
42
    """
43
```
44

45
Usage Examples:
46

47
```python
48
from bs4 import BeautifulSoup
49

50
html = '''
51
<div class="container">
52
  <p>Keep this paragraph</p>
53
  <div class="unwanted">Remove this div</div>
54
  <p>Keep this paragraph too</p>
55
</div>
56
'''
57

58
soup = BeautifulSoup(html, 'html.parser')
59

60
# Extract element for reuse elsewhere
61
unwanted = soup.find('div', class_='unwanted')
62
extracted = unwanted.extract()  # Removed from tree but still usable
63

64
# Can insert extracted element elsewhere
65
new_location = soup.new_tag('section')
66
new_location.append(extracted)
67

68
# Decompose to free memory permanently
69
ad_elements = soup.find_all('div', class_='advertisement')
70
for ad in ad_elements:
71
    ad.decompose()  # Memory freed, element unusable
72

73
# Clear all children
74
container = soup.find('div', class_='container')
75
container.clear()  # Now empty div
76

77
# Clear with memory cleanup
78
container.clear(decompose=True)  # Children decomposed
79
```
80

81
### Element Replacement
82

83
Replace elements in the parse tree with new content.
84

85
```python { .api }
86
def replace_with(self, *args):
87
    """
88
    Replace this element with one or more new elements.
89
    
90
    Parameters:
91
    - *args: PageElement instances or strings to replace with
92
    
93
    Returns:
94
    The replaced element (self)
95
    """
96

97
def wrap(self, wrap_inside):
98
    """
99
    Wrap this element inside another element.
100
    
101
    Parameters:
102
    - wrap_inside: PageElement (usually Tag) to wrap this element in
103
    
104
    Returns:
105
    The wrapping element
106
    """
107

108
def unwrap(self):
109
    """
110
    Replace this element with its children.
111
    
112
    Useful for removing a wrapper tag but keeping its contents.
113
    Only works on Tag elements with children.
114
    
115
    Returns:
116
    The unwrapped element (self)
117
    """
118
```
119

120
Usage Examples:
121

122
```python
123
html = '''
124
<div>
125
  <p>Old paragraph</p>
126
  <span>Text to wrap</span>
127
  <em>Remove emphasis but keep text</em>
128
</div>
129
'''
130

131
soup = BeautifulSoup(html, 'html.parser')
132

133
# Replace element with new content
134
old_p = soup.find('p')
135
new_p = soup.new_tag('p', class_='updated')
136
new_p.string = 'New paragraph content'
137
old_p.replace_with(new_p)
138

139
# Replace with multiple elements
140
span = soup.find('span')
141
new_strong = soup.new_tag('strong')
142
new_strong.string = 'Bold text'
143
new_text = soup.new_string(' and plain text')
144
span.replace_with(new_strong, new_text)
145

146
# Wrap element in new tag
147
text_span = soup.find('span')  # If it still exists
148
wrapper_div = soup.new_tag('div', class_='wrapper')
149
if text_span:
150
    text_span.wrap(wrapper_div)
151

152
# Unwrap element (remove tag but keep contents)
153
em = soup.find('em')
154
if em:
155
    em.unwrap()  # <em>Remove emphasis but keep text</em> becomes just the text
156
```
157

158
### Element Insertion
159

160
Insert new elements at specific positions in the parse tree.
161

162
```python { .api }
163
def insert(self, position, new_child):
164
    """
165
    Insert a new child at the specified position.
166
    
167
    Parameters:
168
    - position: int - index position (0 = first child)
169
    - new_child: PageElement or string to insert
170
    
171
    Returns:
172
    None
173
    """
174

175
def insert_before(self, *args):
176
    """
177
    Insert one or more elements immediately before this element.
178
    
179
    Parameters:
180
    - *args: PageElement instances or strings to insert
181
    
182
    Returns:
183
    None
184
    """
185

186
def insert_after(self, *args):
187
    """
188
    Insert one or more elements immediately after this element.
189
    
190
    Parameters:
191
    - *args: PageElement instances or strings to insert
192
    
193
    Returns:
194
    None
195
    """
196

197
def append(self, tag):
198
    """
199
    Add an element as the last child of this element.
200
    
201
    Parameters:
202
    - tag: PageElement or string to append
203
    
204
    Returns:
205
    None
206
    """
207

208
def extend(self, tags):
209
    """
210
    Add multiple elements as children of this element.
211
    
212
    Parameters:
213
    - tags: iterable of PageElement instances or strings
214
    
215
    Returns:
216
    None
217
    """
218

219
def index(self, element):
220
    """
221
    Find the index of a child element by identity.
222
    
223
    Avoids issues with tag.contents.index(element) when there are
224
    equal elements, using identity comparison instead of value.
225
    
226
    Parameters:
227
    - element: PageElement - child element to find
228
    
229
    Returns:
230
    int - index of element in contents list
231
    
232
    Raises:
233
    ValueError if element is not a child of this element
234
    """
235
```
236

237
Usage Examples:
238

239
```python
240
html = '<div><p>Existing paragraph</p></div>'
241
soup = BeautifulSoup(html, 'html.parser')
242

243
div = soup.find('div')
244
existing_p = soup.find('p')
245

246
# Insert at specific position
247
new_h1 = soup.new_tag('h1')
248
new_h1.string = 'Title'
249
div.insert(0, new_h1)  # Insert as first child
250

251
# Insert before/after existing elements
252
before_p = soup.new_tag('p', class_='intro')
253
before_p.string = 'Introduction'
254
existing_p.insert_before(before_p)
255

256
after_p = soup.new_tag('p', class_='conclusion')
257
after_p.string = 'Conclusion'
258
existing_p.insert_after(after_p)
259

260
# Append to end
261
footer = soup.new_tag('footer')
262
footer.string = 'Footer content'
263
div.append(footer)
264

265
# Extend with multiple elements
266
new_elements = []
267
for i in range(3):
268
    item = soup.new_tag('span', class_='item')
269
    item.string = f'Item {i+1}'
270
    new_elements.append(item)
271

272
div.extend(new_elements)
273

274
# Insert text content
275
div.insert(1, 'Some plain text')
276
existing_p.insert_after('Text after paragraph')
277

278
# Find element index
279
p_index = div.index(existing_p)
280
print(f"Paragraph is at index {p_index}")
281

282
# Safer than contents.index() for duplicate elements
283
h1_index = div.index(new_h1)  # Uses identity, not equality
284
```
285

286
### Attribute Modification
287

288
Modify element attributes using dictionary-like operations.
289

290
```python { .api }
291
def __getitem__(self, key):
292
    """Get attribute value like a dictionary"""
293

294
def __setitem__(self, key, value):
295
    """Set attribute value like a dictionary"""
296

297
def __delitem__(self, key):
298
    """Delete attribute like a dictionary"""
299

300
def get(self, key, default=None):
301
    """
302
    Get attribute value with optional default.
303
    
304
    Parameters:
305
    - key: str - attribute name
306
    - default: value to return if attribute doesn't exist
307
    
308
    Returns:
309
    Attribute value or default
310
    """
311

312
def has_attr(self, key):
313
    """
314
    Check if element has the specified attribute.
315
    
316
    Parameters:
317
    - key: str - attribute name
318
    
319
    Returns:
320
    bool
321
    """
322

323
@property
324
def attrs(self):
325
    """
326
    Dictionary of all attributes.
327
    
328
    Returns:
329
    dict - can be modified directly
330
    """
331
```
332

333
Usage Examples:
334

335
```python
336
html = '<div class="container" id="main">Content</div>'
337
soup = BeautifulSoup(html, 'html.parser')
338

339
div = soup.find('div')
340

341
# Get attributes
342
print(div['class'])  # ['container']
343
print(div['id'])     # 'main'
344
print(div.get('data-value', 'default'))  # 'default'
345

346
# Set attributes
347
div['class'] = ['container', 'updated']
348
div['data-value'] = '123'
349
div['title'] = 'Tooltip text'
350

351
# Delete attributes
352
del div['id']
353

354
# Check attribute existence
355
if div.has_attr('class'):
356
    print('Has class attribute')
357

358
# Modify attrs dictionary directly
359
div.attrs['style'] = 'color: red;'
360
div.attrs.update({'data-count': '5', 'role': 'main'})
361

362
# Special handling for class attribute (list vs string)
363
div['class'] = 'single-class'    # Becomes ['single-class']
364
div['class'] = ['a', 'b', 'c']   # Stays as list
365
```
366

367
### Content Modification
368

369
Modify the text content and children of elements.
370

371
```python { .api }
372
@property
373
def string(self):
374
    """
375
    Get/set the string content of this element.
376
    
377
    Get: Returns single NavigableString if element has only one string child,
378
         otherwise None
379
    Set: Replaces all children with a single NavigableString
380
    
381
    Returns:
382
    NavigableString or None
383
    """
384

385
@string.setter  
386
def string(self, value):
387
    """Replace all children with a single string"""
388

389
def get_text(self, separator="", strip=False, types=(NavigableString,)):
390
    """
391
    Extract all text content from this element and its descendants.
392
    
393
    Parameters:
394
    - separator: str - join multiple text pieces with this separator
395
    - strip: bool - strip whitespace from each text piece
396
    - types: tuple - which NavigableString types to include
397
    
398
    Returns:
399
    str - concatenated text content
400
    """
401
```
402

403
Usage Examples:
404

405
```python
406
html = '''
407
<div>
408
  <p>Original text</p>
409
  <span>More <em>emphasis</em> text</span>
410
</div>
411
'''
412

413
soup = BeautifulSoup(html, 'html.parser')
414

415
# Modify string content
416
p = soup.find('p')
417
p.string = 'Updated text'  # Replaces all children
418

419
# Get text content
420
span = soup.find('span')
421
print(span.get_text())  # 'More emphasis text'
422
print(span.get_text(' | '))  # 'More | emphasis | text'
423
print(span.get_text(strip=True))  # Strips whitespace
424

425
# Work with mixed content
426
div = soup.find('div')
427
all_text = div.get_text(' ')
428
print(all_text)  # All text from div and descendants
429

430
# Preserve only certain text types
431
from bs4 import NavigableString, Comment
432
text_only = div.get_text(types=(NavigableString,))  # Excludes comments
433
```
434

435
### Memory Management
436

437
Control memory usage when working with large documents.
438

439
```python { .api }
440
def decompose(self):
441
    """
442
    Recursively destroy this element and free memory.
443
    
444
    After decomposition, the element and its children become
445
    unusable. Use when processing large documents to prevent
446
    memory accumulation.
447
    """
448

449
# Context manager for automatic cleanup
450
class SoupProcessor:
451
    def __init__(self, markup, parser):
452
        self.soup = BeautifulSoup(markup, parser)
453
    
454
    def __enter__(self):
455
        return self.soup
456
    
457
    def __exit__(self, exc_type, exc_val, exc_tb):
458
        self.soup.decompose()
459
```
460

461
Usage Examples:
462

463
```python
464
# Manual memory management
465
large_html = get_large_html_document()
466
soup = BeautifulSoup(large_html, 'lxml')
467

468
# Process elements and clean up as you go
469
for section in soup.find_all('section', class_='processed'):
470
    process_section(section)
471
    section.decompose()  # Free memory immediately
472

473
# Clean up entire soup when done
474
soup.decompose()
475

476
# Context manager pattern for automatic cleanup
477
with SoupProcessor(html_content, 'html.parser') as soup:
478
    results = extract_data(soup)
479
    # soup automatically decomposed when exiting context
480
```

Version

Tile

Files

modification.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

modification.mddocs/