0
# Tree Modification
1
2
Modify the parse tree by inserting, removing, replacing elements and their attributes with automatic relationship maintenance. Beautiful Soup ensures tree integrity when making changes and provides memory management through explicit cleanup methods.
3
4
## Capabilities
5
6
### Element Removal
7
8
Remove elements from the parse tree with optional memory cleanup.
9
10
```python { .api }
11
def extract(self):
12
"""
13
Remove this element from the tree and return it.
14
15
The element can be reinserted elsewhere. All parent/sibling
16
relationships are updated automatically.
17
18
Returns:
19
The extracted element (self)
20
"""
21
22
def decompose(self):
23
"""
24
Recursively destroy this element and its children to free memory.
25
26
Use when you're done with an element and want to reclaim memory.
27
The element becomes unusable after decomposition.
28
29
Returns:
30
None
31
"""
32
33
def clear(self, decompose=False):
34
"""
35
Remove all children from this element.
36
37
Parameters:
38
- decompose: bool - if True, decompose children to free memory
39
40
Returns:
41
None
42
"""
43
```
44
45
Usage Examples:
46
47
```python
48
from bs4 import BeautifulSoup
49
50
html = '''
51
<div class="container">
52
<p>Keep this paragraph</p>
53
<div class="unwanted">Remove this div</div>
54
<p>Keep this paragraph too</p>
55
</div>
56
'''
57
58
soup = BeautifulSoup(html, 'html.parser')
59
60
# Extract element for reuse elsewhere
61
unwanted = soup.find('div', class_='unwanted')
62
extracted = unwanted.extract() # Removed from tree but still usable
63
64
# Can insert extracted element elsewhere
65
new_location = soup.new_tag('section')
66
new_location.append(extracted)
67
68
# Decompose to free memory permanently
69
ad_elements = soup.find_all('div', class_='advertisement')
70
for ad in ad_elements:
71
ad.decompose() # Memory freed, element unusable
72
73
# Clear all children
74
container = soup.find('div', class_='container')
75
container.clear() # Now empty div
76
77
# Clear with memory cleanup
78
container.clear(decompose=True) # Children decomposed
79
```
80
81
### Element Replacement
82
83
Replace elements in the parse tree with new content.
84
85
```python { .api }
86
def replace_with(self, *args):
87
"""
88
Replace this element with one or more new elements.
89
90
Parameters:
91
- *args: PageElement instances or strings to replace with
92
93
Returns:
94
The replaced element (self)
95
"""
96
97
def wrap(self, wrap_inside):
98
"""
99
Wrap this element inside another element.
100
101
Parameters:
102
- wrap_inside: PageElement (usually Tag) to wrap this element in
103
104
Returns:
105
The wrapping element
106
"""
107
108
def unwrap(self):
109
"""
110
Replace this element with its children.
111
112
Useful for removing a wrapper tag but keeping its contents.
113
Only works on Tag elements with children.
114
115
Returns:
116
The unwrapped element (self)
117
"""
118
```
119
120
Usage Examples:
121
122
```python
123
html = '''
124
<div>
125
<p>Old paragraph</p>
126
<span>Text to wrap</span>
127
<em>Remove emphasis but keep text</em>
128
</div>
129
'''
130
131
soup = BeautifulSoup(html, 'html.parser')
132
133
# Replace element with new content
134
old_p = soup.find('p')
135
new_p = soup.new_tag('p', class_='updated')
136
new_p.string = 'New paragraph content'
137
old_p.replace_with(new_p)
138
139
# Replace with multiple elements
140
span = soup.find('span')
141
new_strong = soup.new_tag('strong')
142
new_strong.string = 'Bold text'
143
new_text = soup.new_string(' and plain text')
144
span.replace_with(new_strong, new_text)
145
146
# Wrap element in new tag
147
text_span = soup.find('span') # If it still exists
148
wrapper_div = soup.new_tag('div', class_='wrapper')
149
if text_span:
150
text_span.wrap(wrapper_div)
151
152
# Unwrap element (remove tag but keep contents)
153
em = soup.find('em')
154
if em:
155
em.unwrap() # <em>Remove emphasis but keep text</em> becomes just the text
156
```
157
158
### Element Insertion
159
160
Insert new elements at specific positions in the parse tree.
161
162
```python { .api }
163
def insert(self, position, new_child):
164
"""
165
Insert a new child at the specified position.
166
167
Parameters:
168
- position: int - index position (0 = first child)
169
- new_child: PageElement or string to insert
170
171
Returns:
172
None
173
"""
174
175
def insert_before(self, *args):
176
"""
177
Insert one or more elements immediately before this element.
178
179
Parameters:
180
- *args: PageElement instances or strings to insert
181
182
Returns:
183
None
184
"""
185
186
def insert_after(self, *args):
187
"""
188
Insert one or more elements immediately after this element.
189
190
Parameters:
191
- *args: PageElement instances or strings to insert
192
193
Returns:
194
None
195
"""
196
197
def append(self, tag):
198
"""
199
Add an element as the last child of this element.
200
201
Parameters:
202
- tag: PageElement or string to append
203
204
Returns:
205
None
206
"""
207
208
def extend(self, tags):
209
"""
210
Add multiple elements as children of this element.
211
212
Parameters:
213
- tags: iterable of PageElement instances or strings
214
215
Returns:
216
None
217
"""
218
219
def index(self, element):
220
"""
221
Find the index of a child element by identity.
222
223
Avoids issues with tag.contents.index(element) when there are
224
equal elements, using identity comparison instead of value.
225
226
Parameters:
227
- element: PageElement - child element to find
228
229
Returns:
230
int - index of element in contents list
231
232
Raises:
233
ValueError if element is not a child of this element
234
"""
235
```
236
237
Usage Examples:
238
239
```python
240
html = '<div><p>Existing paragraph</p></div>'
241
soup = BeautifulSoup(html, 'html.parser')
242
243
div = soup.find('div')
244
existing_p = soup.find('p')
245
246
# Insert at specific position
247
new_h1 = soup.new_tag('h1')
248
new_h1.string = 'Title'
249
div.insert(0, new_h1) # Insert as first child
250
251
# Insert before/after existing elements
252
before_p = soup.new_tag('p', class_='intro')
253
before_p.string = 'Introduction'
254
existing_p.insert_before(before_p)
255
256
after_p = soup.new_tag('p', class_='conclusion')
257
after_p.string = 'Conclusion'
258
existing_p.insert_after(after_p)
259
260
# Append to end
261
footer = soup.new_tag('footer')
262
footer.string = 'Footer content'
263
div.append(footer)
264
265
# Extend with multiple elements
266
new_elements = []
267
for i in range(3):
268
item = soup.new_tag('span', class_='item')
269
item.string = f'Item {i+1}'
270
new_elements.append(item)
271
272
div.extend(new_elements)
273
274
# Insert text content
275
div.insert(1, 'Some plain text')
276
existing_p.insert_after('Text after paragraph')
277
278
# Find element index
279
p_index = div.index(existing_p)
280
print(f"Paragraph is at index {p_index}")
281
282
# Safer than contents.index() for duplicate elements
283
h1_index = div.index(new_h1) # Uses identity, not equality
284
```
285
286
### Attribute Modification
287
288
Modify element attributes using dictionary-like operations.
289
290
```python { .api }
291
def __getitem__(self, key):
292
"""Get attribute value like a dictionary"""
293
294
def __setitem__(self, key, value):
295
"""Set attribute value like a dictionary"""
296
297
def __delitem__(self, key):
298
"""Delete attribute like a dictionary"""
299
300
def get(self, key, default=None):
301
"""
302
Get attribute value with optional default.
303
304
Parameters:
305
- key: str - attribute name
306
- default: value to return if attribute doesn't exist
307
308
Returns:
309
Attribute value or default
310
"""
311
312
def has_attr(self, key):
313
"""
314
Check if element has the specified attribute.
315
316
Parameters:
317
- key: str - attribute name
318
319
Returns:
320
bool
321
"""
322
323
@property
324
def attrs(self):
325
"""
326
Dictionary of all attributes.
327
328
Returns:
329
dict - can be modified directly
330
"""
331
```
332
333
Usage Examples:
334
335
```python
336
html = '<div class="container" id="main">Content</div>'
337
soup = BeautifulSoup(html, 'html.parser')
338
339
div = soup.find('div')
340
341
# Get attributes
342
print(div['class']) # ['container']
343
print(div['id']) # 'main'
344
print(div.get('data-value', 'default')) # 'default'
345
346
# Set attributes
347
div['class'] = ['container', 'updated']
348
div['data-value'] = '123'
349
div['title'] = 'Tooltip text'
350
351
# Delete attributes
352
del div['id']
353
354
# Check attribute existence
355
if div.has_attr('class'):
356
print('Has class attribute')
357
358
# Modify attrs dictionary directly
359
div.attrs['style'] = 'color: red;'
360
div.attrs.update({'data-count': '5', 'role': 'main'})
361
362
# Special handling for class attribute (list vs string)
363
div['class'] = 'single-class' # Becomes ['single-class']
364
div['class'] = ['a', 'b', 'c'] # Stays as list
365
```
366
367
### Content Modification
368
369
Modify the text content and children of elements.
370
371
```python { .api }
372
@property
373
def string(self):
374
"""
375
Get/set the string content of this element.
376
377
Get: Returns single NavigableString if element has only one string child,
378
otherwise None
379
Set: Replaces all children with a single NavigableString
380
381
Returns:
382
NavigableString or None
383
"""
384
385
@string.setter
386
def string(self, value):
387
"""Replace all children with a single string"""
388
389
def get_text(self, separator="", strip=False, types=(NavigableString,)):
390
"""
391
Extract all text content from this element and its descendants.
392
393
Parameters:
394
- separator: str - join multiple text pieces with this separator
395
- strip: bool - strip whitespace from each text piece
396
- types: tuple - which NavigableString types to include
397
398
Returns:
399
str - concatenated text content
400
"""
401
```
402
403
Usage Examples:
404
405
```python
406
html = '''
407
<div>
408
<p>Original text</p>
409
<span>More <em>emphasis</em> text</span>
410
</div>
411
'''
412
413
soup = BeautifulSoup(html, 'html.parser')
414
415
# Modify string content
416
p = soup.find('p')
417
p.string = 'Updated text' # Replaces all children
418
419
# Get text content
420
span = soup.find('span')
421
print(span.get_text()) # 'More emphasis text'
422
print(span.get_text(' | ')) # 'More | emphasis | text'
423
print(span.get_text(strip=True)) # Strips whitespace
424
425
# Work with mixed content
426
div = soup.find('div')
427
all_text = div.get_text(' ')
428
print(all_text) # All text from div and descendants
429
430
# Preserve only certain text types
431
from bs4 import NavigableString, Comment
432
text_only = div.get_text(types=(NavigableString,)) # Excludes comments
433
```
434
435
### Memory Management
436
437
Control memory usage when working with large documents.
438
439
```python { .api }
440
def decompose(self):
441
"""
442
Recursively destroy this element and free memory.
443
444
After decomposition, the element and its children become
445
unusable. Use when processing large documents to prevent
446
memory accumulation.
447
"""
448
449
# Context manager for automatic cleanup
450
class SoupProcessor:
451
def __init__(self, markup, parser):
452
self.soup = BeautifulSoup(markup, parser)
453
454
def __enter__(self):
455
return self.soup
456
457
def __exit__(self, exc_type, exc_val, exc_tb):
458
self.soup.decompose()
459
```
460
461
Usage Examples:
462
463
```python
464
# Manual memory management
465
large_html = get_large_html_document()
466
soup = BeautifulSoup(large_html, 'lxml')
467
468
# Process elements and clean up as you go
469
for section in soup.find_all('section', class_='processed'):
470
process_section(section)
471
section.decompose() # Free memory immediately
472
473
# Clean up entire soup when done
474
soup.decompose()
475
476
# Context manager pattern for automatic cleanup
477
with SoupProcessor(html_content, 'html.parser') as soup:
478
results = extract_data(soup)
479
# soup automatically decomposed when exiting context
480
```