0
# Content Extraction
1
2
Extract text content, attribute values, and formatted output from parse tree elements with flexible filtering and formatting options. Beautiful Soup provides multiple ways to access and extract different types of content from HTML/XML documents.
3
4
## Capabilities
5
6
### Text Content Extraction
7
8
Extract text content from elements with various filtering and formatting options.
9
10
```python { .api }
11
def get_text(self, separator="", strip=False, types=(NavigableString,)):
12
"""
13
Extract all text content from this element and its descendants.
14
15
Parameters:
16
- separator: str - string to join text pieces (default: "")
17
- strip: bool - strip whitespace from each piece (default: False)
18
- types: tuple - NavigableString types to include (default: (NavigableString,))
19
20
Returns:
21
str - concatenated text content
22
"""
23
24
@property
25
def text(self):
26
"""
27
All text content concatenated without separators.
28
29
Equivalent to get_text()
30
31
Returns:
32
str
33
"""
34
35
@property
36
def string(self):
37
"""
38
The single NavigableString child, or None if multiple children.
39
40
Returns string content only if element has exactly one string child.
41
42
Returns:
43
NavigableString or None
44
"""
45
46
@property
47
def strings(self):
48
"""
49
Generator yielding all NavigableString descendants.
50
51
Yields:
52
NavigableString instances in document order
53
"""
54
55
@property
56
def stripped_strings(self):
57
"""
58
Generator yielding all non-empty NavigableString descendants with whitespace stripped.
59
60
Yields:
61
str - stripped string content (empty strings excluded)
62
"""
63
```
64
65
Usage Examples:
66
67
```python
68
from bs4 import BeautifulSoup, Comment
69
70
html = '''
71
<div class="article">
72
<h1>Article Title</h1>
73
<!-- This is a comment -->
74
<p>First paragraph with <em>emphasis</em> and <strong>bold</strong> text.</p>
75
<p> Second paragraph with extra whitespace. </p>
76
<script>console.log('script content');</script>
77
</div>
78
'''
79
80
soup = BeautifulSoup(html, 'html.parser')
81
article = soup.find('div', class_='article')
82
83
# Basic text extraction
84
all_text = article.get_text()
85
print(all_text) # All text concatenated
86
87
# Text with separators
88
spaced_text = article.get_text(' ')
89
line_separated = article.get_text('\n')
90
print(spaced_text) # Words separated by spaces
91
print(line_separated) # Elements separated by newlines
92
93
# Stripped text (removes extra whitespace)
94
clean_text = article.get_text(' ', strip=True)
95
print(clean_text) # Clean, properly spaced text
96
97
# Include different string types
98
from bs4 import NavigableString, Comment, CData
99
100
# Default - only NavigableString (excludes comments, scripts, etc.)
101
text_only = article.get_text(types=(NavigableString,))
102
103
# Include comments
104
with_comments = article.get_text(types=(NavigableString, Comment))
105
106
# Direct property access
107
print(article.text) # Same as get_text()
108
109
# Single string access
110
title = soup.find('h1')
111
print(title.string) # "Article Title" (single string child)
112
113
paragraph = soup.find('p')
114
print(paragraph.string) # None (has multiple children including tags)
115
116
# Iterate over all strings
117
for string in article.strings:
118
print(repr(string)) # Shows all text nodes including whitespace
119
120
# Iterate over stripped strings (non-empty only)
121
for string in article.stripped_strings:
122
print(repr(string)) # Clean text content only
123
```
124
125
### Attribute Access
126
127
Access and manipulate element attributes with dictionary-like interface.
128
129
```python { .api }
130
def get(self, key, default=None):
131
"""
132
Get attribute value with optional default.
133
134
Parameters:
135
- key: str - attribute name
136
- default: value to return if attribute doesn't exist
137
138
Returns:
139
Attribute value (str or list for class), or default
140
"""
141
142
def has_attr(self, key):
143
"""
144
Check if element has the specified attribute.
145
146
Parameters:
147
- key: str - attribute name
148
149
Returns:
150
bool
151
"""
152
153
def __getitem__(self, key):
154
"""
155
Get attribute value using dictionary syntax.
156
157
Parameters:
158
- key: str - attribute name
159
160
Returns:
161
Attribute value
162
163
Raises:
164
KeyError if attribute doesn't exist
165
"""
166
167
@property
168
def attrs(self):
169
"""
170
Dictionary of all element attributes.
171
172
Returns:
173
dict - attribute name/value pairs
174
"""
175
```
176
177
Usage Examples:
178
179
```python
180
html = '''
181
<div id="main" class="container highlight" data-value="123" title="Main container">
182
<a href="https://example.com" target="_blank" rel="noopener">Link</a>
183
<img src="image.jpg" alt="Description" width="100" height="200">
184
</div>
185
'''
186
187
soup = BeautifulSoup(html, 'html.parser')
188
189
div = soup.find('div')
190
link = soup.find('a')
191
img = soup.find('img')
192
193
# Get attributes with default
194
print(div.get('id')) # 'main'
195
print(div.get('data-value')) # '123'
196
print(div.get('nonexistent', 'default')) # 'default'
197
198
# Dictionary-style access
199
print(div['id']) # 'main'
200
print(link['href']) # 'https://example.com'
201
202
# Check attribute existence
203
if div.has_attr('class'):
204
print('Div has class attribute')
205
206
if not img.has_attr('alt'):
207
print('Image missing alt text')
208
209
# Access all attributes
210
print(div.attrs)
211
# {'id': 'main', 'class': ['container', 'highlight'],
212
# 'data-value': '123', 'title': 'Main container'}
213
214
# Special handling for class attribute (always a list)
215
print(div['class']) # ['container', 'highlight']
216
print(type(div['class'])) # <class 'list'>
217
218
# Iterate over attributes
219
for attr_name, attr_value in div.attrs.items():
220
print(f'{attr_name}: {attr_value}')
221
```
222
223
### Content Type Detection
224
225
Identify and work with different types of content within elements.
226
227
```python { .api }
228
# Content type checking
229
def isinstance(obj, class_or_tuple):
230
"""Check if object is instance of NavigableString subclass"""
231
232
# NavigableString types
233
class NavigableString(str):
234
"""Regular text content"""
235
236
class Comment(NavigableString):
237
"""HTML/XML comments"""
238
239
class CData(NavigableString):
240
"""CDATA sections"""
241
242
class ProcessingInstruction(NavigableString):
243
"""XML processing instructions"""
244
245
class Doctype(NavigableString):
246
"""DOCTYPE declarations"""
247
```
248
249
Usage Examples:
250
251
```python
252
from bs4 import BeautifulSoup, NavigableString, Comment, CData
253
254
html = '''
255
<div>
256
Regular text
257
<!-- This is a comment -->
258
<![CDATA[This is CDATA]]>
259
<?xml version="1.0"?>
260
<p>Paragraph text</p>
261
</div>
262
'''
263
264
soup = BeautifulSoup(html, 'lxml') # lxml better for mixed content
265
div = soup.find('div')
266
267
# Iterate and identify content types
268
for content in div.contents:
269
if isinstance(content, Comment):
270
print(f"Comment: {content}")
271
elif isinstance(content, CData):
272
print(f"CDATA: {content}")
273
elif isinstance(content, NavigableString):
274
if content.strip(): # Skip empty whitespace
275
print(f"Text: {content.strip()}")
276
elif hasattr(content, 'name'): # It's a Tag
277
print(f"Tag: {content.name}")
278
279
# Filter by content type
280
comments = [c for c in div.contents if isinstance(c, Comment)]
281
text_nodes = [c for c in div.strings if isinstance(c, NavigableString)]
282
```
283
284
### Data Extraction Patterns
285
286
Common patterns for extracting structured data from HTML documents.
287
288
```python { .api }
289
# Common extraction patterns
290
291
def extract_links(soup):
292
"""Extract all links with href and text"""
293
294
def extract_images(soup):
295
"""Extract image sources and alt text"""
296
297
def extract_tables(soup):
298
"""Extract table data as list of dictionaries"""
299
300
def extract_forms(soup):
301
"""Extract form fields and actions"""
302
```
303
304
Usage Examples:
305
306
```python
307
html = '''
308
<div class="content">
309
<h2>Product List</h2>
310
<ul class="products">
311
<li data-id="1" data-price="29.99">
312
<a href="/product/1">Widget A</a>
313
<span class="price">$29.99</span>
314
</li>
315
<li data-id="2" data-price="39.99">
316
<a href="/product/2">Widget B</a>
317
<span class="price">$39.99</span>
318
</li>
319
</ul>
320
</div>
321
'''
322
323
soup = BeautifulSoup(html, 'html.parser')
324
325
# Extract structured product data
326
products = []
327
for item in soup.find_all('li', {'data-id': True}):
328
product = {
329
'id': item.get('data-id'),
330
'price': item.get('data-price'),
331
'name': item.find('a').get_text().strip(),
332
'url': item.find('a').get('href'),
333
'price_text': item.find('span', class_='price').get_text()
334
}
335
products.append(product)
336
337
print(products)
338
# [{'id': '1', 'price': '29.99', 'name': 'Widget A',
339
# 'url': '/product/1', 'price_text': '$29.99'}, ...]
340
341
# Extract all links
342
links = []
343
for link in soup.find_all('a', href=True):
344
links.append({
345
'url': link['href'],
346
'text': link.get_text().strip(),
347
'title': link.get('title', '')
348
})
349
350
# Extract metadata
351
metadata = {}
352
for meta in soup.find_all('meta'):
353
name = meta.get('name') or meta.get('property') or meta.get('http-equiv')
354
content = meta.get('content')
355
if name and content:
356
metadata[name] = content
357
```
358
359
### Text Processing Utilities
360
361
Helper functions for cleaning and processing extracted text content.
362
363
```python { .api }
364
import re
365
366
def clean_text(text):
367
"""Remove extra whitespace and normalize text"""
368
return re.sub(r'\s+', ' ', text.strip())
369
370
def extract_numbers(text):
371
"""Extract numeric values from text"""
372
return re.findall(r'\d+\.?\d*', text)
373
374
def extract_emails(text):
375
"""Extract email addresses from text"""
376
return re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
377
378
def extract_urls(text):
379
"""Extract URLs from text"""
380
return re.findall(r'https?://[^\s<>"]+', text)
381
```
382
383
Usage Examples:
384
385
```python
386
html = '''
387
<div class="contact">
388
Contact us at support@example.com or visit
389
https://example.com/contact for more info.
390
391
Phone: 555-123-4567
392
</div>
393
'''
394
395
soup = BeautifulSoup(html, 'html.parser')
396
contact_div = soup.find('div', class_='contact')
397
398
# Extract and clean text
399
raw_text = contact_div.get_text()
400
clean_text = re.sub(r'\s+', ' ', raw_text.strip())
401
print(clean_text)
402
403
# Extract specific data patterns
404
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', clean_text)
405
urls = re.findall(r'https?://[^\s<>"]+', clean_text)
406
phones = re.findall(r'\d{3}-\d{3}-\d{4}', clean_text)
407
408
print(f"Emails: {emails}") # ['support@example.com']
409
print(f"URLs: {urls}") # ['https://example.com/contact']
410
print(f"Phones: {phones}") # ['555-123-4567']
411
```