0
# Element Search
1
2
Find elements using tag names, attributes, text content, CSS selectors, and custom matching functions. Beautiful Soup provides flexible search capabilities with both single and multiple result options, supporting various criteria types for precise element selection.
3
4
## Capabilities
5
6
### Basic Search Methods
7
8
Find elements in the parse tree using tag names, attributes, and text content.
9
10
```python { .api }
11
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs):
12
"""
13
Find the first element matching the given criteria.
14
15
Parameters:
16
- name: str, list, regex, callable, or True - tag name filter
17
- attrs: dict - attribute filters
18
- recursive: bool - search descendants (True) or direct children only (False)
19
- text: str, list, regex, callable, or True - text content filter
20
- **kwargs: attribute filters as keyword arguments
21
22
Returns:
23
PageElement or None if no match found
24
"""
25
26
def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs):
27
"""
28
Find all elements matching the given criteria.
29
30
Parameters:
31
- name: str, list, regex, callable, or True - tag name filter
32
- attrs: dict - attribute filters
33
- recursive: bool - search descendants (True) or direct children only (False)
34
- text: str, list, regex, callable, or True - text content filter
35
- limit: int - maximum number of results to return
36
- **kwargs: attribute filters as keyword arguments
37
38
Returns:
39
ResultSet (list-like) of matching PageElement instances
40
"""
41
42
# Convenience method - equivalent to find_all
43
def __call__(self, *args, **kwargs):
44
"""Equivalent to find_all(*args, **kwargs)"""
45
```
46
47
Usage Examples:
48
49
```python
50
from bs4 import BeautifulSoup
51
import re
52
53
html = '''
54
<html>
55
<body>
56
<div class="container">
57
<p id="intro">Introduction text</p>
58
<p class="content">Main content</p>
59
<a href="http://example.com">External link</a>
60
<a href="/internal">Internal link</a>
61
</div>
62
</body>
63
</html>
64
'''
65
66
soup = BeautifulSoup(html, 'html.parser')
67
68
# Find by tag name
69
first_p = soup.find('p')
70
all_ps = soup.find_all('p')
71
72
# Find by attributes
73
intro = soup.find('p', id='intro')
74
content = soup.find('p', class_='content')
75
external_link = soup.find('a', href='http://example.com')
76
77
# Find with attribute dictionary
78
intro = soup.find('p', attrs={'id': 'intro'})
79
80
# Find by multiple attributes
81
# (no results in this example, but shows syntax)
82
result = soup.find('p', {'class': 'content', 'id': 'special'})
83
84
# Find with regex patterns
85
external_links = soup.find_all('a', href=re.compile(r'^http'))
86
internal_links = soup.find_all('a', href=re.compile(r'^/'))
87
88
# Find with callable
89
def has_class(tag):
90
return tag.has_attr('class')
91
92
elements_with_class = soup.find_all(has_class)
93
94
# Limit results
95
first_two_links = soup.find_all('a', limit=2)
96
97
# Search direct children only
98
container = soup.find('div', class_='container')
99
direct_children = container.find_all('p', recursive=False)
100
```
101
102
### CSS Selector Search
103
104
Use CSS selector syntax for complex element selection.
105
106
```python { .api }
107
def select(self, selector):
108
"""
109
Find all elements matching a CSS selector.
110
111
Parameters:
112
- selector: str - CSS selector string
113
114
Returns:
115
ResultSet of matching elements
116
117
Supported selectors:
118
- Tag names: 'p', 'div'
119
- IDs: '#myid'
120
- Classes: '.myclass'
121
- Attributes: '[href]', '[href="value"]'
122
- Pseudo-classes: ':first-child', ':nth-of-type(n)'
123
- Combinators: 'div > p', 'div p', 'div + p', 'div ~ p'
124
"""
125
126
# Note: select_one() method was added in later versions of Beautiful Soup
127
# For version 4.3.2, use select(selector)[0] or select(selector)[:1] for first match
128
```
129
130
Usage Examples:
131
132
```python
133
html = '''
134
<div class="container">
135
<h1 id="title">Page Title</h1>
136
<div class="content">
137
<p class="intro">Introduction</p>
138
<p>Regular paragraph</p>
139
<ul>
140
<li>First item</li>
141
<li class="special">Second item</li>
142
</ul>
143
</div>
144
</div>
145
'''
146
147
soup = BeautifulSoup(html, 'html.parser')
148
149
# Basic selectors
150
title_list = soup.select('#title') # Returns list, use [0] for first element
151
title = title_list[0] if title_list else None
152
intro_list = soup.select('.intro')
153
intro = intro_list[0] if intro_list else None
154
all_paragraphs = soup.select('p') # All p tags
155
156
# Attribute selectors
157
elements_with_class = soup.select('[class]') # Has class attribute
158
special_items = soup.select('[class="special"]') # Specific class value
159
160
# Descendant combinators
161
content_paragraphs = soup.select('div.content p') # p descendants of div.content
162
direct_children = soup.select('div.content > p') # p direct children of div.content
163
164
# Sibling combinators
165
after_intro = soup.select('p.intro + p') # p immediately after p.intro
166
all_after_intro = soup.select('p.intro ~ p') # all p siblings after p.intro
167
168
# Pseudo-classes
169
first_li = soup.select('li:first-child')
170
second_li = soup.select('li:nth-of-type(2)')
171
last_p = soup.select('p:last-of-type')
172
173
# Complex selectors
174
special_in_content = soup.select('div.content .special')
175
nested_selection = soup.select('div.container > div > ul > li.special')
176
```
177
178
### Directional Search Methods
179
180
Search in specific directions from the current element.
181
182
```python { .api }
183
def find_next(self, name=None, attrs={}, text=None, **kwargs):
184
"""
185
Find the next element in document order matching criteria.
186
187
Returns:
188
PageElement or None
189
"""
190
191
def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs):
192
"""
193
Find all following elements in document order matching criteria.
194
195
Returns:
196
ResultSet of matching elements
197
"""
198
199
def find_previous(self, name=None, attrs={}, text=None, **kwargs):
200
"""
201
Find the previous element in document order matching criteria.
202
203
Returns:
204
PageElement or None
205
"""
206
207
def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs):
208
"""
209
Find all preceding elements in document order matching criteria.
210
211
Returns:
212
ResultSet of matching elements
213
"""
214
215
def find_next_sibling(self, name=None, attrs={}, text=None, **kwargs):
216
"""
217
Find the next sibling element matching criteria.
218
219
Returns:
220
PageElement or None
221
"""
222
223
def find_next_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs):
224
"""
225
Find all following sibling elements matching criteria.
226
227
Returns:
228
ResultSet of matching elements
229
"""
230
231
def find_previous_sibling(self, name=None, attrs={}, text=None, **kwargs):
232
"""
233
Find the previous sibling element matching criteria.
234
235
Returns:
236
PageElement or None
237
"""
238
239
def find_previous_siblings(self, name=None, attrs={}, text=None, limit=None, **kwargs):
240
"""
241
Find all preceding sibling elements matching criteria.
242
243
Returns:
244
ResultSet of matching elements
245
"""
246
247
def find_parent(self, name=None, attrs={}, **kwargs):
248
"""
249
Find the parent element matching criteria.
250
251
Returns:
252
PageElement or None
253
"""
254
255
def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
256
"""
257
Find all ancestor elements matching criteria.
258
259
Returns:
260
ResultSet of matching elements
261
"""
262
```
263
264
Usage Examples:
265
266
```python
267
html = '''
268
<article>
269
<h1>Title</h1>
270
<p>First paragraph</p>
271
<div class="sidebar">Sidebar content</div>
272
<p>Second paragraph</p>
273
<footer>Footer</footer>
274
</article>
275
'''
276
277
soup = BeautifulSoup(html, 'html.parser')
278
first_p = soup.find('p')
279
280
# Find next elements
281
next_div = first_p.find_next('div') # sidebar div
282
next_p = first_p.find_next('p') # second paragraph
283
all_following = first_p.find_all_next() # all elements after first p
284
285
# Find previous elements
286
h1 = first_p.find_previous('h1') # title
287
all_preceding = first_p.find_all_previous() # h1 and title text
288
289
# Find siblings
290
next_sibling_div = first_p.find_next_sibling('div') # sidebar
291
all_next_siblings = first_p.find_next_siblings() # div, p, footer
292
293
# Find parents
294
article = first_p.find_parent('article')
295
all_parents = first_p.find_parents() # article, then document root
296
```
297
298
### Advanced Search Patterns
299
300
Complex search criteria using callables, regular expressions, and custom matching logic.
301
302
```python { .api }
303
# Search criteria types
304
SearchCriteria = Union[
305
str, # Exact match
306
list, # Match any item in list
307
re.Pattern, # Regex pattern match
308
callable, # Custom function returning bool
309
True, # Match any (for text: any non-empty string)
310
None # No filter (match all)
311
]
312
```
313
314
Usage Examples:
315
316
```python
317
import re
318
from bs4 import BeautifulSoup
319
320
html = '''
321
<div>
322
<p class="intro summary">Introduction</p>
323
<p class="content">Main content</p>
324
<a href="mailto:user@example.com">Email</a>
325
<a href="http://example.com">Website</a>
326
<span data-value="123">Data span</span>
327
</div>
328
'''
329
330
soup = BeautifulSoup(html, 'html.parser')
331
332
# List matching - multiple values
333
paragraphs = soup.find_all('p', class_=['intro', 'content'])
334
335
# Regex matching
336
email_links = soup.find_all('a', href=re.compile(r'^mailto:'))
337
data_elements = soup.find_all(attrs={'data-value': re.compile(r'\d+')})
338
339
# Callable matching
340
def has_multiple_classes(tag):
341
return tag.has_attr('class') and len(tag['class']) > 1
342
343
multi_class_elements = soup.find_all(has_multiple_classes)
344
345
def is_external_link(tag):
346
return (tag.name == 'a' and
347
tag.has_attr('href') and
348
tag['href'].startswith('http'))
349
350
external_links = soup.find_all(is_external_link)
351
352
# Text content search
353
elements_with_text = soup.find_all(text=True) # All text nodes
354
intro_text = soup.find_all(text=re.compile(r'Intro')) # Text containing 'Intro'
355
356
# Complex combined criteria
357
def complex_criteria(tag):
358
return (tag.name in ['p', 'div'] and
359
tag.has_attr('class') and
360
'content' in tag.get('class', []))
361
362
matching_elements = soup.find_all(complex_criteria)
363
```
364
365
### Search Utilities
366
367
Helper classes and functions for search operations.
368
369
```python { .api }
370
class SoupStrainer:
371
"""Encapsulates search criteria for filtering elements during parsing."""
372
373
def __init__(self, name=None, attrs={}, text=None, **kwargs):
374
"""
375
Create search criteria for parsing or post-parse filtering.
376
377
Parameters same as find() method
378
"""
379
380
def search(self, markup):
381
"""Test if element matches criteria"""
382
383
def search_tag(self, markup_name, markup_attrs):
384
"""Test if tag matches criteria"""
385
386
class ResultSet(list):
387
"""List subclass that tracks the search criteria used to generate results."""
388
389
@property
390
def source(self):
391
"""The SoupStrainer that generated these results"""
392
```
393
394
Usage Examples:
395
396
```python
397
from bs4 import BeautifulSoup, SoupStrainer
398
399
# Use SoupStrainer to limit parsing
400
only_links = SoupStrainer('a')
401
soup = BeautifulSoup(html, 'html.parser', parse_only=only_links)
402
403
# ResultSet provides search context
404
results = soup.find_all('p')
405
print(type(results)) # <class 'bs4.element.ResultSet'>
406
print(results.source) # Shows the SoupStrainer used
407
```
408
409
### Backward Compatibility
410
411
Legacy search methods from BeautifulSoup 3.x.
412
413
```python { .api }
414
# BeautifulSoup 3.x compatibility
415
def findAll(self, *args, **kwargs): # Use find_all instead
416
"""Deprecated: use find_all"""
417
418
def findNext(self, *args, **kwargs): # Use find_next instead
419
"""Deprecated: use find_next"""
420
```