Tessl Tile for pypi/beautifulsoup4@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

content.md index.md modification.md output.md parsing.md search.md

content.mddocs/

0
# Content Extraction
1

2
Extract text content, attribute values, and formatted output from parse tree elements with flexible filtering and formatting options. Beautiful Soup provides multiple ways to access and extract different types of content from HTML/XML documents.
3

4
## Capabilities
5

6
### Text Content Extraction
7

8
Extract text content from elements with various filtering and formatting options.
9

10
```python { .api }
11
def get_text(self, separator="", strip=False, types=(NavigableString,)):
12
    """
13
    Extract all text content from this element and its descendants.
14
    
15
    Parameters:
16
    - separator: str - string to join text pieces (default: "")
17
    - strip: bool - strip whitespace from each piece (default: False)  
18
    - types: tuple - NavigableString types to include (default: (NavigableString,))
19
    
20
    Returns:
21
    str - concatenated text content
22
    """
23

24
@property
25
def text(self):
26
    """
27
    All text content concatenated without separators.
28
    
29
    Equivalent to get_text()
30
    
31
    Returns:
32
    str
33
    """
34

35
@property  
36
def string(self):
37
    """
38
    The single NavigableString child, or None if multiple children.
39
    
40
    Returns string content only if element has exactly one string child.
41
    
42
    Returns:
43
    NavigableString or None
44
    """
45

46
@property
47
def strings(self):
48
    """
49
    Generator yielding all NavigableString descendants.
50
    
51
    Yields:
52
    NavigableString instances in document order
53
    """
54

55
@property
56
def stripped_strings(self):
57
    """
58
    Generator yielding all non-empty NavigableString descendants with whitespace stripped.
59
    
60
    Yields:
61
    str - stripped string content (empty strings excluded)
62
    """
63
```
64

65
Usage Examples:
66

67
```python
68
from bs4 import BeautifulSoup, Comment
69

70
html = '''
71
<div class="article">
72
  <h1>Article Title</h1>
73
  <!-- This is a comment -->
74
  <p>First paragraph with <em>emphasis</em> and <strong>bold</strong> text.</p>
75
  <p>  Second paragraph with extra whitespace.  </p>
76
  <script>console.log('script content');</script>
77
</div>
78
'''
79

80
soup = BeautifulSoup(html, 'html.parser')
81
article = soup.find('div', class_='article')
82

83
# Basic text extraction
84
all_text = article.get_text()
85
print(all_text)  # All text concatenated
86

87
# Text with separators
88
spaced_text = article.get_text(' ')
89
line_separated = article.get_text('\n')
90
print(spaced_text)   # Words separated by spaces
91
print(line_separated) # Elements separated by newlines
92

93
# Stripped text (removes extra whitespace)
94
clean_text = article.get_text(' ', strip=True)
95
print(clean_text)  # Clean, properly spaced text
96

97
# Include different string types
98
from bs4 import NavigableString, Comment, CData
99

100
# Default - only NavigableString (excludes comments, scripts, etc.)
101
text_only = article.get_text(types=(NavigableString,))
102

103
# Include comments
104
with_comments = article.get_text(types=(NavigableString, Comment))
105

106
# Direct property access
107
print(article.text)  # Same as get_text()
108

109
# Single string access
110
title = soup.find('h1')
111
print(title.string)  # "Article Title" (single string child)
112

113
paragraph = soup.find('p')
114
print(paragraph.string)  # None (has multiple children including tags)
115

116
# Iterate over all strings
117
for string in article.strings:
118
    print(repr(string))  # Shows all text nodes including whitespace
119

120
# Iterate over stripped strings (non-empty only)
121
for string in article.stripped_strings:
122
    print(repr(string))  # Clean text content only
123
```
124

125
### Attribute Access
126

127
Access and manipulate element attributes with dictionary-like interface.
128

129
```python { .api }
130
def get(self, key, default=None):
131
    """
132
    Get attribute value with optional default.
133
    
134
    Parameters:
135
    - key: str - attribute name
136
    - default: value to return if attribute doesn't exist
137
    
138
    Returns:
139
    Attribute value (str or list for class), or default
140
    """
141

142
def has_attr(self, key):
143
    """
144
    Check if element has the specified attribute.
145
    
146
    Parameters:
147
    - key: str - attribute name
148
    
149
    Returns:
150
    bool
151
    """
152

153
def __getitem__(self, key):
154
    """
155
    Get attribute value using dictionary syntax.
156
    
157
    Parameters:
158
    - key: str - attribute name
159
    
160
    Returns:
161
    Attribute value
162
    
163
    Raises:
164
    KeyError if attribute doesn't exist
165
    """
166

167
@property
168
def attrs(self):
169
    """
170
    Dictionary of all element attributes.
171
    
172
    Returns:
173
    dict - attribute name/value pairs
174
    """
175
```
176

177
Usage Examples:
178

179
```python
180
html = '''
181
<div id="main" class="container highlight" data-value="123" title="Main container">
182
  <a href="https://example.com" target="_blank" rel="noopener">Link</a>
183
  <img src="image.jpg" alt="Description" width="100" height="200">
184
</div>
185
'''
186

187
soup = BeautifulSoup(html, 'html.parser')
188

189
div = soup.find('div')
190
link = soup.find('a')
191
img = soup.find('img')
192

193
# Get attributes with default
194
print(div.get('id'))  # 'main'
195
print(div.get('data-value'))  # '123'
196
print(div.get('nonexistent', 'default'))  # 'default'
197

198
# Dictionary-style access
199
print(div['id'])  # 'main'
200
print(link['href'])  # 'https://example.com'
201

202
# Check attribute existence
203
if div.has_attr('class'):
204
    print('Div has class attribute')
205

206
if not img.has_attr('alt'):
207
    print('Image missing alt text')
208

209
# Access all attributes
210
print(div.attrs)
211
# {'id': 'main', 'class': ['container', 'highlight'], 
212
#  'data-value': '123', 'title': 'Main container'}
213

214
# Special handling for class attribute (always a list)
215
print(div['class'])  # ['container', 'highlight']
216
print(type(div['class']))  # <class 'list'>
217

218
# Iterate over attributes
219
for attr_name, attr_value in div.attrs.items():
220
    print(f'{attr_name}: {attr_value}')
221
```
222

223
### Content Type Detection
224

225
Identify and work with different types of content within elements.
226

227
```python { .api }
228
# Content type checking
229
def isinstance(obj, class_or_tuple):
230
    """Check if object is instance of NavigableString subclass"""
231

232
# NavigableString types
233
class NavigableString(str):
234
    """Regular text content"""
235

236
class Comment(NavigableString):
237
    """HTML/XML comments"""
238

239
class CData(NavigableString):
240
    """CDATA sections"""
241

242
class ProcessingInstruction(NavigableString):
243
    """XML processing instructions"""
244

245
class Doctype(NavigableString):
246
    """DOCTYPE declarations"""
247
```
248

249
Usage Examples:
250

251
```python
252
from bs4 import BeautifulSoup, NavigableString, Comment, CData
253

254
html = '''
255
<div>
256
  Regular text
257
  <!-- This is a comment -->
258
  <![CDATA[This is CDATA]]>
259
  <?xml version="1.0"?>
260
  <p>Paragraph text</p>
261
</div>
262
'''
263

264
soup = BeautifulSoup(html, 'lxml')  # lxml better for mixed content
265
div = soup.find('div')
266

267
# Iterate and identify content types
268
for content in div.contents:
269
    if isinstance(content, Comment):
270
        print(f"Comment: {content}")
271
    elif isinstance(content, CData):
272
        print(f"CDATA: {content}")
273
    elif isinstance(content, NavigableString):
274
        if content.strip():  # Skip empty whitespace
275
            print(f"Text: {content.strip()}")
276
    elif hasattr(content, 'name'):  # It's a Tag
277
        print(f"Tag: {content.name}")
278

279
# Filter by content type
280
comments = [c for c in div.contents if isinstance(c, Comment)]
281
text_nodes = [c for c in div.strings if isinstance(c, NavigableString)]
282
```
283

284
### Data Extraction Patterns
285

286
Common patterns for extracting structured data from HTML documents.
287

288
```python { .api }
289
# Common extraction patterns
290

291
def extract_links(soup):
292
    """Extract all links with href and text"""
293
    
294
def extract_images(soup):
295
    """Extract image sources and alt text"""
296
    
297
def extract_tables(soup):
298
    """Extract table data as list of dictionaries"""
299
    
300
def extract_forms(soup):
301
    """Extract form fields and actions"""
302
```
303

304
Usage Examples:
305

306
```python
307
html = '''
308
<div class="content">
309
  <h2>Product List</h2>
310
  <ul class="products">
311
    <li data-id="1" data-price="29.99">
312
      <a href="/product/1">Widget A</a>
313
      <span class="price">$29.99</span>
314
    </li>
315
    <li data-id="2" data-price="39.99">
316
      <a href="/product/2">Widget B</a>
317
      <span class="price">$39.99</span>
318
    </li>
319
  </ul>
320
</div>
321
'''
322

323
soup = BeautifulSoup(html, 'html.parser')
324

325
# Extract structured product data
326
products = []
327
for item in soup.find_all('li', {'data-id': True}):
328
    product = {
329
        'id': item.get('data-id'),
330
        'price': item.get('data-price'),
331
        'name': item.find('a').get_text().strip(),
332
        'url': item.find('a').get('href'),
333
        'price_text': item.find('span', class_='price').get_text()
334
    }
335
    products.append(product)
336

337
print(products)
338
# [{'id': '1', 'price': '29.99', 'name': 'Widget A', 
339
#   'url': '/product/1', 'price_text': '$29.99'}, ...]
340

341
# Extract all links
342
links = []
343
for link in soup.find_all('a', href=True):
344
    links.append({
345
        'url': link['href'],
346
        'text': link.get_text().strip(),
347
        'title': link.get('title', '')
348
    })
349

350
# Extract metadata
351
metadata = {}
352
for meta in soup.find_all('meta'):
353
    name = meta.get('name') or meta.get('property') or meta.get('http-equiv')
354
    content = meta.get('content')
355
    if name and content:
356
        metadata[name] = content
357
```
358

359
### Text Processing Utilities
360

361
Helper functions for cleaning and processing extracted text content.
362

363
```python { .api }
364
import re
365

366
def clean_text(text):
367
    """Remove extra whitespace and normalize text"""
368
    return re.sub(r'\s+', ' ', text.strip())
369

370
def extract_numbers(text):
371
    """Extract numeric values from text"""
372
    return re.findall(r'\d+\.?\d*', text)
373

374
def extract_emails(text):
375
    """Extract email addresses from text"""
376
    return re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
377

378
def extract_urls(text):
379
    """Extract URLs from text"""
380
    return re.findall(r'https?://[^\s<>"]+', text)
381
```
382

383
Usage Examples:
384

385
```python
386
html = '''
387
<div class="contact">
388
  Contact us at   support@example.com   or visit 
389
  https://example.com/contact   for more info.
390
  
391
  Phone:   555-123-4567   
392
</div>
393
'''
394

395
soup = BeautifulSoup(html, 'html.parser')
396
contact_div = soup.find('div', class_='contact')
397

398
# Extract and clean text
399
raw_text = contact_div.get_text()
400
clean_text = re.sub(r'\s+', ' ', raw_text.strip())
401
print(clean_text)
402

403
# Extract specific data patterns
404
emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', clean_text)
405
urls = re.findall(r'https?://[^\s<>"]+', clean_text)
406
phones = re.findall(r'\d{3}-\d{3}-\d{4}', clean_text)
407

408
print(f"Emails: {emails}")  # ['support@example.com']
409
print(f"URLs: {urls}")      # ['https://example.com/contact']
410
print(f"Phones: {phones}")  # ['555-123-4567']
411
```

Version

Tile

Files

content.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

content.mddocs/