or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

content.mdindex.mdmodification.mdnavigation.mdoutput.mdparsing.mdsearch.md

content.mddocs/

0

# Content Extraction

1

2

Extract text content, attribute values, and formatted output from parse tree elements with flexible filtering and formatting options. Beautiful Soup provides multiple ways to access and extract different types of content from HTML/XML documents.

3

4

## Capabilities

5

6

### Text Content Extraction

7

8

Extract text content from elements with various filtering and formatting options.

9

10

```python { .api }

11

def get_text(self, separator="", strip=False, types=(NavigableString,)):

12

"""

13

Extract all text content from this element and its descendants.

14

15

Parameters:

16

- separator: str - string to join text pieces (default: "")

17

- strip: bool - strip whitespace from each piece (default: False)

18

- types: tuple - NavigableString types to include (default: (NavigableString,))

19

20

Returns:

21

str - concatenated text content

22

"""

23

24

@property

25

def text(self):

26

"""

27

All text content concatenated without separators.

28

29

Equivalent to get_text()

30

31

Returns:

32

str

33

"""

34

35

@property

36

def string(self):

37

"""

38

The single NavigableString child, or None if multiple children.

39

40

Returns string content only if element has exactly one string child.

41

42

Returns:

43

NavigableString or None

44

"""

45

46

@property

47

def strings(self):

48

"""

49

Generator yielding all NavigableString descendants.

50

51

Yields:

52

NavigableString instances in document order

53

"""

54

55

@property

56

def stripped_strings(self):

57

"""

58

Generator yielding all non-empty NavigableString descendants with whitespace stripped.

59

60

Yields:

61

str - stripped string content (empty strings excluded)

62

"""

63

```

64

65

Usage Examples:

66

67

```python

68

from bs4 import BeautifulSoup, Comment

69

70

html = '''

71

<div class="article">

72

<h1>Article Title</h1>

73

<!-- This is a comment -->

74

<p>First paragraph with <em>emphasis</em> and <strong>bold</strong> text.</p>

75

<p> Second paragraph with extra whitespace. </p>

76

<script>console.log('script content');</script>

77

</div>

78

'''

79

80

soup = BeautifulSoup(html, 'html.parser')

81

article = soup.find('div', class_='article')

82

83

# Basic text extraction

84

all_text = article.get_text()

85

print(all_text) # All text concatenated

86

87

# Text with separators

88

spaced_text = article.get_text(' ')

89

line_separated = article.get_text('\n')

90

print(spaced_text) # Words separated by spaces

91

print(line_separated) # Elements separated by newlines

92

93

# Stripped text (removes extra whitespace)

94

clean_text = article.get_text(' ', strip=True)

95

print(clean_text) # Clean, properly spaced text

96

97

# Include different string types

98

from bs4 import NavigableString, Comment, CData

99

100

# Default - only NavigableString (excludes comments, scripts, etc.)

101

text_only = article.get_text(types=(NavigableString,))

102

103

# Include comments

104

with_comments = article.get_text(types=(NavigableString, Comment))

105

106

# Direct property access

107

print(article.text) # Same as get_text()

108

109

# Single string access

110

title = soup.find('h1')

111

print(title.string) # "Article Title" (single string child)

112

113

paragraph = soup.find('p')

114

print(paragraph.string) # None (has multiple children including tags)

115

116

# Iterate over all strings

117

for string in article.strings:

118

print(repr(string)) # Shows all text nodes including whitespace

119

120

# Iterate over stripped strings (non-empty only)

121

for string in article.stripped_strings:

122

print(repr(string)) # Clean text content only

123

```

124

125

### Attribute Access

126

127

Access and manipulate element attributes with dictionary-like interface.

128

129

```python { .api }

130

def get(self, key, default=None):

131

"""

132

Get attribute value with optional default.

133

134

Parameters:

135

- key: str - attribute name

136

- default: value to return if attribute doesn't exist

137

138

Returns:

139

Attribute value (str or list for class), or default

140

"""

141

142

def has_attr(self, key):

143

"""

144

Check if element has the specified attribute.

145

146

Parameters:

147

- key: str - attribute name

148

149

Returns:

150

bool

151

"""

152

153

def __getitem__(self, key):

154

"""

155

Get attribute value using dictionary syntax.

156

157

Parameters:

158

- key: str - attribute name

159

160

Returns:

161

Attribute value

162

163

Raises:

164

KeyError if attribute doesn't exist

165

"""

166

167

@property

168

def attrs(self):

169

"""

170

Dictionary of all element attributes.

171

172

Returns:

173

dict - attribute name/value pairs

174

"""

175

```

176

177

Usage Examples:

178

179

```python

180

html = '''

181

<div id="main" class="container highlight" data-value="123" title="Main container">

182

<a href="https://example.com" target="_blank" rel="noopener">Link</a>

183

<img src="image.jpg" alt="Description" width="100" height="200">

184

</div>

185

'''

186

187

soup = BeautifulSoup(html, 'html.parser')

188

189

div = soup.find('div')

190

link = soup.find('a')

191

img = soup.find('img')

192

193

# Get attributes with default

194

print(div.get('id')) # 'main'

195

print(div.get('data-value')) # '123'

196

print(div.get('nonexistent', 'default')) # 'default'

197

198

# Dictionary-style access

199

print(div['id']) # 'main'

200

print(link['href']) # 'https://example.com'

201

202

# Check attribute existence

203

if div.has_attr('class'):

204

print('Div has class attribute')

205

206

if not img.has_attr('alt'):

207

print('Image missing alt text')

208

209

# Access all attributes

210

print(div.attrs)

211

# {'id': 'main', 'class': ['container', 'highlight'],

212

# 'data-value': '123', 'title': 'Main container'}

213

214

# Special handling for class attribute (always a list)

215

print(div['class']) # ['container', 'highlight']

216

print(type(div['class'])) # <class 'list'>

217

218

# Iterate over attributes

219

for attr_name, attr_value in div.attrs.items():

220

print(f'{attr_name}: {attr_value}')

221

```

222

223

### Content Type Detection

224

225

Identify and work with different types of content within elements.

226

227

```python { .api }

228

# Content type checking

229

def isinstance(obj, class_or_tuple):

230

"""Check if object is instance of NavigableString subclass"""

231

232

# NavigableString types

233

class NavigableString(str):

234

"""Regular text content"""

235

236

class Comment(NavigableString):

237

"""HTML/XML comments"""

238

239

class CData(NavigableString):

240

"""CDATA sections"""

241

242

class ProcessingInstruction(NavigableString):

243

"""XML processing instructions"""

244

245

class Doctype(NavigableString):

246

"""DOCTYPE declarations"""

247

```

248

249

Usage Examples:

250

251

```python

252

from bs4 import BeautifulSoup, NavigableString, Comment, CData

253

254

html = '''

255

<div>

256

Regular text

257

<!-- This is a comment -->

258

<![CDATA[This is CDATA]]>

259

<?xml version="1.0"?>

260

<p>Paragraph text</p>

261

</div>

262

'''

263

264

soup = BeautifulSoup(html, 'lxml') # lxml better for mixed content

265

div = soup.find('div')

266

267

# Iterate and identify content types

268

for content in div.contents:

269

if isinstance(content, Comment):

270

print(f"Comment: {content}")

271

elif isinstance(content, CData):

272

print(f"CDATA: {content}")

273

elif isinstance(content, NavigableString):

274

if content.strip(): # Skip empty whitespace

275

print(f"Text: {content.strip()}")

276

elif hasattr(content, 'name'): # It's a Tag

277

print(f"Tag: {content.name}")

278

279

# Filter by content type

280

comments = [c for c in div.contents if isinstance(c, Comment)]

281

text_nodes = [c for c in div.strings if isinstance(c, NavigableString)]

282

```

283

284

### Data Extraction Patterns

285

286

Common patterns for extracting structured data from HTML documents.

287

288

```python { .api }

289

# Common extraction patterns

290

291

def extract_links(soup):

292

"""Extract all links with href and text"""

293

294

def extract_images(soup):

295

"""Extract image sources and alt text"""

296

297

def extract_tables(soup):

298

"""Extract table data as list of dictionaries"""

299

300

def extract_forms(soup):

301

"""Extract form fields and actions"""

302

```

303

304

Usage Examples:

305

306

```python

307

html = '''

308

<div class="content">

309

<h2>Product List</h2>

310

<ul class="products">

311

<li data-id="1" data-price="29.99">

312

<a href="/product/1">Widget A</a>

313

<span class="price">$29.99</span>

314

</li>

315

<li data-id="2" data-price="39.99">

316

<a href="/product/2">Widget B</a>

317

<span class="price">$39.99</span>

318

</li>

319

</ul>

320

</div>

321

'''

322

323

soup = BeautifulSoup(html, 'html.parser')

324

325

# Extract structured product data

326

products = []

327

for item in soup.find_all('li', {'data-id': True}):

328

product = {

329

'id': item.get('data-id'),

330

'price': item.get('data-price'),

331

'name': item.find('a').get_text().strip(),

332

'url': item.find('a').get('href'),

333

'price_text': item.find('span', class_='price').get_text()

334

}

335

products.append(product)

336

337

print(products)

338

# [{'id': '1', 'price': '29.99', 'name': 'Widget A',

339

# 'url': '/product/1', 'price_text': '$29.99'}, ...]

340

341

# Extract all links

342

links = []

343

for link in soup.find_all('a', href=True):

344

links.append({

345

'url': link['href'],

346

'text': link.get_text().strip(),

347

'title': link.get('title', '')

348

})

349

350

# Extract metadata

351

metadata = {}

352

for meta in soup.find_all('meta'):

353

name = meta.get('name') or meta.get('property') or meta.get('http-equiv')

354

content = meta.get('content')

355

if name and content:

356

metadata[name] = content

357

```

358

359

### Text Processing Utilities

360

361

Helper functions for cleaning and processing extracted text content.

362

363

```python { .api }

364

import re

365

366

def clean_text(text):

367

"""Remove extra whitespace and normalize text"""

368

return re.sub(r'\s+', ' ', text.strip())

369

370

def extract_numbers(text):

371

"""Extract numeric values from text"""

372

return re.findall(r'\d+\.?\d*', text)

373

374

def extract_emails(text):

375

"""Extract email addresses from text"""

376

return re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)

377

378

def extract_urls(text):

379

"""Extract URLs from text"""

380

return re.findall(r'https?://[^\s<>"]+', text)

381

```

382

383

Usage Examples:

384

385

```python

386

html = '''

387

<div class="contact">

388

Contact us at support@example.com or visit

389

https://example.com/contact for more info.

390

391

Phone: 555-123-4567

392

</div>

393

'''

394

395

soup = BeautifulSoup(html, 'html.parser')

396

contact_div = soup.find('div', class_='contact')

397

398

# Extract and clean text

399

raw_text = contact_div.get_text()

400

clean_text = re.sub(r'\s+', ' ', raw_text.strip())

401

print(clean_text)

402

403

# Extract specific data patterns

404

emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', clean_text)

405

urls = re.findall(r'https?://[^\s<>"]+', clean_text)

406

phones = re.findall(r'\d{3}-\d{3}-\d{4}', clean_text)

407

408

print(f"Emails: {emails}") # ['support@example.com']

409

print(f"URLs: {urls}") # ['https://example.com/contact']

410

print(f"Phones: {phones}") # ['555-123-4567']

411

```