or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

archives.mdcloud-storage.mdconfiguration.mddata-formats.mddirectory-management.mdfile-operations.mdindex.mdmodule-class.mdnltk-integration.mdweb-scraping.md

web-scraping.mddocs/

0

# Web Scraping

1

2

PyStow provides built-in support for downloading and parsing web content using BeautifulSoup. This capability allows you to cache web pages and extract structured data from HTML content.

3

4

## Web Content Functions

5

6

### HTML Parsing with BeautifulSoup

7

8

```python { .api }

9

def ensure_soup(key: str, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, beautiful_soup_kwargs: Mapping[str, Any] | None = None) -> bs4.BeautifulSoup:

10

"""Ensure a webpage is downloaded and parsed with BeautifulSoup.

11

12

Args:

13

key: The name of the module. No funny characters. The envvar <key>_HOME where

14

key is uppercased is checked first before using the default home directory.

15

subkeys: A sequence of additional strings to join. If none are given,

16

returns the directory for this module.

17

url: The URL to download.

18

name: Overrides the name of the file at the end of the URL, if given.

19

Also useful for URLs that don't have proper filenames with extensions.

20

version: The optional version, or no-argument callable that returns an

21

optional version. This is prepended before the subkeys.

22

force: Should the download be done again, even if the path already

23

exists? Defaults to false.

24

download_kwargs: Keyword arguments to pass through to pystow.utils.download.

25

beautiful_soup_kwargs: Additional keyword arguments passed to BeautifulSoup

26

27

Returns:

28

An BeautifulSoup object

29

30

Note:

31

If you don't need to cache, consider using pystow.utils.get_soup instead.

32

"""

33

```

34

35

## Usage Examples

36

37

### Basic Web Scraping

38

39

```python

40

import pystow

41

42

# Download and parse HTML page

43

soup = pystow.ensure_soup(

44

"myapp", "scraped_data",

45

url="https://example.com/data-table"

46

)

47

48

# Extract data from the page

49

table = soup.find('table', {'class': 'data-table'})

50

rows = table.find_all('tr')

51

52

# Process the data

53

data = []

54

for row in rows[1:]: # Skip header row

55

cells = row.find_all('td')

56

data.append([cell.get_text().strip() for cell in cells])

57

```

58

59

### Advanced HTML Parsing

60

61

```python

62

import pystow

63

64

# Download with custom parser and caching

65

soup = pystow.ensure_soup(

66

"myapp", "articles",

67

url="https://news.example.com/article/123",

68

name="article_123.html",

69

beautiful_soup_kwargs={

70

"features": "lxml", # Use lxml parser

71

"from_encoding": "utf-8"

72

}

73

)

74

75

# Extract structured data

76

article_data = {

77

"title": soup.find('h1').get_text().strip(),

78

"author": soup.find('span', {'class': 'author'}).get_text().strip(),

79

"content": soup.find('div', {'class': 'article-content'}).get_text().strip(),

80

"tags": [tag.get_text() for tag in soup.find_all('span', {'class': 'tag'})]

81

}

82

83

# Save extracted data

84

pystow.dump_json(

85

"myapp", "processed",

86

name="article_123.json",

87

obj=article_data

88

)

89

```

90

91

### Web Scraping with Version Management

92

93

```python

94

import pystow

95

from datetime import datetime

96

97

def get_scrape_timestamp():

98

"""Generate timestamp for version control"""

99

return datetime.now().strftime("%Y%m%d_%H%M")

100

101

# Version-aware web scraping

102

soup = pystow.ensure_soup(

103

"myapp", "daily_data",

104

url="https://example.com/live-data",

105

version=get_scrape_timestamp,

106

force=True # Always fetch latest version

107

)

108

109

# Extract time-sensitive data

110

live_data = {

111

"timestamp": datetime.now().isoformat(),

112

"metrics": {

113

metric.get('name'): metric.get_text()

114

for metric in soup.find_all('div', {'class': 'metric'})

115

}

116

}

117

118

# Save with timestamp

119

pystow.dump_json(

120

"myapp", "live_metrics",

121

name=f"metrics_{get_scrape_timestamp()}.json",

122

obj=live_data

123

)

124

```

125

126

### Module-Based Web Scraping

127

128

```python

129

import pystow

130

131

# Create module for web scraping

132

scraper_module = pystow.module("webscraper")

133

134

# Scrape multiple pages

135

pages_to_scrape = [

136

"https://example.com/page1",

137

"https://example.com/page2",

138

"https://example.com/page3"

139

]

140

141

scraped_data = []

142

for i, url in enumerate(pages_to_scrape):

143

soup = scraper_module.ensure_soup(

144

"raw_pages",

145

url=url,

146

name=f"page_{i+1}.html"

147

)

148

149

# Extract data from each page

150

page_data = {

151

"url": url,

152

"title": soup.find('title').get_text().strip(),

153

"links": [a.get('href') for a in soup.find_all('a', href=True)],

154

"images": [img.get('src') for img in soup.find_all('img', src=True)]

155

}

156

scraped_data.append(page_data)

157

158

# Save aggregated data

159

scraper_module.dump_json(

160

"processed",

161

name="all_pages_data.json",

162

obj=scraped_data

163

)

164

```

165

166

### Error Handling and Robust Scraping

167

168

```python

169

import pystow

170

import requests

171

from bs4 import BeautifulSoup

172

173

def safe_scrape_page(url, module_name, page_name):

174

"""Safely scrape a page with error handling"""

175

try:

176

soup = pystow.ensure_soup(

177

module_name, "scraped",

178

url=url,

179

name=f"{page_name}.html",

180

download_kwargs={

181

"timeout": 30,

182

"headers": {

183

"User-Agent": "Mozilla/5.0 (compatible; PyStow/1.0)"

184

}

185

},

186

beautiful_soup_kwargs={

187

"features": "html.parser"

188

}

189

)

190

191

# Validate the soup object

192

if soup.find('title') is None:

193

print(f"Warning: No title found in {url}")

194

return None

195

196

return soup

197

198

except requests.exceptions.RequestException as e:

199

print(f"Network error scraping {url}: {e}")

200

return None

201

except Exception as e:

202

print(f"Error parsing {url}: {e}")

203

return None

204

205

# Use the safe scraper

206

soup = safe_scrape_page(

207

"https://example.com/complex-page",

208

"myapp",

209

"complex_page"

210

)

211

212

if soup:

213

# Extract data safely

214

title = soup.find('title')

215

page_title = title.get_text().strip() if title else "No title"

216

print(f"Successfully scraped: {page_title}")

217

```

218

219

### Scraping Configuration

220

221

```python

222

import pystow

223

224

# Configure scraping behavior

225

def scrape_with_config(url, config_module="scraping"):

226

"""Scrape using configuration settings"""

227

228

# Get configuration

229

user_agent = pystow.get_config(

230

config_module, "user_agent",

231

default="PyStow-Scraper/1.0"

232

)

233

timeout = pystow.get_config(

234

config_module, "timeout",

235

dtype=int, default=30

236

)

237

parser = pystow.get_config(

238

config_module, "parser",

239

default="html.parser"

240

)

241

242

# Scrape with configuration

243

soup = pystow.ensure_soup(

244

"configured_scraping",

245

url=url,

246

download_kwargs={

247

"timeout": timeout,

248

"headers": {"User-Agent": user_agent}

249

},

250

beautiful_soup_kwargs={

251

"features": parser

252

}

253

)

254

255

return soup

256

257

# Set up configuration

258

pystow.write_config("scraping", "user_agent", "MyApp/2.0")

259

pystow.write_config("scraping", "timeout", "60")

260

pystow.write_config("scraping", "parser", "lxml")

261

262

# Use configured scraper

263

soup = scrape_with_config("https://example.com/data")

264

```

265

266

### Data Extraction Pipelines

267

268

```python

269

import pystow

270

271

def extract_product_data(product_urls):

272

"""Extract product data from multiple URLs"""

273

274

module = pystow.module("ecommerce_scraper")

275

products = []

276

277

for i, url in enumerate(product_urls):

278

try:

279

# Scrape product page

280

soup = module.ensure_soup(

281

"products",

282

url=url,

283

name=f"product_{i+1}.html"

284

)

285

286

# Extract product information

287

product = {

288

"url": url,

289

"name": soup.find('h1', {'class': 'product-title'}).get_text().strip(),

290

"price": soup.find('span', {'class': 'price'}).get_text().strip(),

291

"description": soup.find('div', {'class': 'description'}).get_text().strip(),

292

"images": [img.get('src') for img in soup.find_all('img', {'class': 'product-image'})],

293

"availability": soup.find('span', {'class': 'stock'}).get_text().strip()

294

}

295

products.append(product)

296

297

except Exception as e:

298

print(f"Error processing {url}: {e}")

299

continue

300

301

# Save extracted data

302

module.dump_json(

303

"extracted",

304

name="products_data.json",

305

obj=products

306

)

307

308

return products

309

310

# Use the pipeline

311

product_urls = [

312

"https://store.example.com/product/1",

313

"https://store.example.com/product/2",

314

"https://store.example.com/product/3"

315

]

316

317

extracted_products = extract_product_data(product_urls)

318

print(f"Extracted data for {len(extracted_products)} products")

319

```