0
# Web Scraping
1
2
PyStow provides built-in support for downloading and parsing web content using BeautifulSoup. This capability allows you to cache web pages and extract structured data from HTML content.
3
4
## Web Content Functions
5
6
### HTML Parsing with BeautifulSoup
7
8
```python { .api }
9
def ensure_soup(key: str, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, beautiful_soup_kwargs: Mapping[str, Any] | None = None) -> bs4.BeautifulSoup:
10
"""Ensure a webpage is downloaded and parsed with BeautifulSoup.
11
12
Args:
13
key: The name of the module. No funny characters. The envvar <key>_HOME where
14
key is uppercased is checked first before using the default home directory.
15
subkeys: A sequence of additional strings to join. If none are given,
16
returns the directory for this module.
17
url: The URL to download.
18
name: Overrides the name of the file at the end of the URL, if given.
19
Also useful for URLs that don't have proper filenames with extensions.
20
version: The optional version, or no-argument callable that returns an
21
optional version. This is prepended before the subkeys.
22
force: Should the download be done again, even if the path already
23
exists? Defaults to false.
24
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
25
beautiful_soup_kwargs: Additional keyword arguments passed to BeautifulSoup
26
27
Returns:
28
An BeautifulSoup object
29
30
Note:
31
If you don't need to cache, consider using pystow.utils.get_soup instead.
32
"""
33
```
34
35
## Usage Examples
36
37
### Basic Web Scraping
38
39
```python
40
import pystow
41
42
# Download and parse HTML page
43
soup = pystow.ensure_soup(
44
"myapp", "scraped_data",
45
url="https://example.com/data-table"
46
)
47
48
# Extract data from the page
49
table = soup.find('table', {'class': 'data-table'})
50
rows = table.find_all('tr')
51
52
# Process the data
53
data = []
54
for row in rows[1:]: # Skip header row
55
cells = row.find_all('td')
56
data.append([cell.get_text().strip() for cell in cells])
57
```
58
59
### Advanced HTML Parsing
60
61
```python
62
import pystow
63
64
# Download with custom parser and caching
65
soup = pystow.ensure_soup(
66
"myapp", "articles",
67
url="https://news.example.com/article/123",
68
name="article_123.html",
69
beautiful_soup_kwargs={
70
"features": "lxml", # Use lxml parser
71
"from_encoding": "utf-8"
72
}
73
)
74
75
# Extract structured data
76
article_data = {
77
"title": soup.find('h1').get_text().strip(),
78
"author": soup.find('span', {'class': 'author'}).get_text().strip(),
79
"content": soup.find('div', {'class': 'article-content'}).get_text().strip(),
80
"tags": [tag.get_text() for tag in soup.find_all('span', {'class': 'tag'})]
81
}
82
83
# Save extracted data
84
pystow.dump_json(
85
"myapp", "processed",
86
name="article_123.json",
87
obj=article_data
88
)
89
```
90
91
### Web Scraping with Version Management
92
93
```python
94
import pystow
95
from datetime import datetime
96
97
def get_scrape_timestamp():
98
"""Generate timestamp for version control"""
99
return datetime.now().strftime("%Y%m%d_%H%M")
100
101
# Version-aware web scraping
102
soup = pystow.ensure_soup(
103
"myapp", "daily_data",
104
url="https://example.com/live-data",
105
version=get_scrape_timestamp,
106
force=True # Always fetch latest version
107
)
108
109
# Extract time-sensitive data
110
live_data = {
111
"timestamp": datetime.now().isoformat(),
112
"metrics": {
113
metric.get('name'): metric.get_text()
114
for metric in soup.find_all('div', {'class': 'metric'})
115
}
116
}
117
118
# Save with timestamp
119
pystow.dump_json(
120
"myapp", "live_metrics",
121
name=f"metrics_{get_scrape_timestamp()}.json",
122
obj=live_data
123
)
124
```
125
126
### Module-Based Web Scraping
127
128
```python
129
import pystow
130
131
# Create module for web scraping
132
scraper_module = pystow.module("webscraper")
133
134
# Scrape multiple pages
135
pages_to_scrape = [
136
"https://example.com/page1",
137
"https://example.com/page2",
138
"https://example.com/page3"
139
]
140
141
scraped_data = []
142
for i, url in enumerate(pages_to_scrape):
143
soup = scraper_module.ensure_soup(
144
"raw_pages",
145
url=url,
146
name=f"page_{i+1}.html"
147
)
148
149
# Extract data from each page
150
page_data = {
151
"url": url,
152
"title": soup.find('title').get_text().strip(),
153
"links": [a.get('href') for a in soup.find_all('a', href=True)],
154
"images": [img.get('src') for img in soup.find_all('img', src=True)]
155
}
156
scraped_data.append(page_data)
157
158
# Save aggregated data
159
scraper_module.dump_json(
160
"processed",
161
name="all_pages_data.json",
162
obj=scraped_data
163
)
164
```
165
166
### Error Handling and Robust Scraping
167
168
```python
169
import pystow
170
import requests
171
from bs4 import BeautifulSoup
172
173
def safe_scrape_page(url, module_name, page_name):
174
"""Safely scrape a page with error handling"""
175
try:
176
soup = pystow.ensure_soup(
177
module_name, "scraped",
178
url=url,
179
name=f"{page_name}.html",
180
download_kwargs={
181
"timeout": 30,
182
"headers": {
183
"User-Agent": "Mozilla/5.0 (compatible; PyStow/1.0)"
184
}
185
},
186
beautiful_soup_kwargs={
187
"features": "html.parser"
188
}
189
)
190
191
# Validate the soup object
192
if soup.find('title') is None:
193
print(f"Warning: No title found in {url}")
194
return None
195
196
return soup
197
198
except requests.exceptions.RequestException as e:
199
print(f"Network error scraping {url}: {e}")
200
return None
201
except Exception as e:
202
print(f"Error parsing {url}: {e}")
203
return None
204
205
# Use the safe scraper
206
soup = safe_scrape_page(
207
"https://example.com/complex-page",
208
"myapp",
209
"complex_page"
210
)
211
212
if soup:
213
# Extract data safely
214
title = soup.find('title')
215
page_title = title.get_text().strip() if title else "No title"
216
print(f"Successfully scraped: {page_title}")
217
```
218
219
### Scraping Configuration
220
221
```python
222
import pystow
223
224
# Configure scraping behavior
225
def scrape_with_config(url, config_module="scraping"):
226
"""Scrape using configuration settings"""
227
228
# Get configuration
229
user_agent = pystow.get_config(
230
config_module, "user_agent",
231
default="PyStow-Scraper/1.0"
232
)
233
timeout = pystow.get_config(
234
config_module, "timeout",
235
dtype=int, default=30
236
)
237
parser = pystow.get_config(
238
config_module, "parser",
239
default="html.parser"
240
)
241
242
# Scrape with configuration
243
soup = pystow.ensure_soup(
244
"configured_scraping",
245
url=url,
246
download_kwargs={
247
"timeout": timeout,
248
"headers": {"User-Agent": user_agent}
249
},
250
beautiful_soup_kwargs={
251
"features": parser
252
}
253
)
254
255
return soup
256
257
# Set up configuration
258
pystow.write_config("scraping", "user_agent", "MyApp/2.0")
259
pystow.write_config("scraping", "timeout", "60")
260
pystow.write_config("scraping", "parser", "lxml")
261
262
# Use configured scraper
263
soup = scrape_with_config("https://example.com/data")
264
```
265
266
### Data Extraction Pipelines
267
268
```python
269
import pystow
270
271
def extract_product_data(product_urls):
272
"""Extract product data from multiple URLs"""
273
274
module = pystow.module("ecommerce_scraper")
275
products = []
276
277
for i, url in enumerate(product_urls):
278
try:
279
# Scrape product page
280
soup = module.ensure_soup(
281
"products",
282
url=url,
283
name=f"product_{i+1}.html"
284
)
285
286
# Extract product information
287
product = {
288
"url": url,
289
"name": soup.find('h1', {'class': 'product-title'}).get_text().strip(),
290
"price": soup.find('span', {'class': 'price'}).get_text().strip(),
291
"description": soup.find('div', {'class': 'description'}).get_text().strip(),
292
"images": [img.get('src') for img in soup.find_all('img', {'class': 'product-image'})],
293
"availability": soup.find('span', {'class': 'stock'}).get_text().strip()
294
}
295
products.append(product)
296
297
except Exception as e:
298
print(f"Error processing {url}: {e}")
299
continue
300
301
# Save extracted data
302
module.dump_json(
303
"extracted",
304
name="products_data.json",
305
obj=products
306
)
307
308
return products
309
310
# Use the pipeline
311
product_urls = [
312
"https://store.example.com/product/1",
313
"https://store.example.com/product/2",
314
"https://store.example.com/product/3"
315
]
316
317
extracted_products = extract_product_data(product_urls)
318
print(f"Extracted data for {len(extracted_products)} products")
319
```