Tessl Tile for pypi/pystow@0.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

archives.md cloud-storage.md configuration.md data-formats.md directory-management.md file-operations.md index.md module-class.md nltk-integration.md web-scraping.md

web-scraping.mddocs/

0
# Web Scraping
1

2
PyStow provides built-in support for downloading and parsing web content using BeautifulSoup. This capability allows you to cache web pages and extract structured data from HTML content.
3

4
## Web Content Functions
5

6
### HTML Parsing with BeautifulSoup
7

8
```python { .api }
9
def ensure_soup(key: str, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, beautiful_soup_kwargs: Mapping[str, Any] | None = None) -> bs4.BeautifulSoup:
10
    """Ensure a webpage is downloaded and parsed with BeautifulSoup.
11
    
12
    Args:
13
        key: The name of the module. No funny characters. The envvar <key>_HOME where
14
            key is uppercased is checked first before using the default home directory.
15
        subkeys: A sequence of additional strings to join. If none are given,
16
            returns the directory for this module.
17
        url: The URL to download.
18
        name: Overrides the name of the file at the end of the URL, if given.
19
            Also useful for URLs that don't have proper filenames with extensions.
20
        version: The optional version, or no-argument callable that returns an
21
            optional version. This is prepended before the subkeys.
22
        force: Should the download be done again, even if the path already
23
            exists? Defaults to false.
24
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
25
        beautiful_soup_kwargs: Additional keyword arguments passed to BeautifulSoup
26
    
27
    Returns:
28
        An BeautifulSoup object
29
    
30
    Note:
31
        If you don't need to cache, consider using pystow.utils.get_soup instead.
32
    """
33
```
34

35
## Usage Examples
36

37
### Basic Web Scraping
38

39
```python
40
import pystow
41

42
# Download and parse HTML page
43
soup = pystow.ensure_soup(
44
    "myapp", "scraped_data",
45
    url="https://example.com/data-table"
46
)
47

48
# Extract data from the page
49
table = soup.find('table', {'class': 'data-table'})
50
rows = table.find_all('tr')
51

52
# Process the data
53
data = []
54
for row in rows[1:]:  # Skip header row
55
    cells = row.find_all('td')
56
    data.append([cell.get_text().strip() for cell in cells])
57
```
58

59
### Advanced HTML Parsing
60

61
```python
62
import pystow
63

64
# Download with custom parser and caching
65
soup = pystow.ensure_soup(
66
    "myapp", "articles",
67
    url="https://news.example.com/article/123",
68
    name="article_123.html",
69
    beautiful_soup_kwargs={
70
        "features": "lxml",  # Use lxml parser
71
        "from_encoding": "utf-8"
72
    }
73
)
74

75
# Extract structured data
76
article_data = {
77
    "title": soup.find('h1').get_text().strip(),
78
    "author": soup.find('span', {'class': 'author'}).get_text().strip(),
79
    "content": soup.find('div', {'class': 'article-content'}).get_text().strip(),
80
    "tags": [tag.get_text() for tag in soup.find_all('span', {'class': 'tag'})]
81
}
82

83
# Save extracted data
84
pystow.dump_json(
85
    "myapp", "processed",
86
    name="article_123.json",
87
    obj=article_data
88
)
89
```
90

91
### Web Scraping with Version Management
92

93
```python
94
import pystow
95
from datetime import datetime
96

97
def get_scrape_timestamp():
98
    """Generate timestamp for version control"""
99
    return datetime.now().strftime("%Y%m%d_%H%M")
100

101
# Version-aware web scraping
102
soup = pystow.ensure_soup(
103
    "myapp", "daily_data",
104
    url="https://example.com/live-data",
105
    version=get_scrape_timestamp,
106
    force=True  # Always fetch latest version
107
)
108

109
# Extract time-sensitive data
110
live_data = {
111
    "timestamp": datetime.now().isoformat(),
112
    "metrics": {
113
        metric.get('name'): metric.get_text()
114
        for metric in soup.find_all('div', {'class': 'metric'})
115
    }
116
}
117

118
# Save with timestamp
119
pystow.dump_json(
120
    "myapp", "live_metrics",
121
    name=f"metrics_{get_scrape_timestamp()}.json",
122
    obj=live_data
123
)
124
```
125

126
### Module-Based Web Scraping
127

128
```python
129
import pystow
130

131
# Create module for web scraping
132
scraper_module = pystow.module("webscraper")
133

134
# Scrape multiple pages
135
pages_to_scrape = [
136
    "https://example.com/page1",
137
    "https://example.com/page2", 
138
    "https://example.com/page3"
139
]
140

141
scraped_data = []
142
for i, url in enumerate(pages_to_scrape):
143
    soup = scraper_module.ensure_soup(
144
        "raw_pages",
145
        url=url,
146
        name=f"page_{i+1}.html"
147
    )
148
    
149
    # Extract data from each page
150
    page_data = {
151
        "url": url,
152
        "title": soup.find('title').get_text().strip(),
153
        "links": [a.get('href') for a in soup.find_all('a', href=True)],
154
        "images": [img.get('src') for img in soup.find_all('img', src=True)]
155
    }
156
    scraped_data.append(page_data)
157

158
# Save aggregated data
159
scraper_module.dump_json(
160
    "processed",
161
    name="all_pages_data.json",
162
    obj=scraped_data
163
)
164
```
165

166
### Error Handling and Robust Scraping
167

168
```python
169
import pystow
170
import requests
171
from bs4 import BeautifulSoup
172

173
def safe_scrape_page(url, module_name, page_name):
174
    """Safely scrape a page with error handling"""
175
    try:
176
        soup = pystow.ensure_soup(
177
            module_name, "scraped",
178
            url=url,
179
            name=f"{page_name}.html",
180
            download_kwargs={
181
                "timeout": 30,
182
                "headers": {
183
                    "User-Agent": "Mozilla/5.0 (compatible; PyStow/1.0)"
184
                }
185
            },
186
            beautiful_soup_kwargs={
187
                "features": "html.parser"
188
            }
189
        )
190
        
191
        # Validate the soup object
192
        if soup.find('title') is None:
193
            print(f"Warning: No title found in {url}")
194
            return None
195
            
196
        return soup
197
        
198
    except requests.exceptions.RequestException as e:
199
        print(f"Network error scraping {url}: {e}")
200
        return None
201
    except Exception as e:
202
        print(f"Error parsing {url}: {e}")
203
        return None
204

205
# Use the safe scraper
206
soup = safe_scrape_page(
207
    "https://example.com/complex-page",
208
    "myapp",
209
    "complex_page"
210
)
211

212
if soup:
213
    # Extract data safely
214
    title = soup.find('title')
215
    page_title = title.get_text().strip() if title else "No title"
216
    print(f"Successfully scraped: {page_title}")
217
```
218

219
### Scraping Configuration
220

221
```python
222
import pystow
223

224
# Configure scraping behavior
225
def scrape_with_config(url, config_module="scraping"):
226
    """Scrape using configuration settings"""
227
    
228
    # Get configuration
229
    user_agent = pystow.get_config(
230
        config_module, "user_agent",
231
        default="PyStow-Scraper/1.0"
232
    )
233
    timeout = pystow.get_config(
234
        config_module, "timeout",
235
        dtype=int, default=30
236
    )
237
    parser = pystow.get_config(
238
        config_module, "parser",
239
        default="html.parser"
240
    )
241
    
242
    # Scrape with configuration
243
    soup = pystow.ensure_soup(
244
        "configured_scraping",
245
        url=url,
246
        download_kwargs={
247
            "timeout": timeout,
248
            "headers": {"User-Agent": user_agent}
249
        },
250
        beautiful_soup_kwargs={
251
            "features": parser
252
        }
253
    )
254
    
255
    return soup
256

257
# Set up configuration
258
pystow.write_config("scraping", "user_agent", "MyApp/2.0")
259
pystow.write_config("scraping", "timeout", "60")
260
pystow.write_config("scraping", "parser", "lxml")
261

262
# Use configured scraper
263
soup = scrape_with_config("https://example.com/data")
264
```
265

266
### Data Extraction Pipelines
267

268
```python
269
import pystow
270

271
def extract_product_data(product_urls):
272
    """Extract product data from multiple URLs"""
273
    
274
    module = pystow.module("ecommerce_scraper")
275
    products = []
276
    
277
    for i, url in enumerate(product_urls):
278
        try:
279
            # Scrape product page
280
            soup = module.ensure_soup(
281
                "products",
282
                url=url,
283
                name=f"product_{i+1}.html"
284
            )
285
            
286
            # Extract product information
287
            product = {
288
                "url": url,
289
                "name": soup.find('h1', {'class': 'product-title'}).get_text().strip(),
290
                "price": soup.find('span', {'class': 'price'}).get_text().strip(),
291
                "description": soup.find('div', {'class': 'description'}).get_text().strip(),
292
                "images": [img.get('src') for img in soup.find_all('img', {'class': 'product-image'})],
293
                "availability": soup.find('span', {'class': 'stock'}).get_text().strip()
294
            }
295
            products.append(product)
296
            
297
        except Exception as e:
298
            print(f"Error processing {url}: {e}")
299
            continue
300
    
301
    # Save extracted data
302
    module.dump_json(
303
        "extracted",
304
        name="products_data.json",
305
        obj=products
306
    )
307
    
308
    return products
309

310
# Use the pipeline
311
product_urls = [
312
    "https://store.example.com/product/1",
313
    "https://store.example.com/product/2",
314
    "https://store.example.com/product/3"
315
]
316

317
extracted_products = extract_product_data(product_urls)
318
print(f"Extracted data for {len(extracted_products)} products")
319
```

Version

Tile

Files

web-scraping.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

web-scraping.mddocs/