Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.
—
PyStow provides built-in support for downloading and parsing web content using BeautifulSoup. This capability allows you to cache web pages and extract structured data from HTML content.
def ensure_soup(key: str, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, beautiful_soup_kwargs: Mapping[str, Any] | None = None) -> bs4.BeautifulSoup:
"""Ensure a webpage is downloaded and parsed with BeautifulSoup.
Args:
key: The name of the module. No funny characters. The envvar <key>_HOME where
key is uppercased is checked first before using the default home directory.
subkeys: A sequence of additional strings to join. If none are given,
returns the directory for this module.
url: The URL to download.
name: Overrides the name of the file at the end of the URL, if given.
Also useful for URLs that don't have proper filenames with extensions.
version: The optional version, or no-argument callable that returns an
optional version. This is prepended before the subkeys.
force: Should the download be done again, even if the path already
exists? Defaults to false.
download_kwargs: Keyword arguments to pass through to pystow.utils.download.
beautiful_soup_kwargs: Additional keyword arguments passed to BeautifulSoup
Returns:
An BeautifulSoup object
Note:
If you don't need to cache, consider using pystow.utils.get_soup instead.
"""import pystow
# Download and parse HTML page
soup = pystow.ensure_soup(
"myapp", "scraped_data",
url="https://example.com/data-table"
)
# Extract data from the page
table = soup.find('table', {'class': 'data-table'})
rows = table.find_all('tr')
# Process the data
data = []
for row in rows[1:]: # Skip header row
cells = row.find_all('td')
data.append([cell.get_text().strip() for cell in cells])import pystow
# Download with custom parser and caching
soup = pystow.ensure_soup(
"myapp", "articles",
url="https://news.example.com/article/123",
name="article_123.html",
beautiful_soup_kwargs={
"features": "lxml", # Use lxml parser
"from_encoding": "utf-8"
}
)
# Extract structured data
article_data = {
"title": soup.find('h1').get_text().strip(),
"author": soup.find('span', {'class': 'author'}).get_text().strip(),
"content": soup.find('div', {'class': 'article-content'}).get_text().strip(),
"tags": [tag.get_text() for tag in soup.find_all('span', {'class': 'tag'})]
}
# Save extracted data
pystow.dump_json(
"myapp", "processed",
name="article_123.json",
obj=article_data
)import pystow
from datetime import datetime
def get_scrape_timestamp():
"""Generate timestamp for version control"""
return datetime.now().strftime("%Y%m%d_%H%M")
# Version-aware web scraping
soup = pystow.ensure_soup(
"myapp", "daily_data",
url="https://example.com/live-data",
version=get_scrape_timestamp,
force=True # Always fetch latest version
)
# Extract time-sensitive data
live_data = {
"timestamp": datetime.now().isoformat(),
"metrics": {
metric.get('name'): metric.get_text()
for metric in soup.find_all('div', {'class': 'metric'})
}
}
# Save with timestamp
pystow.dump_json(
"myapp", "live_metrics",
name=f"metrics_{get_scrape_timestamp()}.json",
obj=live_data
)import pystow
# Create module for web scraping
scraper_module = pystow.module("webscraper")
# Scrape multiple pages
pages_to_scrape = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3"
]
scraped_data = []
for i, url in enumerate(pages_to_scrape):
soup = scraper_module.ensure_soup(
"raw_pages",
url=url,
name=f"page_{i+1}.html"
)
# Extract data from each page
page_data = {
"url": url,
"title": soup.find('title').get_text().strip(),
"links": [a.get('href') for a in soup.find_all('a', href=True)],
"images": [img.get('src') for img in soup.find_all('img', src=True)]
}
scraped_data.append(page_data)
# Save aggregated data
scraper_module.dump_json(
"processed",
name="all_pages_data.json",
obj=scraped_data
)import pystow
import requests
from bs4 import BeautifulSoup
def safe_scrape_page(url, module_name, page_name):
"""Safely scrape a page with error handling"""
try:
soup = pystow.ensure_soup(
module_name, "scraped",
url=url,
name=f"{page_name}.html",
download_kwargs={
"timeout": 30,
"headers": {
"User-Agent": "Mozilla/5.0 (compatible; PyStow/1.0)"
}
},
beautiful_soup_kwargs={
"features": "html.parser"
}
)
# Validate the soup object
if soup.find('title') is None:
print(f"Warning: No title found in {url}")
return None
return soup
except requests.exceptions.RequestException as e:
print(f"Network error scraping {url}: {e}")
return None
except Exception as e:
print(f"Error parsing {url}: {e}")
return None
# Use the safe scraper
soup = safe_scrape_page(
"https://example.com/complex-page",
"myapp",
"complex_page"
)
if soup:
# Extract data safely
title = soup.find('title')
page_title = title.get_text().strip() if title else "No title"
print(f"Successfully scraped: {page_title}")import pystow
# Configure scraping behavior
def scrape_with_config(url, config_module="scraping"):
"""Scrape using configuration settings"""
# Get configuration
user_agent = pystow.get_config(
config_module, "user_agent",
default="PyStow-Scraper/1.0"
)
timeout = pystow.get_config(
config_module, "timeout",
dtype=int, default=30
)
parser = pystow.get_config(
config_module, "parser",
default="html.parser"
)
# Scrape with configuration
soup = pystow.ensure_soup(
"configured_scraping",
url=url,
download_kwargs={
"timeout": timeout,
"headers": {"User-Agent": user_agent}
},
beautiful_soup_kwargs={
"features": parser
}
)
return soup
# Set up configuration
pystow.write_config("scraping", "user_agent", "MyApp/2.0")
pystow.write_config("scraping", "timeout", "60")
pystow.write_config("scraping", "parser", "lxml")
# Use configured scraper
soup = scrape_with_config("https://example.com/data")import pystow
def extract_product_data(product_urls):
"""Extract product data from multiple URLs"""
module = pystow.module("ecommerce_scraper")
products = []
for i, url in enumerate(product_urls):
try:
# Scrape product page
soup = module.ensure_soup(
"products",
url=url,
name=f"product_{i+1}.html"
)
# Extract product information
product = {
"url": url,
"name": soup.find('h1', {'class': 'product-title'}).get_text().strip(),
"price": soup.find('span', {'class': 'price'}).get_text().strip(),
"description": soup.find('div', {'class': 'description'}).get_text().strip(),
"images": [img.get('src') for img in soup.find_all('img', {'class': 'product-image'})],
"availability": soup.find('span', {'class': 'stock'}).get_text().strip()
}
products.append(product)
except Exception as e:
print(f"Error processing {url}: {e}")
continue
# Save extracted data
module.dump_json(
"extracted",
name="products_data.json",
obj=products
)
return products
# Use the pipeline
product_urls = [
"https://store.example.com/product/1",
"https://store.example.com/product/2",
"https://store.example.com/product/3"
]
extracted_products = extract_product_data(product_urls)
print(f"Extracted data for {len(extracted_products)} products")Install with Tessl CLI
npx tessl i tessl/pypi-pystow