CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pystow

Easily pick a place to store data for your Python code with standardized directory management, caching, and data format support.

Pending
Overview
Eval results
Files

web-scraping.mddocs/

Web Scraping

PyStow provides built-in support for downloading and parsing web content using BeautifulSoup. This capability allows you to cache web pages and extract structured data from HTML content.

Web Content Functions

HTML Parsing with BeautifulSoup

def ensure_soup(key: str, *subkeys: str, url: str, name: str | None = None, version: VersionHint = None, force: bool = False, download_kwargs: Mapping[str, Any] | None = None, beautiful_soup_kwargs: Mapping[str, Any] | None = None) -> bs4.BeautifulSoup:
    """Ensure a webpage is downloaded and parsed with BeautifulSoup.
    
    Args:
        key: The name of the module. No funny characters. The envvar <key>_HOME where
            key is uppercased is checked first before using the default home directory.
        subkeys: A sequence of additional strings to join. If none are given,
            returns the directory for this module.
        url: The URL to download.
        name: Overrides the name of the file at the end of the URL, if given.
            Also useful for URLs that don't have proper filenames with extensions.
        version: The optional version, or no-argument callable that returns an
            optional version. This is prepended before the subkeys.
        force: Should the download be done again, even if the path already
            exists? Defaults to false.
        download_kwargs: Keyword arguments to pass through to pystow.utils.download.
        beautiful_soup_kwargs: Additional keyword arguments passed to BeautifulSoup
    
    Returns:
        An BeautifulSoup object
    
    Note:
        If you don't need to cache, consider using pystow.utils.get_soup instead.
    """

Usage Examples

Basic Web Scraping

import pystow

# Download and parse HTML page
soup = pystow.ensure_soup(
    "myapp", "scraped_data",
    url="https://example.com/data-table"
)

# Extract data from the page
table = soup.find('table', {'class': 'data-table'})
rows = table.find_all('tr')

# Process the data
data = []
for row in rows[1:]:  # Skip header row
    cells = row.find_all('td')
    data.append([cell.get_text().strip() for cell in cells])

Advanced HTML Parsing

import pystow

# Download with custom parser and caching
soup = pystow.ensure_soup(
    "myapp", "articles",
    url="https://news.example.com/article/123",
    name="article_123.html",
    beautiful_soup_kwargs={
        "features": "lxml",  # Use lxml parser
        "from_encoding": "utf-8"
    }
)

# Extract structured data
article_data = {
    "title": soup.find('h1').get_text().strip(),
    "author": soup.find('span', {'class': 'author'}).get_text().strip(),
    "content": soup.find('div', {'class': 'article-content'}).get_text().strip(),
    "tags": [tag.get_text() for tag in soup.find_all('span', {'class': 'tag'})]
}

# Save extracted data
pystow.dump_json(
    "myapp", "processed",
    name="article_123.json",
    obj=article_data
)

Web Scraping with Version Management

import pystow
from datetime import datetime

def get_scrape_timestamp():
    """Generate timestamp for version control"""
    return datetime.now().strftime("%Y%m%d_%H%M")

# Version-aware web scraping
soup = pystow.ensure_soup(
    "myapp", "daily_data",
    url="https://example.com/live-data",
    version=get_scrape_timestamp,
    force=True  # Always fetch latest version
)

# Extract time-sensitive data
live_data = {
    "timestamp": datetime.now().isoformat(),
    "metrics": {
        metric.get('name'): metric.get_text()
        for metric in soup.find_all('div', {'class': 'metric'})
    }
}

# Save with timestamp
pystow.dump_json(
    "myapp", "live_metrics",
    name=f"metrics_{get_scrape_timestamp()}.json",
    obj=live_data
)

Module-Based Web Scraping

import pystow

# Create module for web scraping
scraper_module = pystow.module("webscraper")

# Scrape multiple pages
pages_to_scrape = [
    "https://example.com/page1",
    "https://example.com/page2", 
    "https://example.com/page3"
]

scraped_data = []
for i, url in enumerate(pages_to_scrape):
    soup = scraper_module.ensure_soup(
        "raw_pages",
        url=url,
        name=f"page_{i+1}.html"
    )
    
    # Extract data from each page
    page_data = {
        "url": url,
        "title": soup.find('title').get_text().strip(),
        "links": [a.get('href') for a in soup.find_all('a', href=True)],
        "images": [img.get('src') for img in soup.find_all('img', src=True)]
    }
    scraped_data.append(page_data)

# Save aggregated data
scraper_module.dump_json(
    "processed",
    name="all_pages_data.json",
    obj=scraped_data
)

Error Handling and Robust Scraping

import pystow
import requests
from bs4 import BeautifulSoup

def safe_scrape_page(url, module_name, page_name):
    """Safely scrape a page with error handling"""
    try:
        soup = pystow.ensure_soup(
            module_name, "scraped",
            url=url,
            name=f"{page_name}.html",
            download_kwargs={
                "timeout": 30,
                "headers": {
                    "User-Agent": "Mozilla/5.0 (compatible; PyStow/1.0)"
                }
            },
            beautiful_soup_kwargs={
                "features": "html.parser"
            }
        )
        
        # Validate the soup object
        if soup.find('title') is None:
            print(f"Warning: No title found in {url}")
            return None
            
        return soup
        
    except requests.exceptions.RequestException as e:
        print(f"Network error scraping {url}: {e}")
        return None
    except Exception as e:
        print(f"Error parsing {url}: {e}")
        return None

# Use the safe scraper
soup = safe_scrape_page(
    "https://example.com/complex-page",
    "myapp",
    "complex_page"
)

if soup:
    # Extract data safely
    title = soup.find('title')
    page_title = title.get_text().strip() if title else "No title"
    print(f"Successfully scraped: {page_title}")

Scraping Configuration

import pystow

# Configure scraping behavior
def scrape_with_config(url, config_module="scraping"):
    """Scrape using configuration settings"""
    
    # Get configuration
    user_agent = pystow.get_config(
        config_module, "user_agent",
        default="PyStow-Scraper/1.0"
    )
    timeout = pystow.get_config(
        config_module, "timeout",
        dtype=int, default=30
    )
    parser = pystow.get_config(
        config_module, "parser",
        default="html.parser"
    )
    
    # Scrape with configuration
    soup = pystow.ensure_soup(
        "configured_scraping",
        url=url,
        download_kwargs={
            "timeout": timeout,
            "headers": {"User-Agent": user_agent}
        },
        beautiful_soup_kwargs={
            "features": parser
        }
    )
    
    return soup

# Set up configuration
pystow.write_config("scraping", "user_agent", "MyApp/2.0")
pystow.write_config("scraping", "timeout", "60")
pystow.write_config("scraping", "parser", "lxml")

# Use configured scraper
soup = scrape_with_config("https://example.com/data")

Data Extraction Pipelines

import pystow

def extract_product_data(product_urls):
    """Extract product data from multiple URLs"""
    
    module = pystow.module("ecommerce_scraper")
    products = []
    
    for i, url in enumerate(product_urls):
        try:
            # Scrape product page
            soup = module.ensure_soup(
                "products",
                url=url,
                name=f"product_{i+1}.html"
            )
            
            # Extract product information
            product = {
                "url": url,
                "name": soup.find('h1', {'class': 'product-title'}).get_text().strip(),
                "price": soup.find('span', {'class': 'price'}).get_text().strip(),
                "description": soup.find('div', {'class': 'description'}).get_text().strip(),
                "images": [img.get('src') for img in soup.find_all('img', {'class': 'product-image'})],
                "availability": soup.find('span', {'class': 'stock'}).get_text().strip()
            }
            products.append(product)
            
        except Exception as e:
            print(f"Error processing {url}: {e}")
            continue
    
    # Save extracted data
    module.dump_json(
        "extracted",
        name="products_data.json",
        obj=products
    )
    
    return products

# Use the pipeline
product_urls = [
    "https://store.example.com/product/1",
    "https://store.example.com/product/2",
    "https://store.example.com/product/3"
]

extracted_products = extract_product_data(product_urls)
print(f"Extracted data for {len(extracted_products)} products")

Install with Tessl CLI

npx tessl i tessl/pypi-pystow

docs

archives.md

cloud-storage.md

configuration.md

data-formats.md

directory-management.md

file-operations.md

index.md

module-class.md

nltk-integration.md

web-scraping.md

tile.json