tessl/pypi-crawlee

A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass

Overview

Eval results

Files

Configuration

Name: tessl/pypi-crawlee
Author: tessl

Global configuration management and request routing systems for fine-tuned control over crawling behavior. Configuration components provide centralized settings management, environment variable integration, proxy management, and request routing capabilities.

Capabilities

Global Configuration

Centralized configuration system with environment variable support and default value management.

class Configuration:
    def __init__(self, **settings): ...

    def get(self, key: str, default: any = None) -> any:
        """Get configuration value with optional default."""

    def set(self, key: str, value: any) -> None:
        """Set configuration value."""

    def get_bool(self, key: str, default: bool = False) -> bool:
        """Get boolean configuration value."""

    def get_int(self, key: str, default: int = 0) -> int:
        """Get integer configuration value."""

    def get_float(self, key: str, default: float = 0.0) -> float:
        """Get float configuration value."""

    @property
    def storage_dir(self) -> str:
        """Default storage directory path."""

    @property
    def max_pool_size(self) -> int:
        """Default maximum pool size."""

    @property
    def request_handler_timeout(self) -> int:
        """Default request handler timeout in seconds."""

Request Router

Request routing system for directing requests to appropriate handlers based on labels and patterns.

class Router:
    def __init__(self): ...

    def default_handler(self, handler: RequestHandler) -> RequestHandler:
        """
        Register default request handler.

        Args:
            handler: Handler function for requests

        Returns:
            The registered handler
        """

    def route(
        self,
        label: str,
        handler: RequestHandler,
        *,
        method: HttpMethod | None = None
    ) -> RequestHandler:
        """
        Register handler for specific request label.

        Args:
            label: Request label to match
            handler: Handler function
            method: Optional HTTP method filter

        Returns:
            The registered handler
        """

    def error_handler(self, handler: ErrorRequestHandler) -> ErrorRequestHandler:
        """
        Register error handler for failed requests.

        Args:
            handler: Error handler function

        Returns:
            The registered handler
        """

    def get_handler(self, request: Request) -> RequestHandler | None:
        """Get appropriate handler for request."""

    def get_error_handler(self) -> ErrorRequestHandler | None:
        """Get registered error handler."""

Proxy Configuration

Proxy server configuration and rotation system for enhanced anonymity and geo-targeting.

class ProxyConfiguration:
    def __init__(
        self,
        proxy_urls: list[str] | None = None,
        *,
        new_url_function: Callable[[], str] | None = None,
        country_code: str | None = None,
        session_id: str | None = None
    ): ...

    async def new_proxy_info(
        self,
        *,
        session_id: str | None = None,
        request: Request | None = None
    ) -> ProxyInfo | None:
        """
        Get new proxy for request.

        Args:
            session_id: Session identifier for proxy affinity
            request: Request being processed

        Returns:
            ProxyInfo object or None if no proxy needed
        """

    def new_url(self) -> str | None:
        """Generate new proxy URL using configured strategy."""

    @property
    def proxy_urls(self) -> list[str]: ...

    @property
    def country_code(self) -> str | None: ...

class ProxyInfo:
    def __init__(
        self,
        *,
        url: str,
        hostname: str | None = None,
        port: int | None = None,
        username: str | None = None,
        password: str | None = None,
        country_code: str | None = None,
        session_id: str | None = None
    ): ...

    @property
    def url(self) -> str:
        """Full proxy URL."""

    @property
    def hostname(self) -> str | None:
        """Proxy hostname."""

    @property
    def port(self) -> int | None:
        """Proxy port number."""

    @property
    def username(self) -> str | None:
        """Proxy authentication username."""

    @property
    def password(self) -> str | None:
        """Proxy authentication password."""

    @property
    def country_code(self) -> str | None:
        """ISO country code for proxy location."""

    @property
    def session_id(self) -> str | None:
        """Session identifier for proxy affinity."""

Handler Types

Type definitions for request handlers and error handlers used with the Router.

RequestHandler = Callable[[BasicCrawlingContext], Awaitable[None]]

ErrorRequestHandler = Callable[
    [BasicCrawlingContext, Exception], Awaitable[None]
]

Usage Examples

Global Configuration

from crawlee.configuration import Configuration
import os

# Create configuration with defaults
config = Configuration(
    storage_dir='./crawlee_storage',
    max_concurrent_requests=10,
    request_timeout=30
)

# Environment variables override defaults
# CRAWLEE_STORAGE_DIR, CRAWLEE_MAX_CONCURRENT_REQUESTS, etc.
os.environ['CRAWLEE_STORAGE_DIR'] = '/tmp/my_crawls'

# Get configuration values
storage_dir = config.get('storage_dir')
print(f"Storage directory: {storage_dir}")  # /tmp/my_crawls

# Type-specific getters
max_requests = config.get_int('max_concurrent_requests', 5)
enable_logging = config.get_bool('enable_logging', True)

# Set values programmatically
config.set('user_agent', 'My Custom Bot 1.0')

Request Routing

import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.router import Router

async def main():
    crawler = BeautifulSoupCrawler()

    # Use the crawler's built-in router
    router = crawler.router

    # Default handler for unlabeled requests
    @router.default_handler
    async def default_handler(context: BeautifulSoupCrawlingContext):
        context.log.info(f"Processing default: {context.request.url}")

        data = {
            'url': context.request.url,
            'title': context.soup.title.string if context.soup.title else None,
            'type': 'default'
        }

        await context.push_data(data)

    # Handler for product pages
    @router.route('product')
    async def product_handler(context: BeautifulSoupCrawlingContext):
        context.log.info(f"Processing product: {context.request.url}")

        # Extract product-specific data
        name = context.soup.select_one('.product-name')
        price = context.soup.select_one('.price')

        data = {
            'url': context.request.url,
            'name': name.get_text().strip() if name else None,
            'price': price.get_text().strip() if price else None,
            'type': 'product'
        }

        await context.push_data(data)

        # Enqueue related products
        await context.enqueue_links(
            selector='.related-product a',
            label='product'
        )

    # Handler for category pages
    @router.route('category')
    async def category_handler(context: BeautifulSoupCrawlingContext):
        context.log.info(f"Processing category: {context.request.url}")

        # Extract category info
        category_name = context.soup.select_one('h1')

        data = {
            'url': context.request.url,
            'category': category_name.get_text().strip() if category_name else None,
            'type': 'category'
        }

        await context.push_data(data)

        # Enqueue product links with product label
        await context.enqueue_links(
            selector='.product-link',
            label='product'
        )

    # Error handler for all failed requests
    @router.error_handler
    async def error_handler(context: BeautifulSoupCrawlingContext, error: Exception):
        context.log.error(f"Error processing {context.request.url}: {error}")

        # Log error details
        await context.push_data({
            'url': context.request.url,
            'error': str(error),
            'type': 'error'
        })

    # Start crawling with labeled requests
    from crawlee import Request

    requests = [
        Request('https://store.example.com/', label='category'),
        Request('https://store.example.com/products/item1', label='product'),
        'https://store.example.com/about',  # No label = default handler
    ]

    await crawler.run(requests)

asyncio.run(main())

Proxy Configuration

import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration

async def main():
    # Configure proxy rotation
    proxy_config = ProxyConfiguration([
        'http://proxy1:8080',
        'http://user:pass@proxy2:8080',
        'http://proxy3:8080',
        'socks5://socks-proxy:1080'
    ])

    # Create crawler with proxy configuration
    crawler = HttpCrawler(
        proxy_configuration=proxy_config,
        max_requests_per_crawl=20
    )

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        # Each request may use different proxy
        proxy_info = context.proxy_info
        if proxy_info:
            context.log.info(f"Using proxy: {proxy_info.hostname}:{proxy_info.port}")
            if proxy_info.country_code:
                context.log.info(f"Proxy country: {proxy_info.country_code}")

        data = {
            'url': context.request.url,
            'status': context.response.status_code,
            'proxy_used': proxy_info.url if proxy_info else None
        }

        await context.push_data(data)

    await crawler.run(['https://httpbin.org/ip'] * 10)

asyncio.run(main())

Custom Proxy Function

import asyncio
import random
from crawlee.proxy_configuration import ProxyConfiguration

def generate_proxy_url() -> str:
    """Custom function to generate proxy URLs dynamically."""
    proxy_providers = [
        'proxy-pool-1.example.com:8080',
        'proxy-pool-2.example.com:8080',
        'proxy-pool-3.example.com:8080'
    ]

    selected = random.choice(proxy_providers)
    return f"http://user:pass@{selected}"

async def main():
    # Use custom proxy generation function
    proxy_config = ProxyConfiguration(
        new_url_function=generate_proxy_url
    )

    # Test proxy generation
    for i in range(5):
        proxy_info = await proxy_config.new_proxy_info()
        print(f"Generated proxy {i+1}: {proxy_info.url}")

asyncio.run(main())

Environment-Based Configuration

import os
from crawlee.configuration import Configuration

# Set environment variables
os.environ.update({
    'CRAWLEE_STORAGE_DIR': './data',
    'CRAWLEE_MAX_CONCURRENT_REQUESTS': '20',
    'CRAWLEE_REQUEST_TIMEOUT': '60',
    'CRAWLEE_ENABLE_PROXY': 'true',
    'CRAWLEE_LOG_LEVEL': 'DEBUG'
})

# Configuration automatically reads from environment
config = Configuration()

print(f"Storage dir: {config.storage_dir}")
print(f"Max concurrent: {config.get_int('max_concurrent_requests')}")
print(f"Request timeout: {config.get_int('request_timeout')}")
print(f"Proxy enabled: {config.get_bool('enable_proxy')}")
print(f"Log level: {config.get('log_level')}")

# Override with custom values
config.set('custom_setting', 'my_value')
print(f"Custom setting: {config.get('custom_setting')}")

Install with Tessl CLI