A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass
Global configuration management and request routing systems for fine-tuned control over crawling behavior. Configuration components provide centralized settings management, environment variable integration, proxy management, and request routing capabilities.
Centralized configuration system with environment variable support and default value management.
class Configuration:
def __init__(self, **settings): ...
def get(self, key: str, default: any = None) -> any:
"""Get configuration value with optional default."""
def set(self, key: str, value: any) -> None:
"""Set configuration value."""
def get_bool(self, key: str, default: bool = False) -> bool:
"""Get boolean configuration value."""
def get_int(self, key: str, default: int = 0) -> int:
"""Get integer configuration value."""
def get_float(self, key: str, default: float = 0.0) -> float:
"""Get float configuration value."""
@property
def storage_dir(self) -> str:
"""Default storage directory path."""
@property
def max_pool_size(self) -> int:
"""Default maximum pool size."""
@property
def request_handler_timeout(self) -> int:
"""Default request handler timeout in seconds."""Request routing system for directing requests to appropriate handlers based on labels and patterns.
class Router:
def __init__(self): ...
def default_handler(self, handler: RequestHandler) -> RequestHandler:
"""
Register default request handler.
Args:
handler: Handler function for requests
Returns:
The registered handler
"""
def route(
self,
label: str,
handler: RequestHandler,
*,
method: HttpMethod | None = None
) -> RequestHandler:
"""
Register handler for specific request label.
Args:
label: Request label to match
handler: Handler function
method: Optional HTTP method filter
Returns:
The registered handler
"""
def error_handler(self, handler: ErrorRequestHandler) -> ErrorRequestHandler:
"""
Register error handler for failed requests.
Args:
handler: Error handler function
Returns:
The registered handler
"""
def get_handler(self, request: Request) -> RequestHandler | None:
"""Get appropriate handler for request."""
def get_error_handler(self) -> ErrorRequestHandler | None:
"""Get registered error handler."""Proxy server configuration and rotation system for enhanced anonymity and geo-targeting.
class ProxyConfiguration:
def __init__(
self,
proxy_urls: list[str] | None = None,
*,
new_url_function: Callable[[], str] | None = None,
country_code: str | None = None,
session_id: str | None = None
): ...
async def new_proxy_info(
self,
*,
session_id: str | None = None,
request: Request | None = None
) -> ProxyInfo | None:
"""
Get new proxy for request.
Args:
session_id: Session identifier for proxy affinity
request: Request being processed
Returns:
ProxyInfo object or None if no proxy needed
"""
def new_url(self) -> str | None:
"""Generate new proxy URL using configured strategy."""
@property
def proxy_urls(self) -> list[str]: ...
@property
def country_code(self) -> str | None: ...class ProxyInfo:
def __init__(
self,
*,
url: str,
hostname: str | None = None,
port: int | None = None,
username: str | None = None,
password: str | None = None,
country_code: str | None = None,
session_id: str | None = None
): ...
@property
def url(self) -> str:
"""Full proxy URL."""
@property
def hostname(self) -> str | None:
"""Proxy hostname."""
@property
def port(self) -> int | None:
"""Proxy port number."""
@property
def username(self) -> str | None:
"""Proxy authentication username."""
@property
def password(self) -> str | None:
"""Proxy authentication password."""
@property
def country_code(self) -> str | None:
"""ISO country code for proxy location."""
@property
def session_id(self) -> str | None:
"""Session identifier for proxy affinity."""Type definitions for request handlers and error handlers used with the Router.
RequestHandler = Callable[[BasicCrawlingContext], Awaitable[None]]
ErrorRequestHandler = Callable[
[BasicCrawlingContext, Exception], Awaitable[None]
]from crawlee.configuration import Configuration
import os
# Create configuration with defaults
config = Configuration(
storage_dir='./crawlee_storage',
max_concurrent_requests=10,
request_timeout=30
)
# Environment variables override defaults
# CRAWLEE_STORAGE_DIR, CRAWLEE_MAX_CONCURRENT_REQUESTS, etc.
os.environ['CRAWLEE_STORAGE_DIR'] = '/tmp/my_crawls'
# Get configuration values
storage_dir = config.get('storage_dir')
print(f"Storage directory: {storage_dir}") # /tmp/my_crawls
# Type-specific getters
max_requests = config.get_int('max_concurrent_requests', 5)
enable_logging = config.get_bool('enable_logging', True)
# Set values programmatically
config.set('user_agent', 'My Custom Bot 1.0')import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.router import Router
async def main():
crawler = BeautifulSoupCrawler()
# Use the crawler's built-in router
router = crawler.router
# Default handler for unlabeled requests
@router.default_handler
async def default_handler(context: BeautifulSoupCrawlingContext):
context.log.info(f"Processing default: {context.request.url}")
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
'type': 'default'
}
await context.push_data(data)
# Handler for product pages
@router.route('product')
async def product_handler(context: BeautifulSoupCrawlingContext):
context.log.info(f"Processing product: {context.request.url}")
# Extract product-specific data
name = context.soup.select_one('.product-name')
price = context.soup.select_one('.price')
data = {
'url': context.request.url,
'name': name.get_text().strip() if name else None,
'price': price.get_text().strip() if price else None,
'type': 'product'
}
await context.push_data(data)
# Enqueue related products
await context.enqueue_links(
selector='.related-product a',
label='product'
)
# Handler for category pages
@router.route('category')
async def category_handler(context: BeautifulSoupCrawlingContext):
context.log.info(f"Processing category: {context.request.url}")
# Extract category info
category_name = context.soup.select_one('h1')
data = {
'url': context.request.url,
'category': category_name.get_text().strip() if category_name else None,
'type': 'category'
}
await context.push_data(data)
# Enqueue product links with product label
await context.enqueue_links(
selector='.product-link',
label='product'
)
# Error handler for all failed requests
@router.error_handler
async def error_handler(context: BeautifulSoupCrawlingContext, error: Exception):
context.log.error(f"Error processing {context.request.url}: {error}")
# Log error details
await context.push_data({
'url': context.request.url,
'error': str(error),
'type': 'error'
})
# Start crawling with labeled requests
from crawlee import Request
requests = [
Request('https://store.example.com/', label='category'),
Request('https://store.example.com/products/item1', label='product'),
'https://store.example.com/about', # No label = default handler
]
await crawler.run(requests)
asyncio.run(main())import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.proxy_configuration import ProxyConfiguration
async def main():
# Configure proxy rotation
proxy_config = ProxyConfiguration([
'http://proxy1:8080',
'http://user:pass@proxy2:8080',
'http://proxy3:8080',
'socks5://socks-proxy:1080'
])
# Create crawler with proxy configuration
crawler = HttpCrawler(
proxy_configuration=proxy_config,
max_requests_per_crawl=20
)
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
# Each request may use different proxy
proxy_info = context.proxy_info
if proxy_info:
context.log.info(f"Using proxy: {proxy_info.hostname}:{proxy_info.port}")
if proxy_info.country_code:
context.log.info(f"Proxy country: {proxy_info.country_code}")
data = {
'url': context.request.url,
'status': context.response.status_code,
'proxy_used': proxy_info.url if proxy_info else None
}
await context.push_data(data)
await crawler.run(['https://httpbin.org/ip'] * 10)
asyncio.run(main())import asyncio
import random
from crawlee.proxy_configuration import ProxyConfiguration
def generate_proxy_url() -> str:
"""Custom function to generate proxy URLs dynamically."""
proxy_providers = [
'proxy-pool-1.example.com:8080',
'proxy-pool-2.example.com:8080',
'proxy-pool-3.example.com:8080'
]
selected = random.choice(proxy_providers)
return f"http://user:pass@{selected}"
async def main():
# Use custom proxy generation function
proxy_config = ProxyConfiguration(
new_url_function=generate_proxy_url
)
# Test proxy generation
for i in range(5):
proxy_info = await proxy_config.new_proxy_info()
print(f"Generated proxy {i+1}: {proxy_info.url}")
asyncio.run(main())import os
from crawlee.configuration import Configuration
# Set environment variables
os.environ.update({
'CRAWLEE_STORAGE_DIR': './data',
'CRAWLEE_MAX_CONCURRENT_REQUESTS': '20',
'CRAWLEE_REQUEST_TIMEOUT': '60',
'CRAWLEE_ENABLE_PROXY': 'true',
'CRAWLEE_LOG_LEVEL': 'DEBUG'
})
# Configuration automatically reads from environment
config = Configuration()
print(f"Storage dir: {config.storage_dir}")
print(f"Max concurrent: {config.get_int('max_concurrent_requests')}")
print(f"Request timeout: {config.get_int('request_timeout')}")
print(f"Proxy enabled: {config.get_bool('enable_proxy')}")
print(f"Log level: {config.get('log_level')}")
# Override with custom values
config.set('custom_setting', 'my_value')
print(f"Custom setting: {config.get('custom_setting')}")Install with Tessl CLI
npx tessl i tessl/pypi-crawlee