A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass
npx @tessl/cli install tessl/pypi-crawlee@0.6.0A comprehensive web scraping and browser automation library for Python designed to help developers build reliable scrapers that appear human-like and bypass modern bot protections. Crawlee provides end-to-end crawling and scraping capabilities with tools to crawl the web for links, scrape data, and persistently store it in machine-readable formats.
pip install 'crawlee[all]' (full features) or pip install crawlee (core only)import crawlee
from crawlee import Request, service_locatorCommon patterns for crawlers:
from crawlee.crawlers import (
BasicCrawler, HttpCrawler,
BeautifulSoupCrawler, ParselCrawler, PlaywrightCrawler,
AdaptivePlaywrightCrawler
)For specific functionality:
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
from crawlee.sessions import Session, SessionPool
from crawlee.http_clients import HttpxHttpClient, CurlImpersonateHttpClient
from crawlee import ConcurrencySettings, HttpHeaders, EnqueueStrategyimport asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
async def main() -> None:
crawler = BeautifulSoupCrawler(
max_requests_per_crawl=10,
)
@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')
# Extract data from the page
data = {
'url': context.request.url,
'title': context.soup.title.string if context.soup.title else None,
}
# Push data to storage
await context.push_data(data)
# Enqueue all links found on the page
await context.enqueue_links()
# Run the crawler
await crawler.run(['https://example.com'])
if __name__ == '__main__':
asyncio.run(main())Crawlee follows a modular architecture with clear separation of concerns:
This design enables Crawlee to handle everything from simple HTTP scraping to complex browser automation while maintaining human-like behavior patterns.
Essential types and request management functionality that forms the foundation of all crawling operations.
class Request:
@classmethod
def from_url(cls, url: str, **options) -> Request: ...
class ConcurrencySettings:
def __init__(
self,
min_concurrency: int = 1,
max_concurrency: int = 200,
max_tasks_per_minute: float = float('inf'),
desired_concurrency: int | None = None
): ...
class HttpHeaders(Mapping[str, str]):
def __init__(self, headers: dict[str, str] | None = None): ...
service_locator: ServiceLocatorSpecialized crawler implementations for different scraping needs, from HTTP-only to full browser automation with intelligent adaptation between modes.
class BasicCrawler:
def __init__(self, **options): ...
async def run(self, requests: list[str | Request]): ...
class BeautifulSoupCrawler(AbstractHttpCrawler):
def __init__(self, **options): ...
class PlaywrightCrawler:
def __init__(self, **options): ...
class AdaptivePlaywrightCrawler:
def __init__(self, **options): ...Persistent storage solutions for structured data, key-value pairs, and request queue management with built-in export capabilities.
class Dataset:
def push_data(self, data: dict | list[dict]): ...
def export_to(self, format: str, path: str): ...
class KeyValueStore:
def set_value(self, key: str, value: any): ...
def get_value(self, key: str): ...
class RequestQueue:
def add_request(self, request: Request): ...
def fetch_next_request(self) -> Request | None: ...Pluggable HTTP client implementations supporting different libraries and browser impersonation for enhanced anti-detection capabilities.
class HttpxHttpClient(HttpClient):
def __init__(self, **options): ...
class CurlImpersonateHttpClient(HttpClient):
def __init__(self, **options): ...
class HttpResponse:
status_code: int
headers: HttpHeaders
text: str
content: bytesSession and cookie management with rotation capabilities for maintaining state across requests and avoiding detection.
class Session:
def __init__(self, session_pool: SessionPool): ...
class SessionPool:
def __init__(self, max_pool_size: int = 1000): ...
def get_session(self) -> Session: ...
class SessionCookies:
def add_cookie(self, cookie: CookieParam): ...Optional Playwright integration for full browser automation with support for JavaScript-heavy sites and complex user interactions.
class BrowserPool:
def __init__(self, **options): ...
class PlaywrightBrowserController:
def __init__(self, **options): ...Browser fingerprint generation and header randomization for enhanced stealth capabilities and bot protection bypass.
class FingerprintGenerator:
def generate_fingerprint(self) -> dict: ...
class HeaderGenerator:
def get_headers(self, **options: HeaderGeneratorOptions) -> HttpHeaders: ...
class DefaultFingerprintGenerator(FingerprintGenerator):
def __init__(self, **options): ...Global configuration management and request routing systems for fine-tuned control over crawling behavior.
class Configuration:
def __init__(self, **settings): ...
class Router:
def default_handler(self, handler): ...
def route(self, label: str, handler): ...
class ProxyConfiguration:
def __init__(self, proxy_urls: list[str]): ...Performance monitoring and statistics collection for tracking crawling progress and system resource usage.
class Statistics:
def __init__(self): ...
def get_state(self) -> StatisticsState: ...
class FinalStatistics:
requests_finished: int
requests_failed: int
retry_histogram: list[int]Comprehensive exception hierarchy for handling various crawling scenarios and failure modes.
class HttpStatusCodeError(Exception): ...
class ProxyError(Exception): ...
class SessionError(Exception): ...
class RequestHandlerError(Exception): ...Advanced request lifecycle management with support for static lists, dynamic queues, and tandem operations.
class RequestList:
def __init__(self, requests: list[str | Request]): ...
class RequestManager:
def __init__(self, **options): ...
class RequestManagerTandem:
def __init__(self, request_list: RequestList, request_queue: RequestQueue): ...Event-driven architecture for hooking into crawler lifecycle events and implementing custom behaviors.
class EventManager:
def emit(self, event: Event, data: EventData): ...
def on(self, event: Event, listener: EventListener): ...
class LocalEventManager(EventManager): ...Command-line interface for project scaffolding and development workflow automation.
# Command line usage:
# crawlee create my-project
# crawlee --version