A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass
Specialized crawler implementations for different scraping needs, from simple HTTP requests to full browser automation. Crawlee provides a unified interface across different crawler types while offering specialized capabilities for specific use cases.
Foundation crawler providing core functionality including autoscaling, session management, and request lifecycle handling. All other crawlers extend from this base implementation.
class BasicCrawler:
def __init__(
self,
*,
max_requests_per_crawl: int | None = None,
max_request_retries: int = 3,
request_handler_timeout: timedelta | None = None,
session_pool: SessionPool | None = None,
use_session_pool: bool = True,
retry_on_blocked: bool = True,
statistics: Statistics | None = None,
**options: BasicCrawlerOptions
): ...
async def run(self, requests: list[str | Request]) -> FinalStatistics: ...
async def add_requests(
self,
requests: list[str | Request],
**kwargs
) -> None: ...
@property
def router(self) -> Router: ...
@property
def stats(self) -> Statistics: ...HTTP-based crawler for web scraping using configurable HTTP clients. Ideal for sites that don't require JavaScript execution.
class HttpCrawler(AbstractHttpCrawler):
def __init__(
self,
*,
http_client: HttpClient | None = None,
ignore_http_error_status_codes: list[int] | None = None,
**options
): ...HTML parsing crawler using BeautifulSoup for content extraction. Combines HTTP requests with powerful CSS selector and BeautifulSoup parsing capabilities.
class BeautifulSoupCrawler(AbstractHttpCrawler):
def __init__(
self,
*,
parser_type: BeautifulSoupParserType = BeautifulSoupParserType.HTML_PARSER,
**options
): ...class BeautifulSoupParserType(str, Enum):
HTML_PARSER = "html.parser"
LXML = "lxml"
HTML5LIB = "html5lib"CSS selector and XPath-based crawler using the Parsel library for structured data extraction from HTML and XML documents.
class ParselCrawler(AbstractHttpCrawler):
def __init__(self, **options): ...Full browser automation crawler using Playwright for JavaScript-heavy sites and complex user interactions. Supports headless and headful modes.
class PlaywrightCrawler:
def __init__(
self,
*,
browser_type: Literal["chromium", "firefox", "webkit"] = "chromium",
browser_pool: BrowserPool | None = None,
headless: bool = True,
**options
): ...Intelligent crawler that automatically decides between HTTP and browser modes based on page requirements using machine learning prediction.
class AdaptivePlaywrightCrawler:
def __init__(
self,
*,
rendering_type_predictor: RenderingTypePredictor | None = None,
**options
): ...class RenderingType(str, Enum):
CLIENT_SIDE_ONLY = "client_side_only"
SERVER_SIDE_ONLY = "server_side_only"
CLIENT_SERVER_SIDE = "client_server_side"class RenderingTypePrediction:
rendering_type: RenderingType
probability: floatclass RenderingTypePredictor:
def predict(self, url: str) -> RenderingTypePrediction: ...Base context available in all crawler request handlers with core functionality for data extraction and request management.
class BasicCrawlingContext:
request: Request
session: Session
log: Logger
async def push_data(self, data: dict | list[dict]) -> None: ...
async def enqueue_links(
self,
*,
selector: str = "a[href]",
base_url: str | None = None,
**kwargs
) -> None: ...
async def add_requests(self, requests: list[str | Request]) -> None: ...
async def get_key_value_store(self, name: str | None = None) -> KeyValueStore: ...
async def use_state(self, default_value: any = None) -> any: ...Context for HTTP-based crawlers providing access to response data and HTTP-specific functionality.
class HttpCrawlingContext(BasicCrawlingContext):
response: HttpResponse
@property
def body(self) -> str: ...
@property
def content_type(self) -> str | None: ...
@property
def encoding(self) -> str: ...Context with BeautifulSoup parsed HTML content and CSS selector capabilities for easy data extraction.
class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext):
soup: BeautifulSoup
def css(self, selector: str) -> list: ...
def xpath(self, xpath: str) -> list: ...Context with Parsel selector objects for advanced CSS and XPath-based data extraction from HTML and XML.
class ParselCrawlingContext(ParsedHttpCrawlingContext):
selector: Selector
def css(self, selector: str) -> SelectorList: ...
def xpath(self, xpath: str) -> SelectorList: ...Context with Playwright page objects for full browser automation and JavaScript interaction capabilities.
class PlaywrightCrawlingContext(BasicCrawlingContext):
page: Page
response: Response | None
async def infinite_scroll(
self,
*,
max_scroll_height: int | None = None,
button_selector: str | None = None,
wait_for_selector: str | None = None,
) -> None: ...
async def save_snapshot(self, *, key: str | None = None) -> None: ...Context available before page navigation in Playwright crawlers for setting up page configuration and listeners.
class PlaywrightPreNavCrawlingContext:
page: Page
request: Request
session: Session
log: LoggerConfiguration options for customizing crawler behavior, performance, and resource management.
class BasicCrawlerOptions:
request_provider: RequestProvider | None = None
request_handler: RequestHandler | None = None
failed_request_handler: ErrorHandler | None = None
max_requests_per_crawl: int | None = None
max_request_retries: int = 3
request_handler_timeout: timedelta | None = None
navigation_timeout: timedelta | None = None
session_pool: SessionPool | None = None
use_session_pool: bool = True
statistics: Statistics | None = None
event_manager: EventManager | None = NoneMiddleware pipeline system for processing crawling contexts with support for initialization, processing, and cleanup phases.
class ContextPipeline:
def __init__(self): ...
def use(self, middleware: Callable) -> None: ...
async def compose(self, context: BasicCrawlingContext) -> None: ...Result object containing response data and metadata from HTTP-based crawling operations.
class HttpCrawlingResult:
http_response: HttpResponse
encoding: str | None = NoneBase parser interface for implementing custom response parsing in HTTP-based crawlers.
class AbstractHttpParser:
async def parse(
self,
crawling_context: HttpCrawlingContext
) -> ParsedHttpCrawlingContext: ...import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
async def main():
crawler = HttpCrawler()
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
context.log.info(f'Processing {context.request.url}')
data = {
'url': context.request.url,
'status': context.response.status_code,
'length': len(context.body)
}
await context.push_data(data)
await crawler.run(['https://example.com'])
asyncio.run(main())import asyncio
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
async def main():
crawler = PlaywrightCrawler(headless=True)
@crawler.router.default_handler
async def handler(context: PlaywrightCrawlingContext):
await context.page.wait_for_load_state('networkidle')
title = await context.page.title()
data = {
'url': context.request.url,
'title': title
}
await context.push_data(data)
await crawler.run(['https://example.com'])
asyncio.run(main())import asyncio
from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext
async def main():
crawler = AdaptivePlaywrightCrawler()
@crawler.router.default_handler
async def handler(context: AdaptivePlaywrightCrawlingContext):
# Context automatically switches between HTTP and browser modes
# based on page rendering requirements
if hasattr(context, 'page'):
# Browser mode - page requires JavaScript
title = await context.page.title()
else:
# HTTP mode - static content
title = context.soup.title.string if context.soup.title else None
data = {
'url': context.request.url,
'title': title
}
await context.push_data(data)
await crawler.run(['https://example.com'])
asyncio.run(main())Install with Tessl CLI
npx tessl i tessl/pypi-crawlee