tessl/pypi-crawlee

A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass

Overview

Eval results

Files

Crawlers

Name: tessl/pypi-crawlee
Author: tessl

Specialized crawler implementations for different scraping needs, from simple HTTP requests to full browser automation. Crawlee provides a unified interface across different crawler types while offering specialized capabilities for specific use cases.

Capabilities

Basic Crawler

Foundation crawler providing core functionality including autoscaling, session management, and request lifecycle handling. All other crawlers extend from this base implementation.

class BasicCrawler:
    def __init__(
        self,
        *,
        max_requests_per_crawl: int | None = None,
        max_request_retries: int = 3,
        request_handler_timeout: timedelta | None = None,
        session_pool: SessionPool | None = None,
        use_session_pool: bool = True,
        retry_on_blocked: bool = True,
        statistics: Statistics | None = None,
        **options: BasicCrawlerOptions
    ): ...

    async def run(self, requests: list[str | Request]) -> FinalStatistics: ...

    async def add_requests(
        self,
        requests: list[str | Request],
        **kwargs
    ) -> None: ...

    @property
    def router(self) -> Router: ...

    @property
    def stats(self) -> Statistics: ...

HTTP Crawler

HTTP-based crawler for web scraping using configurable HTTP clients. Ideal for sites that don't require JavaScript execution.

class HttpCrawler(AbstractHttpCrawler):
    def __init__(
        self,
        *,
        http_client: HttpClient | None = None,
        ignore_http_error_status_codes: list[int] | None = None,
        **options
    ): ...

BeautifulSoup Crawler

HTML parsing crawler using BeautifulSoup for content extraction. Combines HTTP requests with powerful CSS selector and BeautifulSoup parsing capabilities.

class BeautifulSoupCrawler(AbstractHttpCrawler):
    def __init__(
        self,
        *,
        parser_type: BeautifulSoupParserType = BeautifulSoupParserType.HTML_PARSER,
        **options
    ): ...

class BeautifulSoupParserType(str, Enum):
    HTML_PARSER = "html.parser"
    LXML = "lxml"
    HTML5LIB = "html5lib"

Parsel Crawler

CSS selector and XPath-based crawler using the Parsel library for structured data extraction from HTML and XML documents.

class ParselCrawler(AbstractHttpCrawler):
    def __init__(self, **options): ...

Playwright Crawler

Full browser automation crawler using Playwright for JavaScript-heavy sites and complex user interactions. Supports headless and headful modes.

class PlaywrightCrawler:
    def __init__(
        self,
        *,
        browser_type: Literal["chromium", "firefox", "webkit"] = "chromium",
        browser_pool: BrowserPool | None = None,
        headless: bool = True,
        **options
    ): ...

Adaptive Playwright Crawler

Intelligent crawler that automatically decides between HTTP and browser modes based on page requirements using machine learning prediction.

class AdaptivePlaywrightCrawler:
    def __init__(
        self,
        *,
        rendering_type_predictor: RenderingTypePredictor | None = None,
        **options
    ): ...

class RenderingType(str, Enum):
    CLIENT_SIDE_ONLY = "client_side_only"
    SERVER_SIDE_ONLY = "server_side_only"
    CLIENT_SERVER_SIDE = "client_server_side"

class RenderingTypePrediction:
    rendering_type: RenderingType
    probability: float

class RenderingTypePredictor:
    def predict(self, url: str) -> RenderingTypePrediction: ...

Crawling Contexts

Basic Crawling Context

Base context available in all crawler request handlers with core functionality for data extraction and request management.

class BasicCrawlingContext:
    request: Request
    session: Session
    log: Logger

    async def push_data(self, data: dict | list[dict]) -> None: ...
    async def enqueue_links(
        self,
        *,
        selector: str = "a[href]",
        base_url: str | None = None,
        **kwargs
    ) -> None: ...
    async def add_requests(self, requests: list[str | Request]) -> None: ...
    async def get_key_value_store(self, name: str | None = None) -> KeyValueStore: ...
    async def use_state(self, default_value: any = None) -> any: ...

HTTP Crawling Context

Context for HTTP-based crawlers providing access to response data and HTTP-specific functionality.

class HttpCrawlingContext(BasicCrawlingContext):
    response: HttpResponse

    @property
    def body(self) -> str: ...

    @property
    def content_type(self) -> str | None: ...

    @property
    def encoding(self) -> str: ...

BeautifulSoup Crawling Context

Context with BeautifulSoup parsed HTML content and CSS selector capabilities for easy data extraction.

class BeautifulSoupCrawlingContext(ParsedHttpCrawlingContext):
    soup: BeautifulSoup

    def css(self, selector: str) -> list: ...
    def xpath(self, xpath: str) -> list: ...

Parsel Crawling Context

Context with Parsel selector objects for advanced CSS and XPath-based data extraction from HTML and XML.

class ParselCrawlingContext(ParsedHttpCrawlingContext):
    selector: Selector

    def css(self, selector: str) -> SelectorList: ...
    def xpath(self, xpath: str) -> SelectorList: ...

Playwright Crawling Context

Context with Playwright page objects for full browser automation and JavaScript interaction capabilities.

class PlaywrightCrawlingContext(BasicCrawlingContext):
    page: Page
    response: Response | None

    async def infinite_scroll(
        self,
        *,
        max_scroll_height: int | None = None,
        button_selector: str | None = None,
        wait_for_selector: str | None = None,
    ) -> None: ...

    async def save_snapshot(self, *, key: str | None = None) -> None: ...

Playwright Pre-Navigation Context

Context available before page navigation in Playwright crawlers for setting up page configuration and listeners.

class PlaywrightPreNavCrawlingContext:
    page: Page
    request: Request
    session: Session
    log: Logger

Crawler Configuration

Basic Crawler Options

Configuration options for customizing crawler behavior, performance, and resource management.

class BasicCrawlerOptions:
    request_provider: RequestProvider | None = None
    request_handler: RequestHandler | None = None
    failed_request_handler: ErrorHandler | None = None
    max_requests_per_crawl: int | None = None
    max_request_retries: int = 3
    request_handler_timeout: timedelta | None = None
    navigation_timeout: timedelta | None = None
    session_pool: SessionPool | None = None
    use_session_pool: bool = True
    statistics: Statistics | None = None
    event_manager: EventManager | None = None

Context Pipeline

Middleware pipeline system for processing crawling contexts with support for initialization, processing, and cleanup phases.

class ContextPipeline:
    def __init__(self): ...

    def use(self, middleware: Callable) -> None: ...

    async def compose(self, context: BasicCrawlingContext) -> None: ...

HTTP Crawling Result

Result object containing response data and metadata from HTTP-based crawling operations.

class HttpCrawlingResult:
    http_response: HttpResponse
    encoding: str | None = None

Abstract HTTP Parser

Base parser interface for implementing custom response parsing in HTTP-based crawlers.

class AbstractHttpParser:
    async def parse(
        self,
        crawling_context: HttpCrawlingContext
    ) -> ParsedHttpCrawlingContext: ...

Usage Examples

Basic HTTP Scraping

import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

async def main():
    crawler = HttpCrawler()

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        context.log.info(f'Processing {context.request.url}')

        data = {
            'url': context.request.url,
            'status': context.response.status_code,
            'length': len(context.body)
        }

        await context.push_data(data)

    await crawler.run(['https://example.com'])

asyncio.run(main())

Browser Automation with Playwright

import asyncio
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext

async def main():
    crawler = PlaywrightCrawler(headless=True)

    @crawler.router.default_handler
    async def handler(context: PlaywrightCrawlingContext):
        await context.page.wait_for_load_state('networkidle')

        title = await context.page.title()

        data = {
            'url': context.request.url,
            'title': title
        }

        await context.push_data(data)

    await crawler.run(['https://example.com'])

asyncio.run(main())

Adaptive Crawling

import asyncio
from crawlee.crawlers import AdaptivePlaywrightCrawler, AdaptivePlaywrightCrawlingContext

async def main():
    crawler = AdaptivePlaywrightCrawler()

    @crawler.router.default_handler
    async def handler(context: AdaptivePlaywrightCrawlingContext):
        # Context automatically switches between HTTP and browser modes
        # based on page rendering requirements

        if hasattr(context, 'page'):
            # Browser mode - page requires JavaScript
            title = await context.page.title()
        else:
            # HTTP mode - static content
            title = context.soup.title.string if context.soup.title else None

        data = {
            'url': context.request.url,
            'title': title
        }

        await context.push_data(data)

    await crawler.run(['https://example.com'])

asyncio.run(main())

Install with Tessl CLI