tessl/pypi-crawlee

A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass

Overview

Eval results

Files

HTTP Clients

Name: tessl/pypi-crawlee
Author: tessl

Pluggable HTTP client implementations supporting different libraries and browser impersonation for enhanced anti-detection capabilities. HTTP clients handle the actual network communication while providing consistent interfaces for different underlying implementations.

Capabilities

Base HTTP Client

Abstract base class defining the interface for all HTTP client implementations in Crawlee.

class HttpClient:
    async def crawl(
        self,
        request: Request,
        *,
        session: Session | None = None,
        proxy_info: ProxyInfo | None = None,
        statistics: Statistics | None = None
    ) -> HttpCrawlingResult:
        """
        Perform HTTP request crawling.

        Args:
            request: Request to process
            session: Session for state management
            proxy_info: Proxy configuration
            statistics: Statistics collector

        Returns:
            HttpCrawlingResult with response data
        """

    async def send_request(
        self,
        url: str,
        *,
        method: HttpMethod = "GET",
        headers: dict[str, str] | None = None,
        payload: HttpPayload | None = None,
        **kwargs
    ) -> HttpResponse:
        """
        Send direct HTTP request.

        Args:
            url: Target URL
            method: HTTP method
            headers: Request headers
            payload: Request body

        Returns:
            HttpResponse object
        """

HTTPX Client

HTTP client implementation using the httpx library with support for HTTP/2, connection pooling, and async operations.

class HttpxHttpClient(HttpClient):
    def __init__(
        self,
        *,
        persist_cookies_per_session: bool = True,
        additional_http_error_status_codes: set[int] | None = None,
        ignore_http_error_status_codes: set[int] | None = None,
        **httpx_kwargs
    ): ...

    @property
    def client(self) -> httpx.AsyncClient:
        """Access underlying httpx client."""

Curl Impersonate Client

HTTP client using curl-cffi for browser impersonation and advanced anti-detection capabilities.

class CurlImpersonateHttpClient(HttpClient):
    def __init__(
        self,
        *,
        persist_cookies_per_session: bool = True,
        impersonate: str = "chrome",
        additional_http_error_status_codes: set[int] | None = None,
        ignore_http_error_status_codes: set[int] | None = None,
        **curl_cffi_kwargs
    ): ...

    @property
    def impersonate(self) -> str:
        """Browser impersonation target."""

HTTP Response

Response object containing response data, headers, and metadata from HTTP requests.

class HttpResponse:
    def __init__(
        self,
        *,
        url: str,
        status_code: int,
        headers: HttpHeaders,
        content: bytes,
        encoding: str | None = None
    ): ...

    @property
    def url(self) -> str:
        """Final response URL (after redirects)."""

    @property
    def status_code(self) -> int:
        """HTTP status code."""

    @property
    def headers(self) -> HttpHeaders:
        """Response headers."""

    @property
    def content(self) -> bytes:
        """Raw response content."""

    @property
    def text(self) -> str:
        """Response content as string."""

    @property
    def encoding(self) -> str | None:
        """Character encoding of response."""

    @property
    def content_type(self) -> str | None:
        """MIME type from Content-Type header."""

    def json(self) -> any:
        """
        Parse response content as JSON.

        Returns:
            Parsed JSON data

        Raises:
            JSONDecodeError: If content is not valid JSON
        """

    @property
    def ok(self) -> bool:
        """True if status code indicates success (200-299)."""

    def raise_for_status(self) -> None:
        """
        Raise HTTPStatusError for bad response status codes.

        Raises:
            HttpStatusCodeError: For 4xx and 5xx status codes
        """

HTTP Crawling Result

Result object containing both HTTP response data and additional crawling metadata.

class HttpCrawlingResult:
    def __init__(
        self,
        *,
        http_response: HttpResponse,
        encoding: str | None = None
    ): ...

    @property
    def http_response(self) -> HttpResponse:
        """HTTP response object."""

    @property
    def encoding(self) -> str | None:
        """Character encoding override."""

Configuration Options

HTTP Client Configuration

Common configuration options available across HTTP client implementations.

class HttpClientConfig:
    persist_cookies_per_session: bool = True
    additional_http_error_status_codes: set[int] | None = None
    ignore_http_error_status_codes: set[int] | None = None
    timeout: float = 30.0
    max_redirects: int = 10
    verify_ssl: bool = True
    proxy_url: str | None = None

Browser Impersonation Options

Configuration for curl-cffi browser impersonation capabilities.

ImpersonateTarget = Literal[
    "chrome",
    "chrome99",
    "chrome100",
    "chrome101",
    "chrome104",
    "chrome107",
    "chrome110",
    "chrome116",
    "firefox",
    "firefox99",
    "firefox102",
    "firefox109",
    "safari",
    "safari15_3",
    "safari15_5",
    "safari17_0",
    "safari17_2_1"
]

Usage Examples

Basic HTTP Client Usage

import asyncio
from crawlee.http_clients import HttpxHttpClient
from crawlee import Request

async def main():
    client = HttpxHttpClient()

    # Send direct request
    response = await client.send_request(
        'https://api.example.com/data',
        method='GET',
        headers={'User-Agent': 'My Bot 1.0'}
    )

    print(f"Status: {response.status_code}")
    print(f"Content: {response.text}")

    # Process as JSON
    if response.content_type == 'application/json':
        data = response.json()
        print(f"JSON data: {data}")

    await client.close()

asyncio.run(main())

Browser Impersonation

import asyncio
from crawlee.http_clients import CurlImpersonateHttpClient

async def main():
    # Impersonate Chrome browser
    client = CurlImpersonateHttpClient(
        impersonate='chrome116'
    )

    response = await client.send_request('https://example.com')

    print(f"Impersonating: {client.impersonate}")
    print(f"Response: {response.status_code}")

    # The request appears to come from Chrome 116
    print(f"User-Agent: {response.headers.get('user-agent', 'Not set')}")

    await client.close()

asyncio.run(main())

Custom HTTP Client Configuration

import asyncio
import httpx
from crawlee.http_clients import HttpxHttpClient

async def main():
    # Custom httpx configuration
    client = HttpxHttpClient(
        timeout=60.0,
        verify=False,  # Disable SSL verification
        limits=httpx.Limits(
            max_keepalive_connections=100,
            max_connections=200
        ),
        ignore_http_error_status_codes={404, 503}
    )

    try:
        response = await client.send_request('https://example.com/may-not-exist')
        # Won't raise error for 404 due to ignore_http_error_status_codes
        print(f"Status: {response.status_code}")
    except Exception as e:
        print(f"Request failed: {e}")

    await client.close()

asyncio.run(main())

Using HTTP Clients with Crawlers

import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.http_clients import CurlImpersonateHttpClient

async def main():
    # Configure crawler with custom HTTP client
    http_client = CurlImpersonateHttpClient(
        impersonate='safari17_0',
        persist_cookies_per_session=True
    )

    crawler = HttpCrawler(
        http_client=http_client,
        max_requests_per_crawl=50
    )

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        response = context.response

        print(f"Crawled: {response.url}")
        print(f"Status: {response.status_code}")
        print(f"Content-Type: {response.content_type}")

        # Extract data based on content type
        if response.content_type and 'application/json' in response.content_type:
            data = response.json()
            await context.push_data(data)
        else:
            # Process HTML or other content
            data = {
                'url': response.url,
                'status': response.status_code,
                'title': 'Extracted from HTML'  # Add your extraction logic
            }
            await context.push_data(data)

    await crawler.run(['https://api.example.com/data'])

asyncio.run(main())

Error Handling

import asyncio
from crawlee.http_clients import HttpxHttpClient
from crawlee.errors import HttpStatusCodeError

async def main():
    client = HttpxHttpClient()

    try:
        response = await client.send_request('https://httpbin.org/status/500')

        # Check if response is successful
        if not response.ok:
            print(f"Request failed with status: {response.status_code}")

        # Or raise exception for bad status
        response.raise_for_status()

    except HttpStatusCodeError as e:
        print(f"HTTP error occurred: {e}")
        print(f"Status code: {e.status_code}")

    except Exception as e:
        print(f"Other error occurred: {e}")

    finally:
        await client.close()

asyncio.run(main())

Install with Tessl CLI