tessl/pypi-crawlee

A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass

Overview

Eval results

Files

Error Handling

Name: tessl/pypi-crawlee
Author: tessl

Comprehensive exception hierarchy for handling various crawling scenarios and failure modes. Crawlee provides specific exception types for different error conditions to enable precise error handling and recovery strategies.

Exception Hierarchy

HTTP Errors

Exceptions related to HTTP requests and responses.

class HttpStatusCodeError(Exception):
    """Raised when HTTP request returns error status code."""

    def __init__(
        self,
        message: str,
        *,
        status_code: int,
        response: HttpResponse | None = None
    ): ...

    @property
    def status_code(self) -> int:
        """HTTP status code that caused the error."""

    @property
    def response(self) -> HttpResponse | None:
        """HTTP response object if available."""

class HttpClientStatusCodeError(HttpStatusCodeError):
    """Raised by HTTP clients for error status codes."""
    pass

Proxy Errors

Exceptions related to proxy configuration and connectivity.

class ProxyError(Exception):
    """Base class for proxy-related errors."""

    def __init__(
        self,
        message: str,
        *,
        proxy_info: ProxyInfo | None = None
    ): ...

    @property
    def proxy_info(self) -> ProxyInfo | None:
        """Proxy information associated with the error."""

Session Errors

Exceptions related to session management and state.

class SessionError(Exception):
    """Raised when session operations fail."""

    def __init__(
        self,
        message: str,
        *,
        session_id: str | None = None
    ): ...

    @property
    def session_id(self) -> str | None:
        """Session ID associated with the error."""

Request Handling Errors

Exceptions that occur during request processing and handler execution.

class RequestHandlerError(Exception):
    """Raised when request handler execution fails."""

    def __init__(
        self,
        message: str,
        *,
        request: Request | None = None,
        original_exception: Exception | None = None
    ): ...

    @property
    def request(self) -> Request | None:
        """Request that was being processed when error occurred."""

    @property
    def original_exception(self) -> Exception | None:
        """Original exception that caused the handler error."""

class UserDefinedErrorHandlerError(Exception):
    """Wrapper for errors in user-defined error handlers."""

    def __init__(
        self,
        message: str,
        *,
        original_exception: Exception
    ): ...

    @property
    def original_exception(self) -> Exception:
        """Original exception that occurred in user handler."""

Request Queue Errors

Exceptions related to request queue operations and resource conflicts.

class RequestCollisionError(Exception):
    """Raised when request resource conflicts occur."""

    def __init__(
        self,
        message: str,
        *,
        request: Request | None = None,
        conflicting_request: Request | None = None
    ): ...

    @property
    def request(self) -> Request | None:
        """Request that caused the collision."""

    @property
    def conflicting_request(self) -> Request | None:
        """Existing request that conflicts."""

Context Pipeline Errors

Exceptions related to context pipeline processing and middleware.

class ContextPipelineInitializationError(Exception):
    """Raised when context pipeline initialization fails."""

    def __init__(
        self,
        message: str,
        *,
        pipeline_stage: str | None = None
    ): ...

    @property
    def pipeline_stage(self) -> str | None:
        """Pipeline stage where initialization failed."""

class ContextPipelineFinalizationError(Exception):
    """Raised when context pipeline finalization fails."""

    def __init__(
        self,
        message: str,
        *,
        pipeline_stage: str | None = None
    ): ...

    @property
    def pipeline_stage(self) -> str | None:
        """Pipeline stage where finalization failed."""

class ContextPipelineInterruptedError(Exception):
    """Signal for interrupting context pipeline processing."""

    def __init__(
        self,
        message: str = "Context pipeline interrupted",
        *,
        skip_to_error_handler: bool = False
    ): ...

    @property
    def skip_to_error_handler(self) -> bool:
        """Whether to skip remaining pipeline and go to error handler."""

Service Container Errors

Exceptions related to service locator and dependency injection.

class ServiceConflictError(Exception):
    """Raised when service registration conflicts occur."""

    def __init__(
        self,
        message: str,
        *,
        service_type: type | None = None
    ): ...

    @property
    def service_type(self) -> type | None:
        """Service type that caused the conflict."""

Usage Examples

HTTP Error Handling

import asyncio
from crawlee.http_clients import HttpxHttpClient
from crawlee.errors import HttpStatusCodeError, HttpClientStatusCodeError

async def main():
    client = HttpxHttpClient()

    try:
        response = await client.send_request('https://httpbin.org/status/404')

    except HttpClientStatusCodeError as e:
        print(f"HTTP client error: {e}")
        print(f"Status code: {e.status_code}")
        if e.response:
            print(f"Response URL: {e.response.url}")
            print(f"Response headers: {e.response.headers}")

    except HttpStatusCodeError as e:
        print(f"General HTTP error: {e}")
        print(f"Status code: {e.status_code}")

    except Exception as e:
        print(f"Unexpected error: {e}")

    finally:
        await client.close()

asyncio.run(main())

Crawler Error Handling

import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.errors import RequestHandlerError, SessionError

async def main():
    crawler = BeautifulSoupCrawler()

    @crawler.router.default_handler
    async def handler(context: BeautifulSoupCrawlingContext):
        try:
            # Main scraping logic
            title = context.soup.title.string if context.soup.title else "No title"

            data = {
                'url': context.request.url,
                'title': title
            }

            await context.push_data(data)

        except Exception as e:
            context.log.error(f"Error processing {context.request.url}: {e}")
            # Re-raise to trigger retry logic
            raise

    @crawler.router.error_handler
    async def error_handler(context: BeautifulSoupCrawlingContext, error: Exception):
        """Handle errors that occur during request processing."""

        if isinstance(error, SessionError):
            context.log.warning(f"Session error for {context.request.url}: {error}")
            # Rotate session
            context.session.mark_blocked()

        elif isinstance(error, RequestHandlerError):
            context.log.error(f"Handler error for {context.request.url}: {error}")
            if error.original_exception:
                context.log.error(f"Original cause: {error.original_exception}")

        elif isinstance(error, HttpStatusCodeError):
            if error.status_code in [403, 429]:
                context.log.warning(f"Rate limited or blocked: {error.status_code}")
                # Mark session as potentially blocked
                context.session.mark_blocked()
            else:
                context.log.error(f"HTTP error {error.status_code}: {error}")

        else:
            context.log.error(f"Unexpected error: {error}")

        # Log error for analysis
        await context.push_data({
            'url': context.request.url,
            'error_type': type(error).__name__,
            'error_message': str(error),
            'status': 'failed'
        })

    await crawler.run(['https://example.com'])

asyncio.run(main())

Proxy Error Handling

import asyncio
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.errors import ProxyError

async def main():
    # Configure proxy rotation
    proxy_config = ProxyConfiguration([
        'http://proxy1:8080',
        'http://proxy2:8080',
        'http://proxy3:8080'
    ])

    crawler = HttpCrawler(
        proxy_configuration=proxy_config
    )

    @crawler.router.default_handler
    async def handler(context: HttpCrawlingContext):
        try:
            # Process request normally
            data = {
                'url': context.request.url,
                'status': context.response.status_code
            }
            await context.push_data(data)

        except ProxyError as e:
            context.log.error(f"Proxy error: {e}")
            if e.proxy_info:
                context.log.error(f"Failed proxy: {e.proxy_info.url}")

            # Request will be automatically retried with different proxy
            raise

    await crawler.run(['https://example.com'])

asyncio.run(main())

Context Pipeline Error Handling

import asyncio
from crawlee.crawlers import BasicCrawler, BasicCrawlingContext, ContextPipeline
from crawlee.errors import (
    ContextPipelineInitializationError,
    ContextPipelineFinalizationError,
    ContextPipelineInterruptedError
)

async def authentication_middleware(context: BasicCrawlingContext):
    """Middleware for handling authentication."""
    try:
        # Check if authentication is needed
        if not context.session.cookies.get_cookie('auth_token'):
            # Perform authentication
            await authenticate_session(context.session)

    except Exception as e:
        raise ContextPipelineInitializationError(
            f"Authentication failed: {e}",
            pipeline_stage="authentication"
        )

async def rate_limit_middleware(context: BasicCrawlingContext):
    """Middleware for rate limiting."""
    if should_skip_request(context.request):
        # Skip this request
        raise ContextPipelineInterruptedError(
            "Request skipped due to rate limiting",
            skip_to_error_handler=False
        )

async def cleanup_middleware(context: BasicCrawlingContext):
    """Cleanup middleware."""
    try:
        # Perform cleanup operations
        await cleanup_session_data(context.session)

    except Exception as e:
        raise ContextPipelineFinalizationError(
            f"Cleanup failed: {e}",
            pipeline_stage="cleanup"
        )

async def main():
    crawler = BasicCrawler()

    # Configure pipeline with error-prone middleware
    pipeline = ContextPipeline()
    pipeline.use(authentication_middleware)
    pipeline.use(rate_limit_middleware)
    pipeline.use(cleanup_middleware)

    @crawler.router.default_handler
    async def handler(context: BasicCrawlingContext):
        try:
            await pipeline.compose(context)

            # Main request processing
            await context.push_data({'url': context.request.url})

        except ContextPipelineInterruptedError as e:
            if e.skip_to_error_handler:
                context.log.warning(f"Pipeline interrupted: {e}")
                raise
            else:
                context.log.info(f"Request skipped: {e}")
                return

        except (ContextPipelineInitializationError, ContextPipelineFinalizationError) as e:
            context.log.error(f"Pipeline error in {e.pipeline_stage}: {e}")
            raise

    await crawler.run(['https://example.com'])

# Helper functions (implement based on your needs)
async def authenticate_session(session): pass
def should_skip_request(request): return False
async def cleanup_session_data(session): pass

asyncio.run(main())

Service Container Error Handling

from crawlee import service_locator
from crawlee.errors import ServiceConflictError
from crawlee.http_clients import HttpxHttpClient

def setup_services():
    try:
        # Register HTTP client
        client = HttpxHttpClient()
        service_locator.register(HttpxHttpClient, instance=client)

        # Try to register again (will cause conflict)
        another_client = HttpxHttpClient()
        service_locator.register(HttpxHttpClient, instance=another_client)

    except ServiceConflictError as e:
        print(f"Service conflict: {e}")
        print(f"Conflicting service type: {e.service_type}")

        # Use try_get to check if service exists
        existing_client = service_locator.try_get(HttpxHttpClient)
        if existing_client:
            print("Using existing HTTP client")
        else:
            print("No HTTP client registered")

setup_services()

Install with Tessl CLI