A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass
Comprehensive exception hierarchy for handling various crawling scenarios and failure modes. Crawlee provides specific exception types for different error conditions to enable precise error handling and recovery strategies.
Exceptions related to HTTP requests and responses.
class HttpStatusCodeError(Exception):
"""Raised when HTTP request returns error status code."""
def __init__(
self,
message: str,
*,
status_code: int,
response: HttpResponse | None = None
): ...
@property
def status_code(self) -> int:
"""HTTP status code that caused the error."""
@property
def response(self) -> HttpResponse | None:
"""HTTP response object if available."""class HttpClientStatusCodeError(HttpStatusCodeError):
"""Raised by HTTP clients for error status codes."""
passExceptions related to proxy configuration and connectivity.
class ProxyError(Exception):
"""Base class for proxy-related errors."""
def __init__(
self,
message: str,
*,
proxy_info: ProxyInfo | None = None
): ...
@property
def proxy_info(self) -> ProxyInfo | None:
"""Proxy information associated with the error."""Exceptions related to session management and state.
class SessionError(Exception):
"""Raised when session operations fail."""
def __init__(
self,
message: str,
*,
session_id: str | None = None
): ...
@property
def session_id(self) -> str | None:
"""Session ID associated with the error."""Exceptions that occur during request processing and handler execution.
class RequestHandlerError(Exception):
"""Raised when request handler execution fails."""
def __init__(
self,
message: str,
*,
request: Request | None = None,
original_exception: Exception | None = None
): ...
@property
def request(self) -> Request | None:
"""Request that was being processed when error occurred."""
@property
def original_exception(self) -> Exception | None:
"""Original exception that caused the handler error."""class UserDefinedErrorHandlerError(Exception):
"""Wrapper for errors in user-defined error handlers."""
def __init__(
self,
message: str,
*,
original_exception: Exception
): ...
@property
def original_exception(self) -> Exception:
"""Original exception that occurred in user handler."""Exceptions related to request queue operations and resource conflicts.
class RequestCollisionError(Exception):
"""Raised when request resource conflicts occur."""
def __init__(
self,
message: str,
*,
request: Request | None = None,
conflicting_request: Request | None = None
): ...
@property
def request(self) -> Request | None:
"""Request that caused the collision."""
@property
def conflicting_request(self) -> Request | None:
"""Existing request that conflicts."""Exceptions related to context pipeline processing and middleware.
class ContextPipelineInitializationError(Exception):
"""Raised when context pipeline initialization fails."""
def __init__(
self,
message: str,
*,
pipeline_stage: str | None = None
): ...
@property
def pipeline_stage(self) -> str | None:
"""Pipeline stage where initialization failed."""class ContextPipelineFinalizationError(Exception):
"""Raised when context pipeline finalization fails."""
def __init__(
self,
message: str,
*,
pipeline_stage: str | None = None
): ...
@property
def pipeline_stage(self) -> str | None:
"""Pipeline stage where finalization failed."""class ContextPipelineInterruptedError(Exception):
"""Signal for interrupting context pipeline processing."""
def __init__(
self,
message: str = "Context pipeline interrupted",
*,
skip_to_error_handler: bool = False
): ...
@property
def skip_to_error_handler(self) -> bool:
"""Whether to skip remaining pipeline and go to error handler."""Exceptions related to service locator and dependency injection.
class ServiceConflictError(Exception):
"""Raised when service registration conflicts occur."""
def __init__(
self,
message: str,
*,
service_type: type | None = None
): ...
@property
def service_type(self) -> type | None:
"""Service type that caused the conflict."""import asyncio
from crawlee.http_clients import HttpxHttpClient
from crawlee.errors import HttpStatusCodeError, HttpClientStatusCodeError
async def main():
client = HttpxHttpClient()
try:
response = await client.send_request('https://httpbin.org/status/404')
except HttpClientStatusCodeError as e:
print(f"HTTP client error: {e}")
print(f"Status code: {e.status_code}")
if e.response:
print(f"Response URL: {e.response.url}")
print(f"Response headers: {e.response.headers}")
except HttpStatusCodeError as e:
print(f"General HTTP error: {e}")
print(f"Status code: {e.status_code}")
except Exception as e:
print(f"Unexpected error: {e}")
finally:
await client.close()
asyncio.run(main())import asyncio
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
from crawlee.errors import RequestHandlerError, SessionError
async def main():
crawler = BeautifulSoupCrawler()
@crawler.router.default_handler
async def handler(context: BeautifulSoupCrawlingContext):
try:
# Main scraping logic
title = context.soup.title.string if context.soup.title else "No title"
data = {
'url': context.request.url,
'title': title
}
await context.push_data(data)
except Exception as e:
context.log.error(f"Error processing {context.request.url}: {e}")
# Re-raise to trigger retry logic
raise
@crawler.router.error_handler
async def error_handler(context: BeautifulSoupCrawlingContext, error: Exception):
"""Handle errors that occur during request processing."""
if isinstance(error, SessionError):
context.log.warning(f"Session error for {context.request.url}: {error}")
# Rotate session
context.session.mark_blocked()
elif isinstance(error, RequestHandlerError):
context.log.error(f"Handler error for {context.request.url}: {error}")
if error.original_exception:
context.log.error(f"Original cause: {error.original_exception}")
elif isinstance(error, HttpStatusCodeError):
if error.status_code in [403, 429]:
context.log.warning(f"Rate limited or blocked: {error.status_code}")
# Mark session as potentially blocked
context.session.mark_blocked()
else:
context.log.error(f"HTTP error {error.status_code}: {error}")
else:
context.log.error(f"Unexpected error: {error}")
# Log error for analysis
await context.push_data({
'url': context.request.url,
'error_type': type(error).__name__,
'error_message': str(error),
'status': 'failed'
})
await crawler.run(['https://example.com'])
asyncio.run(main())import asyncio
from crawlee.proxy_configuration import ProxyConfiguration
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.errors import ProxyError
async def main():
# Configure proxy rotation
proxy_config = ProxyConfiguration([
'http://proxy1:8080',
'http://proxy2:8080',
'http://proxy3:8080'
])
crawler = HttpCrawler(
proxy_configuration=proxy_config
)
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
try:
# Process request normally
data = {
'url': context.request.url,
'status': context.response.status_code
}
await context.push_data(data)
except ProxyError as e:
context.log.error(f"Proxy error: {e}")
if e.proxy_info:
context.log.error(f"Failed proxy: {e.proxy_info.url}")
# Request will be automatically retried with different proxy
raise
await crawler.run(['https://example.com'])
asyncio.run(main())import asyncio
from crawlee.crawlers import BasicCrawler, BasicCrawlingContext, ContextPipeline
from crawlee.errors import (
ContextPipelineInitializationError,
ContextPipelineFinalizationError,
ContextPipelineInterruptedError
)
async def authentication_middleware(context: BasicCrawlingContext):
"""Middleware for handling authentication."""
try:
# Check if authentication is needed
if not context.session.cookies.get_cookie('auth_token'):
# Perform authentication
await authenticate_session(context.session)
except Exception as e:
raise ContextPipelineInitializationError(
f"Authentication failed: {e}",
pipeline_stage="authentication"
)
async def rate_limit_middleware(context: BasicCrawlingContext):
"""Middleware for rate limiting."""
if should_skip_request(context.request):
# Skip this request
raise ContextPipelineInterruptedError(
"Request skipped due to rate limiting",
skip_to_error_handler=False
)
async def cleanup_middleware(context: BasicCrawlingContext):
"""Cleanup middleware."""
try:
# Perform cleanup operations
await cleanup_session_data(context.session)
except Exception as e:
raise ContextPipelineFinalizationError(
f"Cleanup failed: {e}",
pipeline_stage="cleanup"
)
async def main():
crawler = BasicCrawler()
# Configure pipeline with error-prone middleware
pipeline = ContextPipeline()
pipeline.use(authentication_middleware)
pipeline.use(rate_limit_middleware)
pipeline.use(cleanup_middleware)
@crawler.router.default_handler
async def handler(context: BasicCrawlingContext):
try:
await pipeline.compose(context)
# Main request processing
await context.push_data({'url': context.request.url})
except ContextPipelineInterruptedError as e:
if e.skip_to_error_handler:
context.log.warning(f"Pipeline interrupted: {e}")
raise
else:
context.log.info(f"Request skipped: {e}")
return
except (ContextPipelineInitializationError, ContextPipelineFinalizationError) as e:
context.log.error(f"Pipeline error in {e.pipeline_stage}: {e}")
raise
await crawler.run(['https://example.com'])
# Helper functions (implement based on your needs)
async def authenticate_session(session): pass
def should_skip_request(request): return False
async def cleanup_session_data(session): pass
asyncio.run(main())from crawlee import service_locator
from crawlee.errors import ServiceConflictError
from crawlee.http_clients import HttpxHttpClient
def setup_services():
try:
# Register HTTP client
client = HttpxHttpClient()
service_locator.register(HttpxHttpClient, instance=client)
# Try to register again (will cause conflict)
another_client = HttpxHttpClient()
service_locator.register(HttpxHttpClient, instance=another_client)
except ServiceConflictError as e:
print(f"Service conflict: {e}")
print(f"Conflicting service type: {e.service_type}")
# Use try_get to check if service exists
existing_client = service_locator.try_get(HttpxHttpClient)
if existing_client:
print("Using existing HTTP client")
else:
print("No HTTP client registered")
setup_services()Install with Tessl CLI
npx tessl i tessl/pypi-crawlee