A comprehensive web scraping and browser automation library for Python with human-like behavior and bot protection bypass
Pluggable HTTP client implementations supporting different libraries and browser impersonation for enhanced anti-detection capabilities. HTTP clients handle the actual network communication while providing consistent interfaces for different underlying implementations.
Abstract base class defining the interface for all HTTP client implementations in Crawlee.
class HttpClient:
async def crawl(
self,
request: Request,
*,
session: Session | None = None,
proxy_info: ProxyInfo | None = None,
statistics: Statistics | None = None
) -> HttpCrawlingResult:
"""
Perform HTTP request crawling.
Args:
request: Request to process
session: Session for state management
proxy_info: Proxy configuration
statistics: Statistics collector
Returns:
HttpCrawlingResult with response data
"""
async def send_request(
self,
url: str,
*,
method: HttpMethod = "GET",
headers: dict[str, str] | None = None,
payload: HttpPayload | None = None,
**kwargs
) -> HttpResponse:
"""
Send direct HTTP request.
Args:
url: Target URL
method: HTTP method
headers: Request headers
payload: Request body
Returns:
HttpResponse object
"""HTTP client implementation using the httpx library with support for HTTP/2, connection pooling, and async operations.
class HttpxHttpClient(HttpClient):
def __init__(
self,
*,
persist_cookies_per_session: bool = True,
additional_http_error_status_codes: set[int] | None = None,
ignore_http_error_status_codes: set[int] | None = None,
**httpx_kwargs
): ...
@property
def client(self) -> httpx.AsyncClient:
"""Access underlying httpx client."""HTTP client using curl-cffi for browser impersonation and advanced anti-detection capabilities.
class CurlImpersonateHttpClient(HttpClient):
def __init__(
self,
*,
persist_cookies_per_session: bool = True,
impersonate: str = "chrome",
additional_http_error_status_codes: set[int] | None = None,
ignore_http_error_status_codes: set[int] | None = None,
**curl_cffi_kwargs
): ...
@property
def impersonate(self) -> str:
"""Browser impersonation target."""Response object containing response data, headers, and metadata from HTTP requests.
class HttpResponse:
def __init__(
self,
*,
url: str,
status_code: int,
headers: HttpHeaders,
content: bytes,
encoding: str | None = None
): ...
@property
def url(self) -> str:
"""Final response URL (after redirects)."""
@property
def status_code(self) -> int:
"""HTTP status code."""
@property
def headers(self) -> HttpHeaders:
"""Response headers."""
@property
def content(self) -> bytes:
"""Raw response content."""
@property
def text(self) -> str:
"""Response content as string."""
@property
def encoding(self) -> str | None:
"""Character encoding of response."""
@property
def content_type(self) -> str | None:
"""MIME type from Content-Type header."""
def json(self) -> any:
"""
Parse response content as JSON.
Returns:
Parsed JSON data
Raises:
JSONDecodeError: If content is not valid JSON
"""
@property
def ok(self) -> bool:
"""True if status code indicates success (200-299)."""
def raise_for_status(self) -> None:
"""
Raise HTTPStatusError for bad response status codes.
Raises:
HttpStatusCodeError: For 4xx and 5xx status codes
"""Result object containing both HTTP response data and additional crawling metadata.
class HttpCrawlingResult:
def __init__(
self,
*,
http_response: HttpResponse,
encoding: str | None = None
): ...
@property
def http_response(self) -> HttpResponse:
"""HTTP response object."""
@property
def encoding(self) -> str | None:
"""Character encoding override."""Common configuration options available across HTTP client implementations.
class HttpClientConfig:
persist_cookies_per_session: bool = True
additional_http_error_status_codes: set[int] | None = None
ignore_http_error_status_codes: set[int] | None = None
timeout: float = 30.0
max_redirects: int = 10
verify_ssl: bool = True
proxy_url: str | None = NoneConfiguration for curl-cffi browser impersonation capabilities.
ImpersonateTarget = Literal[
"chrome",
"chrome99",
"chrome100",
"chrome101",
"chrome104",
"chrome107",
"chrome110",
"chrome116",
"firefox",
"firefox99",
"firefox102",
"firefox109",
"safari",
"safari15_3",
"safari15_5",
"safari17_0",
"safari17_2_1"
]import asyncio
from crawlee.http_clients import HttpxHttpClient
from crawlee import Request
async def main():
client = HttpxHttpClient()
# Send direct request
response = await client.send_request(
'https://api.example.com/data',
method='GET',
headers={'User-Agent': 'My Bot 1.0'}
)
print(f"Status: {response.status_code}")
print(f"Content: {response.text}")
# Process as JSON
if response.content_type == 'application/json':
data = response.json()
print(f"JSON data: {data}")
await client.close()
asyncio.run(main())import asyncio
from crawlee.http_clients import CurlImpersonateHttpClient
async def main():
# Impersonate Chrome browser
client = CurlImpersonateHttpClient(
impersonate='chrome116'
)
response = await client.send_request('https://example.com')
print(f"Impersonating: {client.impersonate}")
print(f"Response: {response.status_code}")
# The request appears to come from Chrome 116
print(f"User-Agent: {response.headers.get('user-agent', 'Not set')}")
await client.close()
asyncio.run(main())import asyncio
import httpx
from crawlee.http_clients import HttpxHttpClient
async def main():
# Custom httpx configuration
client = HttpxHttpClient(
timeout=60.0,
verify=False, # Disable SSL verification
limits=httpx.Limits(
max_keepalive_connections=100,
max_connections=200
),
ignore_http_error_status_codes={404, 503}
)
try:
response = await client.send_request('https://example.com/may-not-exist')
# Won't raise error for 404 due to ignore_http_error_status_codes
print(f"Status: {response.status_code}")
except Exception as e:
print(f"Request failed: {e}")
await client.close()
asyncio.run(main())import asyncio
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
from crawlee.http_clients import CurlImpersonateHttpClient
async def main():
# Configure crawler with custom HTTP client
http_client = CurlImpersonateHttpClient(
impersonate='safari17_0',
persist_cookies_per_session=True
)
crawler = HttpCrawler(
http_client=http_client,
max_requests_per_crawl=50
)
@crawler.router.default_handler
async def handler(context: HttpCrawlingContext):
response = context.response
print(f"Crawled: {response.url}")
print(f"Status: {response.status_code}")
print(f"Content-Type: {response.content_type}")
# Extract data based on content type
if response.content_type and 'application/json' in response.content_type:
data = response.json()
await context.push_data(data)
else:
# Process HTML or other content
data = {
'url': response.url,
'status': response.status_code,
'title': 'Extracted from HTML' # Add your extraction logic
}
await context.push_data(data)
await crawler.run(['https://api.example.com/data'])
asyncio.run(main())import asyncio
from crawlee.http_clients import HttpxHttpClient
from crawlee.errors import HttpStatusCodeError
async def main():
client = HttpxHttpClient()
try:
response = await client.send_request('https://httpbin.org/status/500')
# Check if response is successful
if not response.ok:
print(f"Request failed with status: {response.status_code}")
# Or raise exception for bad status
response.raise_for_status()
except HttpStatusCodeError as e:
print(f"HTTP error occurred: {e}")
print(f"Status code: {e.status_code}")
except Exception as e:
print(f"Other error occurred: {e}")
finally:
await client.close()
asyncio.run(main())Install with Tessl CLI
npx tessl i tessl/pypi-crawlee