0
# HTTP Clients
1
2
Pluggable HTTP client implementations supporting different libraries and browser impersonation for enhanced anti-detection capabilities. HTTP clients handle the actual network communication while providing consistent interfaces for different underlying implementations.
3
4
## Capabilities
5
6
### Base HTTP Client
7
8
Abstract base class defining the interface for all HTTP client implementations in Crawlee.
9
10
```python { .api }
11
class HttpClient:
12
async def crawl(
13
self,
14
request: Request,
15
*,
16
session: Session | None = None,
17
proxy_info: ProxyInfo | None = None,
18
statistics: Statistics | None = None
19
) -> HttpCrawlingResult:
20
"""
21
Perform HTTP request crawling.
22
23
Args:
24
request: Request to process
25
session: Session for state management
26
proxy_info: Proxy configuration
27
statistics: Statistics collector
28
29
Returns:
30
HttpCrawlingResult with response data
31
"""
32
33
async def send_request(
34
self,
35
url: str,
36
*,
37
method: HttpMethod = "GET",
38
headers: dict[str, str] | None = None,
39
payload: HttpPayload | None = None,
40
**kwargs
41
) -> HttpResponse:
42
"""
43
Send direct HTTP request.
44
45
Args:
46
url: Target URL
47
method: HTTP method
48
headers: Request headers
49
payload: Request body
50
51
Returns:
52
HttpResponse object
53
"""
54
```
55
56
### HTTPX Client
57
58
HTTP client implementation using the httpx library with support for HTTP/2, connection pooling, and async operations.
59
60
```python { .api }
61
class HttpxHttpClient(HttpClient):
62
def __init__(
63
self,
64
*,
65
persist_cookies_per_session: bool = True,
66
additional_http_error_status_codes: set[int] | None = None,
67
ignore_http_error_status_codes: set[int] | None = None,
68
**httpx_kwargs
69
): ...
70
71
@property
72
def client(self) -> httpx.AsyncClient:
73
"""Access underlying httpx client."""
74
```
75
76
### Curl Impersonate Client
77
78
HTTP client using curl-cffi for browser impersonation and advanced anti-detection capabilities.
79
80
```python { .api }
81
class CurlImpersonateHttpClient(HttpClient):
82
def __init__(
83
self,
84
*,
85
persist_cookies_per_session: bool = True,
86
impersonate: str = "chrome",
87
additional_http_error_status_codes: set[int] | None = None,
88
ignore_http_error_status_codes: set[int] | None = None,
89
**curl_cffi_kwargs
90
): ...
91
92
@property
93
def impersonate(self) -> str:
94
"""Browser impersonation target."""
95
```
96
97
### HTTP Response
98
99
Response object containing response data, headers, and metadata from HTTP requests.
100
101
```python { .api }
102
class HttpResponse:
103
def __init__(
104
self,
105
*,
106
url: str,
107
status_code: int,
108
headers: HttpHeaders,
109
content: bytes,
110
encoding: str | None = None
111
): ...
112
113
@property
114
def url(self) -> str:
115
"""Final response URL (after redirects)."""
116
117
@property
118
def status_code(self) -> int:
119
"""HTTP status code."""
120
121
@property
122
def headers(self) -> HttpHeaders:
123
"""Response headers."""
124
125
@property
126
def content(self) -> bytes:
127
"""Raw response content."""
128
129
@property
130
def text(self) -> str:
131
"""Response content as string."""
132
133
@property
134
def encoding(self) -> str | None:
135
"""Character encoding of response."""
136
137
@property
138
def content_type(self) -> str | None:
139
"""MIME type from Content-Type header."""
140
141
def json(self) -> any:
142
"""
143
Parse response content as JSON.
144
145
Returns:
146
Parsed JSON data
147
148
Raises:
149
JSONDecodeError: If content is not valid JSON
150
"""
151
152
@property
153
def ok(self) -> bool:
154
"""True if status code indicates success (200-299)."""
155
156
def raise_for_status(self) -> None:
157
"""
158
Raise HTTPStatusError for bad response status codes.
159
160
Raises:
161
HttpStatusCodeError: For 4xx and 5xx status codes
162
"""
163
```
164
165
### HTTP Crawling Result
166
167
Result object containing both HTTP response data and additional crawling metadata.
168
169
```python { .api }
170
class HttpCrawlingResult:
171
def __init__(
172
self,
173
*,
174
http_response: HttpResponse,
175
encoding: str | None = None
176
): ...
177
178
@property
179
def http_response(self) -> HttpResponse:
180
"""HTTP response object."""
181
182
@property
183
def encoding(self) -> str | None:
184
"""Character encoding override."""
185
```
186
187
## Configuration Options
188
189
### HTTP Client Configuration
190
191
Common configuration options available across HTTP client implementations.
192
193
```python { .api }
194
class HttpClientConfig:
195
persist_cookies_per_session: bool = True
196
additional_http_error_status_codes: set[int] | None = None
197
ignore_http_error_status_codes: set[int] | None = None
198
timeout: float = 30.0
199
max_redirects: int = 10
200
verify_ssl: bool = True
201
proxy_url: str | None = None
202
```
203
204
### Browser Impersonation Options
205
206
Configuration for curl-cffi browser impersonation capabilities.
207
208
```python { .api }
209
ImpersonateTarget = Literal[
210
"chrome",
211
"chrome99",
212
"chrome100",
213
"chrome101",
214
"chrome104",
215
"chrome107",
216
"chrome110",
217
"chrome116",
218
"firefox",
219
"firefox99",
220
"firefox102",
221
"firefox109",
222
"safari",
223
"safari15_3",
224
"safari15_5",
225
"safari17_0",
226
"safari17_2_1"
227
]
228
```
229
230
## Usage Examples
231
232
### Basic HTTP Client Usage
233
234
```python
235
import asyncio
236
from crawlee.http_clients import HttpxHttpClient
237
from crawlee import Request
238
239
async def main():
240
client = HttpxHttpClient()
241
242
# Send direct request
243
response = await client.send_request(
244
'https://api.example.com/data',
245
method='GET',
246
headers={'User-Agent': 'My Bot 1.0'}
247
)
248
249
print(f"Status: {response.status_code}")
250
print(f"Content: {response.text}")
251
252
# Process as JSON
253
if response.content_type == 'application/json':
254
data = response.json()
255
print(f"JSON data: {data}")
256
257
await client.close()
258
259
asyncio.run(main())
260
```
261
262
### Browser Impersonation
263
264
```python
265
import asyncio
266
from crawlee.http_clients import CurlImpersonateHttpClient
267
268
async def main():
269
# Impersonate Chrome browser
270
client = CurlImpersonateHttpClient(
271
impersonate='chrome116'
272
)
273
274
response = await client.send_request('https://example.com')
275
276
print(f"Impersonating: {client.impersonate}")
277
print(f"Response: {response.status_code}")
278
279
# The request appears to come from Chrome 116
280
print(f"User-Agent: {response.headers.get('user-agent', 'Not set')}")
281
282
await client.close()
283
284
asyncio.run(main())
285
```
286
287
### Custom HTTP Client Configuration
288
289
```python
290
import asyncio
291
import httpx
292
from crawlee.http_clients import HttpxHttpClient
293
294
async def main():
295
# Custom httpx configuration
296
client = HttpxHttpClient(
297
timeout=60.0,
298
verify=False, # Disable SSL verification
299
limits=httpx.Limits(
300
max_keepalive_connections=100,
301
max_connections=200
302
),
303
ignore_http_error_status_codes={404, 503}
304
)
305
306
try:
307
response = await client.send_request('https://example.com/may-not-exist')
308
# Won't raise error for 404 due to ignore_http_error_status_codes
309
print(f"Status: {response.status_code}")
310
except Exception as e:
311
print(f"Request failed: {e}")
312
313
await client.close()
314
315
asyncio.run(main())
316
```
317
318
### Using HTTP Clients with Crawlers
319
320
```python
321
import asyncio
322
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
323
from crawlee.http_clients import CurlImpersonateHttpClient
324
325
async def main():
326
# Configure crawler with custom HTTP client
327
http_client = CurlImpersonateHttpClient(
328
impersonate='safari17_0',
329
persist_cookies_per_session=True
330
)
331
332
crawler = HttpCrawler(
333
http_client=http_client,
334
max_requests_per_crawl=50
335
)
336
337
@crawler.router.default_handler
338
async def handler(context: HttpCrawlingContext):
339
response = context.response
340
341
print(f"Crawled: {response.url}")
342
print(f"Status: {response.status_code}")
343
print(f"Content-Type: {response.content_type}")
344
345
# Extract data based on content type
346
if response.content_type and 'application/json' in response.content_type:
347
data = response.json()
348
await context.push_data(data)
349
else:
350
# Process HTML or other content
351
data = {
352
'url': response.url,
353
'status': response.status_code,
354
'title': 'Extracted from HTML' # Add your extraction logic
355
}
356
await context.push_data(data)
357
358
await crawler.run(['https://api.example.com/data'])
359
360
asyncio.run(main())
361
```
362
363
### Error Handling
364
365
```python
366
import asyncio
367
from crawlee.http_clients import HttpxHttpClient
368
from crawlee.errors import HttpStatusCodeError
369
370
async def main():
371
client = HttpxHttpClient()
372
373
try:
374
response = await client.send_request('https://httpbin.org/status/500')
375
376
# Check if response is successful
377
if not response.ok:
378
print(f"Request failed with status: {response.status_code}")
379
380
# Or raise exception for bad status
381
response.raise_for_status()
382
383
except HttpStatusCodeError as e:
384
print(f"HTTP error occurred: {e}")
385
print(f"Status code: {e.status_code}")
386
387
except Exception as e:
388
print(f"Other error occurred: {e}")
389
390
finally:
391
await client.close()
392
393
asyncio.run(main())
394
```