0
# Error Handling
1
2
Comprehensive exception hierarchy for handling various crawling scenarios and failure modes. Crawlee provides specific exception types for different error conditions to enable precise error handling and recovery strategies.
3
4
## Exception Hierarchy
5
6
### HTTP Errors
7
8
Exceptions related to HTTP requests and responses.
9
10
```python { .api }
11
class HttpStatusCodeError(Exception):
12
"""Raised when HTTP request returns error status code."""
13
14
def __init__(
15
self,
16
message: str,
17
*,
18
status_code: int,
19
response: HttpResponse | None = None
20
): ...
21
22
@property
23
def status_code(self) -> int:
24
"""HTTP status code that caused the error."""
25
26
@property
27
def response(self) -> HttpResponse | None:
28
"""HTTP response object if available."""
29
```
30
31
```python { .api }
32
class HttpClientStatusCodeError(HttpStatusCodeError):
33
"""Raised by HTTP clients for error status codes."""
34
pass
35
```
36
37
### Proxy Errors
38
39
Exceptions related to proxy configuration and connectivity.
40
41
```python { .api }
42
class ProxyError(Exception):
43
"""Base class for proxy-related errors."""
44
45
def __init__(
46
self,
47
message: str,
48
*,
49
proxy_info: ProxyInfo | None = None
50
): ...
51
52
@property
53
def proxy_info(self) -> ProxyInfo | None:
54
"""Proxy information associated with the error."""
55
```
56
57
### Session Errors
58
59
Exceptions related to session management and state.
60
61
```python { .api }
62
class SessionError(Exception):
63
"""Raised when session operations fail."""
64
65
def __init__(
66
self,
67
message: str,
68
*,
69
session_id: str | None = None
70
): ...
71
72
@property
73
def session_id(self) -> str | None:
74
"""Session ID associated with the error."""
75
```
76
77
### Request Handling Errors
78
79
Exceptions that occur during request processing and handler execution.
80
81
```python { .api }
82
class RequestHandlerError(Exception):
83
"""Raised when request handler execution fails."""
84
85
def __init__(
86
self,
87
message: str,
88
*,
89
request: Request | None = None,
90
original_exception: Exception | None = None
91
): ...
92
93
@property
94
def request(self) -> Request | None:
95
"""Request that was being processed when error occurred."""
96
97
@property
98
def original_exception(self) -> Exception | None:
99
"""Original exception that caused the handler error."""
100
```
101
102
```python { .api }
103
class UserDefinedErrorHandlerError(Exception):
104
"""Wrapper for errors in user-defined error handlers."""
105
106
def __init__(
107
self,
108
message: str,
109
*,
110
original_exception: Exception
111
): ...
112
113
@property
114
def original_exception(self) -> Exception:
115
"""Original exception that occurred in user handler."""
116
```
117
118
### Request Queue Errors
119
120
Exceptions related to request queue operations and resource conflicts.
121
122
```python { .api }
123
class RequestCollisionError(Exception):
124
"""Raised when request resource conflicts occur."""
125
126
def __init__(
127
self,
128
message: str,
129
*,
130
request: Request | None = None,
131
conflicting_request: Request | None = None
132
): ...
133
134
@property
135
def request(self) -> Request | None:
136
"""Request that caused the collision."""
137
138
@property
139
def conflicting_request(self) -> Request | None:
140
"""Existing request that conflicts."""
141
```
142
143
### Context Pipeline Errors
144
145
Exceptions related to context pipeline processing and middleware.
146
147
```python { .api }
148
class ContextPipelineInitializationError(Exception):
149
"""Raised when context pipeline initialization fails."""
150
151
def __init__(
152
self,
153
message: str,
154
*,
155
pipeline_stage: str | None = None
156
): ...
157
158
@property
159
def pipeline_stage(self) -> str | None:
160
"""Pipeline stage where initialization failed."""
161
```
162
163
```python { .api }
164
class ContextPipelineFinalizationError(Exception):
165
"""Raised when context pipeline finalization fails."""
166
167
def __init__(
168
self,
169
message: str,
170
*,
171
pipeline_stage: str | None = None
172
): ...
173
174
@property
175
def pipeline_stage(self) -> str | None:
176
"""Pipeline stage where finalization failed."""
177
```
178
179
```python { .api }
180
class ContextPipelineInterruptedError(Exception):
181
"""Signal for interrupting context pipeline processing."""
182
183
def __init__(
184
self,
185
message: str = "Context pipeline interrupted",
186
*,
187
skip_to_error_handler: bool = False
188
): ...
189
190
@property
191
def skip_to_error_handler(self) -> bool:
192
"""Whether to skip remaining pipeline and go to error handler."""
193
```
194
195
### Service Container Errors
196
197
Exceptions related to service locator and dependency injection.
198
199
```python { .api }
200
class ServiceConflictError(Exception):
201
"""Raised when service registration conflicts occur."""
202
203
def __init__(
204
self,
205
message: str,
206
*,
207
service_type: type | None = None
208
): ...
209
210
@property
211
def service_type(self) -> type | None:
212
"""Service type that caused the conflict."""
213
```
214
215
## Usage Examples
216
217
### HTTP Error Handling
218
219
```python
220
import asyncio
221
from crawlee.http_clients import HttpxHttpClient
222
from crawlee.errors import HttpStatusCodeError, HttpClientStatusCodeError
223
224
async def main():
225
client = HttpxHttpClient()
226
227
try:
228
response = await client.send_request('https://httpbin.org/status/404')
229
230
except HttpClientStatusCodeError as e:
231
print(f"HTTP client error: {e}")
232
print(f"Status code: {e.status_code}")
233
if e.response:
234
print(f"Response URL: {e.response.url}")
235
print(f"Response headers: {e.response.headers}")
236
237
except HttpStatusCodeError as e:
238
print(f"General HTTP error: {e}")
239
print(f"Status code: {e.status_code}")
240
241
except Exception as e:
242
print(f"Unexpected error: {e}")
243
244
finally:
245
await client.close()
246
247
asyncio.run(main())
248
```
249
250
### Crawler Error Handling
251
252
```python
253
import asyncio
254
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
255
from crawlee.errors import RequestHandlerError, SessionError
256
257
async def main():
258
crawler = BeautifulSoupCrawler()
259
260
@crawler.router.default_handler
261
async def handler(context: BeautifulSoupCrawlingContext):
262
try:
263
# Main scraping logic
264
title = context.soup.title.string if context.soup.title else "No title"
265
266
data = {
267
'url': context.request.url,
268
'title': title
269
}
270
271
await context.push_data(data)
272
273
except Exception as e:
274
context.log.error(f"Error processing {context.request.url}: {e}")
275
# Re-raise to trigger retry logic
276
raise
277
278
@crawler.router.error_handler
279
async def error_handler(context: BeautifulSoupCrawlingContext, error: Exception):
280
"""Handle errors that occur during request processing."""
281
282
if isinstance(error, SessionError):
283
context.log.warning(f"Session error for {context.request.url}: {error}")
284
# Rotate session
285
context.session.mark_blocked()
286
287
elif isinstance(error, RequestHandlerError):
288
context.log.error(f"Handler error for {context.request.url}: {error}")
289
if error.original_exception:
290
context.log.error(f"Original cause: {error.original_exception}")
291
292
elif isinstance(error, HttpStatusCodeError):
293
if error.status_code in [403, 429]:
294
context.log.warning(f"Rate limited or blocked: {error.status_code}")
295
# Mark session as potentially blocked
296
context.session.mark_blocked()
297
else:
298
context.log.error(f"HTTP error {error.status_code}: {error}")
299
300
else:
301
context.log.error(f"Unexpected error: {error}")
302
303
# Log error for analysis
304
await context.push_data({
305
'url': context.request.url,
306
'error_type': type(error).__name__,
307
'error_message': str(error),
308
'status': 'failed'
309
})
310
311
await crawler.run(['https://example.com'])
312
313
asyncio.run(main())
314
```
315
316
### Proxy Error Handling
317
318
```python
319
import asyncio
320
from crawlee.proxy_configuration import ProxyConfiguration
321
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
322
from crawlee.errors import ProxyError
323
324
async def main():
325
# Configure proxy rotation
326
proxy_config = ProxyConfiguration([
327
'http://proxy1:8080',
328
'http://proxy2:8080',
329
'http://proxy3:8080'
330
])
331
332
crawler = HttpCrawler(
333
proxy_configuration=proxy_config
334
)
335
336
@crawler.router.default_handler
337
async def handler(context: HttpCrawlingContext):
338
try:
339
# Process request normally
340
data = {
341
'url': context.request.url,
342
'status': context.response.status_code
343
}
344
await context.push_data(data)
345
346
except ProxyError as e:
347
context.log.error(f"Proxy error: {e}")
348
if e.proxy_info:
349
context.log.error(f"Failed proxy: {e.proxy_info.url}")
350
351
# Request will be automatically retried with different proxy
352
raise
353
354
await crawler.run(['https://example.com'])
355
356
asyncio.run(main())
357
```
358
359
### Context Pipeline Error Handling
360
361
```python
362
import asyncio
363
from crawlee.crawlers import BasicCrawler, BasicCrawlingContext, ContextPipeline
364
from crawlee.errors import (
365
ContextPipelineInitializationError,
366
ContextPipelineFinalizationError,
367
ContextPipelineInterruptedError
368
)
369
370
async def authentication_middleware(context: BasicCrawlingContext):
371
"""Middleware for handling authentication."""
372
try:
373
# Check if authentication is needed
374
if not context.session.cookies.get_cookie('auth_token'):
375
# Perform authentication
376
await authenticate_session(context.session)
377
378
except Exception as e:
379
raise ContextPipelineInitializationError(
380
f"Authentication failed: {e}",
381
pipeline_stage="authentication"
382
)
383
384
async def rate_limit_middleware(context: BasicCrawlingContext):
385
"""Middleware for rate limiting."""
386
if should_skip_request(context.request):
387
# Skip this request
388
raise ContextPipelineInterruptedError(
389
"Request skipped due to rate limiting",
390
skip_to_error_handler=False
391
)
392
393
async def cleanup_middleware(context: BasicCrawlingContext):
394
"""Cleanup middleware."""
395
try:
396
# Perform cleanup operations
397
await cleanup_session_data(context.session)
398
399
except Exception as e:
400
raise ContextPipelineFinalizationError(
401
f"Cleanup failed: {e}",
402
pipeline_stage="cleanup"
403
)
404
405
async def main():
406
crawler = BasicCrawler()
407
408
# Configure pipeline with error-prone middleware
409
pipeline = ContextPipeline()
410
pipeline.use(authentication_middleware)
411
pipeline.use(rate_limit_middleware)
412
pipeline.use(cleanup_middleware)
413
414
@crawler.router.default_handler
415
async def handler(context: BasicCrawlingContext):
416
try:
417
await pipeline.compose(context)
418
419
# Main request processing
420
await context.push_data({'url': context.request.url})
421
422
except ContextPipelineInterruptedError as e:
423
if e.skip_to_error_handler:
424
context.log.warning(f"Pipeline interrupted: {e}")
425
raise
426
else:
427
context.log.info(f"Request skipped: {e}")
428
return
429
430
except (ContextPipelineInitializationError, ContextPipelineFinalizationError) as e:
431
context.log.error(f"Pipeline error in {e.pipeline_stage}: {e}")
432
raise
433
434
await crawler.run(['https://example.com'])
435
436
# Helper functions (implement based on your needs)
437
async def authenticate_session(session): pass
438
def should_skip_request(request): return False
439
async def cleanup_session_data(session): pass
440
441
asyncio.run(main())
442
```
443
444
### Service Container Error Handling
445
446
```python
447
from crawlee import service_locator
448
from crawlee.errors import ServiceConflictError
449
from crawlee.http_clients import HttpxHttpClient
450
451
def setup_services():
452
try:
453
# Register HTTP client
454
client = HttpxHttpClient()
455
service_locator.register(HttpxHttpClient, instance=client)
456
457
# Try to register again (will cause conflict)
458
another_client = HttpxHttpClient()
459
service_locator.register(HttpxHttpClient, instance=another_client)
460
461
except ServiceConflictError as e:
462
print(f"Service conflict: {e}")
463
print(f"Conflicting service type: {e.service_type}")
464
465
# Use try_get to check if service exists
466
existing_client = service_locator.try_get(HttpxHttpClient)
467
if existing_client:
468
print("Using existing HTTP client")
469
else:
470
print("No HTTP client registered")
471
472
setup_services()
473
```