or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

browser-automation.mdcli-tools.mdconfiguration.mdcore-types.mdcrawlers.mderror-handling.mdevents.mdfingerprinting.mdhttp-clients.mdindex.mdrequest-management.mdsessions.mdstatistics.mdstorage.md

error-handling.mddocs/

0

# Error Handling

1

2

Comprehensive exception hierarchy for handling various crawling scenarios and failure modes. Crawlee provides specific exception types for different error conditions to enable precise error handling and recovery strategies.

3

4

## Exception Hierarchy

5

6

### HTTP Errors

7

8

Exceptions related to HTTP requests and responses.

9

10

```python { .api }

11

class HttpStatusCodeError(Exception):

12

"""Raised when HTTP request returns error status code."""

13

14

def __init__(

15

self,

16

message: str,

17

*,

18

status_code: int,

19

response: HttpResponse | None = None

20

): ...

21

22

@property

23

def status_code(self) -> int:

24

"""HTTP status code that caused the error."""

25

26

@property

27

def response(self) -> HttpResponse | None:

28

"""HTTP response object if available."""

29

```

30

31

```python { .api }

32

class HttpClientStatusCodeError(HttpStatusCodeError):

33

"""Raised by HTTP clients for error status codes."""

34

pass

35

```

36

37

### Proxy Errors

38

39

Exceptions related to proxy configuration and connectivity.

40

41

```python { .api }

42

class ProxyError(Exception):

43

"""Base class for proxy-related errors."""

44

45

def __init__(

46

self,

47

message: str,

48

*,

49

proxy_info: ProxyInfo | None = None

50

): ...

51

52

@property

53

def proxy_info(self) -> ProxyInfo | None:

54

"""Proxy information associated with the error."""

55

```

56

57

### Session Errors

58

59

Exceptions related to session management and state.

60

61

```python { .api }

62

class SessionError(Exception):

63

"""Raised when session operations fail."""

64

65

def __init__(

66

self,

67

message: str,

68

*,

69

session_id: str | None = None

70

): ...

71

72

@property

73

def session_id(self) -> str | None:

74

"""Session ID associated with the error."""

75

```

76

77

### Request Handling Errors

78

79

Exceptions that occur during request processing and handler execution.

80

81

```python { .api }

82

class RequestHandlerError(Exception):

83

"""Raised when request handler execution fails."""

84

85

def __init__(

86

self,

87

message: str,

88

*,

89

request: Request | None = None,

90

original_exception: Exception | None = None

91

): ...

92

93

@property

94

def request(self) -> Request | None:

95

"""Request that was being processed when error occurred."""

96

97

@property

98

def original_exception(self) -> Exception | None:

99

"""Original exception that caused the handler error."""

100

```

101

102

```python { .api }

103

class UserDefinedErrorHandlerError(Exception):

104

"""Wrapper for errors in user-defined error handlers."""

105

106

def __init__(

107

self,

108

message: str,

109

*,

110

original_exception: Exception

111

): ...

112

113

@property

114

def original_exception(self) -> Exception:

115

"""Original exception that occurred in user handler."""

116

```

117

118

### Request Queue Errors

119

120

Exceptions related to request queue operations and resource conflicts.

121

122

```python { .api }

123

class RequestCollisionError(Exception):

124

"""Raised when request resource conflicts occur."""

125

126

def __init__(

127

self,

128

message: str,

129

*,

130

request: Request | None = None,

131

conflicting_request: Request | None = None

132

): ...

133

134

@property

135

def request(self) -> Request | None:

136

"""Request that caused the collision."""

137

138

@property

139

def conflicting_request(self) -> Request | None:

140

"""Existing request that conflicts."""

141

```

142

143

### Context Pipeline Errors

144

145

Exceptions related to context pipeline processing and middleware.

146

147

```python { .api }

148

class ContextPipelineInitializationError(Exception):

149

"""Raised when context pipeline initialization fails."""

150

151

def __init__(

152

self,

153

message: str,

154

*,

155

pipeline_stage: str | None = None

156

): ...

157

158

@property

159

def pipeline_stage(self) -> str | None:

160

"""Pipeline stage where initialization failed."""

161

```

162

163

```python { .api }

164

class ContextPipelineFinalizationError(Exception):

165

"""Raised when context pipeline finalization fails."""

166

167

def __init__(

168

self,

169

message: str,

170

*,

171

pipeline_stage: str | None = None

172

): ...

173

174

@property

175

def pipeline_stage(self) -> str | None:

176

"""Pipeline stage where finalization failed."""

177

```

178

179

```python { .api }

180

class ContextPipelineInterruptedError(Exception):

181

"""Signal for interrupting context pipeline processing."""

182

183

def __init__(

184

self,

185

message: str = "Context pipeline interrupted",

186

*,

187

skip_to_error_handler: bool = False

188

): ...

189

190

@property

191

def skip_to_error_handler(self) -> bool:

192

"""Whether to skip remaining pipeline and go to error handler."""

193

```

194

195

### Service Container Errors

196

197

Exceptions related to service locator and dependency injection.

198

199

```python { .api }

200

class ServiceConflictError(Exception):

201

"""Raised when service registration conflicts occur."""

202

203

def __init__(

204

self,

205

message: str,

206

*,

207

service_type: type | None = None

208

): ...

209

210

@property

211

def service_type(self) -> type | None:

212

"""Service type that caused the conflict."""

213

```

214

215

## Usage Examples

216

217

### HTTP Error Handling

218

219

```python

220

import asyncio

221

from crawlee.http_clients import HttpxHttpClient

222

from crawlee.errors import HttpStatusCodeError, HttpClientStatusCodeError

223

224

async def main():

225

client = HttpxHttpClient()

226

227

try:

228

response = await client.send_request('https://httpbin.org/status/404')

229

230

except HttpClientStatusCodeError as e:

231

print(f"HTTP client error: {e}")

232

print(f"Status code: {e.status_code}")

233

if e.response:

234

print(f"Response URL: {e.response.url}")

235

print(f"Response headers: {e.response.headers}")

236

237

except HttpStatusCodeError as e:

238

print(f"General HTTP error: {e}")

239

print(f"Status code: {e.status_code}")

240

241

except Exception as e:

242

print(f"Unexpected error: {e}")

243

244

finally:

245

await client.close()

246

247

asyncio.run(main())

248

```

249

250

### Crawler Error Handling

251

252

```python

253

import asyncio

254

from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext

255

from crawlee.errors import RequestHandlerError, SessionError

256

257

async def main():

258

crawler = BeautifulSoupCrawler()

259

260

@crawler.router.default_handler

261

async def handler(context: BeautifulSoupCrawlingContext):

262

try:

263

# Main scraping logic

264

title = context.soup.title.string if context.soup.title else "No title"

265

266

data = {

267

'url': context.request.url,

268

'title': title

269

}

270

271

await context.push_data(data)

272

273

except Exception as e:

274

context.log.error(f"Error processing {context.request.url}: {e}")

275

# Re-raise to trigger retry logic

276

raise

277

278

@crawler.router.error_handler

279

async def error_handler(context: BeautifulSoupCrawlingContext, error: Exception):

280

"""Handle errors that occur during request processing."""

281

282

if isinstance(error, SessionError):

283

context.log.warning(f"Session error for {context.request.url}: {error}")

284

# Rotate session

285

context.session.mark_blocked()

286

287

elif isinstance(error, RequestHandlerError):

288

context.log.error(f"Handler error for {context.request.url}: {error}")

289

if error.original_exception:

290

context.log.error(f"Original cause: {error.original_exception}")

291

292

elif isinstance(error, HttpStatusCodeError):

293

if error.status_code in [403, 429]:

294

context.log.warning(f"Rate limited or blocked: {error.status_code}")

295

# Mark session as potentially blocked

296

context.session.mark_blocked()

297

else:

298

context.log.error(f"HTTP error {error.status_code}: {error}")

299

300

else:

301

context.log.error(f"Unexpected error: {error}")

302

303

# Log error for analysis

304

await context.push_data({

305

'url': context.request.url,

306

'error_type': type(error).__name__,

307

'error_message': str(error),

308

'status': 'failed'

309

})

310

311

await crawler.run(['https://example.com'])

312

313

asyncio.run(main())

314

```

315

316

### Proxy Error Handling

317

318

```python

319

import asyncio

320

from crawlee.proxy_configuration import ProxyConfiguration

321

from crawlee.crawlers import HttpCrawler, HttpCrawlingContext

322

from crawlee.errors import ProxyError

323

324

async def main():

325

# Configure proxy rotation

326

proxy_config = ProxyConfiguration([

327

'http://proxy1:8080',

328

'http://proxy2:8080',

329

'http://proxy3:8080'

330

])

331

332

crawler = HttpCrawler(

333

proxy_configuration=proxy_config

334

)

335

336

@crawler.router.default_handler

337

async def handler(context: HttpCrawlingContext):

338

try:

339

# Process request normally

340

data = {

341

'url': context.request.url,

342

'status': context.response.status_code

343

}

344

await context.push_data(data)

345

346

except ProxyError as e:

347

context.log.error(f"Proxy error: {e}")

348

if e.proxy_info:

349

context.log.error(f"Failed proxy: {e.proxy_info.url}")

350

351

# Request will be automatically retried with different proxy

352

raise

353

354

await crawler.run(['https://example.com'])

355

356

asyncio.run(main())

357

```

358

359

### Context Pipeline Error Handling

360

361

```python

362

import asyncio

363

from crawlee.crawlers import BasicCrawler, BasicCrawlingContext, ContextPipeline

364

from crawlee.errors import (

365

ContextPipelineInitializationError,

366

ContextPipelineFinalizationError,

367

ContextPipelineInterruptedError

368

)

369

370

async def authentication_middleware(context: BasicCrawlingContext):

371

"""Middleware for handling authentication."""

372

try:

373

# Check if authentication is needed

374

if not context.session.cookies.get_cookie('auth_token'):

375

# Perform authentication

376

await authenticate_session(context.session)

377

378

except Exception as e:

379

raise ContextPipelineInitializationError(

380

f"Authentication failed: {e}",

381

pipeline_stage="authentication"

382

)

383

384

async def rate_limit_middleware(context: BasicCrawlingContext):

385

"""Middleware for rate limiting."""

386

if should_skip_request(context.request):

387

# Skip this request

388

raise ContextPipelineInterruptedError(

389

"Request skipped due to rate limiting",

390

skip_to_error_handler=False

391

)

392

393

async def cleanup_middleware(context: BasicCrawlingContext):

394

"""Cleanup middleware."""

395

try:

396

# Perform cleanup operations

397

await cleanup_session_data(context.session)

398

399

except Exception as e:

400

raise ContextPipelineFinalizationError(

401

f"Cleanup failed: {e}",

402

pipeline_stage="cleanup"

403

)

404

405

async def main():

406

crawler = BasicCrawler()

407

408

# Configure pipeline with error-prone middleware

409

pipeline = ContextPipeline()

410

pipeline.use(authentication_middleware)

411

pipeline.use(rate_limit_middleware)

412

pipeline.use(cleanup_middleware)

413

414

@crawler.router.default_handler

415

async def handler(context: BasicCrawlingContext):

416

try:

417

await pipeline.compose(context)

418

419

# Main request processing

420

await context.push_data({'url': context.request.url})

421

422

except ContextPipelineInterruptedError as e:

423

if e.skip_to_error_handler:

424

context.log.warning(f"Pipeline interrupted: {e}")

425

raise

426

else:

427

context.log.info(f"Request skipped: {e}")

428

return

429

430

except (ContextPipelineInitializationError, ContextPipelineFinalizationError) as e:

431

context.log.error(f"Pipeline error in {e.pipeline_stage}: {e}")

432

raise

433

434

await crawler.run(['https://example.com'])

435

436

# Helper functions (implement based on your needs)

437

async def authenticate_session(session): pass

438

def should_skip_request(request): return False

439

async def cleanup_session_data(session): pass

440

441

asyncio.run(main())

442

```

443

444

### Service Container Error Handling

445

446

```python

447

from crawlee import service_locator

448

from crawlee.errors import ServiceConflictError

449

from crawlee.http_clients import HttpxHttpClient

450

451

def setup_services():

452

try:

453

# Register HTTP client

454

client = HttpxHttpClient()

455

service_locator.register(HttpxHttpClient, instance=client)

456

457

# Try to register again (will cause conflict)

458

another_client = HttpxHttpClient()

459

service_locator.register(HttpxHttpClient, instance=another_client)

460

461

except ServiceConflictError as e:

462

print(f"Service conflict: {e}")

463

print(f"Conflicting service type: {e.service_type}")

464

465

# Use try_get to check if service exists

466

existing_client = service_locator.try_get(HttpxHttpClient)

467

if existing_client:

468

print("Using existing HTTP client")

469

else:

470

print("No HTTP client registered")

471

472

setup_services()

473

```