0
# Core Crawling
1
2
Core crawling functionality provides the foundation classes and utilities that all other crawlers build upon. This includes the base `BasicCrawler` class, autoscaling capabilities, error handling, and request routing.
3
4
## Capabilities
5
6
### BasicCrawler
7
8
The foundation crawler class that all other crawlers extend. Provides core functionality for request management, autoscaling, session handling, and error recovery.
9
10
```typescript { .api }
11
/**
12
* Base crawler class that provides the foundation for all crawler implementations
13
* @template Context - The context type passed to request handlers
14
*/
15
class BasicCrawler<Context = BasicCrawlingContext> {
16
constructor(options: BasicCrawlerOptions<Context>);
17
18
/** Run the crawler until all requests are processed */
19
run(): Promise<FinalStatistics>;
20
21
/** Add requests to the crawler queue */
22
addRequests(requests: (string | RequestOptions)[]): Promise<void>;
23
24
/** Export data from the default dataset */
25
exportData<T>(options?: DatasetExportOptions): Promise<T[]>;
26
27
/** Get data from the default dataset */
28
getData<T>(options?: DatasetDataOptions): Promise<T[]>;
29
30
/** Set a value in the default key-value store */
31
setValue(key: string, value: any, options?: RecordOptions): Promise<void>;
32
33
/** Get a value from the default key-value store */
34
getValue<T>(key: string): Promise<T | null>;
35
36
/** Register event handlers */
37
use(handler: CrawlerAddons<Context>): void;
38
39
/** Get crawler statistics */
40
readonly stats: Statistics;
41
42
/** Get final statistics after crawler finishes */
43
readonly finalStatistics?: FinalStatistics;
44
}
45
```
46
47
**Usage Examples:**
48
49
```typescript
50
import { BasicCrawler } from "crawlee";
51
52
const crawler = new BasicCrawler({
53
requestHandler: async ({ request, log }) => {
54
log.info(`Processing ${request.url}`);
55
56
// Custom processing logic here
57
const response = await fetch(request.url);
58
const data = await response.text();
59
60
// Save processed data
61
await crawler.setValue(`page-${request.uniqueKey}`, data);
62
},
63
maxConcurrency: 10,
64
maxRequestRetries: 3,
65
});
66
67
// Add requests and run
68
await crawler.addRequests(['https://example.com']);
69
const stats = await crawler.run();
70
71
console.log(`Processed ${stats.requestsFinished} requests`);
72
```
73
74
### BasicCrawlerOptions
75
76
Configuration options for the BasicCrawler.
77
78
```typescript { .api }
79
interface BasicCrawlerOptions<Context = BasicCrawlingContext> {
80
/** List of requests to process */
81
requestList?: RequestList;
82
83
/** Queue of requests to process */
84
requestQueue?: RequestQueue;
85
86
/** Function to handle each request */
87
requestHandler: (context: Context) => Promise<void>;
88
89
/** Handler for failed requests that won't be retried */
90
failedRequestHandler?: (context: Context, error: Error) => Promise<void>;
91
92
/** Maximum number of retries per request */
93
maxRequestRetries?: number;
94
95
/** Maximum number of requests to process */
96
maxRequestsPerCrawl?: number;
97
98
/** Maximum number of concurrent requests */
99
maxConcurrency?: number;
100
101
/** Minimum number of concurrent requests */
102
minConcurrency?: number;
103
104
/** Options for the autoscaled pool */
105
autoscaledPoolOptions?: AutoscaledPoolOptions;
106
107
/** Options for the session pool */
108
sessionPoolOptions?: SessionPoolOptions;
109
110
/** Whether to use session pool for requests */
111
useSessionPool?: boolean;
112
113
/** Whether to persist cookies per session */
114
persistCookiesPerSession?: boolean;
115
116
/** Configuration for proxy usage */
117
proxyConfiguration?: ProxyConfiguration;
118
119
/** Whether to keep URL fragments when processing */
120
keepAlive?: boolean;
121
122
/** Custom statistics instance */
123
statistics?: Statistics;
124
125
/** Custom logger instance */
126
log?: Log;
127
128
/** Options for handling reclaimed requests */
129
reclaimRequestHandler?: (context: Context) => Promise<void>;
130
}
131
```
132
133
### BasicCrawlingContext
134
135
The context object passed to request handlers, containing request information and helper methods.
136
137
```typescript { .api }
138
interface BasicCrawlingContext<UserData = Dictionary> {
139
/** The current request being processed */
140
request: Request<UserData>;
141
142
/** Current session if session pool is used */
143
session?: Session;
144
145
/** Proxy information for the request */
146
proxyInfo?: ProxyInfo;
147
148
/** HTTP response object (when applicable) */
149
response?: IncomingMessage;
150
151
/** Reference to the crawler instance */
152
crawler: BasicCrawler;
153
154
/** Logger instance scoped to this request */
155
log: Log;
156
157
/** Send HTTP request with current session and proxy */
158
sendRequest<T>(overrideOptions?: Partial<OptionsInit>): Promise<T>;
159
160
/** Extract and enqueue links from current page */
161
enqueueLinks(options?: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;
162
163
/** Push data to the default dataset */
164
pushData(data: Dictionary | Dictionary[]): Promise<void>;
165
166
/** Store value in the default key-value store */
167
setValue(key: string, value: any, options?: RecordOptions): Promise<void>;
168
169
/** Get value from the default key-value store */
170
getValue<T>(key: string): Promise<T | null>;
171
}
172
```
173
174
### AutoscaledPool
175
176
Manages automatic scaling of concurrent tasks based on system resources and performance.
177
178
```typescript { .api }
179
/**
180
* Manages parallel asynchronous tasks with automatic scaling based on system resources
181
*/
182
class AutoscaledPool {
183
constructor(options: AutoscaledPoolOptions);
184
185
/** Start running the pool */
186
run(): Promise<void>;
187
188
/** Abort all running and pending tasks */
189
abort(): Promise<void>;
190
191
/** Pause the pool (finish running tasks but don't start new ones) */
192
pause(): Promise<void>;
193
194
/** Resume a paused pool */
195
resume(): Promise<void>;
196
197
/** Notify the pool about task completion or failure */
198
notify(): void;
199
200
/** Current concurrency level */
201
readonly currentConcurrency: number;
202
203
/** Desired concurrency level calculated by autoscaling */
204
readonly desiredConcurrency: number;
205
206
/** Minimum concurrency level */
207
readonly minConcurrency: number;
208
209
/** Maximum concurrency level */
210
readonly maxConcurrency: number;
211
}
212
```
213
214
### AutoscaledPoolOptions
215
216
Configuration options for the AutoscaledPool.
217
218
```typescript { .api }
219
interface AutoscaledPoolOptions {
220
/** Function that runs a single task */
221
runTaskFunction: () => Promise<void>;
222
223
/** Function that checks if more tasks are available */
224
isTaskReadyFunction?: () => Promise<boolean>;
225
226
/** Function that checks if all tasks are finished */
227
isFinishedFunction?: () => Promise<boolean>;
228
229
/** Minimum number of concurrent tasks */
230
minConcurrency?: number;
231
232
/** Maximum number of concurrent tasks */
233
maxConcurrency?: number;
234
235
/** Initial number of concurrent tasks */
236
desiredConcurrency?: number;
237
238
/** How often to scale up/down (in milliseconds) */
239
scaleUpStepRatio?: number;
240
241
/** How often to scale down (in milliseconds) */
242
scaleDownStepRatio?: number;
243
244
/** How long to maintain high concurrency after scaling up */
245
maintainConcurrencyTimeoutSecs?: number;
246
247
/** How long to wait before scaling down */
248
tasksReadyTimeoutSecs?: number;
249
250
/** CPU usage threshold for scaling decisions */
251
targetCpuRatio?: number;
252
253
/** Whether to log autoscaling decisions */
254
loggingIntervalSecs?: number;
255
256
/** Custom logger instance */
257
log?: Log;
258
}
259
```
260
261
### Error Classes
262
263
Specialized error types for controlling crawler behavior and retry logic.
264
265
```typescript { .api }
266
/**
267
* Thrown when a request should not be retried
268
* Use this for permanent failures like 404 errors or validation failures
269
*/
270
class NonRetryableError extends Error {
271
constructor(message?: string);
272
}
273
274
/**
275
* Thrown when a critical error occurs that should stop the entire crawler
276
* Use this for fatal errors like invalid configuration or system failures
277
*/
278
class CriticalError extends Error {
279
constructor(message?: string);
280
}
281
282
/**
283
* Thrown when a request should be retried immediately
284
* Use this to force retries even if max retries would normally be exceeded
285
*/
286
class RetryRequestError extends Error {
287
constructor(message?: string);
288
}
289
290
/**
291
* Thrown when a session becomes invalid and should be rotated
292
* Use this when encountering IP blocks or session-related failures
293
*/
294
class SessionError extends Error {
295
constructor(message?: string);
296
}
297
298
/**
299
* Thrown when a request URL is blocked and should be skipped
300
* Internal use by the router system
301
*/
302
class MissingRouteError extends Error {
303
constructor(message?: string);
304
}
305
```
306
307
**Usage Examples:**
308
309
```typescript
310
import { BasicCrawler, NonRetryableError, SessionError } from "crawlee";
311
312
const crawler = new BasicCrawler({
313
requestHandler: async ({ request, response, session }) => {
314
if (response?.statusCode === 404) {
315
// Don't retry 404s
316
throw new NonRetryableError(`Page not found: ${request.url}`);
317
}
318
319
if (response?.statusCode === 403) {
320
// Rotate session on access denied
321
throw new SessionError(`Access denied, rotating session: ${request.url}`);
322
}
323
324
// Process the request normally
325
// ...
326
},
327
});
328
```
329
330
### Router
331
332
URL pattern-based request routing system for handling different types of pages.
333
334
```typescript { .api }
335
/**
336
* Routes requests to different handlers based on URL patterns and labels
337
* @template Context - The crawler context type
338
* @template UserData - The request user data type
339
*/
340
class Router<Context = BasicCrawlingContext, UserData = Dictionary> {
341
constructor();
342
343
/** Add a handler for requests matching the pattern */
344
addHandler<Data extends UserData = UserData>(
345
pattern: string | RegExp | RouteHandler<Context, Data>,
346
handler?: RouteHandler<Context, Data>
347
): void;
348
349
/** Add a default handler for requests that don't match any pattern */
350
addDefaultHandler<Data extends UserData = UserData>(
351
handler: RouteHandler<Context, Data>
352
): void;
353
354
/** Find and return the handler for a given request */
355
findMatchingHandler(request: Request): RouteHandler<Context, UserData> | null;
356
357
/** Create a request handler function to use with crawlers */
358
createRequestHandler(): (context: Context) => Promise<void>;
359
}
360
```
361
362
**Usage Examples:**
363
364
```typescript
365
import { CheerioCrawler, Router } from "crawlee";
366
367
const router = new Router<CheerioCrawlingContext>();
368
369
// Handle product pages
370
router.addHandler('PRODUCT', async ({ $, request, enqueueLinks }) => {
371
const title = $('h1').text();
372
const price = $('.price').text();
373
374
await Dataset.pushData({ title, price, url: request.url });
375
});
376
377
// Handle category pages
378
router.addHandler(/\/category\/.*/, async ({ $, enqueueLinks }) => {
379
await enqueueLinks({
380
selector: '.product-link',
381
label: 'PRODUCT',
382
});
383
});
384
385
// Handle any unmatched pages
386
router.addDefaultHandler(async ({ request, log }) => {
387
log.warn(`No handler for ${request.url}`);
388
});
389
390
const crawler = new CheerioCrawler({
391
requestHandler: router.createRequestHandler(),
392
});
393
```
394
395
### System Status and Monitoring
396
397
Classes for monitoring system resources and crawler performance.
398
399
```typescript { .api }
400
/**
401
* Provides current and historical system status information
402
*/
403
class SystemStatus {
404
constructor(options?: SystemStatusOptions);
405
406
/** Get current system status snapshot */
407
getCurrentStatus(): SystemInfo;
408
409
/** Get historical status data */
410
getHistoricalStatus(): SystemInfo[];
411
412
/** Start monitoring system status */
413
startCapturing(intervalMillis?: number): void;
414
415
/** Stop monitoring system status */
416
stopCapturing(): void;
417
}
418
419
/**
420
* Takes snapshots of system resources for autoscaling decisions
421
*/
422
class Snapshotter {
423
constructor(options?: SnapshotterOptions);
424
425
/** Take a snapshot of current system resources */
426
start(): void;
427
428
/** Stop taking snapshots */
429
stop(): void;
430
431
/** Get CPU usage ratio (0-1) */
432
getCpuRatio(): number;
433
434
/** Get memory usage ratio (0-1) */
435
getMemoryRatio(): number;
436
437
/** Check if system is overloaded */
438
isOverloaded(): boolean;
439
}
440
441
interface SystemInfo {
442
/** CPU usage as a ratio (0-1) */
443
cpuUsage: number;
444
445
/** Memory usage in bytes */
446
memoryUsage: number;
447
448
/** Available memory in bytes */
449
memoryAvailable: number;
450
451
/** Timestamp of the measurement */
452
createdAt: Date;
453
454
/** Whether the system is considered overloaded */
455
isOverloaded: boolean;
456
}
457
```
458
459
### Event Handlers
460
461
Event handling system for crawler lifecycle management.
462
463
```typescript { .api }
464
interface CrawlerAddons<Context> {
465
/** Called when the crawler starts */
466
crawlerStarting?: (crawler: BasicCrawler) => Promise<void>;
467
468
/** Called when the crawler finishes */
469
crawlerFinishing?: (crawler: BasicCrawler) => Promise<void>;
470
471
/** Called when a request starts processing */
472
requestStarting?: (context: Context) => Promise<void>;
473
474
/** Called when a request finishes successfully */
475
requestFinished?: (context: Context) => Promise<void>;
476
477
/** Called when a request fails */
478
requestFailed?: (context: Context, error: Error) => Promise<void>;
479
480
/** Called when a session is rotated */
481
sessionRotating?: (context: Context) => Promise<void>;
482
}
483
484
/**
485
* Handler function type for routing
486
*/
487
type RouteHandler<Context, UserData = Dictionary> = (
488
context: Context,
489
request: Request<UserData>
490
) => Promise<void>;
491
```
492
493
**Usage Examples:**
494
495
```typescript
496
import { BasicCrawler } from "crawlee";
497
498
const crawler = new BasicCrawler({
499
requestHandler: async ({ request }) => {
500
// Main processing logic
501
},
502
});
503
504
// Register event handlers
505
crawler.use({
506
crawlerStarting: async (crawler) => {
507
console.log('Crawler is starting...');
508
},
509
510
crawlerFinishing: async (crawler) => {
511
console.log('Crawler finished!');
512
console.log(`Final stats:`, crawler.finalStatistics);
513
},
514
515
requestFailed: async (context, error) => {
516
console.error(`Request failed: ${context.request.url}`, error);
517
},
518
});
519
```
520
521
## Types
522
523
```typescript { .api }
524
interface Statistics {
525
/** Number of requests that finished successfully */
526
requestsFinished: number;
527
528
/** Number of requests that failed permanently */
529
requestsFailed: number;
530
531
/** Total number of request retries */
532
requestsRetries: number;
533
534
/** Average requests per minute (finished) */
535
requestsFinishedPerMinute: number;
536
537
/** Average failed requests per minute */
538
requestsFailedPerMinute: number;
539
540
/** Minimum request duration in milliseconds */
541
requestMinDurationMillis: number;
542
543
/** Maximum request duration in milliseconds */
544
requestMaxDurationMillis: number;
545
546
/** Total duration of all requests in milliseconds */
547
requestTotalDurationMillis: number;
548
549
/** When the crawler started */
550
crawlerStartedAt: Date;
551
552
/** When the crawler finished */
553
crawlerFinishedAt?: Date;
554
555
/** Unique identifier for these statistics */
556
statsId: string;
557
}
558
559
type FinalStatistics = Statistics & {
560
crawlerFinishedAt: Date;
561
}
562
563
interface RouteDefinition {
564
pattern: string | RegExp;
565
method: 'GET' | 'POST' | 'PUT' | 'DELETE' | '*';
566
handler: RouteHandler<any, any>;
567
}
568
569
interface SystemStatusOptions {
570
/** How often to capture system status (in milliseconds) */
571
intervalMillis?: number;
572
573
/** Maximum number of historical entries to keep */
574
maxEntries?: number;
575
}
576
577
interface SnapshotterOptions {
578
/** How often to take snapshots (in milliseconds) */
579
intervalSecs?: number;
580
581
/** Number of snapshots to average for CPU/memory calculations */
582
windowSize?: number;
583
}
584
```