Tessl Tile for npm/crawlee@3.15.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

browser-crawling.md configuration-proxies.md core-crawling.md http-crawling.md index.md session-management.md storage.md utilities.md

core-crawling.mddocs/

0
# Core Crawling
1

2
Core crawling functionality provides the foundation classes and utilities that all other crawlers build upon. This includes the base `BasicCrawler` class, autoscaling capabilities, error handling, and request routing.
3

4
## Capabilities
5

6
### BasicCrawler
7

8
The foundation crawler class that all other crawlers extend. Provides core functionality for request management, autoscaling, session handling, and error recovery.
9

10
```typescript { .api }
11
/**
12
 * Base crawler class that provides the foundation for all crawler implementations
13
 * @template Context - The context type passed to request handlers
14
 */
15
class BasicCrawler<Context = BasicCrawlingContext> {
16
  constructor(options: BasicCrawlerOptions<Context>);
17

18
  /** Run the crawler until all requests are processed */
19
  run(): Promise<FinalStatistics>;
20

21
  /** Add requests to the crawler queue */
22
  addRequests(requests: (string | RequestOptions)[]): Promise<void>;
23

24
  /** Export data from the default dataset */
25
  exportData<T>(options?: DatasetExportOptions): Promise<T[]>;
26

27
  /** Get data from the default dataset */
28
  getData<T>(options?: DatasetDataOptions): Promise<T[]>;
29

30
  /** Set a value in the default key-value store */
31
  setValue(key: string, value: any, options?: RecordOptions): Promise<void>;
32

33
  /** Get a value from the default key-value store */
34
  getValue<T>(key: string): Promise<T | null>;
35

36
  /** Register event handlers */
37
  use(handler: CrawlerAddons<Context>): void;
38

39
  /** Get crawler statistics */
40
  readonly stats: Statistics;
41

42
  /** Get final statistics after crawler finishes */
43
  readonly finalStatistics?: FinalStatistics;
44
}
45
```
46

47
**Usage Examples:**
48

49
```typescript
50
import { BasicCrawler } from "crawlee";
51

52
const crawler = new BasicCrawler({
53
  requestHandler: async ({ request, log }) => {
54
    log.info(`Processing ${request.url}`);
55

56
    // Custom processing logic here
57
    const response = await fetch(request.url);
58
    const data = await response.text();
59

60
    // Save processed data
61
    await crawler.setValue(`page-${request.uniqueKey}`, data);
62
  },
63
  maxConcurrency: 10,
64
  maxRequestRetries: 3,
65
});
66

67
// Add requests and run
68
await crawler.addRequests(['https://example.com']);
69
const stats = await crawler.run();
70

71
console.log(`Processed ${stats.requestsFinished} requests`);
72
```
73

74
### BasicCrawlerOptions
75

76
Configuration options for the BasicCrawler.
77

78
```typescript { .api }
79
interface BasicCrawlerOptions<Context = BasicCrawlingContext> {
80
  /** List of requests to process */
81
  requestList?: RequestList;
82

83
  /** Queue of requests to process */
84
  requestQueue?: RequestQueue;
85

86
  /** Function to handle each request */
87
  requestHandler: (context: Context) => Promise<void>;
88

89
  /** Handler for failed requests that won't be retried */
90
  failedRequestHandler?: (context: Context, error: Error) => Promise<void>;
91

92
  /** Maximum number of retries per request */
93
  maxRequestRetries?: number;
94

95
  /** Maximum number of requests to process */
96
  maxRequestsPerCrawl?: number;
97

98
  /** Maximum number of concurrent requests */
99
  maxConcurrency?: number;
100

101
  /** Minimum number of concurrent requests */
102
  minConcurrency?: number;
103

104
  /** Options for the autoscaled pool */
105
  autoscaledPoolOptions?: AutoscaledPoolOptions;
106

107
  /** Options for the session pool */
108
  sessionPoolOptions?: SessionPoolOptions;
109

110
  /** Whether to use session pool for requests */
111
  useSessionPool?: boolean;
112

113
  /** Whether to persist cookies per session */
114
  persistCookiesPerSession?: boolean;
115

116
  /** Configuration for proxy usage */
117
  proxyConfiguration?: ProxyConfiguration;
118

119
  /** Whether to keep URL fragments when processing */
120
  keepAlive?: boolean;
121

122
  /** Custom statistics instance */
123
  statistics?: Statistics;
124

125
  /** Custom logger instance */
126
  log?: Log;
127

128
  /** Options for handling reclaimed requests */
129
  reclaimRequestHandler?: (context: Context) => Promise<void>;
130
}
131
```
132

133
### BasicCrawlingContext
134

135
The context object passed to request handlers, containing request information and helper methods.
136

137
```typescript { .api }
138
interface BasicCrawlingContext<UserData = Dictionary> {
139
  /** The current request being processed */
140
  request: Request<UserData>;
141

142
  /** Current session if session pool is used */
143
  session?: Session;
144

145
  /** Proxy information for the request */
146
  proxyInfo?: ProxyInfo;
147

148
  /** HTTP response object (when applicable) */
149
  response?: IncomingMessage;
150

151
  /** Reference to the crawler instance */
152
  crawler: BasicCrawler;
153

154
  /** Logger instance scoped to this request */
155
  log: Log;
156

157
  /** Send HTTP request with current session and proxy */
158
  sendRequest<T>(overrideOptions?: Partial<OptionsInit>): Promise<T>;
159

160
  /** Extract and enqueue links from current page */
161
  enqueueLinks(options?: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;
162

163
  /** Push data to the default dataset */
164
  pushData(data: Dictionary | Dictionary[]): Promise<void>;
165

166
  /** Store value in the default key-value store */
167
  setValue(key: string, value: any, options?: RecordOptions): Promise<void>;
168

169
  /** Get value from the default key-value store */
170
  getValue<T>(key: string): Promise<T | null>;
171
}
172
```
173

174
### AutoscaledPool
175

176
Manages automatic scaling of concurrent tasks based on system resources and performance.
177

178
```typescript { .api }
179
/**
180
 * Manages parallel asynchronous tasks with automatic scaling based on system resources
181
 */
182
class AutoscaledPool {
183
  constructor(options: AutoscaledPoolOptions);
184

185
  /** Start running the pool */
186
  run(): Promise<void>;
187

188
  /** Abort all running and pending tasks */
189
  abort(): Promise<void>;
190

191
  /** Pause the pool (finish running tasks but don't start new ones) */
192
  pause(): Promise<void>;
193

194
  /** Resume a paused pool */
195
  resume(): Promise<void>;
196

197
  /** Notify the pool about task completion or failure */
198
  notify(): void;
199

200
  /** Current concurrency level */
201
  readonly currentConcurrency: number;
202

203
  /** Desired concurrency level calculated by autoscaling */
204
  readonly desiredConcurrency: number;
205

206
  /** Minimum concurrency level */
207
  readonly minConcurrency: number;
208

209
  /** Maximum concurrency level */
210
  readonly maxConcurrency: number;
211
}
212
```
213

214
### AutoscaledPoolOptions
215

216
Configuration options for the AutoscaledPool.
217

218
```typescript { .api }
219
interface AutoscaledPoolOptions {
220
  /** Function that runs a single task */
221
  runTaskFunction: () => Promise<void>;
222

223
  /** Function that checks if more tasks are available */
224
  isTaskReadyFunction?: () => Promise<boolean>;
225

226
  /** Function that checks if all tasks are finished */
227
  isFinishedFunction?: () => Promise<boolean>;
228

229
  /** Minimum number of concurrent tasks */
230
  minConcurrency?: number;
231

232
  /** Maximum number of concurrent tasks */
233
  maxConcurrency?: number;
234

235
  /** Initial number of concurrent tasks */
236
  desiredConcurrency?: number;
237

238
  /** How often to scale up/down (in milliseconds) */
239
  scaleUpStepRatio?: number;
240

241
  /** How often to scale down (in milliseconds) */
242
  scaleDownStepRatio?: number;
243

244
  /** How long to maintain high concurrency after scaling up */
245
  maintainConcurrencyTimeoutSecs?: number;
246

247
  /** How long to wait before scaling down */
248
  tasksReadyTimeoutSecs?: number;
249

250
  /** CPU usage threshold for scaling decisions */
251
  targetCpuRatio?: number;
252

253
  /** Whether to log autoscaling decisions */
254
  loggingIntervalSecs?: number;
255

256
  /** Custom logger instance */
257
  log?: Log;
258
}
259
```
260

261
### Error Classes
262

263
Specialized error types for controlling crawler behavior and retry logic.
264

265
```typescript { .api }
266
/**
267
 * Thrown when a request should not be retried
268
 * Use this for permanent failures like 404 errors or validation failures
269
 */
270
class NonRetryableError extends Error {
271
  constructor(message?: string);
272
}
273

274
/**
275
 * Thrown when a critical error occurs that should stop the entire crawler
276
 * Use this for fatal errors like invalid configuration or system failures
277
 */
278
class CriticalError extends Error {
279
  constructor(message?: string);
280
}
281

282
/**
283
 * Thrown when a request should be retried immediately
284
 * Use this to force retries even if max retries would normally be exceeded
285
 */
286
class RetryRequestError extends Error {
287
  constructor(message?: string);
288
}
289

290
/**
291
 * Thrown when a session becomes invalid and should be rotated
292
 * Use this when encountering IP blocks or session-related failures
293
 */
294
class SessionError extends Error {
295
  constructor(message?: string);
296
}
297

298
/**
299
 * Thrown when a request URL is blocked and should be skipped
300
 * Internal use by the router system
301
 */
302
class MissingRouteError extends Error {
303
  constructor(message?: string);
304
}
305
```
306

307
**Usage Examples:**
308

309
```typescript
310
import { BasicCrawler, NonRetryableError, SessionError } from "crawlee";
311

312
const crawler = new BasicCrawler({
313
  requestHandler: async ({ request, response, session }) => {
314
    if (response?.statusCode === 404) {
315
      // Don't retry 404s
316
      throw new NonRetryableError(`Page not found: ${request.url}`);
317
    }
318

319
    if (response?.statusCode === 403) {
320
      // Rotate session on access denied
321
      throw new SessionError(`Access denied, rotating session: ${request.url}`);
322
    }
323

324
    // Process the request normally
325
    // ...
326
  },
327
});
328
```
329

330
### Router
331

332
URL pattern-based request routing system for handling different types of pages.
333

334
```typescript { .api }
335
/**
336
 * Routes requests to different handlers based on URL patterns and labels
337
 * @template Context - The crawler context type
338
 * @template UserData - The request user data type
339
 */
340
class Router<Context = BasicCrawlingContext, UserData = Dictionary> {
341
  constructor();
342

343
  /** Add a handler for requests matching the pattern */
344
  addHandler<Data extends UserData = UserData>(
345
    pattern: string | RegExp | RouteHandler<Context, Data>,
346
    handler?: RouteHandler<Context, Data>
347
  ): void;
348

349
  /** Add a default handler for requests that don't match any pattern */
350
  addDefaultHandler<Data extends UserData = UserData>(
351
    handler: RouteHandler<Context, Data>
352
  ): void;
353

354
  /** Find and return the handler for a given request */
355
  findMatchingHandler(request: Request): RouteHandler<Context, UserData> | null;
356

357
  /** Create a request handler function to use with crawlers */
358
  createRequestHandler(): (context: Context) => Promise<void>;
359
}
360
```
361

362
**Usage Examples:**
363

364
```typescript
365
import { CheerioCrawler, Router } from "crawlee";
366

367
const router = new Router<CheerioCrawlingContext>();
368

369
// Handle product pages
370
router.addHandler('PRODUCT', async ({ $, request, enqueueLinks }) => {
371
  const title = $('h1').text();
372
  const price = $('.price').text();
373

374
  await Dataset.pushData({ title, price, url: request.url });
375
});
376

377
// Handle category pages
378
router.addHandler(/\/category\/.*/, async ({ $, enqueueLinks }) => {
379
  await enqueueLinks({
380
    selector: '.product-link',
381
    label: 'PRODUCT',
382
  });
383
});
384

385
// Handle any unmatched pages
386
router.addDefaultHandler(async ({ request, log }) => {
387
  log.warn(`No handler for ${request.url}`);
388
});
389

390
const crawler = new CheerioCrawler({
391
  requestHandler: router.createRequestHandler(),
392
});
393
```
394

395
### System Status and Monitoring
396

397
Classes for monitoring system resources and crawler performance.
398

399
```typescript { .api }
400
/**
401
 * Provides current and historical system status information
402
 */
403
class SystemStatus {
404
  constructor(options?: SystemStatusOptions);
405

406
  /** Get current system status snapshot */
407
  getCurrentStatus(): SystemInfo;
408

409
  /** Get historical status data */
410
  getHistoricalStatus(): SystemInfo[];
411

412
  /** Start monitoring system status */
413
  startCapturing(intervalMillis?: number): void;
414

415
  /** Stop monitoring system status */
416
  stopCapturing(): void;
417
}
418

419
/**
420
 * Takes snapshots of system resources for autoscaling decisions
421
 */
422
class Snapshotter {
423
  constructor(options?: SnapshotterOptions);
424

425
  /** Take a snapshot of current system resources */
426
  start(): void;
427

428
  /** Stop taking snapshots */
429
  stop(): void;
430

431
  /** Get CPU usage ratio (0-1) */
432
  getCpuRatio(): number;
433

434
  /** Get memory usage ratio (0-1) */
435
  getMemoryRatio(): number;
436

437
  /** Check if system is overloaded */
438
  isOverloaded(): boolean;
439
}
440

441
interface SystemInfo {
442
  /** CPU usage as a ratio (0-1) */
443
  cpuUsage: number;
444

445
  /** Memory usage in bytes */
446
  memoryUsage: number;
447

448
  /** Available memory in bytes */
449
  memoryAvailable: number;
450

451
  /** Timestamp of the measurement */
452
  createdAt: Date;
453

454
  /** Whether the system is considered overloaded */
455
  isOverloaded: boolean;
456
}
457
```
458

459
### Event Handlers
460

461
Event handling system for crawler lifecycle management.
462

463
```typescript { .api }
464
interface CrawlerAddons<Context> {
465
  /** Called when the crawler starts */
466
  crawlerStarting?: (crawler: BasicCrawler) => Promise<void>;
467

468
  /** Called when the crawler finishes */
469
  crawlerFinishing?: (crawler: BasicCrawler) => Promise<void>;
470

471
  /** Called when a request starts processing */
472
  requestStarting?: (context: Context) => Promise<void>;
473

474
  /** Called when a request finishes successfully */
475
  requestFinished?: (context: Context) => Promise<void>;
476

477
  /** Called when a request fails */
478
  requestFailed?: (context: Context, error: Error) => Promise<void>;
479

480
  /** Called when a session is rotated */
481
  sessionRotating?: (context: Context) => Promise<void>;
482
}
483

484
/**
485
 * Handler function type for routing
486
 */
487
type RouteHandler<Context, UserData = Dictionary> = (
488
  context: Context,
489
  request: Request<UserData>
490
) => Promise<void>;
491
```
492

493
**Usage Examples:**
494

495
```typescript
496
import { BasicCrawler } from "crawlee";
497

498
const crawler = new BasicCrawler({
499
  requestHandler: async ({ request }) => {
500
    // Main processing logic
501
  },
502
});
503

504
// Register event handlers
505
crawler.use({
506
  crawlerStarting: async (crawler) => {
507
    console.log('Crawler is starting...');
508
  },
509

510
  crawlerFinishing: async (crawler) => {
511
    console.log('Crawler finished!');
512
    console.log(`Final stats:`, crawler.finalStatistics);
513
  },
514

515
  requestFailed: async (context, error) => {
516
    console.error(`Request failed: ${context.request.url}`, error);
517
  },
518
});
519
```
520

521
## Types
522

523
```typescript { .api }
524
interface Statistics {
525
  /** Number of requests that finished successfully */
526
  requestsFinished: number;
527

528
  /** Number of requests that failed permanently */
529
  requestsFailed: number;
530

531
  /** Total number of request retries */
532
  requestsRetries: number;
533

534
  /** Average requests per minute (finished) */
535
  requestsFinishedPerMinute: number;
536

537
  /** Average failed requests per minute */
538
  requestsFailedPerMinute: number;
539

540
  /** Minimum request duration in milliseconds */
541
  requestMinDurationMillis: number;
542

543
  /** Maximum request duration in milliseconds */
544
  requestMaxDurationMillis: number;
545

546
  /** Total duration of all requests in milliseconds */
547
  requestTotalDurationMillis: number;
548

549
  /** When the crawler started */
550
  crawlerStartedAt: Date;
551

552
  /** When the crawler finished */
553
  crawlerFinishedAt?: Date;
554

555
  /** Unique identifier for these statistics */
556
  statsId: string;
557
}
558

559
type FinalStatistics = Statistics & {
560
  crawlerFinishedAt: Date;
561
}
562

563
interface RouteDefinition {
564
  pattern: string | RegExp;
565
  method: 'GET' | 'POST' | 'PUT' | 'DELETE' | '*';
566
  handler: RouteHandler<any, any>;
567
}
568

569
interface SystemStatusOptions {
570
  /** How often to capture system status (in milliseconds) */
571
  intervalMillis?: number;
572

573
  /** Maximum number of historical entries to keep */
574
  maxEntries?: number;
575
}
576

577
interface SnapshotterOptions {
578
  /** How often to take snapshots (in milliseconds) */
579
  intervalSecs?: number;
580

581
  /** Number of snapshots to average for CPU/memory calculations */
582
  windowSize?: number;
583
}
584
```

Version

Tile

Files

core-crawling.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

core-crawling.mddocs/