Tessl Tile for npm/crawlee@3.15.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

browser-crawling.md configuration-proxies.md core-crawling.md http-crawling.md index.md session-management.md storage.md utilities.md

http-crawling.mddocs/

0
# HTTP Crawling
1

2
HTTP crawling provides server-side HTML parsing and scraping capabilities without the overhead of full browser automation. These crawlers use various DOM parsing libraries to extract data efficiently from web pages.
3

4
## Capabilities
5

6
### HttpCrawler
7

8
Base HTTP crawler that extends BasicCrawler with HTTP-specific functionality for making requests and handling responses.
9

10
```typescript { .api }
11
/**
12
 * HTTP crawler for server-side request processing without browser automation
13
 */
14
class HttpCrawler extends BasicCrawler<HttpCrawlingContext> {
15
  constructor(options: HttpCrawlerOptions);
16
}
17
```
18

19
### HttpCrawlerOptions
20

21
Configuration options for the HttpCrawler.
22

23
```typescript { .api }
24
interface HttpCrawlerOptions extends BasicCrawlerOptions<HttpCrawlingContext> {
25
  /** HTTP client options for making requests */
26
  requestHandlerOptions?: Partial<OptionsInit>;
27

28
  /** Additional HTTP headers to send with requests */
29
  additionalHttpErrorStatusCodes?: number[];
30

31
  /** Whether to ignore HTTP error status codes */
32
  ignoreHttpErrorStatusCodes?: boolean;
33

34
  /** Pre-navigation hooks to modify requests before sending */
35
  preNavigationHooks?: Array<(crawlingContext: HttpCrawlingContext, requestAsBrowserOptions: OptionsInit) => Promise<void>>;
36

37
  /** Post-navigation hooks to process responses after receiving */
38
  postNavigationHooks?: Array<(crawlingContext: HttpCrawlingContext, response: Response) => Promise<void>>;
39

40
  /** HTTP client configuration */
41
  httpClient?: BaseHttpClient;
42

43
  /** Whether to persist cookies between requests */
44
  persistCookiesPerSession?: boolean;
45

46
  /** Custom User-Agent string */
47
  userAgent?: string;
48

49
  /** Custom request transformation function */
50
  requestTransform?: (options: OptionsInit) => Promise<OptionsInit>;
51

52
  /** Custom response transformation function */
53
  responseTransform?: (response: Response) => Promise<Response>;
54
}
55
```
56

57
### HttpCrawlingContext
58

59
The context object passed to HTTP crawler request handlers.
60

61
```typescript { .api }
62
interface HttpCrawlingContext<UserData = Dictionary> extends BasicCrawlingContext<UserData> {
63
  /** The HTTP response object */
64
  response: Response;
65

66
  /** Response body as text */
67
  body: string;
68

69
  /** Response headers */
70
  headers: Dictionary<string>;
71

72
  /** Content type of the response */
73
  contentType: string;
74

75
  /** Send HTTP request with custom options */
76
  sendRequest<T = any>(overrideOptions?: Partial<OptionsInit>): Promise<T>;
77
}
78
```
79

80
**Usage Examples:**
81

82
```typescript
83
import { HttpCrawler } from "crawlee";
84

85
const crawler = new HttpCrawler({
86
  requestHandler: async ({ request, response, body }) => {
87
    console.log(`Status: ${response.statusCode} for ${request.url}`);
88
    console.log(`Body length: ${body.length}`);
89

90
    // Parse HTML manually or use simple text processing
91
    const titleMatch = body.match(/<title>(.*?)<\/title>/i);
92
    const title = titleMatch ? titleMatch[1] : 'No title';
93

94
    await crawler.pushData({
95
      url: request.url,
96
      title,
97
      statusCode: response.statusCode,
98
    });
99
  },
100
  additionalHttpErrorStatusCodes: [429], // Treat 429 as error
101
  userAgent: 'MyCustomCrawler/1.0',
102
});
103
```
104

105
### CheerioCrawler
106

107
Server-side HTML parsing crawler using the Cheerio library for jQuery-like DOM manipulation.
108

109
```typescript { .api }
110
/**
111
 * Cheerio-based crawler for server-side HTML parsing with jQuery-like syntax
112
 */
113
class CheerioCrawler extends HttpCrawler {
114
  constructor(options: CheerioCrawlerOptions);
115
}
116
```
117

118
### CheerioCrawlerOptions
119

120
Configuration options for the CheerioCrawler.
121

122
```typescript { .api }
123
interface CheerioCrawlerOptions extends HttpCrawlerOptions {
124
  /** Handler function that receives Cheerio context */
125
  requestHandler: (context: CheerioCrawlingContext) => Promise<void>;
126

127
  /** Cheerio parsing options */
128
  cheerioParseOptions?: CheerioParseOptions;
129

130
  /** Whether to inject Cheerio into global scope */
131
  forceResponseEncoding?: string;
132

133
  /** Custom Cheerio root selector */
134
  parserOptions?: {
135
    xmlMode?: boolean;
136
    decodeEntities?: boolean;
137
    lowerCaseAttributeNames?: boolean;
138
  };
139
}
140
```
141

142
### CheerioCrawlingContext
143

144
The context object passed to Cheerio crawler request handlers.
145

146
```typescript { .api }
147
interface CheerioCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {
148
  /** Cheerio root object for DOM manipulation */
149
  $: CheerioRoot;
150

151
  /** Get text content from the current page */
152
  body: string;
153

154
  /** Parse additional HTML with Cheerio */
155
  parseWithCheerio(html: string): CheerioRoot;
156

157
  /** Enqueue links found on the page */
158
  enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;
159
}
160
```
161

162
**Usage Examples:**
163

164
```typescript
165
import { CheerioCrawler, Dataset } from "crawlee";
166

167
const crawler = new CheerioCrawler({
168
  requestHandler: async ({ $, request, enqueueLinks, pushData }) => {
169
    // Extract data using jQuery-like syntax
170
    const title = $('title').text();
171
    const description = $('meta[name="description"]').attr('content');
172

173
    // Extract all product information
174
    const products = [];
175
    $('.product').each((index, element) => {
176
      const product = $(element);
177
      products.push({
178
        name: product.find('.product-name').text().trim(),
179
        price: product.find('.price').text().trim(),
180
        image: product.find('img').attr('src'),
181
      });
182
    });
183

184
    // Save extracted data
185
    await pushData({
186
      url: request.loadedUrl,
187
      title,
188
      description,
189
      products,
190
      extractedAt: new Date(),
191
    });
192

193
    // Find and enqueue pagination links
194
    await enqueueLinks({
195
      selector: 'a.page-link',
196
      label: 'LIST',
197
    });
198

199
    // Find and enqueue product detail links
200
    await enqueueLinks({
201
      selector: '.product a',
202
      label: 'DETAIL',
203
    });
204
  },
205

206
  // Handle product detail pages
207
  router.addHandler('DETAIL', async ({ $, request, pushData }) => {
208
    const productDetails = {
209
      url: request.loadedUrl,
210
      name: $('.product-title').text(),
211
      fullDescription: $('.description').text(),
212
      specifications: {},
213
      reviews: [],
214
    };
215

216
    // Extract specifications
217
    $('.spec-row').each((_, element) => {
218
      const key = $(element).find('.spec-name').text().trim();
219
      const value = $(element).find('.spec-value').text().trim();
220
      productDetails.specifications[key] = value;
221
    });
222

223
    // Extract reviews
224
    $('.review').each((_, element) => {
225
      productDetails.reviews.push({
226
        rating: $(element).find('.rating').attr('data-rating'),
227
        text: $(element).find('.review-text').text().trim(),
228
        author: $(element).find('.reviewer-name').text().trim(),
229
      });
230
    });
231

232
    await pushData(productDetails);
233
  }),
234

235
  maxConcurrency: 5,
236
  maxRequestRetries: 3,
237
});
238
```
239

240
### JSDOMCrawler
241

242
Server-side DOM manipulation crawler using JSDOM for full DOM API support.
243

244
```typescript { .api }
245
/**
246
 * JSDOM-based crawler for server-side DOM manipulation with full DOM API
247
 */
248
class JSDOMCrawler extends HttpCrawler {
249
  constructor(options: JSDOMCrawlerOptions);
250
}
251
```
252

253
### JSDOMCrawlerOptions
254

255
Configuration options for the JSDOMCrawler.
256

257
```typescript { .api }
258
interface JSDOMCrawlerOptions extends HttpCrawlerOptions {
259
  /** Handler function that receives JSDOM context */
260
  requestHandler: (context: JSDOMCrawlingContext) => Promise<void>;
261

262
  /** JSDOM constructor options */
263
  jsdomOptions?: ConstructorOptions;
264

265
  /** Whether to run scripts in JSDOM */
266
  runScripts?: 'dangerously' | 'outside-only';
267

268
  /** Custom resource loader for JSDOM */
269
  resourceLoader?: ResourceLoader;
270

271
  /** Virtual console options */
272
  virtualConsole?: VirtualConsole;
273
}
274
```
275

276
### JSDOMCrawlingContext
277

278
The context object passed to JSDOM crawler request handlers.
279

280
```typescript { .api }
281
interface JSDOMCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {
282
  /** The JSDOM window object */
283
  window: DOMWindow;
284

285
  /** The document object */
286
  document: Document;
287

288
  /** Shortcut to document.querySelector */
289
  $(selector: string): Element | null;
290

291
  /** Shortcut to document.querySelectorAll */
292
  $$(selector: string): NodeListOf<Element>;
293

294
  /** Enqueue links found on the page */
295
  enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;
296
}
297
```
298

299
**Usage Examples:**
300

301
```typescript
302
import { JSDOMCrawler } from "crawlee";
303

304
const crawler = new JSDOMCrawler({
305
  requestHandler: async ({ window, document, $, $$, request, pushData, enqueueLinks }) => {
306
    // Use full DOM API
307
    const title = document.title;
308
    const metaTags = document.getElementsByTagName('meta');
309

310
    // Use convenience selectors
311
    const mainContent = $('.main-content');
312
    const allLinks = $$('a[href]');
313

314
    // Execute JavaScript-like operations
315
    const productList = Array.from($$('.product')).map(element => ({
316
      name: element.querySelector('.name')?.textContent?.trim(),
317
      price: element.querySelector('.price')?.textContent?.trim(),
318
      inStock: element.classList.contains('in-stock'),
319
    }));
320

321
    // Access computed styles if needed
322
    const computedStyle = window.getComputedStyle(mainContent);
323

324
    await pushData({
325
      url: request.loadedUrl,
326
      title,
327
      productCount: productList.length,
328
      products: productList,
329
      hasMainContent: !!mainContent,
330
    });
331

332
    // Enqueue links
333
    await enqueueLinks({
334
      selector: 'a[href*="/category/"]',
335
      label: 'CATEGORY',
336
    });
337
  },
338

339
  jsdomOptions: {
340
    runScripts: 'dangerously', // Enable JavaScript execution
341
    resources: 'usable', // Load external resources
342
  },
343
});
344
```
345

346
### LinkedOMCrawler
347

348
Fast server-side DOM manipulation crawler using LinkedOM for performance-optimized parsing.
349

350
```typescript { .api }
351
/**
352
 * LinkedOM-based crawler for fast server-side DOM manipulation
353
 */
354
class LinkedOMCrawler extends HttpCrawler {
355
  constructor(options: LinkedOMCrawlerOptions);
356
}
357
```
358

359
### LinkedOMCrawlerOptions
360

361
Configuration options for the LinkedOMCrawler.
362

363
```typescript { .api }
364
interface LinkedOMCrawlerOptions extends HttpCrawlerOptions {
365
  /** Handler function that receives LinkedOM context */
366
  requestHandler: (context: LinkedOMCrawlingContext) => Promise<void>;
367

368
  /** LinkedOM parsing options */
369
  linkedomOptions?: {
370
    /** Include comment nodes in parsing */
371
    includeComments?: boolean;
372
    /** Include text nodes in parsing */
373
    includeTextNodes?: boolean;
374
  };
375
}
376
```
377

378
### LinkedOMCrawlingContext
379

380
The context object passed to LinkedOM crawler request handlers.
381

382
```typescript { .api }
383
interface LinkedOMCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {
384
  /** The LinkedOM window object */
385
  window: Window;
386

387
  /** The document object */
388
  document: Document;
389

390
  /** Shortcut to document.querySelector */
391
  $(selector: string): Element | null;
392

393
  /** Shortcut to document.querySelectorAll */
394
  $$(selector: string): NodeListOf<Element>;
395

396
  /** Enqueue links found on the page */
397
  enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;
398
}
399
```
400

401
**Usage Examples:**
402

403
```typescript
404
import { LinkedOMCrawler } from "crawlee";
405

406
const crawler = new LinkedOMCrawler({
407
  requestHandler: async ({ window, document, $, $$, request, pushData }) => {
408
    // LinkedOM provides fast DOM manipulation
409
    const title = document.title;
410
    const description = $('meta[name="description"]')?.getAttribute('content');
411

412
    // Fast element selection and text extraction
413
    const headlines = Array.from($$('h1, h2, h3')).map(el => ({
414
      tag: el.tagName.toLowerCase(),
415
      text: el.textContent?.trim(),
416
      level: parseInt(el.tagName.slice(1)),
417
    }));
418

419
    // Fast table parsing
420
    const tableData = [];
421
    $$('table tr').forEach(row => {
422
      const cells = Array.from(row.querySelectorAll('td, th')).map(cell =>
423
        cell.textContent?.trim()
424
      );
425
      if (cells.length > 0) {
426
        tableData.push(cells);
427
      }
428
    });
429

430
    await pushData({
431
      url: request.loadedUrl,
432
      title,
433
      description,
434
      headlines,
435
      tableData,
436
    });
437
  },
438

439
  maxConcurrency: 20, // LinkedOM is fast, can handle higher concurrency
440
});
441
```
442

443
### File Download Crawler
444

445
Specialized crawler for efficient file downloading using HTTP streams.
446

447
```typescript { .api }
448
/**
449
 * Specialized crawler for downloading files efficiently
450
 */
451
class FileDownload extends HttpCrawler<FileDownloadCrawlingContext> {
452
  constructor(options: FileDownloadOptions);
453
}
454

455
/**
456
 * Create a router for file download handling
457
 */
458
function createFileRouter<Context extends FileDownloadCrawlingContext>(): Router<Context>;
459

460
/**
461
 * Transform stream that monitors download speed and aborts if too slow
462
 */
463
function MinimumSpeedStream(options: MinimumSpeedStreamOptions): Transform;
464

465
/**
466
 * Transform stream that logs download progress
467
 */
468
function ByteCounterStream(options: ByteCounterStreamOptions): Transform;
469

470
interface FileDownloadOptions<UserData = any, JSONData = any> {
471
  /** Request handler for processing downloaded files */
472
  requestHandler?: FileDownloadRequestHandler<UserData, JSONData>;
473

474
  /** Stream handler for processing download streams */
475
  streamHandler?: StreamHandler;
476

477
  /** All standard HttpCrawlerOptions are supported */
478
  requestList?: RequestList;
479
  requestQueue?: RequestQueue;
480
  maxRequestRetries?: number;
481
  maxRequestsPerCrawl?: number;
482
  maxConcurrency?: number;
483
  navigationTimeoutSecs?: number;
484
}
485

486
interface FileDownloadCrawlingContext<UserData = any, JSONData = any>
487
  extends HttpCrawlingContext<UserData, JSONData> {
488
  /** The download stream (when using streamHandler) */
489
  stream?: Request;
490
}
491

492
interface MinimumSpeedStreamOptions {
493
  /** Minimum speed in KB/s */
494
  minSpeedKbps: number;
495

496
  /** Time window for speed calculation in ms (default: 10000) */
497
  historyLengthMs?: number;
498

499
  /** How often to check speed in ms (default: 5000) */
500
  checkProgressInterval?: number;
501
}
502

503
interface ByteCounterStreamOptions {
504
  /** Function to call with bytes transferred */
505
  logTransferredBytes: (bytes: number) => void;
506

507
  /** How often to log progress in ms (default: 5000) */
508
  loggingInterval?: number;
509
}
510
```
511

512
**Usage Examples:**
513

514
```typescript
515
import { FileDownload, createFileRouter, writeFileSync } from "crawlee";
516

517
// Basic file download with requestHandler
518
const fileDownloader = new FileDownload({
519
  requestHandler: async ({ body, request, pushData }) => {
520
    // Save file to disk
521
    const fileName = request.url.replace(/[^a-z0-9\.]/gi, '_');
522
    writeFileSync(`./downloads/${fileName}`, body);
523

524
    await pushData({
525
      url: request.url,
526
      fileName,
527
      size: body.length,
528
      downloadedAt: new Date(),
529
    });
530
  },
531
});
532

533
// Run with list of file URLs
534
await fileDownloader.run([
535
  'http://www.example.com/document.pdf',
536
  'http://www.example.com/image.jpg',
537
  'http://www.example.com/video.mp4',
538
]);
539

540
// Advanced streaming with progress monitoring
541
const streamDownloader = new FileDownload({
542
  streamHandler: async ({ stream, request, log }) => {
543
    const filePath = `./downloads/${path.basename(request.url)}`;
544
    const fileStream = createWriteStream(filePath);
545

546
    // Add progress monitoring
547
    const progressStream = ByteCounterStream({
548
      logTransferredBytes: (bytes) => {
549
        log.info(`Downloaded ${(bytes / 1024 / 1024).toFixed(2)} MB`);
550
      },
551
      loggingInterval: 2000,
552
    });
553

554
    // Add speed monitoring
555
    const speedStream = MinimumSpeedStream({
556
      minSpeedKbps: 100, // Minimum 100 KB/s
557
      historyLengthMs: 10000,
558
      checkProgressInterval: 3000,
559
    });
560

561
    // Pipe stream through monitors to file
562
    stream
563
      .pipe(progressStream)
564
      .pipe(speedStream)
565
      .pipe(fileStream);
566

567
    // Wait for completion
568
    await finished(fileStream);
569
    log.info(`File saved: ${filePath}`);
570
  },
571
});
572

573
// Using router for different file types
574
const router = createFileRouter();
575

576
router.addHandler('PDF', async ({ body, request, pushData }) => {
577
  // Handle PDF files
578
  const fileName = `pdf_${Date.now()}.pdf`;
579
  writeFileSync(`./pdfs/${fileName}`, body);
580
  await pushData({ type: 'pdf', fileName, url: request.url });
581
});
582

583
router.addHandler('IMAGE', async ({ body, request, pushData }) => {
584
  // Handle image files
585
  const fileName = `img_${Date.now()}.jpg`;
586
  writeFileSync(`./images/${fileName}`, body);
587
  await pushData({ type: 'image', fileName, url: request.url });
588
});
589

590
router.addDefaultHandler(async ({ body, request, pushData }) => {
591
  // Handle other file types
592
  const fileName = `file_${Date.now()}`;
593
  writeFileSync(`./files/${fileName}`, body);
594
  await pushData({ type: 'other', fileName, url: request.url });
595
});
596

597
const routerDownloader = new FileDownload({
598
  requestHandler: router,
599
});
600

601
// Add requests with labels for routing
602
await routerDownloader.addRequests([
603
  { url: 'http://example.com/doc.pdf', label: 'PDF' },
604
  { url: 'http://example.com/photo.jpg', label: 'IMAGE' },
605
  { url: 'http://example.com/data.csv', label: 'OTHER' },
606
]);
607
```
608

609
## Types
610

611
```typescript { .api }
612
interface Response {
613
  /** HTTP status code */
614
  statusCode: number;
615

616
  /** HTTP status message */
617
  statusMessage: string;
618

619
  /** Response headers */
620
  headers: Dictionary<string | string[]>;
621

622
  /** Response body as string */
623
  body: string;
624

625
  /** Response body as buffer */
626
  rawBody: Buffer;
627

628
  /** Whether the request was redirected */
629
  isRedirect: boolean;
630

631
  /** Final URL after redirects */
632
  url: string;
633

634
  /** Request timing information */
635
  timings: {
636
    start: number;
637
    socket: number;
638
    lookup: number;
639
    connect: number;
640
    secureConnect: number;
641
    upload: number;
642
    response: number;
643
    end: number;
644
  };
645
}
646

647
interface OptionsInit {
648
  /** HTTP method */
649
  method?: HttpMethod;
650

651
  /** Request headers */
652
  headers?: Dictionary<string>;
653

654
  /** Request body */
655
  body?: string | Buffer;
656

657
  /** Request timeout in milliseconds */
658
  timeout?: number;
659

660
  /** Whether to follow redirects */
661
  followRedirect?: boolean;
662

663
  /** Maximum number of redirects to follow */
664
  maxRedirects?: number;
665

666
  /** Proxy URL */
667
  proxy?: string;
668

669
  /** User agent string */
670
  userAgent?: string;
671

672
  /** Whether to validate SSL certificates */
673
  rejectUnauthorized?: boolean;
674
}
675

676
interface CheerioParseOptions {
677
  /** Whether to parse as XML */
678
  xmlMode?: boolean;
679

680
  /** Whether to decode HTML entities */
681
  decodeEntities?: boolean;
682

683
  /** Whether to lowercase attribute names */
684
  lowerCaseAttributeNames?: boolean;
685

686
  /** Whether to recognize CDATA sections */
687
  recognizeCDATA?: boolean;
688

689
  /** Whether to recognize self-closing tags */
690
  recognizeSelfClosing?: boolean;
691
}
692

693
interface CrawlerEnqueueLinksOptions {
694
  /** CSS selector for finding links */
695
  selector?: string;
696

697
  /** Base URL for resolving relative links */
698
  baseUrl?: string;
699

700
  /** URLs to exclude from enqueueing */
701
  exclude?: (string | RegExp)[];
702

703
  /** Glob patterns for URLs to include */
704
  globs?: string[];
705

706
  /** Pseudo-URLs for matching links */
707
  pseudoUrls?: string[];
708

709
  /** Label to assign to enqueued requests */
710
  label?: string;
711

712
  /** Additional data to attach to requests */
713
  userData?: Dictionary;
714

715
  /** Whether to transform relative URLs to absolute */
716
  transformRequestFunction?: (request: RequestOptions) => RequestOptions;
717

718
  /** Request queue to add requests to */
719
  requestQueue?: RequestQueue;
720

721
  /** Maximum number of links to enqueue */
722
  limit?: number;
723
}
724

725
type HttpMethod = 'GET' | 'POST' | 'PUT' | 'DELETE' | 'HEAD' | 'OPTIONS' | 'PATCH';
726
type CheerioRoot = ReturnType<typeof cheerio.load>;
727
```

Version

Tile

Files

http-crawling.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

http-crawling.mddocs/