Tessl Tile for npm/crawlee@3.15.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

browser-crawling.md configuration-proxies.md core-crawling.md http-crawling.md index.md session-management.md storage.md utilities.md

utilities.mddocs/

0
# Utilities
1

2
Crawlee provides extensive utility functions for common crawling tasks including URL extraction, social media parsing, system detection, and various helper functions for web scraping operations.
3

4
## Capabilities
5

6
### Sleep Utility
7

8
Promise-based sleep function for introducing delays in crawling operations.
9

10
```typescript { .api }
11
/**
12
 * Promise-based sleep function
13
 * @param millis - Milliseconds to sleep (defaults to random between 1-5 seconds)
14
 */
15
function sleep(millis?: number): Promise<void>;
16
```
17

18
**Usage Examples:**
19

20
```typescript
21
import { sleep } from "crawlee";
22

23
// Sleep for 2 seconds
24
await sleep(2000);
25

26
// Random sleep between 1-5 seconds
27
await sleep();
28

29
// Use in crawler for rate limiting
30
const crawler = new CheerioCrawler({
31
  requestHandler: async ({ request }) => {
32
    // Process request
33
    console.log(`Processing: ${request.url}`);
34

35
    // Add delay between requests
36
    await sleep(1000);
37
  },
38
});
39
```
40

41
### Link Enqueueing
42

43
Extract and enqueue links from web pages with powerful filtering and transformation options.
44

45
```typescript { .api }
46
/**
47
 * Extract and enqueue links from HTML pages
48
 */
49
function enqueueLinks(options: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;
50

51
interface EnqueueLinksOptions {
52
  /** Cheerio root object or HTML string */
53
  $?: CheerioRoot;
54

55
  /** Base URL for resolving relative links */
56
  baseUrl?: string;
57

58
  /** CSS selector for finding links */
59
  selector?: string;
60

61
  /** Pseudo-URLs for matching links */
62
  pseudoUrls?: (string | PseudoUrl)[];
63

64
  /** Glob patterns for URLs to include */
65
  globs?: string[];
66

67
  /** URLs or patterns to exclude */
68
  exclude?: (string | RegExp)[];
69

70
  /** Label to assign to enqueued requests */
71
  label?: string;
72

73
  /** Custom user data to attach */
74
  userData?: Dictionary;
75

76
  /** Transform function for request options */
77
  transformRequestFunction?: (request: RequestOptions) => RequestOptions;
78

79
  /** Request queue to add requests to */
80
  requestQueue?: RequestQueue;
81

82
  /** Maximum number of links to enqueue */
83
  limit?: number;
84

85
  /** Strategy for handling duplicate URLs */
86
  strategy?: EnqueueStrategy;
87
}
88

89
type EnqueueStrategy = 'all' | 'same-domain' | 'same-subdomain' | 'same-origin';
90
```
91

92
**Usage Examples:**
93

94
```typescript
95
import { CheerioCrawler, enqueueLinks } from "crawlee";
96

97
const crawler = new CheerioCrawler({
98
  requestHandler: async ({ $, request, enqueueLinks: crawlerEnqueueLinks }) => {
99
    // Using crawler's built-in enqueueLinks
100
    await crawlerEnqueueLinks({
101
      selector: 'a[href]',
102
      globs: ['**/products/**', '**/category/**'],
103
      exclude: [/\/admin\//, /\/login/],
104
      label: 'PRODUCT_PAGE',
105
      transformRequestFunction: (req) => ({
106
        ...req,
107
        userData: { parentUrl: request.url },
108
      }),
109
      limit: 50,
110
    });
111

112
    // Using standalone enqueueLinks function
113
    const result = await enqueueLinks({
114
      $,
115
      baseUrl: request.loadedUrl,
116
      selector: '.pagination a',
117
      label: 'PAGINATION',
118
      strategy: 'same-domain',
119
    });
120

121
    console.log(`Enqueued ${result.processedRequests} pagination links`);
122
  },
123
});
124
```
125

126
### Social Media Parsing
127

128
Comprehensive social media handle and contact extraction from text and HTML content.
129

130
```typescript { .api }
131
const social: {
132
  /** Extract email addresses from text */
133
  emailsFromText(text: string): string[];
134

135
  /** Extract emails from mailto: URLs */
136
  emailsFromUrls(urls: string[]): string[];
137

138
  /** Extract phone numbers from text */
139
  phonesFromText(text: string): string[];
140

141
  /** Extract phones from tel: URLs */
142
  phonesFromUrls(urls: string[]): string[];
143

144
  /** Parse all social handles from HTML */
145
  parseHandlesFromHtml(html: string): SocialHandles;
146

147
  /** Regular expression patterns for matching emails */
148
  EMAIL_REGEX: RegExp;
149
  EMAIL_REGEX_GLOBAL: RegExp;
150

151
  /** Social platform URL patterns */
152
  LINKEDIN_REGEX: RegExp;
153
  LINKEDIN_REGEX_GLOBAL: RegExp;
154
  INSTAGRAM_REGEX: RegExp;
155
  INSTAGRAM_REGEX_GLOBAL: RegExp;
156
  TWITTER_REGEX: RegExp;
157
  TWITTER_REGEX_GLOBAL: RegExp;
158
  FACEBOOK_REGEX: RegExp;
159
  FACEBOOK_REGEX_GLOBAL: RegExp;
160
  YOUTUBE_REGEX: RegExp;
161
  YOUTUBE_REGEX_GLOBAL: RegExp;
162
  TIKTOK_REGEX: RegExp;
163
  TIKTOK_REGEX_GLOBAL: RegExp;
164
  PINTEREST_REGEX: RegExp;
165
  PINTEREST_REGEX_GLOBAL: RegExp;
166
  DISCORD_REGEX: RegExp;
167
  DISCORD_REGEX_GLOBAL: RegExp;
168
};
169

170
interface SocialHandles {
171
  emails: string[];
172
  phones: string[];
173
  linkedIns: string[];
174
  twitters: string[];
175
  instagrams: string[];
176
  facebooks: string[];
177
  youtubes: string[];
178
  tiktoks: string[];
179
  pinterests: string[];
180
  discords: string[];
181
  phonesUncertain: string[];
182
}
183
```
184

185
**Usage Examples:**
186

187
```typescript
188
import { CheerioCrawler, utils } from "crawlee";
189

190
const crawler = new CheerioCrawler({
191
  requestHandler: async ({ $, request, pushData }) => {
192
    const html = $.html();
193
    const textContent = $.text();
194

195
    // Extract all social handles
196
    const socialHandles = utils.social.parseHandlesFromHtml(html);
197

198
    // Extract emails from text content
199
    const emailsInText = utils.social.emailsFromText(textContent);
200

201
    // Extract phones from text
202
    const phonesInText = utils.social.phonesFromText(textContent);
203

204
    // Get all links and extract emails/phones from them
205
    const allLinks = [];
206
    $('a[href]').each((_, link) => {
207
      allLinks.push($(link).attr('href'));
208
    });
209

210
    const emailsFromLinks = utils.social.emailsFromUrls(allLinks);
211
    const phonesFromLinks = utils.social.phonesFromUrls(allLinks);
212

213
    // Combine all contacts
214
    const allContacts = {
215
      url: request.loadedUrl,
216
      emails: [...new Set([...socialHandles.emails, ...emailsInText, ...emailsFromLinks])],
217
      phones: [...new Set([...socialHandles.phones, ...phonesInText, ...phonesFromLinks])],
218
      socialMedia: {
219
        linkedin: socialHandles.linkedIns,
220
        twitter: socialHandles.twitters,
221
        instagram: socialHandles.instagrams,
222
        facebook: socialHandles.facebooks,
223
        youtube: socialHandles.youtubes,
224
        tiktok: socialHandles.tiktoks,
225
        pinterest: socialHandles.pinterests,
226
        discord: socialHandles.discords,
227
      },
228
    };
229

230
    await pushData(allContacts);
231
  },
232
});
233

234
// Custom social media extraction
235
const customText = "Contact us at info@example.com or follow @example on Twitter";
236
const emails = utils.social.emailsFromText(customText);
237
const twitterMatches = customText.match(utils.social.TWITTER_REGEX_GLOBAL);
238

239
console.log('Emails found:', emails);
240
console.log('Twitter handles:', twitterMatches);
241
```
242

243
### URL Utilities
244

245
Functions for URL extraction, validation, and manipulation.
246

247
```typescript { .api }
248
/** Regular expressions for matching URLs */
249
const URL_NO_COMMAS_REGEX: RegExp;
250
const URL_WITH_COMMAS_REGEX: RegExp;
251

252
/**
253
 * Extract URLs from text content
254
 */
255
function extractUrls(options: ExtractUrlsOptions): string[];
256

257
/**
258
 * Download and parse a list of URLs from a remote source
259
 */
260
function downloadListOfUrls(options: DownloadListOfUrlsOptions): Promise<string[]>;
261

262
/**
263
 * Safely create absolute URLs from relative URLs
264
 */
265
function tryAbsoluteURL(href: string, baseUrl: string): string | null;
266

267
interface ExtractUrlsOptions {
268
  /** Text content to extract URLs from */
269
  string: string;
270

271
  /** Whether to include URLs with commas */
272
  urlRegex?: RegExp;
273
}
274

275
interface DownloadListOfUrlsOptions {
276
  /** URL of the list to download */
277
  url: string;
278

279
  /** Character encoding */
280
  encoding?: BufferEncoding;
281

282
  /** Regex pattern to match URLs in the content */
283
  urlRegex?: RegExp;
284
}
285
```
286

287
**Usage Examples:**
288

289
```typescript
290
import { utils, CheerioCrawler } from "crawlee";
291

292
// Extract URLs from text
293
const textWithUrls = "Visit https://example.com or check out http://test.com/page";
294
const extractedUrls = utils.extractUrls({ string: textWithUrls });
295
console.log('Found URLs:', extractedUrls);
296

297
// Download URL list from remote source
298
const urlList = await utils.downloadListOfUrls({
299
  url: 'https://example.com/sitemap.txt',
300
  encoding: 'utf8',
301
});
302

303
// Use in crawler for URL validation
304
const crawler = new CheerioCrawler({
305
  requestHandler: async ({ $, request, enqueueLinks }) => {
306
    // Extract and validate URLs
307
    const allLinks = [];
308
    $('a[href]').each((_, element) => {
309
      const href = $(element).attr('href');
310
      const absoluteUrl = utils.tryAbsoluteURL(href, request.loadedUrl);
311

312
      if (absoluteUrl) {
313
        allLinks.push(absoluteUrl);
314
      }
315
    });
316

317
    // Find URLs in text content
318
    const textContent = $.text();
319
    const urlsInText = utils.extractUrls({
320
      string: textContent,
321
      urlRegex: utils.URL_WITH_COMMAS_REGEX,
322
    });
323

324
    console.log(`Found ${allLinks.length} links and ${urlsInText.length} URLs in text`);
325

326
    await enqueueLinks({
327
      urls: allLinks.slice(0, 100), // Limit to first 100 URLs
328
      label: 'DISCOVERED',
329
    });
330
  },
331
});
332
```
333

334
### System Detection
335

336
Functions for detecting the runtime environment and system capabilities.
337

338
```typescript { .api }
339
/**
340
 * Detect if running in Docker container
341
 * @param forceReset - Force rechecking (internal use)
342
 */
343
function isDocker(forceReset?: boolean): Promise<boolean>;
344

345
/**
346
 * Detect if running in any containerized environment
347
 */
348
function isContainerized(): Promise<boolean>;
349

350
/**
351
 * Detect if running in AWS Lambda
352
 */
353
function isLambda(): boolean;
354

355
/**
356
 * Get cgroup version (V1 or V2)
357
 * @param forceReset - Force rechecking (internal use)
358
 */
359
function getCgroupsVersion(forceReset?: boolean): Promise<'V1' | 'V2' | null>;
360

361
interface CpuTicks {
362
  /** User CPU time */
363
  user: number;
364

365
  /** System CPU time */
366
  system: number;
367

368
  /** Idle CPU time */
369
  idle: number;
370

371
  /** I/O wait time */
372
  iowait: number;
373

374
  /** IRQ time */
375
  irq: number;
376

377
  /** Soft IRQ time */
378
  softirq: number;
379

380
  /** Steal time */
381
  steal: number;
382

383
  /** Guest time */
384
  guest: number;
385
}
386

387
interface MemoryInfo {
388
  /** Total system memory in bytes */
389
  totalBytes: number;
390

391
  /** Free memory in bytes */
392
  freeBytes: number;
393

394
  /** Used memory in bytes */
395
  usedBytes: number;
396

397
  /** Available memory in bytes */
398
  availableBytes: number;
399

400
  /** Memory usage as a ratio (0-1) */
401
  ratio: number;
402
}
403
```
404

405
**Usage Examples:**
406

407
```typescript
408
import { utils, Configuration } from "crawlee";
409

410
// Detect environment and configure accordingly
411
if (await utils.isDocker()) {
412
  console.log('Running in Docker - using optimized settings');
413
  Configuration.getGlobalConfig().set('defaultDatasetId', 'docker-dataset');
414
}
415

416
if (utils.isLambda()) {
417
  console.log('Running in Lambda - reducing memory usage');
418
  Configuration.getGlobalConfig().set('memoryMbytes', 512);
419
}
420

421
// Monitor system resources
422
async function logSystemInfo() {
423
  console.log('System Status:');
424
  console.log(`Containerized: ${await utils.isContainerized()}`);
425
  console.log(`Cgroups version: ${await utils.getCgroupsVersion()}`);
426
  console.log(`Lambda environment: ${utils.isLambda()}`);
427
}
428

429
// Use in crawler for adaptive behavior
430
const crawler = new BasicCrawler({
431
  requestHandler: async ({ request }) => {
432
    // Check environment before processing
433
    if (await utils.isContainerized()) {
434
      console.log('Running in containerized environment');
435
    }
436

437
    // Process request...
438
  },
439

440
  // Adjust concurrency based on environment (set at initialization)
441
  maxConcurrency: utils.isLambda() ? 1 : 10,
442
});
443
```
444

445
### OpenGraph Parsing
446

447
Extract OpenGraph metadata from HTML pages.
448

449
```typescript { .api }
450
/**
451
 * Parse OpenGraph tags from HTML content
452
 */
453
function parseOpenGraph(html: string): Dictionary<string>;
454
```
455

456
**Usage Examples:**
457

458
```typescript
459
import { utils, CheerioCrawler } from "crawlee";
460

461
const crawler = new CheerioCrawler({
462
  requestHandler: async ({ $, request, pushData, body }) => {
463
    // Parse OpenGraph data
464
    const ogData = utils.parseOpenGraph(body);
465

466
    // Extract standard metadata
467
    const metadata = {
468
      url: request.loadedUrl,
469
      title: $('title').text() || ogData['og:title'],
470
      description: $('meta[name="description"]').attr('content') || ogData['og:description'],
471
      image: ogData['og:image'],
472
      type: ogData['og:type'],
473
      siteName: ogData['og:site_name'],
474
      author: ogData['article:author'],
475
      publishedTime: ogData['article:published_time'],
476
      twitterCard: ogData['twitter:card'],
477
      twitterSite: ogData['twitter:site'],
478
      // Include all OpenGraph data
479
      openGraph: ogData,
480
    };
481

482
    await pushData(metadata);
483
  },
484
});
485
```
486

487
### Mathematical Utilities
488

489
Helper functions for calculations and data processing.
490

491
```typescript { .api }
492
/**
493
 * Calculate weighted average from values and weights
494
 */
495
function weightedAvg(values: number[], weights: number[]): number;
496

497
/**
498
 * Convert snake_case strings to camelCase
499
 */
500
function snakeCaseToCamelCase(str: string): string;
501
```
502

503
**Usage Examples:**
504

505
```typescript
506
import { utils } from "crawlee";
507

508
// Calculate weighted ratings
509
const ratings = [4.5, 3.8, 4.9, 4.1];
510
const weights = [100, 50, 200, 75]; // Number of reviews
511
const averageRating = utils.weightedAvg(ratings, weights);
512

513
console.log(`Weighted average rating: ${averageRating.toFixed(2)}`);
514

515
// Convert API response keys
516
const apiResponse = {
517
  product_name: 'Widget',
518
  price_usd: 29.99,
519
  is_available: true,
520
  created_at: '2023-01-01',
521
};
522

523
const camelCaseResponse = {};
524
Object.entries(apiResponse).forEach(([key, value]) => {
525
  const camelKey = utils.snakeCaseToCamelCase(key);
526
  camelCaseResponse[camelKey] = value;
527
});
528

529
console.log(camelCaseResponse);
530
// Result: { productName: 'Widget', priceUsd: 29.99, isAvailable: true, createdAt: '2023-01-01' }
531
```
532

533
### DOM Utilities
534

535
Helper functions for DOM manipulation and processing.
536

537
```typescript { .api }
538
/**
539
 * Expand shadow DOM roots to access shadow content
540
 */
541
function expandShadowRoots(document: Document): void;
542
```
543

544
**Usage Examples:**
545

546
```typescript
547
import { JSDOMCrawler, utils } from "crawlee";
548

549
const crawler = new JSDOMCrawler({
550
  requestHandler: async ({ window, document, request, pushData }) => {
551
    // Expand shadow DOM to access hidden content
552
    utils.expandShadowRoots(document);
553

554
    // Now you can query shadow DOM content
555
    const shadowContent = document.querySelectorAll('[data-shadow-content]');
556

557
    const extractedData = Array.from(shadowContent).map(element => ({
558
      text: element.textContent?.trim(),
559
      attributes: Array.from(element.attributes).reduce((attrs, attr) => {
560
        attrs[attr.name] = attr.value;
561
        return attrs;
562
      }, {}),
563
    }));
564

565
    await pushData({
566
      url: request.loadedUrl,
567
      shadowDomData: extractedData,
568
      hasShadowContent: shadowContent.length > 0,
569
    });
570
  },
571
});
572
```
573

574
### Unified Utils Object
575

576
The main utils object that combines all utility functions.
577

578
```typescript { .api }
579
const utils: {
580
  /** Puppeteer utility functions */
581
  puppeteer: typeof puppeteerUtils;
582

583
  /** Playwright utility functions */
584
  playwright: typeof playwrightUtils;
585

586
  /** Logging utility */
587
  log: Log;
588

589
  /** Link enqueueing function */
590
  enqueueLinks: typeof enqueueLinks;
591

592
  /** Social media parsing utilities */
593
  social: typeof social;
594

595
  /** Sleep function */
596
  sleep: typeof sleep;
597

598
  /** URL list downloading */
599
  downloadListOfUrls: typeof downloadListOfUrls;
600

601
  /** OpenGraph parsing */
602
  parseOpenGraph: typeof parseOpenGraph;
603

604
  /** System detection functions */
605
  isDocker: typeof isDocker;
606
  isLambda: typeof isLambda;
607
  isContainerized: typeof isContainerized;
608
  getCgroupsVersion: typeof getCgroupsVersion;
609

610
  // Note: System monitoring functions are available in utils object but not directly exported
611

612
  /** Mathematical utilities */
613
  weightedAvg: typeof weightedAvg;
614

615
  /** String utilities */
616
  snakeCaseToCamelCase: typeof snakeCaseToCamelCase;
617

618
  /** URL utilities */
619
  extractUrls: typeof extractUrls;
620
  tryAbsoluteURL: typeof tryAbsoluteURL;
621
  URL_NO_COMMAS_REGEX: RegExp;
622
  URL_WITH_COMMAS_REGEX: RegExp;
623

624
  /** DOM utilities */
625
  expandShadowRoots: typeof expandShadowRoots;
626
};
627
```
628

629
**Usage Examples:**
630

631
```typescript
632
import { utils } from "crawlee";
633

634
// All utilities available through single import
635
console.log('Environment check:');
636
console.log(`Docker: ${utils.isDocker()}`);
637
console.log(`Lambda: ${utils.isLambda()}`);
638

639
// Use social media parsing
640
const html = '<p>Contact: info@example.com, Twitter: @company</p>';
641
const contacts = utils.social.parseHandlesFromHtml(html);
642

643
// Use URL extraction
644
const text = 'Visit https://example.com for more info';
645
const urls = utils.extractUrls({ string: text });
646

647
// Use system detection
648
const isInDocker = await utils.isDocker();
649
console.log(`Running in Docker: ${isInDocker}`);
650

651
// Use in crawler with all utilities
652
const crawler = new CheerioCrawler({
653
  requestHandler: async ({ $, request, pushData }) => {
654
    // Rate limiting
655
    await utils.sleep(1000);
656

657
    // Extract data
658
    const ogData = utils.parseOpenGraph($.html());
659
    const socialData = utils.social.parseHandlesFromHtml($.html());
660
    const urls = utils.extractUrls({ string: $.text() });
661

662
    await pushData({
663
      url: request.loadedUrl,
664
      metadata: ogData,
665
      contacts: socialData,
666
      extractedUrls: urls,
667
      systemInfo: {
668
        isDocker: await utils.isDocker(),
669
        isLambda: utils.isLambda(),
670
      },
671
    });
672

673
    // Environment-aware link enqueueing
674
    const isLimitedEnv = utils.isLambda() || await utils.isContainerized();
675
    await utils.enqueueLinks({
676
      $,
677
      baseUrl: request.loadedUrl,
678
      selector: 'a[href]',
679
      limit: isLimitedEnv ? 10 : 50, // Reduce links in constrained environments
680
    });
681
  },
682
});
683
```
684

685
## Types
686

687
```typescript { .api }
688
interface Log {
689
  /** Log debug message */
690
  debug(message: string, data?: any): void;
691

692
  /** Log info message */
693
  info(message: string, data?: any): void;
694

695
  /** Log warning message */
696
  warning(message: string, data?: any): void;
697

698
  /** Log error message */
699
  error(message: string, error?: Error): void;
700

701
  /** Log exception */
702
  exception(error: Error, message?: string, data?: any): void;
703

704
  /** Get child logger with prefix */
705
  child(options: { prefix?: string; suffix?: string }): Log;
706
}
707

708
interface PseudoUrl {
709
  /** Create pseudo-URL matcher */
710
  new (purl: string, requestTemplate?: Partial<RequestOptions>): PseudoUrl;
711

712
  /** Test if URL matches pattern */
713
  matches(url: string): boolean;
714

715
  /** Create request from matched URL */
716
  createRequest(url: string): RequestOptions;
717
}
718

719
type BufferEncoding = 'ascii' | 'utf8' | 'utf16le' | 'ucs2' | 'base64' | 'latin1' | 'binary' | 'hex';
720

721
interface RequestTemplate {
722
  /** Default user data for matched requests */
723
  userData?: Dictionary;
724

725
  /** Default label for matched requests */
726
  label?: string;
727

728
  /** Default HTTP method */
729
  method?: HttpMethod;
730

731
  /** Default headers */
732
  headers?: Dictionary<string>;
733
}
734
```

Version

Tile

Files

utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utilities.mddocs/