0
# Utilities
1
2
Crawlee provides extensive utility functions for common crawling tasks including URL extraction, social media parsing, system detection, and various helper functions for web scraping operations.
3
4
## Capabilities
5
6
### Sleep Utility
7
8
Promise-based sleep function for introducing delays in crawling operations.
9
10
```typescript { .api }
11
/**
12
* Promise-based sleep function
13
* @param millis - Milliseconds to sleep (defaults to random between 1-5 seconds)
14
*/
15
function sleep(millis?: number): Promise<void>;
16
```
17
18
**Usage Examples:**
19
20
```typescript
21
import { sleep } from "crawlee";
22
23
// Sleep for 2 seconds
24
await sleep(2000);
25
26
// Random sleep between 1-5 seconds
27
await sleep();
28
29
// Use in crawler for rate limiting
30
const crawler = new CheerioCrawler({
31
requestHandler: async ({ request }) => {
32
// Process request
33
console.log(`Processing: ${request.url}`);
34
35
// Add delay between requests
36
await sleep(1000);
37
},
38
});
39
```
40
41
### Link Enqueueing
42
43
Extract and enqueue links from web pages with powerful filtering and transformation options.
44
45
```typescript { .api }
46
/**
47
* Extract and enqueue links from HTML pages
48
*/
49
function enqueueLinks(options: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;
50
51
interface EnqueueLinksOptions {
52
/** Cheerio root object or HTML string */
53
$?: CheerioRoot;
54
55
/** Base URL for resolving relative links */
56
baseUrl?: string;
57
58
/** CSS selector for finding links */
59
selector?: string;
60
61
/** Pseudo-URLs for matching links */
62
pseudoUrls?: (string | PseudoUrl)[];
63
64
/** Glob patterns for URLs to include */
65
globs?: string[];
66
67
/** URLs or patterns to exclude */
68
exclude?: (string | RegExp)[];
69
70
/** Label to assign to enqueued requests */
71
label?: string;
72
73
/** Custom user data to attach */
74
userData?: Dictionary;
75
76
/** Transform function for request options */
77
transformRequestFunction?: (request: RequestOptions) => RequestOptions;
78
79
/** Request queue to add requests to */
80
requestQueue?: RequestQueue;
81
82
/** Maximum number of links to enqueue */
83
limit?: number;
84
85
/** Strategy for handling duplicate URLs */
86
strategy?: EnqueueStrategy;
87
}
88
89
type EnqueueStrategy = 'all' | 'same-domain' | 'same-subdomain' | 'same-origin';
90
```
91
92
**Usage Examples:**
93
94
```typescript
95
import { CheerioCrawler, enqueueLinks } from "crawlee";
96
97
const crawler = new CheerioCrawler({
98
requestHandler: async ({ $, request, enqueueLinks: crawlerEnqueueLinks }) => {
99
// Using crawler's built-in enqueueLinks
100
await crawlerEnqueueLinks({
101
selector: 'a[href]',
102
globs: ['**/products/**', '**/category/**'],
103
exclude: [/\/admin\//, /\/login/],
104
label: 'PRODUCT_PAGE',
105
transformRequestFunction: (req) => ({
106
...req,
107
userData: { parentUrl: request.url },
108
}),
109
limit: 50,
110
});
111
112
// Using standalone enqueueLinks function
113
const result = await enqueueLinks({
114
$,
115
baseUrl: request.loadedUrl,
116
selector: '.pagination a',
117
label: 'PAGINATION',
118
strategy: 'same-domain',
119
});
120
121
console.log(`Enqueued ${result.processedRequests} pagination links`);
122
},
123
});
124
```
125
126
### Social Media Parsing
127
128
Comprehensive social media handle and contact extraction from text and HTML content.
129
130
```typescript { .api }
131
const social: {
132
/** Extract email addresses from text */
133
emailsFromText(text: string): string[];
134
135
/** Extract emails from mailto: URLs */
136
emailsFromUrls(urls: string[]): string[];
137
138
/** Extract phone numbers from text */
139
phonesFromText(text: string): string[];
140
141
/** Extract phones from tel: URLs */
142
phonesFromUrls(urls: string[]): string[];
143
144
/** Parse all social handles from HTML */
145
parseHandlesFromHtml(html: string): SocialHandles;
146
147
/** Regular expression patterns for matching emails */
148
EMAIL_REGEX: RegExp;
149
EMAIL_REGEX_GLOBAL: RegExp;
150
151
/** Social platform URL patterns */
152
LINKEDIN_REGEX: RegExp;
153
LINKEDIN_REGEX_GLOBAL: RegExp;
154
INSTAGRAM_REGEX: RegExp;
155
INSTAGRAM_REGEX_GLOBAL: RegExp;
156
TWITTER_REGEX: RegExp;
157
TWITTER_REGEX_GLOBAL: RegExp;
158
FACEBOOK_REGEX: RegExp;
159
FACEBOOK_REGEX_GLOBAL: RegExp;
160
YOUTUBE_REGEX: RegExp;
161
YOUTUBE_REGEX_GLOBAL: RegExp;
162
TIKTOK_REGEX: RegExp;
163
TIKTOK_REGEX_GLOBAL: RegExp;
164
PINTEREST_REGEX: RegExp;
165
PINTEREST_REGEX_GLOBAL: RegExp;
166
DISCORD_REGEX: RegExp;
167
DISCORD_REGEX_GLOBAL: RegExp;
168
};
169
170
interface SocialHandles {
171
emails: string[];
172
phones: string[];
173
linkedIns: string[];
174
twitters: string[];
175
instagrams: string[];
176
facebooks: string[];
177
youtubes: string[];
178
tiktoks: string[];
179
pinterests: string[];
180
discords: string[];
181
phonesUncertain: string[];
182
}
183
```
184
185
**Usage Examples:**
186
187
```typescript
188
import { CheerioCrawler, utils } from "crawlee";
189
190
const crawler = new CheerioCrawler({
191
requestHandler: async ({ $, request, pushData }) => {
192
const html = $.html();
193
const textContent = $.text();
194
195
// Extract all social handles
196
const socialHandles = utils.social.parseHandlesFromHtml(html);
197
198
// Extract emails from text content
199
const emailsInText = utils.social.emailsFromText(textContent);
200
201
// Extract phones from text
202
const phonesInText = utils.social.phonesFromText(textContent);
203
204
// Get all links and extract emails/phones from them
205
const allLinks = [];
206
$('a[href]').each((_, link) => {
207
allLinks.push($(link).attr('href'));
208
});
209
210
const emailsFromLinks = utils.social.emailsFromUrls(allLinks);
211
const phonesFromLinks = utils.social.phonesFromUrls(allLinks);
212
213
// Combine all contacts
214
const allContacts = {
215
url: request.loadedUrl,
216
emails: [...new Set([...socialHandles.emails, ...emailsInText, ...emailsFromLinks])],
217
phones: [...new Set([...socialHandles.phones, ...phonesInText, ...phonesFromLinks])],
218
socialMedia: {
219
linkedin: socialHandles.linkedIns,
220
twitter: socialHandles.twitters,
221
instagram: socialHandles.instagrams,
222
facebook: socialHandles.facebooks,
223
youtube: socialHandles.youtubes,
224
tiktok: socialHandles.tiktoks,
225
pinterest: socialHandles.pinterests,
226
discord: socialHandles.discords,
227
},
228
};
229
230
await pushData(allContacts);
231
},
232
});
233
234
// Custom social media extraction
235
const customText = "Contact us at info@example.com or follow @example on Twitter";
236
const emails = utils.social.emailsFromText(customText);
237
const twitterMatches = customText.match(utils.social.TWITTER_REGEX_GLOBAL);
238
239
console.log('Emails found:', emails);
240
console.log('Twitter handles:', twitterMatches);
241
```
242
243
### URL Utilities
244
245
Functions for URL extraction, validation, and manipulation.
246
247
```typescript { .api }
248
/** Regular expressions for matching URLs */
249
const URL_NO_COMMAS_REGEX: RegExp;
250
const URL_WITH_COMMAS_REGEX: RegExp;
251
252
/**
253
* Extract URLs from text content
254
*/
255
function extractUrls(options: ExtractUrlsOptions): string[];
256
257
/**
258
* Download and parse a list of URLs from a remote source
259
*/
260
function downloadListOfUrls(options: DownloadListOfUrlsOptions): Promise<string[]>;
261
262
/**
263
* Safely create absolute URLs from relative URLs
264
*/
265
function tryAbsoluteURL(href: string, baseUrl: string): string | null;
266
267
interface ExtractUrlsOptions {
268
/** Text content to extract URLs from */
269
string: string;
270
271
/** Whether to include URLs with commas */
272
urlRegex?: RegExp;
273
}
274
275
interface DownloadListOfUrlsOptions {
276
/** URL of the list to download */
277
url: string;
278
279
/** Character encoding */
280
encoding?: BufferEncoding;
281
282
/** Regex pattern to match URLs in the content */
283
urlRegex?: RegExp;
284
}
285
```
286
287
**Usage Examples:**
288
289
```typescript
290
import { utils, CheerioCrawler } from "crawlee";
291
292
// Extract URLs from text
293
const textWithUrls = "Visit https://example.com or check out http://test.com/page";
294
const extractedUrls = utils.extractUrls({ string: textWithUrls });
295
console.log('Found URLs:', extractedUrls);
296
297
// Download URL list from remote source
298
const urlList = await utils.downloadListOfUrls({
299
url: 'https://example.com/sitemap.txt',
300
encoding: 'utf8',
301
});
302
303
// Use in crawler for URL validation
304
const crawler = new CheerioCrawler({
305
requestHandler: async ({ $, request, enqueueLinks }) => {
306
// Extract and validate URLs
307
const allLinks = [];
308
$('a[href]').each((_, element) => {
309
const href = $(element).attr('href');
310
const absoluteUrl = utils.tryAbsoluteURL(href, request.loadedUrl);
311
312
if (absoluteUrl) {
313
allLinks.push(absoluteUrl);
314
}
315
});
316
317
// Find URLs in text content
318
const textContent = $.text();
319
const urlsInText = utils.extractUrls({
320
string: textContent,
321
urlRegex: utils.URL_WITH_COMMAS_REGEX,
322
});
323
324
console.log(`Found ${allLinks.length} links and ${urlsInText.length} URLs in text`);
325
326
await enqueueLinks({
327
urls: allLinks.slice(0, 100), // Limit to first 100 URLs
328
label: 'DISCOVERED',
329
});
330
},
331
});
332
```
333
334
### System Detection
335
336
Functions for detecting the runtime environment and system capabilities.
337
338
```typescript { .api }
339
/**
340
* Detect if running in Docker container
341
* @param forceReset - Force rechecking (internal use)
342
*/
343
function isDocker(forceReset?: boolean): Promise<boolean>;
344
345
/**
346
* Detect if running in any containerized environment
347
*/
348
function isContainerized(): Promise<boolean>;
349
350
/**
351
* Detect if running in AWS Lambda
352
*/
353
function isLambda(): boolean;
354
355
/**
356
* Get cgroup version (V1 or V2)
357
* @param forceReset - Force rechecking (internal use)
358
*/
359
function getCgroupsVersion(forceReset?: boolean): Promise<'V1' | 'V2' | null>;
360
361
interface CpuTicks {
362
/** User CPU time */
363
user: number;
364
365
/** System CPU time */
366
system: number;
367
368
/** Idle CPU time */
369
idle: number;
370
371
/** I/O wait time */
372
iowait: number;
373
374
/** IRQ time */
375
irq: number;
376
377
/** Soft IRQ time */
378
softirq: number;
379
380
/** Steal time */
381
steal: number;
382
383
/** Guest time */
384
guest: number;
385
}
386
387
interface MemoryInfo {
388
/** Total system memory in bytes */
389
totalBytes: number;
390
391
/** Free memory in bytes */
392
freeBytes: number;
393
394
/** Used memory in bytes */
395
usedBytes: number;
396
397
/** Available memory in bytes */
398
availableBytes: number;
399
400
/** Memory usage as a ratio (0-1) */
401
ratio: number;
402
}
403
```
404
405
**Usage Examples:**
406
407
```typescript
408
import { utils, Configuration } from "crawlee";
409
410
// Detect environment and configure accordingly
411
if (await utils.isDocker()) {
412
console.log('Running in Docker - using optimized settings');
413
Configuration.getGlobalConfig().set('defaultDatasetId', 'docker-dataset');
414
}
415
416
if (utils.isLambda()) {
417
console.log('Running in Lambda - reducing memory usage');
418
Configuration.getGlobalConfig().set('memoryMbytes', 512);
419
}
420
421
// Monitor system resources
422
async function logSystemInfo() {
423
console.log('System Status:');
424
console.log(`Containerized: ${await utils.isContainerized()}`);
425
console.log(`Cgroups version: ${await utils.getCgroupsVersion()}`);
426
console.log(`Lambda environment: ${utils.isLambda()}`);
427
}
428
429
// Use in crawler for adaptive behavior
430
const crawler = new BasicCrawler({
431
requestHandler: async ({ request }) => {
432
// Check environment before processing
433
if (await utils.isContainerized()) {
434
console.log('Running in containerized environment');
435
}
436
437
// Process request...
438
},
439
440
// Adjust concurrency based on environment (set at initialization)
441
maxConcurrency: utils.isLambda() ? 1 : 10,
442
});
443
```
444
445
### OpenGraph Parsing
446
447
Extract OpenGraph metadata from HTML pages.
448
449
```typescript { .api }
450
/**
451
* Parse OpenGraph tags from HTML content
452
*/
453
function parseOpenGraph(html: string): Dictionary<string>;
454
```
455
456
**Usage Examples:**
457
458
```typescript
459
import { utils, CheerioCrawler } from "crawlee";
460
461
const crawler = new CheerioCrawler({
462
requestHandler: async ({ $, request, pushData, body }) => {
463
// Parse OpenGraph data
464
const ogData = utils.parseOpenGraph(body);
465
466
// Extract standard metadata
467
const metadata = {
468
url: request.loadedUrl,
469
title: $('title').text() || ogData['og:title'],
470
description: $('meta[name="description"]').attr('content') || ogData['og:description'],
471
image: ogData['og:image'],
472
type: ogData['og:type'],
473
siteName: ogData['og:site_name'],
474
author: ogData['article:author'],
475
publishedTime: ogData['article:published_time'],
476
twitterCard: ogData['twitter:card'],
477
twitterSite: ogData['twitter:site'],
478
// Include all OpenGraph data
479
openGraph: ogData,
480
};
481
482
await pushData(metadata);
483
},
484
});
485
```
486
487
### Mathematical Utilities
488
489
Helper functions for calculations and data processing.
490
491
```typescript { .api }
492
/**
493
* Calculate weighted average from values and weights
494
*/
495
function weightedAvg(values: number[], weights: number[]): number;
496
497
/**
498
* Convert snake_case strings to camelCase
499
*/
500
function snakeCaseToCamelCase(str: string): string;
501
```
502
503
**Usage Examples:**
504
505
```typescript
506
import { utils } from "crawlee";
507
508
// Calculate weighted ratings
509
const ratings = [4.5, 3.8, 4.9, 4.1];
510
const weights = [100, 50, 200, 75]; // Number of reviews
511
const averageRating = utils.weightedAvg(ratings, weights);
512
513
console.log(`Weighted average rating: ${averageRating.toFixed(2)}`);
514
515
// Convert API response keys
516
const apiResponse = {
517
product_name: 'Widget',
518
price_usd: 29.99,
519
is_available: true,
520
created_at: '2023-01-01',
521
};
522
523
const camelCaseResponse = {};
524
Object.entries(apiResponse).forEach(([key, value]) => {
525
const camelKey = utils.snakeCaseToCamelCase(key);
526
camelCaseResponse[camelKey] = value;
527
});
528
529
console.log(camelCaseResponse);
530
// Result: { productName: 'Widget', priceUsd: 29.99, isAvailable: true, createdAt: '2023-01-01' }
531
```
532
533
### DOM Utilities
534
535
Helper functions for DOM manipulation and processing.
536
537
```typescript { .api }
538
/**
539
* Expand shadow DOM roots to access shadow content
540
*/
541
function expandShadowRoots(document: Document): void;
542
```
543
544
**Usage Examples:**
545
546
```typescript
547
import { JSDOMCrawler, utils } from "crawlee";
548
549
const crawler = new JSDOMCrawler({
550
requestHandler: async ({ window, document, request, pushData }) => {
551
// Expand shadow DOM to access hidden content
552
utils.expandShadowRoots(document);
553
554
// Now you can query shadow DOM content
555
const shadowContent = document.querySelectorAll('[data-shadow-content]');
556
557
const extractedData = Array.from(shadowContent).map(element => ({
558
text: element.textContent?.trim(),
559
attributes: Array.from(element.attributes).reduce((attrs, attr) => {
560
attrs[attr.name] = attr.value;
561
return attrs;
562
}, {}),
563
}));
564
565
await pushData({
566
url: request.loadedUrl,
567
shadowDomData: extractedData,
568
hasShadowContent: shadowContent.length > 0,
569
});
570
},
571
});
572
```
573
574
### Unified Utils Object
575
576
The main utils object that combines all utility functions.
577
578
```typescript { .api }
579
const utils: {
580
/** Puppeteer utility functions */
581
puppeteer: typeof puppeteerUtils;
582
583
/** Playwright utility functions */
584
playwright: typeof playwrightUtils;
585
586
/** Logging utility */
587
log: Log;
588
589
/** Link enqueueing function */
590
enqueueLinks: typeof enqueueLinks;
591
592
/** Social media parsing utilities */
593
social: typeof social;
594
595
/** Sleep function */
596
sleep: typeof sleep;
597
598
/** URL list downloading */
599
downloadListOfUrls: typeof downloadListOfUrls;
600
601
/** OpenGraph parsing */
602
parseOpenGraph: typeof parseOpenGraph;
603
604
/** System detection functions */
605
isDocker: typeof isDocker;
606
isLambda: typeof isLambda;
607
isContainerized: typeof isContainerized;
608
getCgroupsVersion: typeof getCgroupsVersion;
609
610
// Note: System monitoring functions are available in utils object but not directly exported
611
612
/** Mathematical utilities */
613
weightedAvg: typeof weightedAvg;
614
615
/** String utilities */
616
snakeCaseToCamelCase: typeof snakeCaseToCamelCase;
617
618
/** URL utilities */
619
extractUrls: typeof extractUrls;
620
tryAbsoluteURL: typeof tryAbsoluteURL;
621
URL_NO_COMMAS_REGEX: RegExp;
622
URL_WITH_COMMAS_REGEX: RegExp;
623
624
/** DOM utilities */
625
expandShadowRoots: typeof expandShadowRoots;
626
};
627
```
628
629
**Usage Examples:**
630
631
```typescript
632
import { utils } from "crawlee";
633
634
// All utilities available through single import
635
console.log('Environment check:');
636
console.log(`Docker: ${utils.isDocker()}`);
637
console.log(`Lambda: ${utils.isLambda()}`);
638
639
// Use social media parsing
640
const html = '<p>Contact: info@example.com, Twitter: @company</p>';
641
const contacts = utils.social.parseHandlesFromHtml(html);
642
643
// Use URL extraction
644
const text = 'Visit https://example.com for more info';
645
const urls = utils.extractUrls({ string: text });
646
647
// Use system detection
648
const isInDocker = await utils.isDocker();
649
console.log(`Running in Docker: ${isInDocker}`);
650
651
// Use in crawler with all utilities
652
const crawler = new CheerioCrawler({
653
requestHandler: async ({ $, request, pushData }) => {
654
// Rate limiting
655
await utils.sleep(1000);
656
657
// Extract data
658
const ogData = utils.parseOpenGraph($.html());
659
const socialData = utils.social.parseHandlesFromHtml($.html());
660
const urls = utils.extractUrls({ string: $.text() });
661
662
await pushData({
663
url: request.loadedUrl,
664
metadata: ogData,
665
contacts: socialData,
666
extractedUrls: urls,
667
systemInfo: {
668
isDocker: await utils.isDocker(),
669
isLambda: utils.isLambda(),
670
},
671
});
672
673
// Environment-aware link enqueueing
674
const isLimitedEnv = utils.isLambda() || await utils.isContainerized();
675
await utils.enqueueLinks({
676
$,
677
baseUrl: request.loadedUrl,
678
selector: 'a[href]',
679
limit: isLimitedEnv ? 10 : 50, // Reduce links in constrained environments
680
});
681
},
682
});
683
```
684
685
## Types
686
687
```typescript { .api }
688
interface Log {
689
/** Log debug message */
690
debug(message: string, data?: any): void;
691
692
/** Log info message */
693
info(message: string, data?: any): void;
694
695
/** Log warning message */
696
warning(message: string, data?: any): void;
697
698
/** Log error message */
699
error(message: string, error?: Error): void;
700
701
/** Log exception */
702
exception(error: Error, message?: string, data?: any): void;
703
704
/** Get child logger with prefix */
705
child(options: { prefix?: string; suffix?: string }): Log;
706
}
707
708
interface PseudoUrl {
709
/** Create pseudo-URL matcher */
710
new (purl: string, requestTemplate?: Partial<RequestOptions>): PseudoUrl;
711
712
/** Test if URL matches pattern */
713
matches(url: string): boolean;
714
715
/** Create request from matched URL */
716
createRequest(url: string): RequestOptions;
717
}
718
719
type BufferEncoding = 'ascii' | 'utf8' | 'utf16le' | 'ucs2' | 'base64' | 'latin1' | 'binary' | 'hex';
720
721
interface RequestTemplate {
722
/** Default user data for matched requests */
723
userData?: Dictionary;
724
725
/** Default label for matched requests */
726
label?: string;
727
728
/** Default HTTP method */
729
method?: HttpMethod;
730
731
/** Default headers */
732
headers?: Dictionary<string>;
733
}
734
```