The scalable web crawling and scraping library for JavaScript/Node.js that enables development of data extraction and web automation jobs with headless Chrome and Puppeteer.
Crawlee provides extensive utility functions for common crawling tasks including URL extraction, social media parsing, system detection, and various helper functions for web scraping operations.
Promise-based sleep function for introducing delays in crawling operations.
/**
* Promise-based sleep function
* @param millis - Milliseconds to sleep (defaults to random between 1-5 seconds)
*/
function sleep(millis?: number): Promise<void>;Usage Examples:
import { sleep } from "crawlee";
// Sleep for 2 seconds
await sleep(2000);
// Random sleep between 1-5 seconds
await sleep();
// Use in crawler for rate limiting
const crawler = new CheerioCrawler({
requestHandler: async ({ request }) => {
// Process request
console.log(`Processing: ${request.url}`);
// Add delay between requests
await sleep(1000);
},
});Extract and enqueue links from web pages with powerful filtering and transformation options.
/**
* Extract and enqueue links from HTML pages
*/
function enqueueLinks(options: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;
interface EnqueueLinksOptions {
/** Cheerio root object or HTML string */
$?: CheerioRoot;
/** Base URL for resolving relative links */
baseUrl?: string;
/** CSS selector for finding links */
selector?: string;
/** Pseudo-URLs for matching links */
pseudoUrls?: (string | PseudoUrl)[];
/** Glob patterns for URLs to include */
globs?: string[];
/** URLs or patterns to exclude */
exclude?: (string | RegExp)[];
/** Label to assign to enqueued requests */
label?: string;
/** Custom user data to attach */
userData?: Dictionary;
/** Transform function for request options */
transformRequestFunction?: (request: RequestOptions) => RequestOptions;
/** Request queue to add requests to */
requestQueue?: RequestQueue;
/** Maximum number of links to enqueue */
limit?: number;
/** Strategy for handling duplicate URLs */
strategy?: EnqueueStrategy;
}
type EnqueueStrategy = 'all' | 'same-domain' | 'same-subdomain' | 'same-origin';Usage Examples:
import { CheerioCrawler, enqueueLinks } from "crawlee";
const crawler = new CheerioCrawler({
requestHandler: async ({ $, request, enqueueLinks: crawlerEnqueueLinks }) => {
// Using crawler's built-in enqueueLinks
await crawlerEnqueueLinks({
selector: 'a[href]',
globs: ['**/products/**', '**/category/**'],
exclude: [/\/admin\//, /\/login/],
label: 'PRODUCT_PAGE',
transformRequestFunction: (req) => ({
...req,
userData: { parentUrl: request.url },
}),
limit: 50,
});
// Using standalone enqueueLinks function
const result = await enqueueLinks({
$,
baseUrl: request.loadedUrl,
selector: '.pagination a',
label: 'PAGINATION',
strategy: 'same-domain',
});
console.log(`Enqueued ${result.processedRequests} pagination links`);
},
});Comprehensive social media handle and contact extraction from text and HTML content.
const social: {
/** Extract email addresses from text */
emailsFromText(text: string): string[];
/** Extract emails from mailto: URLs */
emailsFromUrls(urls: string[]): string[];
/** Extract phone numbers from text */
phonesFromText(text: string): string[];
/** Extract phones from tel: URLs */
phonesFromUrls(urls: string[]): string[];
/** Parse all social handles from HTML */
parseHandlesFromHtml(html: string): SocialHandles;
/** Regular expression patterns for matching emails */
EMAIL_REGEX: RegExp;
EMAIL_REGEX_GLOBAL: RegExp;
/** Social platform URL patterns */
LINKEDIN_REGEX: RegExp;
LINKEDIN_REGEX_GLOBAL: RegExp;
INSTAGRAM_REGEX: RegExp;
INSTAGRAM_REGEX_GLOBAL: RegExp;
TWITTER_REGEX: RegExp;
TWITTER_REGEX_GLOBAL: RegExp;
FACEBOOK_REGEX: RegExp;
FACEBOOK_REGEX_GLOBAL: RegExp;
YOUTUBE_REGEX: RegExp;
YOUTUBE_REGEX_GLOBAL: RegExp;
TIKTOK_REGEX: RegExp;
TIKTOK_REGEX_GLOBAL: RegExp;
PINTEREST_REGEX: RegExp;
PINTEREST_REGEX_GLOBAL: RegExp;
DISCORD_REGEX: RegExp;
DISCORD_REGEX_GLOBAL: RegExp;
};
interface SocialHandles {
emails: string[];
phones: string[];
linkedIns: string[];
twitters: string[];
instagrams: string[];
facebooks: string[];
youtubes: string[];
tiktoks: string[];
pinterests: string[];
discords: string[];
phonesUncertain: string[];
}Usage Examples:
import { CheerioCrawler, utils } from "crawlee";
const crawler = new CheerioCrawler({
requestHandler: async ({ $, request, pushData }) => {
const html = $.html();
const textContent = $.text();
// Extract all social handles
const socialHandles = utils.social.parseHandlesFromHtml(html);
// Extract emails from text content
const emailsInText = utils.social.emailsFromText(textContent);
// Extract phones from text
const phonesInText = utils.social.phonesFromText(textContent);
// Get all links and extract emails/phones from them
const allLinks = [];
$('a[href]').each((_, link) => {
allLinks.push($(link).attr('href'));
});
const emailsFromLinks = utils.social.emailsFromUrls(allLinks);
const phonesFromLinks = utils.social.phonesFromUrls(allLinks);
// Combine all contacts
const allContacts = {
url: request.loadedUrl,
emails: [...new Set([...socialHandles.emails, ...emailsInText, ...emailsFromLinks])],
phones: [...new Set([...socialHandles.phones, ...phonesInText, ...phonesFromLinks])],
socialMedia: {
linkedin: socialHandles.linkedIns,
twitter: socialHandles.twitters,
instagram: socialHandles.instagrams,
facebook: socialHandles.facebooks,
youtube: socialHandles.youtubes,
tiktok: socialHandles.tiktoks,
pinterest: socialHandles.pinterests,
discord: socialHandles.discords,
},
};
await pushData(allContacts);
},
});
// Custom social media extraction
const customText = "Contact us at info@example.com or follow @example on Twitter";
const emails = utils.social.emailsFromText(customText);
const twitterMatches = customText.match(utils.social.TWITTER_REGEX_GLOBAL);
console.log('Emails found:', emails);
console.log('Twitter handles:', twitterMatches);Functions for URL extraction, validation, and manipulation.
/** Regular expressions for matching URLs */
const URL_NO_COMMAS_REGEX: RegExp;
const URL_WITH_COMMAS_REGEX: RegExp;
/**
* Extract URLs from text content
*/
function extractUrls(options: ExtractUrlsOptions): string[];
/**
* Download and parse a list of URLs from a remote source
*/
function downloadListOfUrls(options: DownloadListOfUrlsOptions): Promise<string[]>;
/**
* Safely create absolute URLs from relative URLs
*/
function tryAbsoluteURL(href: string, baseUrl: string): string | null;
interface ExtractUrlsOptions {
/** Text content to extract URLs from */
string: string;
/** Whether to include URLs with commas */
urlRegex?: RegExp;
}
interface DownloadListOfUrlsOptions {
/** URL of the list to download */
url: string;
/** Character encoding */
encoding?: BufferEncoding;
/** Regex pattern to match URLs in the content */
urlRegex?: RegExp;
}Usage Examples:
import { utils, CheerioCrawler } from "crawlee";
// Extract URLs from text
const textWithUrls = "Visit https://example.com or check out http://test.com/page";
const extractedUrls = utils.extractUrls({ string: textWithUrls });
console.log('Found URLs:', extractedUrls);
// Download URL list from remote source
const urlList = await utils.downloadListOfUrls({
url: 'https://example.com/sitemap.txt',
encoding: 'utf8',
});
// Use in crawler for URL validation
const crawler = new CheerioCrawler({
requestHandler: async ({ $, request, enqueueLinks }) => {
// Extract and validate URLs
const allLinks = [];
$('a[href]').each((_, element) => {
const href = $(element).attr('href');
const absoluteUrl = utils.tryAbsoluteURL(href, request.loadedUrl);
if (absoluteUrl) {
allLinks.push(absoluteUrl);
}
});
// Find URLs in text content
const textContent = $.text();
const urlsInText = utils.extractUrls({
string: textContent,
urlRegex: utils.URL_WITH_COMMAS_REGEX,
});
console.log(`Found ${allLinks.length} links and ${urlsInText.length} URLs in text`);
await enqueueLinks({
urls: allLinks.slice(0, 100), // Limit to first 100 URLs
label: 'DISCOVERED',
});
},
});Functions for detecting the runtime environment and system capabilities.
/**
* Detect if running in Docker container
* @param forceReset - Force rechecking (internal use)
*/
function isDocker(forceReset?: boolean): Promise<boolean>;
/**
* Detect if running in any containerized environment
*/
function isContainerized(): Promise<boolean>;
/**
* Detect if running in AWS Lambda
*/
function isLambda(): boolean;
/**
* Get cgroup version (V1 or V2)
* @param forceReset - Force rechecking (internal use)
*/
function getCgroupsVersion(forceReset?: boolean): Promise<'V1' | 'V2' | null>;
interface CpuTicks {
/** User CPU time */
user: number;
/** System CPU time */
system: number;
/** Idle CPU time */
idle: number;
/** I/O wait time */
iowait: number;
/** IRQ time */
irq: number;
/** Soft IRQ time */
softirq: number;
/** Steal time */
steal: number;
/** Guest time */
guest: number;
}
interface MemoryInfo {
/** Total system memory in bytes */
totalBytes: number;
/** Free memory in bytes */
freeBytes: number;
/** Used memory in bytes */
usedBytes: number;
/** Available memory in bytes */
availableBytes: number;
/** Memory usage as a ratio (0-1) */
ratio: number;
}Usage Examples:
import { utils, Configuration } from "crawlee";
// Detect environment and configure accordingly
if (await utils.isDocker()) {
console.log('Running in Docker - using optimized settings');
Configuration.getGlobalConfig().set('defaultDatasetId', 'docker-dataset');
}
if (utils.isLambda()) {
console.log('Running in Lambda - reducing memory usage');
Configuration.getGlobalConfig().set('memoryMbytes', 512);
}
// Monitor system resources
async function logSystemInfo() {
console.log('System Status:');
console.log(`Containerized: ${await utils.isContainerized()}`);
console.log(`Cgroups version: ${await utils.getCgroupsVersion()}`);
console.log(`Lambda environment: ${utils.isLambda()}`);
}
// Use in crawler for adaptive behavior
const crawler = new BasicCrawler({
requestHandler: async ({ request }) => {
// Check environment before processing
if (await utils.isContainerized()) {
console.log('Running in containerized environment');
}
// Process request...
},
// Adjust concurrency based on environment (set at initialization)
maxConcurrency: utils.isLambda() ? 1 : 10,
});Extract OpenGraph metadata from HTML pages.
/**
* Parse OpenGraph tags from HTML content
*/
function parseOpenGraph(html: string): Dictionary<string>;Usage Examples:
import { utils, CheerioCrawler } from "crawlee";
const crawler = new CheerioCrawler({
requestHandler: async ({ $, request, pushData, body }) => {
// Parse OpenGraph data
const ogData = utils.parseOpenGraph(body);
// Extract standard metadata
const metadata = {
url: request.loadedUrl,
title: $('title').text() || ogData['og:title'],
description: $('meta[name="description"]').attr('content') || ogData['og:description'],
image: ogData['og:image'],
type: ogData['og:type'],
siteName: ogData['og:site_name'],
author: ogData['article:author'],
publishedTime: ogData['article:published_time'],
twitterCard: ogData['twitter:card'],
twitterSite: ogData['twitter:site'],
// Include all OpenGraph data
openGraph: ogData,
};
await pushData(metadata);
},
});Helper functions for calculations and data processing.
/**
* Calculate weighted average from values and weights
*/
function weightedAvg(values: number[], weights: number[]): number;
/**
* Convert snake_case strings to camelCase
*/
function snakeCaseToCamelCase(str: string): string;Usage Examples:
import { utils } from "crawlee";
// Calculate weighted ratings
const ratings = [4.5, 3.8, 4.9, 4.1];
const weights = [100, 50, 200, 75]; // Number of reviews
const averageRating = utils.weightedAvg(ratings, weights);
console.log(`Weighted average rating: ${averageRating.toFixed(2)}`);
// Convert API response keys
const apiResponse = {
product_name: 'Widget',
price_usd: 29.99,
is_available: true,
created_at: '2023-01-01',
};
const camelCaseResponse = {};
Object.entries(apiResponse).forEach(([key, value]) => {
const camelKey = utils.snakeCaseToCamelCase(key);
camelCaseResponse[camelKey] = value;
});
console.log(camelCaseResponse);
// Result: { productName: 'Widget', priceUsd: 29.99, isAvailable: true, createdAt: '2023-01-01' }Helper functions for DOM manipulation and processing.
/**
* Expand shadow DOM roots to access shadow content
*/
function expandShadowRoots(document: Document): void;Usage Examples:
import { JSDOMCrawler, utils } from "crawlee";
const crawler = new JSDOMCrawler({
requestHandler: async ({ window, document, request, pushData }) => {
// Expand shadow DOM to access hidden content
utils.expandShadowRoots(document);
// Now you can query shadow DOM content
const shadowContent = document.querySelectorAll('[data-shadow-content]');
const extractedData = Array.from(shadowContent).map(element => ({
text: element.textContent?.trim(),
attributes: Array.from(element.attributes).reduce((attrs, attr) => {
attrs[attr.name] = attr.value;
return attrs;
}, {}),
}));
await pushData({
url: request.loadedUrl,
shadowDomData: extractedData,
hasShadowContent: shadowContent.length > 0,
});
},
});The main utils object that combines all utility functions.
const utils: {
/** Puppeteer utility functions */
puppeteer: typeof puppeteerUtils;
/** Playwright utility functions */
playwright: typeof playwrightUtils;
/** Logging utility */
log: Log;
/** Link enqueueing function */
enqueueLinks: typeof enqueueLinks;
/** Social media parsing utilities */
social: typeof social;
/** Sleep function */
sleep: typeof sleep;
/** URL list downloading */
downloadListOfUrls: typeof downloadListOfUrls;
/** OpenGraph parsing */
parseOpenGraph: typeof parseOpenGraph;
/** System detection functions */
isDocker: typeof isDocker;
isLambda: typeof isLambda;
isContainerized: typeof isContainerized;
getCgroupsVersion: typeof getCgroupsVersion;
// Note: System monitoring functions are available in utils object but not directly exported
/** Mathematical utilities */
weightedAvg: typeof weightedAvg;
/** String utilities */
snakeCaseToCamelCase: typeof snakeCaseToCamelCase;
/** URL utilities */
extractUrls: typeof extractUrls;
tryAbsoluteURL: typeof tryAbsoluteURL;
URL_NO_COMMAS_REGEX: RegExp;
URL_WITH_COMMAS_REGEX: RegExp;
/** DOM utilities */
expandShadowRoots: typeof expandShadowRoots;
};Usage Examples:
import { utils } from "crawlee";
// All utilities available through single import
console.log('Environment check:');
console.log(`Docker: ${utils.isDocker()}`);
console.log(`Lambda: ${utils.isLambda()}`);
// Use social media parsing
const html = '<p>Contact: info@example.com, Twitter: @company</p>';
const contacts = utils.social.parseHandlesFromHtml(html);
// Use URL extraction
const text = 'Visit https://example.com for more info';
const urls = utils.extractUrls({ string: text });
// Use system detection
const isInDocker = await utils.isDocker();
console.log(`Running in Docker: ${isInDocker}`);
// Use in crawler with all utilities
const crawler = new CheerioCrawler({
requestHandler: async ({ $, request, pushData }) => {
// Rate limiting
await utils.sleep(1000);
// Extract data
const ogData = utils.parseOpenGraph($.html());
const socialData = utils.social.parseHandlesFromHtml($.html());
const urls = utils.extractUrls({ string: $.text() });
await pushData({
url: request.loadedUrl,
metadata: ogData,
contacts: socialData,
extractedUrls: urls,
systemInfo: {
isDocker: await utils.isDocker(),
isLambda: utils.isLambda(),
},
});
// Environment-aware link enqueueing
const isLimitedEnv = utils.isLambda() || await utils.isContainerized();
await utils.enqueueLinks({
$,
baseUrl: request.loadedUrl,
selector: 'a[href]',
limit: isLimitedEnv ? 10 : 50, // Reduce links in constrained environments
});
},
});interface Log {
/** Log debug message */
debug(message: string, data?: any): void;
/** Log info message */
info(message: string, data?: any): void;
/** Log warning message */
warning(message: string, data?: any): void;
/** Log error message */
error(message: string, error?: Error): void;
/** Log exception */
exception(error: Error, message?: string, data?: any): void;
/** Get child logger with prefix */
child(options: { prefix?: string; suffix?: string }): Log;
}
interface PseudoUrl {
/** Create pseudo-URL matcher */
new (purl: string, requestTemplate?: Partial<RequestOptions>): PseudoUrl;
/** Test if URL matches pattern */
matches(url: string): boolean;
/** Create request from matched URL */
createRequest(url: string): RequestOptions;
}
type BufferEncoding = 'ascii' | 'utf8' | 'utf16le' | 'ucs2' | 'base64' | 'latin1' | 'binary' | 'hex';
interface RequestTemplate {
/** Default user data for matched requests */
userData?: Dictionary;
/** Default label for matched requests */
label?: string;
/** Default HTTP method */
method?: HttpMethod;
/** Default headers */
headers?: Dictionary<string>;
}Install with Tessl CLI
npx tessl i tessl/npm-crawlee