tessl/npm-crawlee

The scalable web crawling and scraping library for JavaScript/Node.js that enables development of data extraction and web automation jobs with headless Chrome and Puppeteer.

Overview

Eval results

Files

Utilities

Name: tessl/npm-crawlee
Author: tessl

Crawlee provides extensive utility functions for common crawling tasks including URL extraction, social media parsing, system detection, and various helper functions for web scraping operations.

Capabilities

Sleep Utility

Promise-based sleep function for introducing delays in crawling operations.

/**
 * Promise-based sleep function
 * @param millis - Milliseconds to sleep (defaults to random between 1-5 seconds)
 */
function sleep(millis?: number): Promise<void>;

Usage Examples:

import { sleep } from "crawlee";

// Sleep for 2 seconds
await sleep(2000);

// Random sleep between 1-5 seconds
await sleep();

// Use in crawler for rate limiting
const crawler = new CheerioCrawler({
  requestHandler: async ({ request }) => {
    // Process request
    console.log(`Processing: ${request.url}`);

    // Add delay between requests
    await sleep(1000);
  },
});

Link Enqueueing

Extract and enqueue links from web pages with powerful filtering and transformation options.

/**
 * Extract and enqueue links from HTML pages
 */
function enqueueLinks(options: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;

interface EnqueueLinksOptions {
  /** Cheerio root object or HTML string */
  $?: CheerioRoot;

  /** Base URL for resolving relative links */
  baseUrl?: string;

  /** CSS selector for finding links */
  selector?: string;

  /** Pseudo-URLs for matching links */
  pseudoUrls?: (string | PseudoUrl)[];

  /** Glob patterns for URLs to include */
  globs?: string[];

  /** URLs or patterns to exclude */
  exclude?: (string | RegExp)[];

  /** Label to assign to enqueued requests */
  label?: string;

  /** Custom user data to attach */
  userData?: Dictionary;

  /** Transform function for request options */
  transformRequestFunction?: (request: RequestOptions) => RequestOptions;

  /** Request queue to add requests to */
  requestQueue?: RequestQueue;

  /** Maximum number of links to enqueue */
  limit?: number;

  /** Strategy for handling duplicate URLs */
  strategy?: EnqueueStrategy;
}

type EnqueueStrategy = 'all' | 'same-domain' | 'same-subdomain' | 'same-origin';

Usage Examples:

import { CheerioCrawler, enqueueLinks } from "crawlee";

const crawler = new CheerioCrawler({
  requestHandler: async ({ $, request, enqueueLinks: crawlerEnqueueLinks }) => {
    // Using crawler's built-in enqueueLinks
    await crawlerEnqueueLinks({
      selector: 'a[href]',
      globs: ['**/products/**', '**/category/**'],
      exclude: [/\/admin\//, /\/login/],
      label: 'PRODUCT_PAGE',
      transformRequestFunction: (req) => ({
        ...req,
        userData: { parentUrl: request.url },
      }),
      limit: 50,
    });

    // Using standalone enqueueLinks function
    const result = await enqueueLinks({
      $,
      baseUrl: request.loadedUrl,
      selector: '.pagination a',
      label: 'PAGINATION',
      strategy: 'same-domain',
    });

    console.log(`Enqueued ${result.processedRequests} pagination links`);
  },
});

Social Media Parsing

Comprehensive social media handle and contact extraction from text and HTML content.

const social: {
  /** Extract email addresses from text */
  emailsFromText(text: string): string[];

  /** Extract emails from mailto: URLs */
  emailsFromUrls(urls: string[]): string[];

  /** Extract phone numbers from text */
  phonesFromText(text: string): string[];

  /** Extract phones from tel: URLs */
  phonesFromUrls(urls: string[]): string[];

  /** Parse all social handles from HTML */
  parseHandlesFromHtml(html: string): SocialHandles;

  /** Regular expression patterns for matching emails */
  EMAIL_REGEX: RegExp;
  EMAIL_REGEX_GLOBAL: RegExp;

  /** Social platform URL patterns */
  LINKEDIN_REGEX: RegExp;
  LINKEDIN_REGEX_GLOBAL: RegExp;
  INSTAGRAM_REGEX: RegExp;
  INSTAGRAM_REGEX_GLOBAL: RegExp;
  TWITTER_REGEX: RegExp;
  TWITTER_REGEX_GLOBAL: RegExp;
  FACEBOOK_REGEX: RegExp;
  FACEBOOK_REGEX_GLOBAL: RegExp;
  YOUTUBE_REGEX: RegExp;
  YOUTUBE_REGEX_GLOBAL: RegExp;
  TIKTOK_REGEX: RegExp;
  TIKTOK_REGEX_GLOBAL: RegExp;
  PINTEREST_REGEX: RegExp;
  PINTEREST_REGEX_GLOBAL: RegExp;
  DISCORD_REGEX: RegExp;
  DISCORD_REGEX_GLOBAL: RegExp;
};

interface SocialHandles {
  emails: string[];
  phones: string[];
  linkedIns: string[];
  twitters: string[];
  instagrams: string[];
  facebooks: string[];
  youtubes: string[];
  tiktoks: string[];
  pinterests: string[];
  discords: string[];
  phonesUncertain: string[];
}

Usage Examples:

import { CheerioCrawler, utils } from "crawlee";

const crawler = new CheerioCrawler({
  requestHandler: async ({ $, request, pushData }) => {
    const html = $.html();
    const textContent = $.text();

    // Extract all social handles
    const socialHandles = utils.social.parseHandlesFromHtml(html);

    // Extract emails from text content
    const emailsInText = utils.social.emailsFromText(textContent);

    // Extract phones from text
    const phonesInText = utils.social.phonesFromText(textContent);

    // Get all links and extract emails/phones from them
    const allLinks = [];
    $('a[href]').each((_, link) => {
      allLinks.push($(link).attr('href'));
    });

    const emailsFromLinks = utils.social.emailsFromUrls(allLinks);
    const phonesFromLinks = utils.social.phonesFromUrls(allLinks);

    // Combine all contacts
    const allContacts = {
      url: request.loadedUrl,
      emails: [...new Set([...socialHandles.emails, ...emailsInText, ...emailsFromLinks])],
      phones: [...new Set([...socialHandles.phones, ...phonesInText, ...phonesFromLinks])],
      socialMedia: {
        linkedin: socialHandles.linkedIns,
        twitter: socialHandles.twitters,
        instagram: socialHandles.instagrams,
        facebook: socialHandles.facebooks,
        youtube: socialHandles.youtubes,
        tiktok: socialHandles.tiktoks,
        pinterest: socialHandles.pinterests,
        discord: socialHandles.discords,
      },
    };

    await pushData(allContacts);
  },
});

// Custom social media extraction
const customText = "Contact us at info@example.com or follow @example on Twitter";
const emails = utils.social.emailsFromText(customText);
const twitterMatches = customText.match(utils.social.TWITTER_REGEX_GLOBAL);

console.log('Emails found:', emails);
console.log('Twitter handles:', twitterMatches);

URL Utilities

Functions for URL extraction, validation, and manipulation.

/** Regular expressions for matching URLs */
const URL_NO_COMMAS_REGEX: RegExp;
const URL_WITH_COMMAS_REGEX: RegExp;

/**
 * Extract URLs from text content
 */
function extractUrls(options: ExtractUrlsOptions): string[];

/**
 * Download and parse a list of URLs from a remote source
 */
function downloadListOfUrls(options: DownloadListOfUrlsOptions): Promise<string[]>;

/**
 * Safely create absolute URLs from relative URLs
 */
function tryAbsoluteURL(href: string, baseUrl: string): string | null;

interface ExtractUrlsOptions {
  /** Text content to extract URLs from */
  string: string;

  /** Whether to include URLs with commas */
  urlRegex?: RegExp;
}

interface DownloadListOfUrlsOptions {
  /** URL of the list to download */
  url: string;

  /** Character encoding */
  encoding?: BufferEncoding;

  /** Regex pattern to match URLs in the content */
  urlRegex?: RegExp;
}

Usage Examples:

import { utils, CheerioCrawler } from "crawlee";

// Extract URLs from text
const textWithUrls = "Visit https://example.com or check out http://test.com/page";
const extractedUrls = utils.extractUrls({ string: textWithUrls });
console.log('Found URLs:', extractedUrls);

// Download URL list from remote source
const urlList = await utils.downloadListOfUrls({
  url: 'https://example.com/sitemap.txt',
  encoding: 'utf8',
});

// Use in crawler for URL validation
const crawler = new CheerioCrawler({
  requestHandler: async ({ $, request, enqueueLinks }) => {
    // Extract and validate URLs
    const allLinks = [];
    $('a[href]').each((_, element) => {
      const href = $(element).attr('href');
      const absoluteUrl = utils.tryAbsoluteURL(href, request.loadedUrl);

      if (absoluteUrl) {
        allLinks.push(absoluteUrl);
      }
    });

    // Find URLs in text content
    const textContent = $.text();
    const urlsInText = utils.extractUrls({
      string: textContent,
      urlRegex: utils.URL_WITH_COMMAS_REGEX,
    });

    console.log(`Found ${allLinks.length} links and ${urlsInText.length} URLs in text`);

    await enqueueLinks({
      urls: allLinks.slice(0, 100), // Limit to first 100 URLs
      label: 'DISCOVERED',
    });
  },
});

System Detection

Functions for detecting the runtime environment and system capabilities.

/**
 * Detect if running in Docker container
 * @param forceReset - Force rechecking (internal use)
 */
function isDocker(forceReset?: boolean): Promise<boolean>;

/**
 * Detect if running in any containerized environment
 */
function isContainerized(): Promise<boolean>;

/**
 * Detect if running in AWS Lambda
 */
function isLambda(): boolean;

/**
 * Get cgroup version (V1 or V2)
 * @param forceReset - Force rechecking (internal use)
 */
function getCgroupsVersion(forceReset?: boolean): Promise<'V1' | 'V2' | null>;

interface CpuTicks {
  /** User CPU time */
  user: number;

  /** System CPU time */
  system: number;

  /** Idle CPU time */
  idle: number;

  /** I/O wait time */
  iowait: number;

  /** IRQ time */
  irq: number;

  /** Soft IRQ time */
  softirq: number;

  /** Steal time */
  steal: number;

  /** Guest time */
  guest: number;
}

interface MemoryInfo {
  /** Total system memory in bytes */
  totalBytes: number;

  /** Free memory in bytes */
  freeBytes: number;

  /** Used memory in bytes */
  usedBytes: number;

  /** Available memory in bytes */
  availableBytes: number;

  /** Memory usage as a ratio (0-1) */
  ratio: number;
}

Usage Examples:

import { utils, Configuration } from "crawlee";

// Detect environment and configure accordingly
if (await utils.isDocker()) {
  console.log('Running in Docker - using optimized settings');
  Configuration.getGlobalConfig().set('defaultDatasetId', 'docker-dataset');
}

if (utils.isLambda()) {
  console.log('Running in Lambda - reducing memory usage');
  Configuration.getGlobalConfig().set('memoryMbytes', 512);
}

// Monitor system resources
async function logSystemInfo() {
  console.log('System Status:');
  console.log(`Containerized: ${await utils.isContainerized()}`);
  console.log(`Cgroups version: ${await utils.getCgroupsVersion()}`);
  console.log(`Lambda environment: ${utils.isLambda()}`);
}

// Use in crawler for adaptive behavior
const crawler = new BasicCrawler({
  requestHandler: async ({ request }) => {
    // Check environment before processing
    if (await utils.isContainerized()) {
      console.log('Running in containerized environment');
    }

    // Process request...
  },

  // Adjust concurrency based on environment (set at initialization)
  maxConcurrency: utils.isLambda() ? 1 : 10,
});

OpenGraph Parsing

Extract OpenGraph metadata from HTML pages.

/**
 * Parse OpenGraph tags from HTML content
 */
function parseOpenGraph(html: string): Dictionary<string>;

Usage Examples:

import { utils, CheerioCrawler } from "crawlee";

const crawler = new CheerioCrawler({
  requestHandler: async ({ $, request, pushData, body }) => {
    // Parse OpenGraph data
    const ogData = utils.parseOpenGraph(body);

    // Extract standard metadata
    const metadata = {
      url: request.loadedUrl,
      title: $('title').text() || ogData['og:title'],
      description: $('meta[name="description"]').attr('content') || ogData['og:description'],
      image: ogData['og:image'],
      type: ogData['og:type'],
      siteName: ogData['og:site_name'],
      author: ogData['article:author'],
      publishedTime: ogData['article:published_time'],
      twitterCard: ogData['twitter:card'],
      twitterSite: ogData['twitter:site'],
      // Include all OpenGraph data
      openGraph: ogData,
    };

    await pushData(metadata);
  },
});

Mathematical Utilities

Helper functions for calculations and data processing.

/**
 * Calculate weighted average from values and weights
 */
function weightedAvg(values: number[], weights: number[]): number;

/**
 * Convert snake_case strings to camelCase
 */
function snakeCaseToCamelCase(str: string): string;

Usage Examples:

import { utils } from "crawlee";

// Calculate weighted ratings
const ratings = [4.5, 3.8, 4.9, 4.1];
const weights = [100, 50, 200, 75]; // Number of reviews
const averageRating = utils.weightedAvg(ratings, weights);

console.log(`Weighted average rating: ${averageRating.toFixed(2)}`);

// Convert API response keys
const apiResponse = {
  product_name: 'Widget',
  price_usd: 29.99,
  is_available: true,
  created_at: '2023-01-01',
};

const camelCaseResponse = {};
Object.entries(apiResponse).forEach(([key, value]) => {
  const camelKey = utils.snakeCaseToCamelCase(key);
  camelCaseResponse[camelKey] = value;
});

console.log(camelCaseResponse);
// Result: { productName: 'Widget', priceUsd: 29.99, isAvailable: true, createdAt: '2023-01-01' }

DOM Utilities

Helper functions for DOM manipulation and processing.

/**
 * Expand shadow DOM roots to access shadow content
 */
function expandShadowRoots(document: Document): void;

Usage Examples:

import { JSDOMCrawler, utils } from "crawlee";

const crawler = new JSDOMCrawler({
  requestHandler: async ({ window, document, request, pushData }) => {
    // Expand shadow DOM to access hidden content
    utils.expandShadowRoots(document);

    // Now you can query shadow DOM content
    const shadowContent = document.querySelectorAll('[data-shadow-content]');

    const extractedData = Array.from(shadowContent).map(element => ({
      text: element.textContent?.trim(),
      attributes: Array.from(element.attributes).reduce((attrs, attr) => {
        attrs[attr.name] = attr.value;
        return attrs;
      }, {}),
    }));

    await pushData({
      url: request.loadedUrl,
      shadowDomData: extractedData,
      hasShadowContent: shadowContent.length > 0,
    });
  },
});

Unified Utils Object

The main utils object that combines all utility functions.

const utils: {
  /** Puppeteer utility functions */
  puppeteer: typeof puppeteerUtils;

  /** Playwright utility functions */
  playwright: typeof playwrightUtils;

  /** Logging utility */
  log: Log;

  /** Link enqueueing function */
  enqueueLinks: typeof enqueueLinks;

  /** Social media parsing utilities */
  social: typeof social;

  /** Sleep function */
  sleep: typeof sleep;

  /** URL list downloading */
  downloadListOfUrls: typeof downloadListOfUrls;

  /** OpenGraph parsing */
  parseOpenGraph: typeof parseOpenGraph;

  /** System detection functions */
  isDocker: typeof isDocker;
  isLambda: typeof isLambda;
  isContainerized: typeof isContainerized;
  getCgroupsVersion: typeof getCgroupsVersion;

  // Note: System monitoring functions are available in utils object but not directly exported

  /** Mathematical utilities */
  weightedAvg: typeof weightedAvg;

  /** String utilities */
  snakeCaseToCamelCase: typeof snakeCaseToCamelCase;

  /** URL utilities */
  extractUrls: typeof extractUrls;
  tryAbsoluteURL: typeof tryAbsoluteURL;
  URL_NO_COMMAS_REGEX: RegExp;
  URL_WITH_COMMAS_REGEX: RegExp;

  /** DOM utilities */
  expandShadowRoots: typeof expandShadowRoots;
};

Usage Examples:

import { utils } from "crawlee";

// All utilities available through single import
console.log('Environment check:');
console.log(`Docker: ${utils.isDocker()}`);
console.log(`Lambda: ${utils.isLambda()}`);

// Use social media parsing
const html = '<p>Contact: info@example.com, Twitter: @company</p>';
const contacts = utils.social.parseHandlesFromHtml(html);

// Use URL extraction
const text = 'Visit https://example.com for more info';
const urls = utils.extractUrls({ string: text });

// Use system detection
const isInDocker = await utils.isDocker();
console.log(`Running in Docker: ${isInDocker}`);

// Use in crawler with all utilities
const crawler = new CheerioCrawler({
  requestHandler: async ({ $, request, pushData }) => {
    // Rate limiting
    await utils.sleep(1000);

    // Extract data
    const ogData = utils.parseOpenGraph($.html());
    const socialData = utils.social.parseHandlesFromHtml($.html());
    const urls = utils.extractUrls({ string: $.text() });

    await pushData({
      url: request.loadedUrl,
      metadata: ogData,
      contacts: socialData,
      extractedUrls: urls,
      systemInfo: {
        isDocker: await utils.isDocker(),
        isLambda: utils.isLambda(),
      },
    });

    // Environment-aware link enqueueing
    const isLimitedEnv = utils.isLambda() || await utils.isContainerized();
    await utils.enqueueLinks({
      $,
      baseUrl: request.loadedUrl,
      selector: 'a[href]',
      limit: isLimitedEnv ? 10 : 50, // Reduce links in constrained environments
    });
  },
});

Types

interface Log {
  /** Log debug message */
  debug(message: string, data?: any): void;

  /** Log info message */
  info(message: string, data?: any): void;

  /** Log warning message */
  warning(message: string, data?: any): void;

  /** Log error message */
  error(message: string, error?: Error): void;

  /** Log exception */
  exception(error: Error, message?: string, data?: any): void;

  /** Get child logger with prefix */
  child(options: { prefix?: string; suffix?: string }): Log;
}

interface PseudoUrl {
  /** Create pseudo-URL matcher */
  new (purl: string, requestTemplate?: Partial<RequestOptions>): PseudoUrl;

  /** Test if URL matches pattern */
  matches(url: string): boolean;

  /** Create request from matched URL */
  createRequest(url: string): RequestOptions;
}

type BufferEncoding = 'ascii' | 'utf8' | 'utf16le' | 'ucs2' | 'base64' | 'latin1' | 'binary' | 'hex';

interface RequestTemplate {
  /** Default user data for matched requests */
  userData?: Dictionary;

  /** Default label for matched requests */
  label?: string;

  /** Default HTTP method */
  method?: HttpMethod;

  /** Default headers */
  headers?: Dictionary<string>;
}

Install with Tessl CLI