tessl/npm-crawlee

The scalable web crawling and scraping library for JavaScript/Node.js that enables development of data extraction and web automation jobs with headless Chrome and Puppeteer.

Overview

Eval results

Files

Core Crawling

Name: tessl/npm-crawlee
Author: tessl

Core crawling functionality provides the foundation classes and utilities that all other crawlers build upon. This includes the base BasicCrawler class, autoscaling capabilities, error handling, and request routing.

Capabilities

BasicCrawler

The foundation crawler class that all other crawlers extend. Provides core functionality for request management, autoscaling, session handling, and error recovery.

/**
 * Base crawler class that provides the foundation for all crawler implementations
 * @template Context - The context type passed to request handlers
 */
class BasicCrawler<Context = BasicCrawlingContext> {
  constructor(options: BasicCrawlerOptions<Context>);

  /** Run the crawler until all requests are processed */
  run(): Promise<FinalStatistics>;

  /** Add requests to the crawler queue */
  addRequests(requests: (string | RequestOptions)[]): Promise<void>;

  /** Export data from the default dataset */
  exportData<T>(options?: DatasetExportOptions): Promise<T[]>;

  /** Get data from the default dataset */
  getData<T>(options?: DatasetDataOptions): Promise<T[]>;

  /** Set a value in the default key-value store */
  setValue(key: string, value: any, options?: RecordOptions): Promise<void>;

  /** Get a value from the default key-value store */
  getValue<T>(key: string): Promise<T | null>;

  /** Register event handlers */
  use(handler: CrawlerAddons<Context>): void;

  /** Get crawler statistics */
  readonly stats: Statistics;

  /** Get final statistics after crawler finishes */
  readonly finalStatistics?: FinalStatistics;
}

Usage Examples:

import { BasicCrawler } from "crawlee";

const crawler = new BasicCrawler({
  requestHandler: async ({ request, log }) => {
    log.info(`Processing ${request.url}`);

    // Custom processing logic here
    const response = await fetch(request.url);
    const data = await response.text();

    // Save processed data
    await crawler.setValue(`page-${request.uniqueKey}`, data);
  },
  maxConcurrency: 10,
  maxRequestRetries: 3,
});

// Add requests and run
await crawler.addRequests(['https://example.com']);
const stats = await crawler.run();

console.log(`Processed ${stats.requestsFinished} requests`);

BasicCrawlerOptions

Configuration options for the BasicCrawler.

interface BasicCrawlerOptions<Context = BasicCrawlingContext> {
  /** List of requests to process */
  requestList?: RequestList;

  /** Queue of requests to process */
  requestQueue?: RequestQueue;

  /** Function to handle each request */
  requestHandler: (context: Context) => Promise<void>;

  /** Handler for failed requests that won't be retried */
  failedRequestHandler?: (context: Context, error: Error) => Promise<void>;

  /** Maximum number of retries per request */
  maxRequestRetries?: number;

  /** Maximum number of requests to process */
  maxRequestsPerCrawl?: number;

  /** Maximum number of concurrent requests */
  maxConcurrency?: number;

  /** Minimum number of concurrent requests */
  minConcurrency?: number;

  /** Options for the autoscaled pool */
  autoscaledPoolOptions?: AutoscaledPoolOptions;

  /** Options for the session pool */
  sessionPoolOptions?: SessionPoolOptions;

  /** Whether to use session pool for requests */
  useSessionPool?: boolean;

  /** Whether to persist cookies per session */
  persistCookiesPerSession?: boolean;

  /** Configuration for proxy usage */
  proxyConfiguration?: ProxyConfiguration;

  /** Whether to keep URL fragments when processing */
  keepAlive?: boolean;

  /** Custom statistics instance */
  statistics?: Statistics;

  /** Custom logger instance */
  log?: Log;

  /** Options for handling reclaimed requests */
  reclaimRequestHandler?: (context: Context) => Promise<void>;
}

BasicCrawlingContext

The context object passed to request handlers, containing request information and helper methods.

interface BasicCrawlingContext<UserData = Dictionary> {
  /** The current request being processed */
  request: Request<UserData>;

  /** Current session if session pool is used */
  session?: Session;

  /** Proxy information for the request */
  proxyInfo?: ProxyInfo;

  /** HTTP response object (when applicable) */
  response?: IncomingMessage;

  /** Reference to the crawler instance */
  crawler: BasicCrawler;

  /** Logger instance scoped to this request */
  log: Log;

  /** Send HTTP request with current session and proxy */
  sendRequest<T>(overrideOptions?: Partial<OptionsInit>): Promise<T>;

  /** Extract and enqueue links from current page */
  enqueueLinks(options?: EnqueueLinksOptions): Promise<BatchAddRequestsResult>;

  /** Push data to the default dataset */
  pushData(data: Dictionary | Dictionary[]): Promise<void>;

  /** Store value in the default key-value store */
  setValue(key: string, value: any, options?: RecordOptions): Promise<void>;

  /** Get value from the default key-value store */
  getValue<T>(key: string): Promise<T | null>;
}

AutoscaledPool

Manages automatic scaling of concurrent tasks based on system resources and performance.

/**
 * Manages parallel asynchronous tasks with automatic scaling based on system resources
 */
class AutoscaledPool {
  constructor(options: AutoscaledPoolOptions);

  /** Start running the pool */
  run(): Promise<void>;

  /** Abort all running and pending tasks */
  abort(): Promise<void>;

  /** Pause the pool (finish running tasks but don't start new ones) */
  pause(): Promise<void>;

  /** Resume a paused pool */
  resume(): Promise<void>;

  /** Notify the pool about task completion or failure */
  notify(): void;

  /** Current concurrency level */
  readonly currentConcurrency: number;

  /** Desired concurrency level calculated by autoscaling */
  readonly desiredConcurrency: number;

  /** Minimum concurrency level */
  readonly minConcurrency: number;

  /** Maximum concurrency level */
  readonly maxConcurrency: number;
}

AutoscaledPoolOptions

Configuration options for the AutoscaledPool.

interface AutoscaledPoolOptions {
  /** Function that runs a single task */
  runTaskFunction: () => Promise<void>;

  /** Function that checks if more tasks are available */
  isTaskReadyFunction?: () => Promise<boolean>;

  /** Function that checks if all tasks are finished */
  isFinishedFunction?: () => Promise<boolean>;

  /** Minimum number of concurrent tasks */
  minConcurrency?: number;

  /** Maximum number of concurrent tasks */
  maxConcurrency?: number;

  /** Initial number of concurrent tasks */
  desiredConcurrency?: number;

  /** How often to scale up/down (in milliseconds) */
  scaleUpStepRatio?: number;

  /** How often to scale down (in milliseconds) */
  scaleDownStepRatio?: number;

  /** How long to maintain high concurrency after scaling up */
  maintainConcurrencyTimeoutSecs?: number;

  /** How long to wait before scaling down */
  tasksReadyTimeoutSecs?: number;

  /** CPU usage threshold for scaling decisions */
  targetCpuRatio?: number;

  /** Whether to log autoscaling decisions */
  loggingIntervalSecs?: number;

  /** Custom logger instance */
  log?: Log;
}

Error Classes

Specialized error types for controlling crawler behavior and retry logic.

/**
 * Thrown when a request should not be retried
 * Use this for permanent failures like 404 errors or validation failures
 */
class NonRetryableError extends Error {
  constructor(message?: string);
}

/**
 * Thrown when a critical error occurs that should stop the entire crawler
 * Use this for fatal errors like invalid configuration or system failures
 */
class CriticalError extends Error {
  constructor(message?: string);
}

/**
 * Thrown when a request should be retried immediately
 * Use this to force retries even if max retries would normally be exceeded
 */
class RetryRequestError extends Error {
  constructor(message?: string);
}

/**
 * Thrown when a session becomes invalid and should be rotated
 * Use this when encountering IP blocks or session-related failures
 */
class SessionError extends Error {
  constructor(message?: string);
}

/**
 * Thrown when a request URL is blocked and should be skipped
 * Internal use by the router system
 */
class MissingRouteError extends Error {
  constructor(message?: string);
}

Usage Examples:

import { BasicCrawler, NonRetryableError, SessionError } from "crawlee";

const crawler = new BasicCrawler({
  requestHandler: async ({ request, response, session }) => {
    if (response?.statusCode === 404) {
      // Don't retry 404s
      throw new NonRetryableError(`Page not found: ${request.url}`);
    }

    if (response?.statusCode === 403) {
      // Rotate session on access denied
      throw new SessionError(`Access denied, rotating session: ${request.url}`);
    }

    // Process the request normally
    // ...
  },
});

Router

URL pattern-based request routing system for handling different types of pages.

/**
 * Routes requests to different handlers based on URL patterns and labels
 * @template Context - The crawler context type
 * @template UserData - The request user data type
 */
class Router<Context = BasicCrawlingContext, UserData = Dictionary> {
  constructor();

  /** Add a handler for requests matching the pattern */
  addHandler<Data extends UserData = UserData>(
    pattern: string | RegExp | RouteHandler<Context, Data>,
    handler?: RouteHandler<Context, Data>
  ): void;

  /** Add a default handler for requests that don't match any pattern */
  addDefaultHandler<Data extends UserData = UserData>(
    handler: RouteHandler<Context, Data>
  ): void;

  /** Find and return the handler for a given request */
  findMatchingHandler(request: Request): RouteHandler<Context, UserData> | null;

  /** Create a request handler function to use with crawlers */
  createRequestHandler(): (context: Context) => Promise<void>;
}

Usage Examples:

import { CheerioCrawler, Router } from "crawlee";

const router = new Router<CheerioCrawlingContext>();

// Handle product pages
router.addHandler('PRODUCT', async ({ $, request, enqueueLinks }) => {
  const title = $('h1').text();
  const price = $('.price').text();

  await Dataset.pushData({ title, price, url: request.url });
});

// Handle category pages
router.addHandler(/\/category\/.*/, async ({ $, enqueueLinks }) => {
  await enqueueLinks({
    selector: '.product-link',
    label: 'PRODUCT',
  });
});

// Handle any unmatched pages
router.addDefaultHandler(async ({ request, log }) => {
  log.warn(`No handler for ${request.url}`);
});

const crawler = new CheerioCrawler({
  requestHandler: router.createRequestHandler(),
});

System Status and Monitoring

Classes for monitoring system resources and crawler performance.

/**
 * Provides current and historical system status information
 */
class SystemStatus {
  constructor(options?: SystemStatusOptions);

  /** Get current system status snapshot */
  getCurrentStatus(): SystemInfo;

  /** Get historical status data */
  getHistoricalStatus(): SystemInfo[];

  /** Start monitoring system status */
  startCapturing(intervalMillis?: number): void;

  /** Stop monitoring system status */
  stopCapturing(): void;
}

/**
 * Takes snapshots of system resources for autoscaling decisions
 */
class Snapshotter {
  constructor(options?: SnapshotterOptions);

  /** Take a snapshot of current system resources */
  start(): void;

  /** Stop taking snapshots */
  stop(): void;

  /** Get CPU usage ratio (0-1) */
  getCpuRatio(): number;

  /** Get memory usage ratio (0-1) */
  getMemoryRatio(): number;

  /** Check if system is overloaded */
  isOverloaded(): boolean;
}

interface SystemInfo {
  /** CPU usage as a ratio (0-1) */
  cpuUsage: number;

  /** Memory usage in bytes */
  memoryUsage: number;

  /** Available memory in bytes */
  memoryAvailable: number;

  /** Timestamp of the measurement */
  createdAt: Date;

  /** Whether the system is considered overloaded */
  isOverloaded: boolean;
}

Event Handlers

Event handling system for crawler lifecycle management.

interface CrawlerAddons<Context> {
  /** Called when the crawler starts */
  crawlerStarting?: (crawler: BasicCrawler) => Promise<void>;

  /** Called when the crawler finishes */
  crawlerFinishing?: (crawler: BasicCrawler) => Promise<void>;

  /** Called when a request starts processing */
  requestStarting?: (context: Context) => Promise<void>;

  /** Called when a request finishes successfully */
  requestFinished?: (context: Context) => Promise<void>;

  /** Called when a request fails */
  requestFailed?: (context: Context, error: Error) => Promise<void>;

  /** Called when a session is rotated */
  sessionRotating?: (context: Context) => Promise<void>;
}

/**
 * Handler function type for routing
 */
type RouteHandler<Context, UserData = Dictionary> = (
  context: Context,
  request: Request<UserData>
) => Promise<void>;

Usage Examples:

import { BasicCrawler } from "crawlee";

const crawler = new BasicCrawler({
  requestHandler: async ({ request }) => {
    // Main processing logic
  },
});

// Register event handlers
crawler.use({
  crawlerStarting: async (crawler) => {
    console.log('Crawler is starting...');
  },

  crawlerFinishing: async (crawler) => {
    console.log('Crawler finished!');
    console.log(`Final stats:`, crawler.finalStatistics);
  },

  requestFailed: async (context, error) => {
    console.error(`Request failed: ${context.request.url}`, error);
  },
});

Types

interface Statistics {
  /** Number of requests that finished successfully */
  requestsFinished: number;

  /** Number of requests that failed permanently */
  requestsFailed: number;

  /** Total number of request retries */
  requestsRetries: number;

  /** Average requests per minute (finished) */
  requestsFinishedPerMinute: number;

  /** Average failed requests per minute */
  requestsFailedPerMinute: number;

  /** Minimum request duration in milliseconds */
  requestMinDurationMillis: number;

  /** Maximum request duration in milliseconds */
  requestMaxDurationMillis: number;

  /** Total duration of all requests in milliseconds */
  requestTotalDurationMillis: number;

  /** When the crawler started */
  crawlerStartedAt: Date;

  /** When the crawler finished */
  crawlerFinishedAt?: Date;

  /** Unique identifier for these statistics */
  statsId: string;
}

type FinalStatistics = Statistics & {
  crawlerFinishedAt: Date;
}

interface RouteDefinition {
  pattern: string | RegExp;
  method: 'GET' | 'POST' | 'PUT' | 'DELETE' | '*';
  handler: RouteHandler<any, any>;
}

interface SystemStatusOptions {
  /** How often to capture system status (in milliseconds) */
  intervalMillis?: number;

  /** Maximum number of historical entries to keep */
  maxEntries?: number;
}

interface SnapshotterOptions {
  /** How often to take snapshots (in milliseconds) */
  intervalSecs?: number;

  /** Number of snapshots to average for CPU/memory calculations */
  windowSize?: number;
}

Install with Tessl CLI