tessl/npm-crawlee

The scalable web crawling and scraping library for JavaScript/Node.js that enables development of data extraction and web automation jobs with headless Chrome and Puppeteer.

Overview

Eval results

Files

HTTP Crawling

Name: tessl/npm-crawlee
Author: tessl

HTTP crawling provides server-side HTML parsing and scraping capabilities without the overhead of full browser automation. These crawlers use various DOM parsing libraries to extract data efficiently from web pages.

Capabilities

HttpCrawler

Base HTTP crawler that extends BasicCrawler with HTTP-specific functionality for making requests and handling responses.

/**
 * HTTP crawler for server-side request processing without browser automation
 */
class HttpCrawler extends BasicCrawler<HttpCrawlingContext> {
  constructor(options: HttpCrawlerOptions);
}

HttpCrawlerOptions

Configuration options for the HttpCrawler.

interface HttpCrawlerOptions extends BasicCrawlerOptions<HttpCrawlingContext> {
  /** HTTP client options for making requests */
  requestHandlerOptions?: Partial<OptionsInit>;

  /** Additional HTTP headers to send with requests */
  additionalHttpErrorStatusCodes?: number[];

  /** Whether to ignore HTTP error status codes */
  ignoreHttpErrorStatusCodes?: boolean;

  /** Pre-navigation hooks to modify requests before sending */
  preNavigationHooks?: Array<(crawlingContext: HttpCrawlingContext, requestAsBrowserOptions: OptionsInit) => Promise<void>>;

  /** Post-navigation hooks to process responses after receiving */
  postNavigationHooks?: Array<(crawlingContext: HttpCrawlingContext, response: Response) => Promise<void>>;

  /** HTTP client configuration */
  httpClient?: BaseHttpClient;

  /** Whether to persist cookies between requests */
  persistCookiesPerSession?: boolean;

  /** Custom User-Agent string */
  userAgent?: string;

  /** Custom request transformation function */
  requestTransform?: (options: OptionsInit) => Promise<OptionsInit>;

  /** Custom response transformation function */
  responseTransform?: (response: Response) => Promise<Response>;
}

HttpCrawlingContext

The context object passed to HTTP crawler request handlers.

interface HttpCrawlingContext<UserData = Dictionary> extends BasicCrawlingContext<UserData> {
  /** The HTTP response object */
  response: Response;

  /** Response body as text */
  body: string;

  /** Response headers */
  headers: Dictionary<string>;

  /** Content type of the response */
  contentType: string;

  /** Send HTTP request with custom options */
  sendRequest<T = any>(overrideOptions?: Partial<OptionsInit>): Promise<T>;
}

Usage Examples:

import { HttpCrawler } from "crawlee";

const crawler = new HttpCrawler({
  requestHandler: async ({ request, response, body }) => {
    console.log(`Status: ${response.statusCode} for ${request.url}`);
    console.log(`Body length: ${body.length}`);

    // Parse HTML manually or use simple text processing
    const titleMatch = body.match(/<title>(.*?)<\/title>/i);
    const title = titleMatch ? titleMatch[1] : 'No title';

    await crawler.pushData({
      url: request.url,
      title,
      statusCode: response.statusCode,
    });
  },
  additionalHttpErrorStatusCodes: [429], // Treat 429 as error
  userAgent: 'MyCustomCrawler/1.0',
});

CheerioCrawler

Server-side HTML parsing crawler using the Cheerio library for jQuery-like DOM manipulation.

/**
 * Cheerio-based crawler for server-side HTML parsing with jQuery-like syntax
 */
class CheerioCrawler extends HttpCrawler {
  constructor(options: CheerioCrawlerOptions);
}

CheerioCrawlerOptions

Configuration options for the CheerioCrawler.

interface CheerioCrawlerOptions extends HttpCrawlerOptions {
  /** Handler function that receives Cheerio context */
  requestHandler: (context: CheerioCrawlingContext) => Promise<void>;

  /** Cheerio parsing options */
  cheerioParseOptions?: CheerioParseOptions;

  /** Whether to inject Cheerio into global scope */
  forceResponseEncoding?: string;

  /** Custom Cheerio root selector */
  parserOptions?: {
    xmlMode?: boolean;
    decodeEntities?: boolean;
    lowerCaseAttributeNames?: boolean;
  };
}

CheerioCrawlingContext

The context object passed to Cheerio crawler request handlers.

interface CheerioCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {
  /** Cheerio root object for DOM manipulation */
  $: CheerioRoot;

  /** Get text content from the current page */
  body: string;

  /** Parse additional HTML with Cheerio */
  parseWithCheerio(html: string): CheerioRoot;

  /** Enqueue links found on the page */
  enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;
}

Usage Examples:

import { CheerioCrawler, Dataset } from "crawlee";

const crawler = new CheerioCrawler({
  requestHandler: async ({ $, request, enqueueLinks, pushData }) => {
    // Extract data using jQuery-like syntax
    const title = $('title').text();
    const description = $('meta[name="description"]').attr('content');

    // Extract all product information
    const products = [];
    $('.product').each((index, element) => {
      const product = $(element);
      products.push({
        name: product.find('.product-name').text().trim(),
        price: product.find('.price').text().trim(),
        image: product.find('img').attr('src'),
      });
    });

    // Save extracted data
    await pushData({
      url: request.loadedUrl,
      title,
      description,
      products,
      extractedAt: new Date(),
    });

    // Find and enqueue pagination links
    await enqueueLinks({
      selector: 'a.page-link',
      label: 'LIST',
    });

    // Find and enqueue product detail links
    await enqueueLinks({
      selector: '.product a',
      label: 'DETAIL',
    });
  },

  // Handle product detail pages
  router.addHandler('DETAIL', async ({ $, request, pushData }) => {
    const productDetails = {
      url: request.loadedUrl,
      name: $('.product-title').text(),
      fullDescription: $('.description').text(),
      specifications: {},
      reviews: [],
    };

    // Extract specifications
    $('.spec-row').each((_, element) => {
      const key = $(element).find('.spec-name').text().trim();
      const value = $(element).find('.spec-value').text().trim();
      productDetails.specifications[key] = value;
    });

    // Extract reviews
    $('.review').each((_, element) => {
      productDetails.reviews.push({
        rating: $(element).find('.rating').attr('data-rating'),
        text: $(element).find('.review-text').text().trim(),
        author: $(element).find('.reviewer-name').text().trim(),
      });
    });

    await pushData(productDetails);
  }),

  maxConcurrency: 5,
  maxRequestRetries: 3,
});

JSDOMCrawler

Server-side DOM manipulation crawler using JSDOM for full DOM API support.

/**
 * JSDOM-based crawler for server-side DOM manipulation with full DOM API
 */
class JSDOMCrawler extends HttpCrawler {
  constructor(options: JSDOMCrawlerOptions);
}

JSDOMCrawlerOptions

Configuration options for the JSDOMCrawler.

interface JSDOMCrawlerOptions extends HttpCrawlerOptions {
  /** Handler function that receives JSDOM context */
  requestHandler: (context: JSDOMCrawlingContext) => Promise<void>;

  /** JSDOM constructor options */
  jsdomOptions?: ConstructorOptions;

  /** Whether to run scripts in JSDOM */
  runScripts?: 'dangerously' | 'outside-only';

  /** Custom resource loader for JSDOM */
  resourceLoader?: ResourceLoader;

  /** Virtual console options */
  virtualConsole?: VirtualConsole;
}

JSDOMCrawlingContext

The context object passed to JSDOM crawler request handlers.

interface JSDOMCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {
  /** The JSDOM window object */
  window: DOMWindow;

  /** The document object */
  document: Document;

  /** Shortcut to document.querySelector */
  $(selector: string): Element | null;

  /** Shortcut to document.querySelectorAll */
  $$(selector: string): NodeListOf<Element>;

  /** Enqueue links found on the page */
  enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;
}

Usage Examples:

import { JSDOMCrawler } from "crawlee";

const crawler = new JSDOMCrawler({
  requestHandler: async ({ window, document, $, $$, request, pushData, enqueueLinks }) => {
    // Use full DOM API
    const title = document.title;
    const metaTags = document.getElementsByTagName('meta');

    // Use convenience selectors
    const mainContent = $('.main-content');
    const allLinks = $$('a[href]');

    // Execute JavaScript-like operations
    const productList = Array.from($$('.product')).map(element => ({
      name: element.querySelector('.name')?.textContent?.trim(),
      price: element.querySelector('.price')?.textContent?.trim(),
      inStock: element.classList.contains('in-stock'),
    }));

    // Access computed styles if needed
    const computedStyle = window.getComputedStyle(mainContent);

    await pushData({
      url: request.loadedUrl,
      title,
      productCount: productList.length,
      products: productList,
      hasMainContent: !!mainContent,
    });

    // Enqueue links
    await enqueueLinks({
      selector: 'a[href*="/category/"]',
      label: 'CATEGORY',
    });
  },

  jsdomOptions: {
    runScripts: 'dangerously', // Enable JavaScript execution
    resources: 'usable', // Load external resources
  },
});

LinkedOMCrawler

Fast server-side DOM manipulation crawler using LinkedOM for performance-optimized parsing.

/**
 * LinkedOM-based crawler for fast server-side DOM manipulation
 */
class LinkedOMCrawler extends HttpCrawler {
  constructor(options: LinkedOMCrawlerOptions);
}

LinkedOMCrawlerOptions

Configuration options for the LinkedOMCrawler.

interface LinkedOMCrawlerOptions extends HttpCrawlerOptions {
  /** Handler function that receives LinkedOM context */
  requestHandler: (context: LinkedOMCrawlingContext) => Promise<void>;

  /** LinkedOM parsing options */
  linkedomOptions?: {
    /** Include comment nodes in parsing */
    includeComments?: boolean;
    /** Include text nodes in parsing */
    includeTextNodes?: boolean;
  };
}

LinkedOMCrawlingContext

The context object passed to LinkedOM crawler request handlers.

interface LinkedOMCrawlingContext<UserData = Dictionary> extends HttpCrawlingContext<UserData> {
  /** The LinkedOM window object */
  window: Window;

  /** The document object */
  document: Document;

  /** Shortcut to document.querySelector */
  $(selector: string): Element | null;

  /** Shortcut to document.querySelectorAll */
  $$(selector: string): NodeListOf<Element>;

  /** Enqueue links found on the page */
  enqueueLinks(options?: CrawlerEnqueueLinksOptions): Promise<BatchAddRequestsResult>;
}

Usage Examples:

import { LinkedOMCrawler } from "crawlee";

const crawler = new LinkedOMCrawler({
  requestHandler: async ({ window, document, $, $$, request, pushData }) => {
    // LinkedOM provides fast DOM manipulation
    const title = document.title;
    const description = $('meta[name="description"]')?.getAttribute('content');

    // Fast element selection and text extraction
    const headlines = Array.from($$('h1, h2, h3')).map(el => ({
      tag: el.tagName.toLowerCase(),
      text: el.textContent?.trim(),
      level: parseInt(el.tagName.slice(1)),
    }));

    // Fast table parsing
    const tableData = [];
    $$('table tr').forEach(row => {
      const cells = Array.from(row.querySelectorAll('td, th')).map(cell =>
        cell.textContent?.trim()
      );
      if (cells.length > 0) {
        tableData.push(cells);
      }
    });

    await pushData({
      url: request.loadedUrl,
      title,
      description,
      headlines,
      tableData,
    });
  },

  maxConcurrency: 20, // LinkedOM is fast, can handle higher concurrency
});

File Download Crawler

Specialized crawler for efficient file downloading using HTTP streams.

/**
 * Specialized crawler for downloading files efficiently
 */
class FileDownload extends HttpCrawler<FileDownloadCrawlingContext> {
  constructor(options: FileDownloadOptions);
}

/**
 * Create a router for file download handling
 */
function createFileRouter<Context extends FileDownloadCrawlingContext>(): Router<Context>;

/**
 * Transform stream that monitors download speed and aborts if too slow
 */
function MinimumSpeedStream(options: MinimumSpeedStreamOptions): Transform;

/**
 * Transform stream that logs download progress
 */
function ByteCounterStream(options: ByteCounterStreamOptions): Transform;

interface FileDownloadOptions<UserData = any, JSONData = any> {
  /** Request handler for processing downloaded files */
  requestHandler?: FileDownloadRequestHandler<UserData, JSONData>;

  /** Stream handler for processing download streams */
  streamHandler?: StreamHandler;

  /** All standard HttpCrawlerOptions are supported */
  requestList?: RequestList;
  requestQueue?: RequestQueue;
  maxRequestRetries?: number;
  maxRequestsPerCrawl?: number;
  maxConcurrency?: number;
  navigationTimeoutSecs?: number;
}

interface FileDownloadCrawlingContext<UserData = any, JSONData = any>
  extends HttpCrawlingContext<UserData, JSONData> {
  /** The download stream (when using streamHandler) */
  stream?: Request;
}

interface MinimumSpeedStreamOptions {
  /** Minimum speed in KB/s */
  minSpeedKbps: number;

  /** Time window for speed calculation in ms (default: 10000) */
  historyLengthMs?: number;

  /** How often to check speed in ms (default: 5000) */
  checkProgressInterval?: number;
}

interface ByteCounterStreamOptions {
  /** Function to call with bytes transferred */
  logTransferredBytes: (bytes: number) => void;

  /** How often to log progress in ms (default: 5000) */
  loggingInterval?: number;
}

Usage Examples:

import { FileDownload, createFileRouter, writeFileSync } from "crawlee";

// Basic file download with requestHandler
const fileDownloader = new FileDownload({
  requestHandler: async ({ body, request, pushData }) => {
    // Save file to disk
    const fileName = request.url.replace(/[^a-z0-9\.]/gi, '_');
    writeFileSync(`./downloads/${fileName}`, body);

    await pushData({
      url: request.url,
      fileName,
      size: body.length,
      downloadedAt: new Date(),
    });
  },
});

// Run with list of file URLs
await fileDownloader.run([
  'http://www.example.com/document.pdf',
  'http://www.example.com/image.jpg',
  'http://www.example.com/video.mp4',
]);

// Advanced streaming with progress monitoring
const streamDownloader = new FileDownload({
  streamHandler: async ({ stream, request, log }) => {
    const filePath = `./downloads/${path.basename(request.url)}`;
    const fileStream = createWriteStream(filePath);

    // Add progress monitoring
    const progressStream = ByteCounterStream({
      logTransferredBytes: (bytes) => {
        log.info(`Downloaded ${(bytes / 1024 / 1024).toFixed(2)} MB`);
      },
      loggingInterval: 2000,
    });

    // Add speed monitoring
    const speedStream = MinimumSpeedStream({
      minSpeedKbps: 100, // Minimum 100 KB/s
      historyLengthMs: 10000,
      checkProgressInterval: 3000,
    });

    // Pipe stream through monitors to file
    stream
      .pipe(progressStream)
      .pipe(speedStream)
      .pipe(fileStream);

    // Wait for completion
    await finished(fileStream);
    log.info(`File saved: ${filePath}`);
  },
});

// Using router for different file types
const router = createFileRouter();

router.addHandler('PDF', async ({ body, request, pushData }) => {
  // Handle PDF files
  const fileName = `pdf_${Date.now()}.pdf`;
  writeFileSync(`./pdfs/${fileName}`, body);
  await pushData({ type: 'pdf', fileName, url: request.url });
});

router.addHandler('IMAGE', async ({ body, request, pushData }) => {
  // Handle image files
  const fileName = `img_${Date.now()}.jpg`;
  writeFileSync(`./images/${fileName}`, body);
  await pushData({ type: 'image', fileName, url: request.url });
});

router.addDefaultHandler(async ({ body, request, pushData }) => {
  // Handle other file types
  const fileName = `file_${Date.now()}`;
  writeFileSync(`./files/${fileName}`, body);
  await pushData({ type: 'other', fileName, url: request.url });
});

const routerDownloader = new FileDownload({
  requestHandler: router,
});

// Add requests with labels for routing
await routerDownloader.addRequests([
  { url: 'http://example.com/doc.pdf', label: 'PDF' },
  { url: 'http://example.com/photo.jpg', label: 'IMAGE' },
  { url: 'http://example.com/data.csv', label: 'OTHER' },
]);

Types

interface Response {
  /** HTTP status code */
  statusCode: number;

  /** HTTP status message */
  statusMessage: string;

  /** Response headers */
  headers: Dictionary<string | string[]>;

  /** Response body as string */
  body: string;

  /** Response body as buffer */
  rawBody: Buffer;

  /** Whether the request was redirected */
  isRedirect: boolean;

  /** Final URL after redirects */
  url: string;

  /** Request timing information */
  timings: {
    start: number;
    socket: number;
    lookup: number;
    connect: number;
    secureConnect: number;
    upload: number;
    response: number;
    end: number;
  };
}

interface OptionsInit {
  /** HTTP method */
  method?: HttpMethod;

  /** Request headers */
  headers?: Dictionary<string>;

  /** Request body */
  body?: string | Buffer;

  /** Request timeout in milliseconds */
  timeout?: number;

  /** Whether to follow redirects */
  followRedirect?: boolean;

  /** Maximum number of redirects to follow */
  maxRedirects?: number;

  /** Proxy URL */
  proxy?: string;

  /** User agent string */
  userAgent?: string;

  /** Whether to validate SSL certificates */
  rejectUnauthorized?: boolean;
}

interface CheerioParseOptions {
  /** Whether to parse as XML */
  xmlMode?: boolean;

  /** Whether to decode HTML entities */
  decodeEntities?: boolean;

  /** Whether to lowercase attribute names */
  lowerCaseAttributeNames?: boolean;

  /** Whether to recognize CDATA sections */
  recognizeCDATA?: boolean;

  /** Whether to recognize self-closing tags */
  recognizeSelfClosing?: boolean;
}

interface CrawlerEnqueueLinksOptions {
  /** CSS selector for finding links */
  selector?: string;

  /** Base URL for resolving relative links */
  baseUrl?: string;

  /** URLs to exclude from enqueueing */
  exclude?: (string | RegExp)[];

  /** Glob patterns for URLs to include */
  globs?: string[];

  /** Pseudo-URLs for matching links */
  pseudoUrls?: string[];

  /** Label to assign to enqueued requests */
  label?: string;

  /** Additional data to attach to requests */
  userData?: Dictionary;

  /** Whether to transform relative URLs to absolute */
  transformRequestFunction?: (request: RequestOptions) => RequestOptions;

  /** Request queue to add requests to */
  requestQueue?: RequestQueue;

  /** Maximum number of links to enqueue */
  limit?: number;
}

type HttpMethod = 'GET' | 'POST' | 'PUT' | 'DELETE' | 'HEAD' | 'OPTIONS' | 'PATCH';
type CheerioRoot = ReturnType<typeof cheerio.load>;

Install with Tessl CLI