or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

configuration.mdcrawler.mdindex.mdqueues.mdrate-limiting.mdutilities.md
tile.json

configuration.mddocs/

Configuration Options

Comprehensive configuration system supporting global crawler settings and per-request options.

Capabilities

Crawler Options

Main configuration interface combining global and per-request options.

/**
 * Main crawler configuration interface
 */
interface CrawlerOptions extends Partial<GlobalOnlyOptions>, RequestOptions {}

Global-Only Options

Configuration options that can only be set at crawler initialization.

interface GlobalOnlyOptions {
  /**
   * Maximum number of requests that can be sent simultaneously
   * @default 10
   * @description Only valid if global rateLimit is 0
   */
  maxConnections: number;

  /**
   * Number of priority levels for request queuing
   * @default 10
   * @description Can only be assigned at initialization
   */
  priorityLevels: number;

  /**
   * Global rate limit in milliseconds between requests
   * @default 0
   * @description If > 0, maxConnections is forced to 1
   */
  rateLimit: number;

  /**
   * Skip duplicate requests based on URL
   * @default false
   * @description Uses seenreq for duplicate detection
   */
  skipDuplicates: boolean;

  /**
   * Enable dynamic task reallocation between queues
   * @default false
   * @description Reallocates blocked tasks to other queues
   */
  homogeneous: boolean;

  /**
   * User agent string or array for rotation
   * @default undefined
   * @description If array, rotates through user agents
   */
  userAgents?: string | string[];

  /**
   * Mute all warning and error messages
   * @default false
   * @description Request errors are still thrown
   */
  silence?: boolean;
}

Request Options

Per-request configuration options that can be set globally or per-request.

interface RequestOptions {
  /**
   * Target URL or function returning URL
   */
  url?: string | Function;

  /**
   * HTTP method
   * @default "GET"
   */
  method?: string;

  /**
   * HTTP headers object
   */
  headers?: Record<string, unknown>;

  /**
   * Request body for POST/PUT requests
   */
  body?: string | Record<string, unknown>;

  /**
   * Enable Cheerio jQuery integration
   * @default true
   * @description Adds $ property to response for DOM manipulation
   */
  jQuery?: boolean;

  /**
   * Request timeout in milliseconds
   * @default 15000
   */
  timeout?: number;

  /**
   * Number of retry attempts on failure
   * @default 2
   */
  retries?: number;

  /**
   * Delay between retry attempts in milliseconds
   * @default 2000
   */
  retryInterval?: number;

  /**
   * Request priority level (0 = highest)
   * @default 5
   * @description Must be less than priorityLevels
   */
  priority?: number;

  /**
   * Response body encoding
   * @default "utf8"
   * @description Set to null to return Buffer
   */
  encoding?: string | null;

  /**
   * Force UTF-8 encoding regardless of headers
   * @default false
   */
  forceUTF8?: boolean;

  /**
   * Parse response body as JSON
   * @default false
   * @description Automatically parses JSON responses
   */
  isJson?: boolean;

  /**
   * Single proxy URL
   */
  proxy?: string;

  /**
   * Array of proxy URLs for rotation
   */
  proxies?: string[];

  /**
   * Enable HTTP/2 support
   * @default false
   */
  http2?: boolean;

  /**
   * Validate SSL certificates
   * @default true
   * @description Set false to ignore SSL errors
   */
  rejectUnauthorized?: boolean;

  /**
   * Enable response decompression
   * @default true
   */
  decompress?: boolean;

  /**
   * Cookie jar instance for session management
   */
  cookieJar?: object;

  /**
   * URL search parameters
   */
  searchParams?: Record<string, unknown>;

  /**
   * HTTP referer header
   * @description Auto-generated from URL if not provided
   */
  referer?: string;

  /**
   * Rate limiter ID for this request
   * @default 0
   * @description Assigns request to specific rate limiter
   */
  rateLimiterId?: number;

  /**
   * Custom user parameters (passed through to callbacks)
   */
  userParams?: unknown;

  /**
   * JSON parsing function
   * @description Custom JSON parser for responses
   */
  parseJson?: Function;

  /**
   * JSON stringification function
   * @description Custom JSON stringifier for request bodies
   */
  stringifyJson?: Function;

  /**
   * Pre-request hook function
   * @description Called before sending request, can modify options
   */
  preRequest?: (options: RequestOptions, done?: (error?: Error | null) => void) => void;

  /**
   * Response callback function
   * @description Called when request completes
   */
  callback?: (error: unknown, response: CrawlerResponse, done?: unknown) => void;

  /**
   * Resource release function (internal)
   * @description Used internally to release connections
   */
  release?: () => void;

  /**
   * Skip request event emission
   * @default false
   * @description Prevents 'request' event from being emitted
   */
  skipEventRequest?: boolean;

  /**
   * Process HTML content directly without making HTTP request
   * @description Useful for testing or processing cached content
   */
  html?: boolean;

  /**
   * HTTP agent for request customization
   */
  agent?: any;

  /**
   * Seenreq configuration for duplicate detection
   */
  seenreq?: any;
}

Deprecated Options

Legacy options maintained for backward compatibility.

interface DeprecatedOptions {
  /**
   * @deprecated Use "url" instead
   */
  uri?: string | Function;

  /**
   * @deprecated Use "searchParams" instead
   */
  qs?: string | Record<string, unknown>;

  /**
   * @deprecated Use "rejectUnauthorized" instead
   */
  strictSSL?: boolean;

  /**
   * @deprecated Use "encoding" instead
   */
  incomingEncoding?: string | null;

  /**
   * @deprecated Use "decompress" instead
   */
  gzip?: boolean;

  /**
   * @deprecated Use "cookieJar" instead
   */
  jar?: object;

  /**
   * @deprecated Use "parseJson" instead
   */
  jsonReviver?: Function;

  /**
   * @deprecated Use "stringifyJson" instead
   */
  jsonReplacer?: Function;
}

Configuration Examples

Basic Configuration

import Crawler from "crawler";

const crawler = new Crawler({
    maxConnections: 5,
    rateLimit: 1000, // 1 second between requests
    timeout: 30000,  // 30 second timeout
    retries: 3,      // Retry failed requests 3 times
    jQuery: true,    // Enable Cheerio
    callback: (error, res, done) => {
        if (!error) {
            console.log(res.$("title").text());
        }
        done();
    }
});

Advanced Configuration

const crawler = new Crawler({
    maxConnections: 10,
    priorityLevels: 5,
    skipDuplicates: true,
    userAgents: [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
    ],
    headers: {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5"
    },
    callback: (error, res, done) => {
        if (error) {
            console.error("Crawl error:", error.message);
        } else {
            console.log(`Crawled ${res.options.url}: ${res.body.length} bytes`);
        }
        done();
    }
});

Proxy Configuration

// Single proxy
const crawler = new Crawler({
    proxy: "http://proxy.company.com:8080",
    rejectUnauthorized: false, // For self-signed certificates
});

// Rotating proxies
const crawler = new Crawler({
    proxies: [
        "http://proxy1.company.com:8080",
        "http://proxy2.company.com:8080",
        "http://proxy3.company.com:8080"
    ]
});

// HTTP/2 with proxy
const crawler = new Crawler({
    http2: true,
    proxy: "https://http2-proxy.company.com:8080"
});

JSON API Configuration

const apiCrawler = new Crawler({
    maxConnections: 3,
    rateLimit: 500,
    headers: {
        "Content-Type": "application/json",
        "User-Agent": "MyApp/1.0"
    },
    isJson: true, // Auto-parse JSON responses
    callback: (error, res, done) => {
        if (!error) {
            console.log("API Response:", res.body);
        }
        done();
    }
});

Custom Processing Configuration

const crawler = new Crawler({
    preRequest: (options, done) => {
        // Add authentication token
        options.headers = options.headers || {};
        options.headers.Authorization = `Bearer ${getAuthToken()}`;
        
        console.log(`Processing ${options.url}`);
        done();
    },
    parseJson: (text) => {
        // Custom JSON parsing with error handling
        try {
            return JSON.parse(text);
        } catch (e) {
            return { error: "Invalid JSON", raw: text };
        }
    }
});

Per-Request Option Override

Any global option can be overridden per request:

// Global configuration
const crawler = new Crawler({
    timeout: 10000,
    jQuery: true,
    callback: defaultCallback
});

// Override specific options per request
crawler.add({
    url: "https://api.example.com",
    timeout: 30000,        // Override global timeout
    jQuery: false,         // Disable jQuery for this request
    isJson: true,          // Enable JSON parsing
    callback: apiCallback  // Use different callback
});