Comprehensive configuration system supporting global crawler settings and per-request options.
Main configuration interface combining global and per-request options.
/**
* Main crawler configuration interface
*/
interface CrawlerOptions extends Partial<GlobalOnlyOptions>, RequestOptions {}Configuration options that can only be set at crawler initialization.
interface GlobalOnlyOptions {
/**
* Maximum number of requests that can be sent simultaneously
* @default 10
* @description Only valid if global rateLimit is 0
*/
maxConnections: number;
/**
* Number of priority levels for request queuing
* @default 10
* @description Can only be assigned at initialization
*/
priorityLevels: number;
/**
* Global rate limit in milliseconds between requests
* @default 0
* @description If > 0, maxConnections is forced to 1
*/
rateLimit: number;
/**
* Skip duplicate requests based on URL
* @default false
* @description Uses seenreq for duplicate detection
*/
skipDuplicates: boolean;
/**
* Enable dynamic task reallocation between queues
* @default false
* @description Reallocates blocked tasks to other queues
*/
homogeneous: boolean;
/**
* User agent string or array for rotation
* @default undefined
* @description If array, rotates through user agents
*/
userAgents?: string | string[];
/**
* Mute all warning and error messages
* @default false
* @description Request errors are still thrown
*/
silence?: boolean;
}Per-request configuration options that can be set globally or per-request.
interface RequestOptions {
/**
* Target URL or function returning URL
*/
url?: string | Function;
/**
* HTTP method
* @default "GET"
*/
method?: string;
/**
* HTTP headers object
*/
headers?: Record<string, unknown>;
/**
* Request body for POST/PUT requests
*/
body?: string | Record<string, unknown>;
/**
* Enable Cheerio jQuery integration
* @default true
* @description Adds $ property to response for DOM manipulation
*/
jQuery?: boolean;
/**
* Request timeout in milliseconds
* @default 15000
*/
timeout?: number;
/**
* Number of retry attempts on failure
* @default 2
*/
retries?: number;
/**
* Delay between retry attempts in milliseconds
* @default 2000
*/
retryInterval?: number;
/**
* Request priority level (0 = highest)
* @default 5
* @description Must be less than priorityLevels
*/
priority?: number;
/**
* Response body encoding
* @default "utf8"
* @description Set to null to return Buffer
*/
encoding?: string | null;
/**
* Force UTF-8 encoding regardless of headers
* @default false
*/
forceUTF8?: boolean;
/**
* Parse response body as JSON
* @default false
* @description Automatically parses JSON responses
*/
isJson?: boolean;
/**
* Single proxy URL
*/
proxy?: string;
/**
* Array of proxy URLs for rotation
*/
proxies?: string[];
/**
* Enable HTTP/2 support
* @default false
*/
http2?: boolean;
/**
* Validate SSL certificates
* @default true
* @description Set false to ignore SSL errors
*/
rejectUnauthorized?: boolean;
/**
* Enable response decompression
* @default true
*/
decompress?: boolean;
/**
* Cookie jar instance for session management
*/
cookieJar?: object;
/**
* URL search parameters
*/
searchParams?: Record<string, unknown>;
/**
* HTTP referer header
* @description Auto-generated from URL if not provided
*/
referer?: string;
/**
* Rate limiter ID for this request
* @default 0
* @description Assigns request to specific rate limiter
*/
rateLimiterId?: number;
/**
* Custom user parameters (passed through to callbacks)
*/
userParams?: unknown;
/**
* JSON parsing function
* @description Custom JSON parser for responses
*/
parseJson?: Function;
/**
* JSON stringification function
* @description Custom JSON stringifier for request bodies
*/
stringifyJson?: Function;
/**
* Pre-request hook function
* @description Called before sending request, can modify options
*/
preRequest?: (options: RequestOptions, done?: (error?: Error | null) => void) => void;
/**
* Response callback function
* @description Called when request completes
*/
callback?: (error: unknown, response: CrawlerResponse, done?: unknown) => void;
/**
* Resource release function (internal)
* @description Used internally to release connections
*/
release?: () => void;
/**
* Skip request event emission
* @default false
* @description Prevents 'request' event from being emitted
*/
skipEventRequest?: boolean;
/**
* Process HTML content directly without making HTTP request
* @description Useful for testing or processing cached content
*/
html?: boolean;
/**
* HTTP agent for request customization
*/
agent?: any;
/**
* Seenreq configuration for duplicate detection
*/
seenreq?: any;
}Legacy options maintained for backward compatibility.
interface DeprecatedOptions {
/**
* @deprecated Use "url" instead
*/
uri?: string | Function;
/**
* @deprecated Use "searchParams" instead
*/
qs?: string | Record<string, unknown>;
/**
* @deprecated Use "rejectUnauthorized" instead
*/
strictSSL?: boolean;
/**
* @deprecated Use "encoding" instead
*/
incomingEncoding?: string | null;
/**
* @deprecated Use "decompress" instead
*/
gzip?: boolean;
/**
* @deprecated Use "cookieJar" instead
*/
jar?: object;
/**
* @deprecated Use "parseJson" instead
*/
jsonReviver?: Function;
/**
* @deprecated Use "stringifyJson" instead
*/
jsonReplacer?: Function;
}import Crawler from "crawler";
const crawler = new Crawler({
maxConnections: 5,
rateLimit: 1000, // 1 second between requests
timeout: 30000, // 30 second timeout
retries: 3, // Retry failed requests 3 times
jQuery: true, // Enable Cheerio
callback: (error, res, done) => {
if (!error) {
console.log(res.$("title").text());
}
done();
}
});const crawler = new Crawler({
maxConnections: 10,
priorityLevels: 5,
skipDuplicates: true,
userAgents: [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36"
],
headers: {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5"
},
callback: (error, res, done) => {
if (error) {
console.error("Crawl error:", error.message);
} else {
console.log(`Crawled ${res.options.url}: ${res.body.length} bytes`);
}
done();
}
});// Single proxy
const crawler = new Crawler({
proxy: "http://proxy.company.com:8080",
rejectUnauthorized: false, // For self-signed certificates
});
// Rotating proxies
const crawler = new Crawler({
proxies: [
"http://proxy1.company.com:8080",
"http://proxy2.company.com:8080",
"http://proxy3.company.com:8080"
]
});
// HTTP/2 with proxy
const crawler = new Crawler({
http2: true,
proxy: "https://http2-proxy.company.com:8080"
});const apiCrawler = new Crawler({
maxConnections: 3,
rateLimit: 500,
headers: {
"Content-Type": "application/json",
"User-Agent": "MyApp/1.0"
},
isJson: true, // Auto-parse JSON responses
callback: (error, res, done) => {
if (!error) {
console.log("API Response:", res.body);
}
done();
}
});const crawler = new Crawler({
preRequest: (options, done) => {
// Add authentication token
options.headers = options.headers || {};
options.headers.Authorization = `Bearer ${getAuthToken()}`;
console.log(`Processing ${options.url}`);
done();
},
parseJson: (text) => {
// Custom JSON parsing with error handling
try {
return JSON.parse(text);
} catch (e) {
return { error: "Invalid JSON", raw: text };
}
}
});Any global option can be overridden per request:
// Global configuration
const crawler = new Crawler({
timeout: 10000,
jQuery: true,
callback: defaultCallback
});
// Override specific options per request
crawler.add({
url: "https://api.example.com",
timeout: 30000, // Override global timeout
jQuery: false, // Disable jQuery for this request
isJson: true, // Enable JSON parsing
callback: apiCallback // Use different callback
});