Internal advanced rate limiting system with multiple limiters, priority queues, and cluster management for controlling request frequency and concurrency. These rate limiting classes are not directly exported from the main crawler module.
Individual rate limiter managing request scheduling and execution.
/**
* Individual rate limiter for request scheduling
*/
class RateLimiter {
constructor(options: RateLimiterOptions);
/** Rate limiter ID */
id?: number;
/** Maximum concurrent connections */
maxConnections: number;
/** Rate limit delay in milliseconds */
rateLimit: number;
/** Number of currently running tasks */
runningSize: number;
/** Number of priority levels */
priorityLevels: number;
/** Default priority for tasks */
defaultPriority: number;
/** Next allowed request time */
nextRequestTime: number;
/** Number of waiting tasks (readonly) */
readonly waitingSize: number;
}Creates a new rate limiter with specified configuration.
/**
* Rate limiter configuration options
*/
interface RateLimiterOptions {
/** Maximum concurrent connections */
maxConnections: number;
/** Rate limit delay in milliseconds */
rateLimit: number;
/** Number of priority levels */
priorityLevels: number;
/** Default priority for tasks */
defaultPriority: number;
/** Optional cluster reference */
cluster?: Cluster;
}
/**
* Creates a new rate limiter
* @param options - Rate limiter configuration
*/
constructor(options: RateLimiterOptions);Submits a task to the rate limiter for execution.
/**
* Task function signature
*/
type Task = (done: () => void, limiter?: number) => void;
/**
* Submit a task for execution
* @param options - Priority configuration or priority number
* @param task - Task function to execute
*/
submit(options: {priority: number} | number, task: Task): void;Usage Example:
const rateLimiter = new RateLimiter({
maxConnections: 3,
rateLimit: 1000, // 1 second between requests
priorityLevels: 5,
defaultPriority: 2
});
// Submit high priority task
rateLimiter.submit(0, (done) => {
console.log("High priority task executing");
// Perform task work
setTimeout(() => {
console.log("High priority task completed");
done(); // Signal completion
}, 500);
});
// Submit normal priority task
rateLimiter.submit(2, (done) => {
console.log("Normal priority task executing");
done();
});Dynamically updates the rate limit for the limiter.
/**
* Update the rate limit
* @param rateLimit - New rate limit in milliseconds
*/
setRateLimit(rateLimit: number): void;Usage Example:
// Start with 1 second rate limit
const limiter = new RateLimiter({
maxConnections: 1,
rateLimit: 1000,
priorityLevels: 3,
defaultPriority: 1
});
// Increase rate limit to 2 seconds
limiter.setRateLimit(2000);
// Remove rate limit
limiter.setRateLimit(0);Directly removes and returns a task from the rate limiter's internal queue without scheduling.
/**
* Directly dequeue a task from the internal queue
* @returns Task function from the queue
* @description Internal method for direct task removal, bypasses scheduling
*/
directDequeue(): Task;Usage Example:
const limiter = new RateLimiter({
maxConnections: 2,
rateLimit: 500,
priorityLevels: 3,
defaultPriority: 1
});
// Add some tasks
limiter.submit(0, (done) => {
console.log("Task 1");
done();
});
limiter.submit(1, (done) => {
console.log("Task 2");
done();
});
// Directly dequeue without waiting for scheduling
const task = limiter.directDequeue();
if (task) {
task(() => console.log("Task executed directly"));
}Methods to check task status and queue state.
/**
* Check if there are waiting tasks in this limiter or cluster
* @returns True if tasks are waiting
*/
hasWaitingTasks(): boolean;
/**
* Set the limiter ID
* @param id - Unique identifier for this limiter
*/
setId(id: number): void;Manages multiple rate limiters with shared task distribution.
/**
* Rate limiter cluster for managing multiple limiters
*/
class Cluster {
constructor(options: ClusterOptions);
/** Global maximum connections setting */
globalMaxConnections: number;
/** Global rate limit setting */
globalRateLimit: number;
/** Global priority levels setting */
globalpriorityLevels: number;
/** Global default priority setting */
globalDefaultPriority: number;
/** Total waiting tasks across all limiters (readonly) */
readonly waitingSize: number;
/** Total unfinished tasks across all limiters (readonly) */
readonly unfinishedSize: number;
/** Cluster empty status (readonly) */
readonly empty: boolean;
/** Human-readable status string (readonly) */
readonly status: string;
}Creates a new cluster manager for rate limiters.
/**
* Cluster configuration options
*/
interface ClusterOptions extends RateLimiterOptions {
/** Enable homogeneous task distribution */
homogeneous?: boolean;
}
/**
* Creates a new cluster manager
* @param options - Cluster configuration
*/
constructor(options: ClusterOptions);Retrieves or creates a rate limiter by ID.
/**
* Get or create a rate limiter by ID
* @param id - Rate limiter ID (defaults to 0)
* @returns Rate limiter instance
*/
getRateLimiter(id?: number): RateLimiter;Usage Example:
const cluster = new Cluster({
maxConnections: 5,
rateLimit: 500,
priorityLevels: 3,
defaultPriority: 1,
homogeneous: true
});
// Get default limiter (ID 0)
const defaultLimiter = cluster.getRateLimiter();
// Get specific limiter (creates if doesn't exist)
const domainLimiter = cluster.getRateLimiter(1);
const apiLimiter = cluster.getRateLimiter(2);
// Each limiter inherits cluster settings
console.log(domainLimiter.maxConnections); // 5
console.log(apiLimiter.rateLimit); // 500Methods for managing rate limiters within the cluster.
/**
* Check if a rate limiter exists
* @param id - Rate limiter ID
* @returns True if limiter exists
*/
hasRateLimiter(id: number): boolean;
/**
* Delete a rate limiter from the cluster
* @param id - Rate limiter ID to delete
* @returns True if limiter was deleted
*/
deleteRateLimiter(id: number): boolean;
/**
* Check if any limiter has waiting tasks
* @returns True if any limiter has waiting tasks
*/
hasWaitingTasks(): boolean;Properties for monitoring cluster state.
/**
* @deprecated Use waitingSize instead
*/
readonly waitingClients: number;
/**
* @deprecated Use unfinishedSize instead
*/
readonly unfinishedClients: number;Internal structure for wrapped tasks with limiter information.
/**
* Wrapped task with rate limiter context
*/
interface TaskWrapper {
/** Task function to execute */
next: Task;
/** ID of the rate limiter handling this task */
rateLimiterId?: number;
}// RateLimiter is used internally by the crawler and not directly accessible
// Create rate limiter: max 2 concurrent, 1 second between requests
const limiter = new RateLimiter({
maxConnections: 2,
rateLimit: 1000,
priorityLevels: 3,
defaultPriority: 1
});
// Submit tasks with different priorities
for (let i = 0; i < 10; i++) {
const priority = i < 3 ? 0 : 1; // First 3 are high priority
limiter.submit(priority, (done) => {
console.log(`Task ${i} started (priority ${priority})`);
// Simulate work
setTimeout(() => {
console.log(`Task ${i} completed`);
done();
}, Math.random() * 2000);
});
}// Cluster is used internally by the crawler and not directly accessible
// Create cluster for multi-domain crawling
const cluster = new Cluster({
maxConnections: 3,
rateLimit: 500,
priorityLevels: 5,
defaultPriority: 2,
homogeneous: true // Enable task redistribution
});
// Domain-specific limiters
const googleLimiter = cluster.getRateLimiter(1);
const githubLimiter = cluster.getRateLimiter(2);
const apiLimiter = cluster.getRateLimiter(3);
// Configure different rate limits per domain
googleLimiter.setRateLimit(2000); // Slower for Google
apiLimiter.setRateLimit(100); // Faster for API
// Monitor cluster status
setInterval(() => {
console.log("Cluster status:", cluster.status);
console.log("Waiting tasks:", cluster.waitingSize);
console.log("Total unfinished:", cluster.unfinishedSize);
if (cluster.empty) {
console.log("All tasks completed!");
}
}, 5000);import Crawler from "crawler";
const crawler = new Crawler({
maxConnections: 5,
priorityLevels: 10,
homogeneous: true, // Enable task redistribution
callback: (error, res, done) => {
if (!error) {
console.log(`Crawled: ${res.options.url}`);
}
done();
}
});
// Use different rate limiters for different domains
crawler.add({
url: "https://api.github.com/users",
rateLimiterId: 1, // GitHub API limiter
priority: 1
});
crawler.add({
url: "https://www.google.com",
rateLimiterId: 2, // Google limiter
priority: 5
});
// Adjust rate limits dynamically
crawler.setLimiter(1, "rateLimit", 1000); // 1 second for GitHub API
crawler.setLimiter(2, "rateLimit", 3000); // 3 seconds for Googleconst crawler = new Crawler({
maxConnections: 10,
callback: (error, res, done) => {
// Adjust rate limit based on response
if (error && error.message.includes("rate limit")) {
// Increase rate limit if we hit limits
const currentLimit = crawler._limiters.getRateLimiter(0).rateLimit;
crawler.setLimiter(0, "rateLimit", currentLimit * 2);
console.log(`Rate limit increased to ${currentLimit * 2}ms`);
} else if (!error) {
// Decrease rate limit on success (be more aggressive)
const currentLimit = crawler._limiters.getRateLimiter(0).rateLimit;
if (currentLimit > 100) {
crawler.setLimiter(0, "rateLimit", Math.max(100, currentLimit * 0.9));
}
}
done();
}
});// Conservative approach for unknown sites
const conservativeCrawler = new Crawler({
maxConnections: 2,
rateLimit: 2000, // 2 seconds between requests
retries: 3,
retryInterval: 5000
});
// Aggressive approach for known APIs
const aggressiveCrawler = new Crawler({
maxConnections: 10,
rateLimit: 100, // 0.1 seconds between requests
timeout: 5000
});const multiDomainCrawler = new Crawler({
homogeneous: true,
callback: (error, res, done) => {
const domain = new URL(res.options.url).hostname;
// Different handling per domain
switch (domain) {
case "api.github.com":
// Handle GitHub API responses
break;
case "www.reddit.com":
// Handle Reddit responses
break;
default:
// Handle general websites
break;
}
done();
}
});
// Configure rate limits per domain
const domains = {
github: 1,
reddit: 2,
general: 0
};
crawler.setLimiter(domains.github, "rateLimit", 1000);
crawler.setLimiter(domains.reddit, "rateLimit", 2000);
crawler.setLimiter(domains.general, "rateLimit", 500);