Internal helper functions for type checking, object manipulation, URL validation, and data processing used throughout the crawler system. These utilities are not directly exported from the main crawler module.
Functions for runtime type detection and validation.
/**
* Get the type of a value as a lowercase string
* @param value - Value to check
* @returns Type name in lowercase (e.g., "string", "number", "object")
*/
function getType(value: unknown): string;
/**
* Check if value is a number and not NaN
* @param value - Value to check
* @returns True if value is a valid number
*/
function isNumber(value: unknown): boolean;
/**
* Check if value is a function
* @param value - Value to check
* @returns True if value is a function
*/
function isFunction(value: unknown): boolean;
/**
* Check if value is a boolean
* @param value - Value to check
* @returns True if value is a boolean
*/
function isBoolean(value: unknown): boolean;Usage Examples:
// These utilities are internal to the crawler and not directly accessible
// They are used internally by the crawler for type checking and validation
// Type detection (internal usage)
console.log(getType("hello")); // "string"
console.log(getType(42)); // "number"
console.log(getType([])); // "array"
console.log(getType({})); // "object"
console.log(getType(null)); // "null"
console.log(getType(undefined)); // "undefined"
// Number validation
console.log(isNumber(42)); // true
console.log(isNumber("42")); // false
console.log(isNumber(NaN)); // false
console.log(isNumber(Infinity)); // true
// Function validation
console.log(isFunction(() => {})); // true
console.log(isFunction("function")); // false
// Boolean validation
console.log(isBoolean(true)); // true
console.log(isBoolean("true")); // false
console.log(isBoolean(1)); // falseFunctions for working with objects and setting default values.
/**
* Set default values for an object
* @param target - Target object to modify
* @param source - Source object containing default values
* @returns Modified target object
* @description Adds properties from source to target if they don't exist in target
*/
function setDefaults(
target: Record<string, unknown>,
source: Record<string, unknown>
): Record<string, unknown>;
/**
* Remove null and undefined values from an object
* @param obj - Object to clean
* @returns Cleaned object with null/undefined values removed
* @description Recursively removes null/undefined but preserves empty objects
*/
function cleanObject(obj: Record<string, unknown>): Record<string, unknown>;
/**
* Convert all object keys to lowercase
* @param obj - Object to transform
* @returns New object with all keys in lowercase
*/
function lowerObjectKeys(obj: Record<string, unknown>): Record<string, unknown>;Usage Examples:
// These utilities are internal to the crawler and not directly accessible
// Setting defaults
const userOptions = { timeout: 5000 };
const defaultOptions = {
timeout: 10000,
retries: 3,
method: "GET"
};
setDefaults(userOptions, defaultOptions);
console.log(userOptions);
// { timeout: 5000, retries: 3, method: "GET" }
// Cleaning objects
const dirtyData = {
name: "John",
age: null,
email: "john@example.com",
phone: undefined,
address: {
street: "123 Main St",
city: null,
zip: "12345"
}
};
const cleanData = cleanObject(dirtyData);
console.log(cleanData);
// {
// name: "John",
// email: "john@example.com",
// address: { street: "123 Main St", zip: "12345" }
// }
// Lowercasing keys
const headers = {
"Content-Type": "application/json",
"Authorization": "Bearer token",
"X-Custom-Header": "value"
};
const normalizedHeaders = lowerObjectKeys(headers);
console.log(normalizedHeaders);
// {
// "content-type": "application/json",
// "authorization": "Bearer token",
// "x-custom-header": "value"
// }Function for validating URL strings.
/**
* Validate if a string is a valid URL
* @param url - String to validate
* @returns True if string is a valid URL
*/
function isValidUrl(url: string): boolean;Usage Examples:
import { isValidUrl } from "crawler";
// Valid URLs
console.log(isValidUrl("https://example.com")); // true
console.log(isValidUrl("http://localhost:3000")); // true
console.log(isValidUrl("ftp://files.example.com")); // true
console.log(isValidUrl("https://example.com/path?q=1")); // true
// Invalid URLs
console.log(isValidUrl("not-a-url")); // false
console.log(isValidUrl("://missing-protocol")); // false
console.log(isValidUrl("")); // false
console.log(isValidUrl("https://")); // false
// Use in request validation
function validateRequest(options: any) {
if (typeof options === "string") {
if (!isValidUrl(options)) {
throw new Error("Invalid URL provided");
}
return { url: options };
}
if (options.url && !isValidUrl(options.url)) {
throw new Error("Invalid URL in options");
}
return options;
}Functions for working with nested arrays.
/**
* Flatten nested arrays recursively
* @param array - Array to flatten (can contain nested arrays)
* @returns Flattened array with all nesting removed
*/
function flattenDeep(array: any[]): any[];Usage Examples:
import { flattenDeep } from "crawler";
// Simple flattening
const simple = [1, [2, 3], 4];
console.log(flattenDeep(simple)); // [1, 2, 3, 4]
// Deep nesting
const deep = [1, [2, [3, [4, 5]]], 6];
console.log(flattenDeep(deep)); // [1, 2, 3, 4, 5, 6]
// Mixed types
const mixed = ["a", ["b", ["c"]], "d", [1, [2]]];
console.log(flattenDeep(mixed)); // ["a", "b", "c", "d", 1, 2]
// Use with crawler requests
const requestBatches = [
"https://site1.com",
["https://site2.com", "https://site3.com"],
[
"https://site4.com",
["https://site5.com", "https://site6.com"]
]
];
const allUrls = flattenDeep(requestBatches);
crawler.add(allUrls);import {
getType,
isValidUrl,
setDefaults,
cleanObject,
lowerObjectKeys
} from "crawler";
interface RequestOptions {
url?: string;
method?: string;
headers?: Record<string, unknown>;
timeout?: number;
}
function validateAndNormalizeRequest(
input: unknown,
defaults: RequestOptions
): RequestOptions {
// Type validation
const inputType = getType(input);
if (inputType === "string") {
if (!isValidUrl(input as string)) {
throw new Error("Invalid URL string");
}
input = { url: input as string };
} else if (inputType !== "object" || input === null) {
throw new Error("Request must be string URL or options object");
}
let options = input as RequestOptions;
// URL validation
if (options.url && !isValidUrl(options.url)) {
throw new Error("Invalid URL in options");
}
// Clean undefined/null values
options = cleanObject(options) as RequestOptions;
// Set defaults
setDefaults(options, defaults);
// Normalize headers
if (options.headers) {
options.headers = lowerObjectKeys(options.headers);
}
return options;
}
// Usage
const defaults = {
method: "GET",
timeout: 10000,
headers: { "user-agent": "MyCrawler/1.0" }
};
try {
const request1 = validateAndNormalizeRequest("https://example.com", defaults);
const request2 = validateAndNormalizeRequest({
url: "https://api.example.com",
method: "POST",
headers: { "Content-Type": "application/json" },
timeout: null // Will be cleaned out
}, defaults);
console.log(request1);
// { url: "https://example.com", method: "GET", timeout: 10000, headers: ... }
console.log(request2);
// { url: "https://api.example.com", method: "POST", headers: { "content-type": "application/json", "user-agent": "MyCrawler/1.0" } }
} catch (error) {
console.error("Validation failed:", error.message);
}import { setDefaults, isBoolean, isNumber, cleanObject } from "crawler";
interface CrawlerConfig {
maxConnections?: number;
rateLimit?: number;
jQuery?: boolean;
timeout?: number;
headers?: Record<string, unknown>;
}
class ConfigurationManager {
private globalDefaults: CrawlerConfig = {
maxConnections: 10,
rateLimit: 0,
jQuery: true,
timeout: 15000,
headers: {}
};
mergeConfig(userConfig: Partial<CrawlerConfig>): CrawlerConfig {
// Clean user config
const cleaned = cleanObject(userConfig) as CrawlerConfig;
// Validate numeric values
if (cleaned.maxConnections !== undefined && !isNumber(cleaned.maxConnections)) {
throw new Error("maxConnections must be a number");
}
if (cleaned.rateLimit !== undefined && !isNumber(cleaned.rateLimit)) {
throw new Error("rateLimit must be a number");
}
if (cleaned.timeout !== undefined && !isNumber(cleaned.timeout)) {
throw new Error("timeout must be a number");
}
// Validate boolean values
if (cleaned.jQuery !== undefined && !isBoolean(cleaned.jQuery)) {
throw new Error("jQuery must be a boolean");
}
// Merge with defaults
const config = { ...this.globalDefaults };
setDefaults(config, cleaned);
return config;
}
}
// Usage
const configManager = new ConfigurationManager();
const userConfig = {
maxConnections: 5,
jQuery: false,
invalidValue: null, // Will be cleaned out
headers: {
"User-Agent": "MyBot/1.0",
"Accept": "text/html"
}
};
const finalConfig = configManager.mergeConfig(userConfig);
console.log(finalConfig);import { getType, isFunction, isBoolean, flattenDeep } from "crawler";
class RequestProcessor {
processRequestConfig(config: unknown): ProcessedConfig {
const type = getType(config);
switch (type) {
case "string":
return this.processUrlString(config as string);
case "array":
return this.processRequestArray(config as unknown[]);
case "object":
return this.processRequestObject(config as Record<string, unknown>);
default:
throw new Error(`Unsupported config type: ${type}`);
}
}
private processUrlString(url: string): ProcessedConfig {
return {
requests: [{ url, method: "GET" }],
count: 1
};
}
private processRequestArray(requests: unknown[]): ProcessedConfig {
// Flatten nested arrays
const flattened = flattenDeep(requests);
const processed = flattened.map(item => {
return this.processRequestConfig(item).requests[0];
});
return {
requests: processed,
count: processed.length
};
}
private processRequestObject(obj: Record<string, unknown>): ProcessedConfig {
const processed = { ...obj };
// Process callback functions
if (processed.callback && isFunction(processed.callback)) {
processed.hasCallback = true;
}
// Process boolean flags
if (processed.jQuery !== undefined) {
processed.jQuery = isBoolean(processed.jQuery) ? processed.jQuery : true;
}
return {
requests: [processed],
count: 1
};
}
}
interface ProcessedConfig {
requests: Record<string, unknown>[];
count: number;
}import { getType, isValidUrl, isNumber } from "crawler";
function safeProcessRequest(input: unknown): RequestResult {
try {
const inputType = getType(input);
if (inputType === "string") {
if (!isValidUrl(input as string)) {
return {
success: false,
error: "Invalid URL format",
input: input as string
};
}
return {
success: true,
data: { url: input as string, method: "GET" }
};
}
if (inputType === "object" && input !== null) {
const obj = input as Record<string, unknown>;
if (obj.url && !isValidUrl(obj.url as string)) {
return {
success: false,
error: "Invalid URL in object",
input: obj.url
};
}
if (obj.timeout && !isNumber(obj.timeout)) {
return {
success: false,
error: "Invalid timeout value",
input: obj.timeout
};
}
return {
success: true,
data: obj
};
}
return {
success: false,
error: `Unsupported input type: ${inputType}`,
input
};
} catch (error) {
return {
success: false,
error: error.message,
input
};
}
}
interface RequestResult {
success: boolean;
data?: Record<string, unknown>;
error?: string;
input?: unknown;
}