JavaScript SDK for Firecrawl API that enables comprehensive web scraping, crawling, and data extraction with AI-ready output formats.
—
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Pending
The risk profile of this skill
Recursive website crawling with configurable limits, path filtering, webhook support, and job monitoring.
/**
* Start an async crawl job
* @param url - Root URL to crawl
* @param req - Crawl configuration options
* @returns Promise resolving to job ID and URL
*/
startCrawl(url: string, req?: CrawlOptions): Promise<CrawlResponse>;
/**
* Get crawl job status and partial data
* @param jobId - Crawl job identifier
* @param pagination - Pagination configuration for results
* @returns Promise resolving to job status and data
*/
getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;
/**
* Cancel a running crawl job
* @param jobId - Crawl job identifier
* @returns Promise resolving to true if cancelled
*/
cancelCrawl(jobId: string): Promise<boolean>;
/**
* Convenience waiter: start crawl and poll until completion
* @param url - Root URL to crawl
* @param req - Crawl configuration plus waiter controls
* @returns Promise resolving to final job snapshot
*/
crawl(url: string, req?: CrawlOptions & { pollInterval?: number; timeout?: number }): Promise<CrawlJob>;
/**
* Retrieve crawl errors and robots.txt blocks
* @param crawlId - Crawl job identifier
* @returns Promise resolving to error details
*/
getCrawlErrors(crawlId: string): Promise<CrawlErrorsResponse>;
/**
* List active crawls for the authenticated team
* @returns Promise resolving to active crawls list
*/
getActiveCrawls(): Promise<ActiveCrawlsResponse>;
/**
* Preview normalized crawl parameters from natural language
* @param url - Root URL
* @param prompt - Natural language instruction
* @returns Promise resolving to normalized parameters
*/
crawlParamsPreview(url: string, prompt: string): Promise<Record<string, unknown>>;interface CrawlOptions {
// Natural language crawl configuration
prompt?: string | null;
// Path filtering
excludePaths?: string[] | null;
includePaths?: string[] | null;
// Crawl behavior
maxDiscoveryDepth?: number | null;
sitemap?: "skip" | "include";
ignoreQueryParameters?: boolean;
limit?: number | null;
crawlEntireDomain?: boolean;
allowExternalLinks?: boolean;
allowSubdomains?: boolean;
// Performance control
delay?: number | null;
maxConcurrency?: number | null;
// Notifications
webhook?: string | WebhookConfig | null;
// Content processing
scrapeOptions?: ScrapeOptions | null;
// Privacy
zeroDataRetention?: boolean;
// Integration tracking
integration?: string;
}// Crawl initiation response
interface CrawlResponse {
id: string;
url: string;
}
// Crawl job status and data
interface CrawlJob {
status: "scraping" | "completed" | "failed" | "cancelled";
total: number;
completed: number;
creditsUsed?: number;
expiresAt?: string;
next?: string | null;
data: Document[];
}
// Crawl error details
interface CrawlErrorsResponse {
errors: {
id: string;
timestamp?: string;
url: string;
code?: string;
error: string;
}[];
robotsBlocked: string[];
}
// Active crawls listing
interface ActiveCrawlsResponse {
success: boolean;
crawls: ActiveCrawl[];
}
interface ActiveCrawl {
id: string;
teamId: string;
url: string;
options?: Record<string, unknown> | null;
}interface WebhookConfig {
url: string;
headers?: Record<string, string>;
metadata?: Record<string, string>;
events?: Array<"completed" | "failed" | "page" | "started">;
}interface PaginationConfig {
// Automatically follow `next` links and aggregate documents
autoPaginate?: boolean;
// Maximum additional pages to fetch after first response
maxPages?: number;
// Maximum total documents to return across all pages
maxResults?: number;
// Maximum time to spend fetching additional pages (seconds)
maxWaitTime?: number;
}// Simple crawl with limit
const crawlJob = await app.crawl('https://example.com', {
limit: 50,
scrapeOptions: {
formats: ['markdown']
}
});
console.log(`Crawled ${crawlJob.completed} of ${crawlJob.total} pages`);
console.log(crawlJob.data); // Array of scraped documents// Start crawl job
const crawlResponse = await app.startCrawl('https://example.com', {
limit: 100,
maxConcurrency: 5,
scrapeOptions: {
formats: ['markdown', 'links']
}
});
console.log(`Started crawl job: ${crawlResponse.id}`);
// Monitor status
let job: CrawlJob;
do {
await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds
job = await app.getCrawlStatus(crawlResponse.id);
console.log(`Progress: ${job.completed}/${job.total} - Status: ${job.status}`);
} while (job.status === 'scraping');
console.log('Crawl completed!', job.data.length, 'pages scraped');const crawlJob = await app.crawl('https://docs.example.com', {
includePaths: ['/api/*', '/guides/*'],
excludePaths: ['/api/v1/*', '*/deprecated/*'],
limit: 200,
scrapeOptions: {
formats: ['markdown'],
onlyMainContent: true
}
});// Preview what the natural language prompt will do
const preview = await app.crawlParamsPreview(
'https://blog.example.com',
'Crawl all blog posts from 2024, exclude author pages and tag pages'
);
console.log('Generated parameters:', preview);
// Use natural language prompt
const crawlJob = await app.crawl('https://blog.example.com', {
prompt: 'Crawl all blog posts from 2024, exclude author pages and tag pages',
limit: 500,
scrapeOptions: {
formats: ['markdown', {
type: 'json',
schema: {
type: 'object',
properties: {
title: { type: 'string' },
author: { type: 'string' },
publishDate: { type: 'string' },
content: { type: 'string' },
tags: { type: 'array', items: { type: 'string' } }
}
}
}]
}
});const crawlJob = await app.crawl('https://example.com', {
limit: 100,
webhook: {
url: 'https://myapp.com/webhooks/crawl-complete',
headers: {
'Authorization': 'Bearer my-webhook-token'
},
metadata: {
'userId': '12345',
'jobType': 'content-audit'
},
events: ['completed', 'failed', 'page']
},
scrapeOptions: {
formats: ['markdown']
}
});const crawlJob = await app.crawl('https://example.com', {
// Crawl configuration
maxDiscoveryDepth: 3,
sitemap: 'include',
crawlEntireDomain: false,
allowSubdomains: true,
allowExternalLinks: false,
ignoreQueryParameters: true,
// Performance
delay: 1000, // 1 second between requests
maxConcurrency: 3,
limit: 500,
// Content filtering
includePaths: ['/docs/*', '/api/*'],
excludePaths: ['*/private/*', '/admin/*'],
// Privacy
zeroDataRetention: true,
// Scraping options
scrapeOptions: {
formats: ['markdown', 'links'],
onlyMainContent: true,
blockAds: true,
mobile: false
}
});try {
const crawlJob = await app.crawl('https://example.com', {
limit: 100
});
// Check for errors
const errors = await app.getCrawlErrors(crawlJob.id);
if (errors.errors.length > 0) {
console.log('Crawl errors:', errors.errors);
}
if (errors.robotsBlocked.length > 0) {
console.log('URLs blocked by robots.txt:', errors.robotsBlocked);
}
} catch (error) {
console.error('Crawl failed:', error);
}
// List all active crawls
const activeCrawls = await app.getActiveCrawls();
console.log('Currently active crawls:', activeCrawls.crawls);// Get first page of results
let job = await app.getCrawlStatus('crawl-job-id', {
autoPaginate: false,
maxResults: 10
});
console.log('First 10 results:', job.data);
// Get all remaining results with pagination
if (job.next) {
const allResults = await app.getCrawlStatus('crawl-job-id', {
autoPaginate: true,
maxPages: 10,
maxResults: 1000,
maxWaitTime: 300 // 5 minutes
});
console.log('All results:', allResults.data);
}