tessl/npm-mendable--firecrawl-js

JavaScript SDK for Firecrawl API that enables comprehensive web scraping, crawling, and data extraction with AI-ready output formats.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Securityby

Pending

The risk profile of this skill

Overview

Eval results

Files

Web Crawling

Name: tessl/npm-mendable--firecrawl-js
Author: tessl

Recursive website crawling with configurable limits, path filtering, webhook support, and job monitoring.

Core Crawling Methods

/**
 * Start an async crawl job
 * @param url - Root URL to crawl
 * @param req - Crawl configuration options
 * @returns Promise resolving to job ID and URL
 */
startCrawl(url: string, req?: CrawlOptions): Promise<CrawlResponse>;

/**
 * Get crawl job status and partial data
 * @param jobId - Crawl job identifier
 * @param pagination - Pagination configuration for results
 * @returns Promise resolving to job status and data
 */
getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;

/**
 * Cancel a running crawl job
 * @param jobId - Crawl job identifier
 * @returns Promise resolving to true if cancelled
 */
cancelCrawl(jobId: string): Promise<boolean>;

/**
 * Convenience waiter: start crawl and poll until completion
 * @param url - Root URL to crawl
 * @param req - Crawl configuration plus waiter controls
 * @returns Promise resolving to final job snapshot
 */
crawl(url: string, req?: CrawlOptions & { pollInterval?: number; timeout?: number }): Promise<CrawlJob>;

/**
 * Retrieve crawl errors and robots.txt blocks
 * @param crawlId - Crawl job identifier
 * @returns Promise resolving to error details
 */
getCrawlErrors(crawlId: string): Promise<CrawlErrorsResponse>;

/**
 * List active crawls for the authenticated team
 * @returns Promise resolving to active crawls list
 */
getActiveCrawls(): Promise<ActiveCrawlsResponse>;

/**
 * Preview normalized crawl parameters from natural language
 * @param url - Root URL
 * @param prompt - Natural language instruction
 * @returns Promise resolving to normalized parameters
 */
crawlParamsPreview(url: string, prompt: string): Promise<Record<string, unknown>>;

Crawl Configuration

interface CrawlOptions {
  // Natural language crawl configuration
  prompt?: string | null;
  
  // Path filtering
  excludePaths?: string[] | null;
  includePaths?: string[] | null;
  
  // Crawl behavior
  maxDiscoveryDepth?: number | null;
  sitemap?: "skip" | "include";
  ignoreQueryParameters?: boolean;
  limit?: number | null;
  crawlEntireDomain?: boolean;
  allowExternalLinks?: boolean;
  allowSubdomains?: boolean;
  
  // Performance control
  delay?: number | null;
  maxConcurrency?: number | null;
  
  // Notifications
  webhook?: string | WebhookConfig | null;
  
  // Content processing
  scrapeOptions?: ScrapeOptions | null;
  
  // Privacy
  zeroDataRetention?: boolean;
  
  // Integration tracking
  integration?: string;
}

Response Types

// Crawl initiation response
interface CrawlResponse {
  id: string;
  url: string;
}

// Crawl job status and data
interface CrawlJob {
  status: "scraping" | "completed" | "failed" | "cancelled";
  total: number;
  completed: number;
  creditsUsed?: number;
  expiresAt?: string;
  next?: string | null;
  data: Document[];
}

// Crawl error details
interface CrawlErrorsResponse {
  errors: {
    id: string;
    timestamp?: string;
    url: string;
    code?: string;
    error: string;
  }[];
  robotsBlocked: string[];
}

// Active crawls listing
interface ActiveCrawlsResponse {
  success: boolean;
  crawls: ActiveCrawl[];
}

interface ActiveCrawl {
  id: string;
  teamId: string;
  url: string;
  options?: Record<string, unknown> | null;
}

Webhook Configuration

interface WebhookConfig {
  url: string;
  headers?: Record<string, string>;
  metadata?: Record<string, string>;
  events?: Array<"completed" | "failed" | "page" | "started">;
}

Pagination Configuration

interface PaginationConfig {
  // Automatically follow `next` links and aggregate documents
  autoPaginate?: boolean;
  
  // Maximum additional pages to fetch after first response
  maxPages?: number;
  
  // Maximum total documents to return across all pages
  maxResults?: number;
  
  // Maximum time to spend fetching additional pages (seconds)
  maxWaitTime?: number;
}

Usage Examples

Basic Crawling

// Simple crawl with limit
const crawlJob = await app.crawl('https://example.com', {
  limit: 50,
  scrapeOptions: {
    formats: ['markdown']
  }
});

console.log(`Crawled ${crawlJob.completed} of ${crawlJob.total} pages`);
console.log(crawlJob.data); // Array of scraped documents

Async Crawl with Status Monitoring

// Start crawl job
const crawlResponse = await app.startCrawl('https://example.com', {
  limit: 100,
  maxConcurrency: 5,
  scrapeOptions: {
    formats: ['markdown', 'links']
  }
});

console.log(`Started crawl job: ${crawlResponse.id}`);

// Monitor status
let job: CrawlJob;
do {
  await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds
  job = await app.getCrawlStatus(crawlResponse.id);
  console.log(`Progress: ${job.completed}/${job.total} - Status: ${job.status}`);
} while (job.status === 'scraping');

console.log('Crawl completed!', job.data.length, 'pages scraped');

Path Filtering

const crawlJob = await app.crawl('https://docs.example.com', {
  includePaths: ['/api/*', '/guides/*'],
  excludePaths: ['/api/v1/*', '*/deprecated/*'],
  limit: 200,
  scrapeOptions: {
    formats: ['markdown'],
    onlyMainContent: true
  }
});

Natural Language Crawl Configuration

// Preview what the natural language prompt will do
const preview = await app.crawlParamsPreview(
  'https://blog.example.com',
  'Crawl all blog posts from 2024, exclude author pages and tag pages'
);
console.log('Generated parameters:', preview);

// Use natural language prompt
const crawlJob = await app.crawl('https://blog.example.com', {
  prompt: 'Crawl all blog posts from 2024, exclude author pages and tag pages',
  limit: 500,
  scrapeOptions: {
    formats: ['markdown', {
      type: 'json',
      schema: {
        type: 'object',
        properties: {
          title: { type: 'string' },
          author: { type: 'string' },
          publishDate: { type: 'string' },
          content: { type: 'string' },
          tags: { type: 'array', items: { type: 'string' } }
        }
      }
    }]
  }
});

Webhook Integration

const crawlJob = await app.crawl('https://example.com', {
  limit: 100,
  webhook: {
    url: 'https://myapp.com/webhooks/crawl-complete',
    headers: {
      'Authorization': 'Bearer my-webhook-token'
    },
    metadata: {
      'userId': '12345',
      'jobType': 'content-audit'
    },
    events: ['completed', 'failed', 'page']
  },
  scrapeOptions: {
    formats: ['markdown']
  }
});

Advanced Crawl Configuration

const crawlJob = await app.crawl('https://example.com', {
  // Crawl configuration
  maxDiscoveryDepth: 3,
  sitemap: 'include',
  crawlEntireDomain: false,
  allowSubdomains: true,
  allowExternalLinks: false,
  ignoreQueryParameters: true,
  
  // Performance
  delay: 1000, // 1 second between requests
  maxConcurrency: 3,
  limit: 500,
  
  // Content filtering
  includePaths: ['/docs/*', '/api/*'],
  excludePaths: ['*/private/*', '/admin/*'],
  
  // Privacy
  zeroDataRetention: true,
  
  // Scraping options
  scrapeOptions: {
    formats: ['markdown', 'links'],
    onlyMainContent: true,
    blockAds: true,
    mobile: false
  }
});

Error Handling and Monitoring

try {
  const crawlJob = await app.crawl('https://example.com', {
    limit: 100
  });
  
  // Check for errors
  const errors = await app.getCrawlErrors(crawlJob.id);
  if (errors.errors.length > 0) {
    console.log('Crawl errors:', errors.errors);
  }
  if (errors.robotsBlocked.length > 0) {
    console.log('URLs blocked by robots.txt:', errors.robotsBlocked);
  }
  
} catch (error) {
  console.error('Crawl failed:', error);
}

// List all active crawls
const activeCrawls = await app.getActiveCrawls();
console.log('Currently active crawls:', activeCrawls.crawls);

Pagination Handling

// Get first page of results
let job = await app.getCrawlStatus('crawl-job-id', {
  autoPaginate: false,
  maxResults: 10
});

console.log('First 10 results:', job.data);

// Get all remaining results with pagination
if (job.next) {
  const allResults = await app.getCrawlStatus('crawl-job-id', {
    autoPaginate: true,
    maxPages: 10,
    maxResults: 1000,
    maxWaitTime: 300 // 5 minutes
  });
  console.log('All results:', allResults.data);
}