CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/npm-mendable--firecrawl-js

JavaScript SDK for Firecrawl API that enables comprehensive web scraping, crawling, and data extraction with AI-ready output formats.

Pending
Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

SecuritybySnyk

Pending

The risk profile of this skill

Overview
Eval results
Files

extraction.mddocs/

Data Extraction

LLM-powered structured data extraction using natural language prompts, schemas, or AI agents for intelligent content processing.

Core Extraction Methods

/**
 * Start an extract job (async)
 * @param args - Extraction request configuration
 * @returns Promise resolving to job ID or processing state
 */
startExtract(args: ExtractRequest): Promise<ExtractResponse>;

/**
 * Get extract job status/data
 * @param jobId - Extract job identifier
 * @returns Promise resolving to extraction results
 */
getExtractStatus(jobId: string): Promise<ExtractResponse>;

/**
 * Convenience waiter: start extract and poll until completion
 * @param args - Extraction request plus waiter controls
 * @returns Promise resolving to final extract response
 */
extract(args: ExtractRequest & { pollInterval?: number; timeout?: number }): Promise<ExtractResponse>;

Extraction Configuration

// Note: The exact ExtractRequest interface is inferred from method signatures
// Based on the v1 API, here are the typical extraction parameters:
interface ExtractRequest {
  // URLs to extract data from
  urls?: string[];
  
  // Natural language extraction prompt
  prompt?: string;
  
  // Structured schema for extraction
  schema?: Record<string, unknown> | ZodTypeAny;
  
  // System prompt for AI context
  systemPrompt?: string;
  
  // Allow external link following
  allowExternalLinks?: boolean;
  
  // Enable web search for additional context
  enableWebSearch?: boolean;
  
  // Include subdomains in extraction
  includeSubdomains?: boolean;
  
  // Source origin tracking
  origin?: string;
  
  // Show source URLs in results
  showSources?: boolean;
  
  // Scraping options for URL processing
  scrapeOptions?: ScrapeOptions;
  
  // AI agent configuration
  agent?: {
    model?: string;
    sessionId?: string;
  };
}

Extraction Response

interface ExtractResponse {
  success?: boolean;
  id?: string;
  status?: "processing" | "completed" | "failed" | "cancelled";
  data?: unknown;
  error?: string;
  warning?: string;
  sources?: Record<string, unknown>;
  expiresAt?: string;
}

Usage Examples

Basic Data Extraction

// Extract structured data using natural language
const extractResult = await app.extract({
  urls: ['https://company.example.com/about'],
  prompt: 'Extract the company name, founding year, number of employees, and main business areas'
});

console.log('Extracted data:', extractResult.data);
// Returns structured data based on the prompt

Schema-Based Extraction

import { z } from 'zod';

// Define extraction schema
const CompanySchema = z.object({
  name: z.string(),
  foundingYear: z.number(),
  employees: z.number().optional(),
  industry: z.string(),
  headquarters: z.string(),
  revenue: z.string().optional(),
  products: z.array(z.string()),
  keyExecutives: z.array(z.object({
    name: z.string(),
    title: z.string()
  }))
});

const extractResult = await app.extract({
  urls: [
    'https://company.example.com/about',
    'https://company.example.com/leadership',
    'https://company.example.com/products'
  ],
  schema: CompanySchema,
  prompt: 'Extract comprehensive company information including leadership and product details'
});

// Result is typed according to CompanySchema
console.log('Company data:', extractResult.data);

Multi-URL Product Extraction

const productUrls = [
  'https://shop.example.com/products/laptop-pro',
  'https://shop.example.com/products/tablet-air',
  'https://shop.example.com/products/phone-max'
];

const ProductSchema = z.object({
  name: z.string(),
  price: z.number(),
  currency: z.string(),
  description: z.string(),
  specifications: z.record(z.string()),
  availability: z.enum(['in-stock', 'out-of-stock', 'pre-order']),
  rating: z.number().optional(),
  reviews: z.number().optional(),
  images: z.array(z.string()),
  category: z.string(),
  brand: z.string()
});

const extractResult = await app.extract({
  urls: productUrls,
  schema: ProductSchema,
  prompt: 'Extract comprehensive product information including pricing, specifications, and availability',
  showSources: true
});

console.log('Products extracted:', extractResult.data);
console.log('Source URLs:', extractResult.sources);

News Article Analysis

const NewsArticleSchema = z.object({
  headline: z.string(),
  summary: z.string(),
  mainPoints: z.array(z.string()),
  author: z.string().optional(),
  publishDate: z.string(),
  source: z.string(),
  sentiment: z.enum(['positive', 'negative', 'neutral']),
  topics: z.array(z.string()),
  keyQuotes: z.array(z.string()),
  relatedCompanies: z.array(z.string()),
  impact: z.string().optional()
});

const extractResult = await app.extract({
  urls: [
    'https://news.example.com/tech-breakthrough',
    'https://news.example.com/market-analysis',
    'https://news.example.com/industry-trends'
  ],
  schema: NewsArticleSchema,
  prompt: 'Analyze news articles for key information, sentiment, and business impact',
  systemPrompt: 'You are a business analyst extracting key insights from news articles. Focus on factual information and business implications.',
  enableWebSearch: true,
  showSources: true
});

Research Paper Extraction

const ResearchPaperSchema = z.object({
  title: z.string(),
  authors: z.array(z.string()),
  abstract: z.string(),
  methodology: z.string(),
  keyFindings: z.array(z.string()),
  conclusions: z.string(),
  futureWork: z.string().optional(),
  citations: z.array(z.string()),
  keywords: z.array(z.string()),
  publishedDate: z.string().optional(),
  journal: z.string().optional(),
  doi: z.string().optional()
});

const extractResult = await app.extract({
  urls: [
    'https://research.example.com/papers/ai-ethics',
    'https://research.example.com/papers/machine-learning-bias'
  ],
  schema: ResearchPaperSchema,
  prompt: 'Extract comprehensive research paper information including methodology, findings, and citations',
  systemPrompt: 'You are an academic researcher extracting structured information from research papers. Focus on scientific accuracy and completeness.',
  allowExternalLinks: true,
  scrapeOptions: {
    formats: ['markdown'],
    onlyMainContent: true
  }
});

Async Extraction with Monitoring

// Start extraction job
const extractResponse = await app.startExtract({
  urls: Array.from({ length: 50 }, (_, i) => 
    `https://reviews.example.com/product/${i + 1}`
  ),
  schema: {
    type: 'object',
    properties: {
      productName: { type: 'string' },
      rating: { type: 'number' },
      reviewText: { type: 'string' },
      reviewer: { type: 'string' },
      reviewDate: { type: 'string' },
      pros: { type: 'array', items: { type: 'string' } },
      cons: { type: 'array', items: { type: 'string' } },
      recommended: { type: 'boolean' }
    }
  },
  prompt: 'Extract detailed product review information including pros, cons, and recommendations'
});

console.log(`Started extraction job: ${extractResponse.id}`);

// Monitor progress
let result: ExtractResponse;
do {
  await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds
  result = await app.getExtractStatus(extractResponse.id!);
  console.log(`Extraction status: ${result.status}`);
} while (result.status === 'processing');

if (result.status === 'completed') {
  console.log('Extraction completed:', result.data);
} else {
  console.error('Extraction failed:', result.error);
}

Web Search Enhanced Extraction

const extractResult = await app.extract({
  prompt: 'Find information about recent AI safety research developments, key researchers, and policy recommendations',
  enableWebSearch: true,
  schema: {
    type: 'object',
    properties: {
      recentDevelopments: {
        type: 'array',
        items: {
          type: 'object',
          properties: {
            title: { type: 'string' },
            description: { type: 'string' },
            researchers: { type: 'array', items: { type: 'string' } },
            institution: { type: 'string' },
            date: { type: 'string' },
            significance: { type: 'string' }
          }
        }
      },
      keyResearchers: {
        type: 'array',
        items: {
          type: 'object',
          properties: {
            name: { type: 'string' },
            affiliation: { type: 'string' },
            expertise: { type: 'array', items: { type: 'string' } },
            recentWork: { type: 'string' }
          }
        }
      },
      policyRecommendations: {
        type: 'array',
        items: {
          type: 'object',
          properties: {
            recommendation: { type: 'string' },
            rationale: { type: 'string' },
            source: { type: 'string' }
          }
        }
      }
    }
  },
  showSources: true
}, {
  timeout: 300 // 5 minutes
});

console.log('AI safety research analysis:', extractResult.data);

Financial Data Extraction

const FinancialDataSchema = z.object({
  companyName: z.string(),
  ticker: z.string().optional(),
  currentPrice: z.number().optional(),
  marketCap: z.string().optional(),
  revenue: z.string(),
  netIncome: z.string(),
  eps: z.number().optional(),
  peRatio: z.number().optional(),
  dividendYield: z.number().optional(),
  quarterlyGrowth: z.string().optional(),
  keyMetrics: z.record(z.string()),
  riskFactors: z.array(z.string()),
  businessSegments: z.array(z.object({
    segment: z.string(),
    revenue: z.string(),
    percentage: z.number().optional()
  }))
});

const extractResult = await app.extract({
  urls: [
    'https://investor.example.com/financials',
    'https://finance.yahoo.com/quote/EXAMPLE',
    'https://www.sec.gov/example-10k'
  ],
  schema: FinancialDataSchema,
  prompt: 'Extract comprehensive financial data including revenue, profitability, key metrics, and risk factors',
  systemPrompt: 'You are a financial analyst extracting key financial metrics and business information. Focus on numerical accuracy and current data.',
  allowExternalLinks: true,
  scrapeOptions: {
    formats: ['markdown'],
    timeout: 30000
  }
});

Error Handling and Validation

try {
  const extractResult = await app.extract({
    urls: ['https://complex-site.example.com'],
    schema: ComplexSchema,
    prompt: 'Extract detailed information',
    timeout: 180 // 3 minutes
  });
  
  if (extractResult.success && extractResult.data) {
    // Validate extracted data
    if (typeof extractResult.data === 'object' && extractResult.data !== null) {
      console.log('Extraction successful:', extractResult.data);
      
      if (extractResult.warning) {
        console.log('Warning:', extractResult.warning);
      }
      
      if (extractResult.sources) {
        console.log('Sources used:', extractResult.sources);
      }
    } else {
      console.log('Extraction returned unexpected data format');
    }
  } else {
    console.error('Extraction failed:', extractResult.error);
  }
  
} catch (error) {
  console.error('Extraction error:', error);
  
  // Fallback to simpler extraction
  try {
    const fallbackResult = await app.extract({
      urls: ['https://simple-fallback.example.com'],
      prompt: 'Extract basic information',
      timeout: 60
    });
    console.log('Fallback extraction:', fallbackResult.data);
  } catch (fallbackError) {
    console.error('Fallback extraction also failed:', fallbackError);
  }
}

Custom Agent Configuration

const extractResult = await app.extract({
  urls: ['https://technical-docs.example.com'],
  prompt: 'Extract API documentation including endpoints, parameters, and examples',
  schema: {
    type: 'object',
    properties: {
      endpoints: {
        type: 'array',
        items: {
          type: 'object',
          properties: {
            method: { type: 'string' },
            path: { type: 'string' },
            description: { type: 'string' },
            parameters: { type: 'array', items: { type: 'object' } },
            responses: { type: 'object' },
            examples: { type: 'array', items: { type: 'string' } }
          }
        }
      }
    }
  },
  agent: {
    model: 'gpt-4',
    sessionId: 'api-docs-extraction-session'
  },
  scrapeOptions: {
    formats: ['markdown'],
    onlyMainContent: true
  }
});

docs

batch.md

crawling.md

extraction.md

index.md

mapping.md

monitoring.md

scraping.md

search.md

usage.md

v1-api.md

tile.json