Tessl Tile for npm/@mendable/firecrawl-js@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

batch.md crawling.md extraction.md index.md mapping.md monitoring.md scraping.md search.md usage.md v1-api.md

extraction.mddocs/

0
# Data Extraction
1

2
LLM-powered structured data extraction using natural language prompts, schemas, or AI agents for intelligent content processing.
3

4
## Core Extraction Methods
5

6
```typescript { .api }
7
/**
8
 * Start an extract job (async)
9
 * @param args - Extraction request configuration
10
 * @returns Promise resolving to job ID or processing state
11
 */
12
startExtract(args: ExtractRequest): Promise<ExtractResponse>;
13

14
/**
15
 * Get extract job status/data
16
 * @param jobId - Extract job identifier
17
 * @returns Promise resolving to extraction results
18
 */
19
getExtractStatus(jobId: string): Promise<ExtractResponse>;
20

21
/**
22
 * Convenience waiter: start extract and poll until completion
23
 * @param args - Extraction request plus waiter controls
24
 * @returns Promise resolving to final extract response
25
 */
26
extract(args: ExtractRequest & { pollInterval?: number; timeout?: number }): Promise<ExtractResponse>;
27
```
28

29
## Extraction Configuration
30

31
```typescript { .api }
32
// Note: The exact ExtractRequest interface is inferred from method signatures
33
// Based on the v1 API, here are the typical extraction parameters:
34
interface ExtractRequest {
35
  // URLs to extract data from
36
  urls?: string[];
37
  
38
  // Natural language extraction prompt
39
  prompt?: string;
40
  
41
  // Structured schema for extraction
42
  schema?: Record<string, unknown> | ZodTypeAny;
43
  
44
  // System prompt for AI context
45
  systemPrompt?: string;
46
  
47
  // Allow external link following
48
  allowExternalLinks?: boolean;
49
  
50
  // Enable web search for additional context
51
  enableWebSearch?: boolean;
52
  
53
  // Include subdomains in extraction
54
  includeSubdomains?: boolean;
55
  
56
  // Source origin tracking
57
  origin?: string;
58
  
59
  // Show source URLs in results
60
  showSources?: boolean;
61
  
62
  // Scraping options for URL processing
63
  scrapeOptions?: ScrapeOptions;
64
  
65
  // AI agent configuration
66
  agent?: {
67
    model?: string;
68
    sessionId?: string;
69
  };
70
}
71
```
72

73
## Extraction Response
74

75
```typescript { .api }
76
interface ExtractResponse {
77
  success?: boolean;
78
  id?: string;
79
  status?: "processing" | "completed" | "failed" | "cancelled";
80
  data?: unknown;
81
  error?: string;
82
  warning?: string;
83
  sources?: Record<string, unknown>;
84
  expiresAt?: string;
85
}
86
```
87

88
## Usage Examples
89

90
### Basic Data Extraction
91

92
```typescript
93
// Extract structured data using natural language
94
const extractResult = await app.extract({
95
  urls: ['https://company.example.com/about'],
96
  prompt: 'Extract the company name, founding year, number of employees, and main business areas'
97
});
98

99
console.log('Extracted data:', extractResult.data);
100
// Returns structured data based on the prompt
101
```
102

103
### Schema-Based Extraction
104

105
```typescript
106
import { z } from 'zod';
107

108
// Define extraction schema
109
const CompanySchema = z.object({
110
  name: z.string(),
111
  foundingYear: z.number(),
112
  employees: z.number().optional(),
113
  industry: z.string(),
114
  headquarters: z.string(),
115
  revenue: z.string().optional(),
116
  products: z.array(z.string()),
117
  keyExecutives: z.array(z.object({
118
    name: z.string(),
119
    title: z.string()
120
  }))
121
});
122

123
const extractResult = await app.extract({
124
  urls: [
125
    'https://company.example.com/about',
126
    'https://company.example.com/leadership',
127
    'https://company.example.com/products'
128
  ],
129
  schema: CompanySchema,
130
  prompt: 'Extract comprehensive company information including leadership and product details'
131
});
132

133
// Result is typed according to CompanySchema
134
console.log('Company data:', extractResult.data);
135
```
136

137
### Multi-URL Product Extraction
138

139
```typescript
140
const productUrls = [
141
  'https://shop.example.com/products/laptop-pro',
142
  'https://shop.example.com/products/tablet-air',
143
  'https://shop.example.com/products/phone-max'
144
];
145

146
const ProductSchema = z.object({
147
  name: z.string(),
148
  price: z.number(),
149
  currency: z.string(),
150
  description: z.string(),
151
  specifications: z.record(z.string()),
152
  availability: z.enum(['in-stock', 'out-of-stock', 'pre-order']),
153
  rating: z.number().optional(),
154
  reviews: z.number().optional(),
155
  images: z.array(z.string()),
156
  category: z.string(),
157
  brand: z.string()
158
});
159

160
const extractResult = await app.extract({
161
  urls: productUrls,
162
  schema: ProductSchema,
163
  prompt: 'Extract comprehensive product information including pricing, specifications, and availability',
164
  showSources: true
165
});
166

167
console.log('Products extracted:', extractResult.data);
168
console.log('Source URLs:', extractResult.sources);
169
```
170

171
### News Article Analysis
172

173
```typescript
174
const NewsArticleSchema = z.object({
175
  headline: z.string(),
176
  summary: z.string(),
177
  mainPoints: z.array(z.string()),
178
  author: z.string().optional(),
179
  publishDate: z.string(),
180
  source: z.string(),
181
  sentiment: z.enum(['positive', 'negative', 'neutral']),
182
  topics: z.array(z.string()),
183
  keyQuotes: z.array(z.string()),
184
  relatedCompanies: z.array(z.string()),
185
  impact: z.string().optional()
186
});
187

188
const extractResult = await app.extract({
189
  urls: [
190
    'https://news.example.com/tech-breakthrough',
191
    'https://news.example.com/market-analysis',
192
    'https://news.example.com/industry-trends'
193
  ],
194
  schema: NewsArticleSchema,
195
  prompt: 'Analyze news articles for key information, sentiment, and business impact',
196
  systemPrompt: 'You are a business analyst extracting key insights from news articles. Focus on factual information and business implications.',
197
  enableWebSearch: true,
198
  showSources: true
199
});
200
```
201

202
### Research Paper Extraction
203

204
```typescript
205
const ResearchPaperSchema = z.object({
206
  title: z.string(),
207
  authors: z.array(z.string()),
208
  abstract: z.string(),
209
  methodology: z.string(),
210
  keyFindings: z.array(z.string()),
211
  conclusions: z.string(),
212
  futureWork: z.string().optional(),
213
  citations: z.array(z.string()),
214
  keywords: z.array(z.string()),
215
  publishedDate: z.string().optional(),
216
  journal: z.string().optional(),
217
  doi: z.string().optional()
218
});
219

220
const extractResult = await app.extract({
221
  urls: [
222
    'https://research.example.com/papers/ai-ethics',
223
    'https://research.example.com/papers/machine-learning-bias'
224
  ],
225
  schema: ResearchPaperSchema,
226
  prompt: 'Extract comprehensive research paper information including methodology, findings, and citations',
227
  systemPrompt: 'You are an academic researcher extracting structured information from research papers. Focus on scientific accuracy and completeness.',
228
  allowExternalLinks: true,
229
  scrapeOptions: {
230
    formats: ['markdown'],
231
    onlyMainContent: true
232
  }
233
});
234
```
235

236
### Async Extraction with Monitoring
237

238
```typescript
239
// Start extraction job
240
const extractResponse = await app.startExtract({
241
  urls: Array.from({ length: 50 }, (_, i) => 
242
    `https://reviews.example.com/product/${i + 1}`
243
  ),
244
  schema: {
245
    type: 'object',
246
    properties: {
247
      productName: { type: 'string' },
248
      rating: { type: 'number' },
249
      reviewText: { type: 'string' },
250
      reviewer: { type: 'string' },
251
      reviewDate: { type: 'string' },
252
      pros: { type: 'array', items: { type: 'string' } },
253
      cons: { type: 'array', items: { type: 'string' } },
254
      recommended: { type: 'boolean' }
255
    }
256
  },
257
  prompt: 'Extract detailed product review information including pros, cons, and recommendations'
258
});
259

260
console.log(`Started extraction job: ${extractResponse.id}`);
261

262
// Monitor progress
263
let result: ExtractResponse;
264
do {
265
  await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds
266
  result = await app.getExtractStatus(extractResponse.id!);
267
  console.log(`Extraction status: ${result.status}`);
268
} while (result.status === 'processing');
269

270
if (result.status === 'completed') {
271
  console.log('Extraction completed:', result.data);
272
} else {
273
  console.error('Extraction failed:', result.error);
274
}
275
```
276

277
### Web Search Enhanced Extraction
278

279
```typescript
280
const extractResult = await app.extract({
281
  prompt: 'Find information about recent AI safety research developments, key researchers, and policy recommendations',
282
  enableWebSearch: true,
283
  schema: {
284
    type: 'object',
285
    properties: {
286
      recentDevelopments: {
287
        type: 'array',
288
        items: {
289
          type: 'object',
290
          properties: {
291
            title: { type: 'string' },
292
            description: { type: 'string' },
293
            researchers: { type: 'array', items: { type: 'string' } },
294
            institution: { type: 'string' },
295
            date: { type: 'string' },
296
            significance: { type: 'string' }
297
          }
298
        }
299
      },
300
      keyResearchers: {
301
        type: 'array',
302
        items: {
303
          type: 'object',
304
          properties: {
305
            name: { type: 'string' },
306
            affiliation: { type: 'string' },
307
            expertise: { type: 'array', items: { type: 'string' } },
308
            recentWork: { type: 'string' }
309
          }
310
        }
311
      },
312
      policyRecommendations: {
313
        type: 'array',
314
        items: {
315
          type: 'object',
316
          properties: {
317
            recommendation: { type: 'string' },
318
            rationale: { type: 'string' },
319
            source: { type: 'string' }
320
          }
321
        }
322
      }
323
    }
324
  },
325
  showSources: true
326
}, {
327
  timeout: 300 // 5 minutes
328
});
329

330
console.log('AI safety research analysis:', extractResult.data);
331
```
332

333
### Financial Data Extraction
334

335
```typescript
336
const FinancialDataSchema = z.object({
337
  companyName: z.string(),
338
  ticker: z.string().optional(),
339
  currentPrice: z.number().optional(),
340
  marketCap: z.string().optional(),
341
  revenue: z.string(),
342
  netIncome: z.string(),
343
  eps: z.number().optional(),
344
  peRatio: z.number().optional(),
345
  dividendYield: z.number().optional(),
346
  quarterlyGrowth: z.string().optional(),
347
  keyMetrics: z.record(z.string()),
348
  riskFactors: z.array(z.string()),
349
  businessSegments: z.array(z.object({
350
    segment: z.string(),
351
    revenue: z.string(),
352
    percentage: z.number().optional()
353
  }))
354
});
355

356
const extractResult = await app.extract({
357
  urls: [
358
    'https://investor.example.com/financials',
359
    'https://finance.yahoo.com/quote/EXAMPLE',
360
    'https://www.sec.gov/example-10k'
361
  ],
362
  schema: FinancialDataSchema,
363
  prompt: 'Extract comprehensive financial data including revenue, profitability, key metrics, and risk factors',
364
  systemPrompt: 'You are a financial analyst extracting key financial metrics and business information. Focus on numerical accuracy and current data.',
365
  allowExternalLinks: true,
366
  scrapeOptions: {
367
    formats: ['markdown'],
368
    timeout: 30000
369
  }
370
});
371
```
372

373
### Error Handling and Validation
374

375
```typescript
376
try {
377
  const extractResult = await app.extract({
378
    urls: ['https://complex-site.example.com'],
379
    schema: ComplexSchema,
380
    prompt: 'Extract detailed information',
381
    timeout: 180 // 3 minutes
382
  });
383
  
384
  if (extractResult.success && extractResult.data) {
385
    // Validate extracted data
386
    if (typeof extractResult.data === 'object' && extractResult.data !== null) {
387
      console.log('Extraction successful:', extractResult.data);
388
      
389
      if (extractResult.warning) {
390
        console.log('Warning:', extractResult.warning);
391
      }
392
      
393
      if (extractResult.sources) {
394
        console.log('Sources used:', extractResult.sources);
395
      }
396
    } else {
397
      console.log('Extraction returned unexpected data format');
398
    }
399
  } else {
400
    console.error('Extraction failed:', extractResult.error);
401
  }
402
  
403
} catch (error) {
404
  console.error('Extraction error:', error);
405
  
406
  // Fallback to simpler extraction
407
  try {
408
    const fallbackResult = await app.extract({
409
      urls: ['https://simple-fallback.example.com'],
410
      prompt: 'Extract basic information',
411
      timeout: 60
412
    });
413
    console.log('Fallback extraction:', fallbackResult.data);
414
  } catch (fallbackError) {
415
    console.error('Fallback extraction also failed:', fallbackError);
416
  }
417
}
418
```
419

420
### Custom Agent Configuration
421

422
```typescript
423
const extractResult = await app.extract({
424
  urls: ['https://technical-docs.example.com'],
425
  prompt: 'Extract API documentation including endpoints, parameters, and examples',
426
  schema: {
427
    type: 'object',
428
    properties: {
429
      endpoints: {
430
        type: 'array',
431
        items: {
432
          type: 'object',
433
          properties: {
434
            method: { type: 'string' },
435
            path: { type: 'string' },
436
            description: { type: 'string' },
437
            parameters: { type: 'array', items: { type: 'object' } },
438
            responses: { type: 'object' },
439
            examples: { type: 'array', items: { type: 'string' } }
440
          }
441
        }
442
      }
443
    }
444
  },
445
  agent: {
446
    model: 'gpt-4',
447
    sessionId: 'api-docs-extraction-session'
448
  },
449
  scrapeOptions: {
450
    formats: ['markdown'],
451
    onlyMainContent: true
452
  }
453
});
454
```

Version

Tile

Files

extraction.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

extraction.mddocs/