0
# Data Extraction
1
2
LLM-powered structured data extraction using natural language prompts, schemas, or AI agents for intelligent content processing.
3
4
## Core Extraction Methods
5
6
```typescript { .api }
7
/**
8
* Start an extract job (async)
9
* @param args - Extraction request configuration
10
* @returns Promise resolving to job ID or processing state
11
*/
12
startExtract(args: ExtractRequest): Promise<ExtractResponse>;
13
14
/**
15
* Get extract job status/data
16
* @param jobId - Extract job identifier
17
* @returns Promise resolving to extraction results
18
*/
19
getExtractStatus(jobId: string): Promise<ExtractResponse>;
20
21
/**
22
* Convenience waiter: start extract and poll until completion
23
* @param args - Extraction request plus waiter controls
24
* @returns Promise resolving to final extract response
25
*/
26
extract(args: ExtractRequest & { pollInterval?: number; timeout?: number }): Promise<ExtractResponse>;
27
```
28
29
## Extraction Configuration
30
31
```typescript { .api }
32
// Note: The exact ExtractRequest interface is inferred from method signatures
33
// Based on the v1 API, here are the typical extraction parameters:
34
interface ExtractRequest {
35
// URLs to extract data from
36
urls?: string[];
37
38
// Natural language extraction prompt
39
prompt?: string;
40
41
// Structured schema for extraction
42
schema?: Record<string, unknown> | ZodTypeAny;
43
44
// System prompt for AI context
45
systemPrompt?: string;
46
47
// Allow external link following
48
allowExternalLinks?: boolean;
49
50
// Enable web search for additional context
51
enableWebSearch?: boolean;
52
53
// Include subdomains in extraction
54
includeSubdomains?: boolean;
55
56
// Source origin tracking
57
origin?: string;
58
59
// Show source URLs in results
60
showSources?: boolean;
61
62
// Scraping options for URL processing
63
scrapeOptions?: ScrapeOptions;
64
65
// AI agent configuration
66
agent?: {
67
model?: string;
68
sessionId?: string;
69
};
70
}
71
```
72
73
## Extraction Response
74
75
```typescript { .api }
76
interface ExtractResponse {
77
success?: boolean;
78
id?: string;
79
status?: "processing" | "completed" | "failed" | "cancelled";
80
data?: unknown;
81
error?: string;
82
warning?: string;
83
sources?: Record<string, unknown>;
84
expiresAt?: string;
85
}
86
```
87
88
## Usage Examples
89
90
### Basic Data Extraction
91
92
```typescript
93
// Extract structured data using natural language
94
const extractResult = await app.extract({
95
urls: ['https://company.example.com/about'],
96
prompt: 'Extract the company name, founding year, number of employees, and main business areas'
97
});
98
99
console.log('Extracted data:', extractResult.data);
100
// Returns structured data based on the prompt
101
```
102
103
### Schema-Based Extraction
104
105
```typescript
106
import { z } from 'zod';
107
108
// Define extraction schema
109
const CompanySchema = z.object({
110
name: z.string(),
111
foundingYear: z.number(),
112
employees: z.number().optional(),
113
industry: z.string(),
114
headquarters: z.string(),
115
revenue: z.string().optional(),
116
products: z.array(z.string()),
117
keyExecutives: z.array(z.object({
118
name: z.string(),
119
title: z.string()
120
}))
121
});
122
123
const extractResult = await app.extract({
124
urls: [
125
'https://company.example.com/about',
126
'https://company.example.com/leadership',
127
'https://company.example.com/products'
128
],
129
schema: CompanySchema,
130
prompt: 'Extract comprehensive company information including leadership and product details'
131
});
132
133
// Result is typed according to CompanySchema
134
console.log('Company data:', extractResult.data);
135
```
136
137
### Multi-URL Product Extraction
138
139
```typescript
140
const productUrls = [
141
'https://shop.example.com/products/laptop-pro',
142
'https://shop.example.com/products/tablet-air',
143
'https://shop.example.com/products/phone-max'
144
];
145
146
const ProductSchema = z.object({
147
name: z.string(),
148
price: z.number(),
149
currency: z.string(),
150
description: z.string(),
151
specifications: z.record(z.string()),
152
availability: z.enum(['in-stock', 'out-of-stock', 'pre-order']),
153
rating: z.number().optional(),
154
reviews: z.number().optional(),
155
images: z.array(z.string()),
156
category: z.string(),
157
brand: z.string()
158
});
159
160
const extractResult = await app.extract({
161
urls: productUrls,
162
schema: ProductSchema,
163
prompt: 'Extract comprehensive product information including pricing, specifications, and availability',
164
showSources: true
165
});
166
167
console.log('Products extracted:', extractResult.data);
168
console.log('Source URLs:', extractResult.sources);
169
```
170
171
### News Article Analysis
172
173
```typescript
174
const NewsArticleSchema = z.object({
175
headline: z.string(),
176
summary: z.string(),
177
mainPoints: z.array(z.string()),
178
author: z.string().optional(),
179
publishDate: z.string(),
180
source: z.string(),
181
sentiment: z.enum(['positive', 'negative', 'neutral']),
182
topics: z.array(z.string()),
183
keyQuotes: z.array(z.string()),
184
relatedCompanies: z.array(z.string()),
185
impact: z.string().optional()
186
});
187
188
const extractResult = await app.extract({
189
urls: [
190
'https://news.example.com/tech-breakthrough',
191
'https://news.example.com/market-analysis',
192
'https://news.example.com/industry-trends'
193
],
194
schema: NewsArticleSchema,
195
prompt: 'Analyze news articles for key information, sentiment, and business impact',
196
systemPrompt: 'You are a business analyst extracting key insights from news articles. Focus on factual information and business implications.',
197
enableWebSearch: true,
198
showSources: true
199
});
200
```
201
202
### Research Paper Extraction
203
204
```typescript
205
const ResearchPaperSchema = z.object({
206
title: z.string(),
207
authors: z.array(z.string()),
208
abstract: z.string(),
209
methodology: z.string(),
210
keyFindings: z.array(z.string()),
211
conclusions: z.string(),
212
futureWork: z.string().optional(),
213
citations: z.array(z.string()),
214
keywords: z.array(z.string()),
215
publishedDate: z.string().optional(),
216
journal: z.string().optional(),
217
doi: z.string().optional()
218
});
219
220
const extractResult = await app.extract({
221
urls: [
222
'https://research.example.com/papers/ai-ethics',
223
'https://research.example.com/papers/machine-learning-bias'
224
],
225
schema: ResearchPaperSchema,
226
prompt: 'Extract comprehensive research paper information including methodology, findings, and citations',
227
systemPrompt: 'You are an academic researcher extracting structured information from research papers. Focus on scientific accuracy and completeness.',
228
allowExternalLinks: true,
229
scrapeOptions: {
230
formats: ['markdown'],
231
onlyMainContent: true
232
}
233
});
234
```
235
236
### Async Extraction with Monitoring
237
238
```typescript
239
// Start extraction job
240
const extractResponse = await app.startExtract({
241
urls: Array.from({ length: 50 }, (_, i) =>
242
`https://reviews.example.com/product/${i + 1}`
243
),
244
schema: {
245
type: 'object',
246
properties: {
247
productName: { type: 'string' },
248
rating: { type: 'number' },
249
reviewText: { type: 'string' },
250
reviewer: { type: 'string' },
251
reviewDate: { type: 'string' },
252
pros: { type: 'array', items: { type: 'string' } },
253
cons: { type: 'array', items: { type: 'string' } },
254
recommended: { type: 'boolean' }
255
}
256
},
257
prompt: 'Extract detailed product review information including pros, cons, and recommendations'
258
});
259
260
console.log(`Started extraction job: ${extractResponse.id}`);
261
262
// Monitor progress
263
let result: ExtractResponse;
264
do {
265
await new Promise(resolve => setTimeout(resolve, 5000)); // Wait 5 seconds
266
result = await app.getExtractStatus(extractResponse.id!);
267
console.log(`Extraction status: ${result.status}`);
268
} while (result.status === 'processing');
269
270
if (result.status === 'completed') {
271
console.log('Extraction completed:', result.data);
272
} else {
273
console.error('Extraction failed:', result.error);
274
}
275
```
276
277
### Web Search Enhanced Extraction
278
279
```typescript
280
const extractResult = await app.extract({
281
prompt: 'Find information about recent AI safety research developments, key researchers, and policy recommendations',
282
enableWebSearch: true,
283
schema: {
284
type: 'object',
285
properties: {
286
recentDevelopments: {
287
type: 'array',
288
items: {
289
type: 'object',
290
properties: {
291
title: { type: 'string' },
292
description: { type: 'string' },
293
researchers: { type: 'array', items: { type: 'string' } },
294
institution: { type: 'string' },
295
date: { type: 'string' },
296
significance: { type: 'string' }
297
}
298
}
299
},
300
keyResearchers: {
301
type: 'array',
302
items: {
303
type: 'object',
304
properties: {
305
name: { type: 'string' },
306
affiliation: { type: 'string' },
307
expertise: { type: 'array', items: { type: 'string' } },
308
recentWork: { type: 'string' }
309
}
310
}
311
},
312
policyRecommendations: {
313
type: 'array',
314
items: {
315
type: 'object',
316
properties: {
317
recommendation: { type: 'string' },
318
rationale: { type: 'string' },
319
source: { type: 'string' }
320
}
321
}
322
}
323
}
324
},
325
showSources: true
326
}, {
327
timeout: 300 // 5 minutes
328
});
329
330
console.log('AI safety research analysis:', extractResult.data);
331
```
332
333
### Financial Data Extraction
334
335
```typescript
336
const FinancialDataSchema = z.object({
337
companyName: z.string(),
338
ticker: z.string().optional(),
339
currentPrice: z.number().optional(),
340
marketCap: z.string().optional(),
341
revenue: z.string(),
342
netIncome: z.string(),
343
eps: z.number().optional(),
344
peRatio: z.number().optional(),
345
dividendYield: z.number().optional(),
346
quarterlyGrowth: z.string().optional(),
347
keyMetrics: z.record(z.string()),
348
riskFactors: z.array(z.string()),
349
businessSegments: z.array(z.object({
350
segment: z.string(),
351
revenue: z.string(),
352
percentage: z.number().optional()
353
}))
354
});
355
356
const extractResult = await app.extract({
357
urls: [
358
'https://investor.example.com/financials',
359
'https://finance.yahoo.com/quote/EXAMPLE',
360
'https://www.sec.gov/example-10k'
361
],
362
schema: FinancialDataSchema,
363
prompt: 'Extract comprehensive financial data including revenue, profitability, key metrics, and risk factors',
364
systemPrompt: 'You are a financial analyst extracting key financial metrics and business information. Focus on numerical accuracy and current data.',
365
allowExternalLinks: true,
366
scrapeOptions: {
367
formats: ['markdown'],
368
timeout: 30000
369
}
370
});
371
```
372
373
### Error Handling and Validation
374
375
```typescript
376
try {
377
const extractResult = await app.extract({
378
urls: ['https://complex-site.example.com'],
379
schema: ComplexSchema,
380
prompt: 'Extract detailed information',
381
timeout: 180 // 3 minutes
382
});
383
384
if (extractResult.success && extractResult.data) {
385
// Validate extracted data
386
if (typeof extractResult.data === 'object' && extractResult.data !== null) {
387
console.log('Extraction successful:', extractResult.data);
388
389
if (extractResult.warning) {
390
console.log('Warning:', extractResult.warning);
391
}
392
393
if (extractResult.sources) {
394
console.log('Sources used:', extractResult.sources);
395
}
396
} else {
397
console.log('Extraction returned unexpected data format');
398
}
399
} else {
400
console.error('Extraction failed:', extractResult.error);
401
}
402
403
} catch (error) {
404
console.error('Extraction error:', error);
405
406
// Fallback to simpler extraction
407
try {
408
const fallbackResult = await app.extract({
409
urls: ['https://simple-fallback.example.com'],
410
prompt: 'Extract basic information',
411
timeout: 60
412
});
413
console.log('Fallback extraction:', fallbackResult.data);
414
} catch (fallbackError) {
415
console.error('Fallback extraction also failed:', fallbackError);
416
}
417
}
418
```
419
420
### Custom Agent Configuration
421
422
```typescript
423
const extractResult = await app.extract({
424
urls: ['https://technical-docs.example.com'],
425
prompt: 'Extract API documentation including endpoints, parameters, and examples',
426
schema: {
427
type: 'object',
428
properties: {
429
endpoints: {
430
type: 'array',
431
items: {
432
type: 'object',
433
properties: {
434
method: { type: 'string' },
435
path: { type: 'string' },
436
description: { type: 'string' },
437
parameters: { type: 'array', items: { type: 'object' } },
438
responses: { type: 'object' },
439
examples: { type: 'array', items: { type: 'string' } }
440
}
441
}
442
}
443
}
444
},
445
agent: {
446
model: 'gpt-4',
447
sessionId: 'api-docs-extraction-session'
448
},
449
scrapeOptions: {
450
formats: ['markdown'],
451
onlyMainContent: true
452
}
453
});
454
```