0
# Data Extraction
1
2
AI-powered structured data extraction using custom schemas. Supports both immediate extraction with result polling and asynchronous job-based extraction for complex data processing.
3
4
## Capabilities
5
6
### Complete Data Extraction
7
8
Extract structured data from a URL using AI and return complete results, automatically polling for completion. Best for smaller extractions or when you need immediate results.
9
10
```python { .api }
11
def extract(url: str, schema: dict, options: Optional[ExtractOptions] = None) -> ExtractResponse:
12
"""
13
Extract structured data from a URL using AI.
14
15
Parameters:
16
- url: str, target URL to extract data from
17
- schema: dict, JSON schema defining the data structure to extract
18
- options: ExtractOptions, optional extraction configuration
19
20
Returns:
21
- ExtractResponse: extracted structured data matching the schema
22
"""
23
```
24
25
### Asynchronous Data Extraction
26
27
Start an extraction job and manage it asynchronously, ideal for complex extractions or when you need to track progress.
28
29
```python { .api }
30
def start_extract(url: str, schema: dict, options: Optional[ExtractOptions] = None) -> str:
31
"""
32
Start an extraction job asynchronously.
33
34
Parameters:
35
- url: str, target URL to extract data from
36
- schema: dict, JSON schema defining the data structure to extract
37
- options: ExtractOptions, optional extraction configuration
38
39
Returns:
40
- str: extraction job ID for status tracking
41
"""
42
43
def get_extract_status(extract_id: str) -> ExtractJobStatus:
44
"""
45
Get status of a running extraction job.
46
47
Parameters:
48
- extract_id: str, extraction job ID from start_extract
49
50
Returns:
51
- ExtractJobStatus: current status and progress information
52
"""
53
```
54
55
## Usage Examples
56
57
### Basic Data Extraction
58
59
```python
60
from firecrawl import Firecrawl
61
62
app = Firecrawl(api_key="your-api-key")
63
64
# Define schema for product information
65
schema = {
66
"type": "object",
67
"properties": {
68
"title": {"type": "string"},
69
"price": {"type": "number"},
70
"description": {"type": "string"},
71
"features": {
72
"type": "array",
73
"items": {"type": "string"}
74
},
75
"availability": {"type": "string"}
76
},
77
"required": ["title", "price"]
78
}
79
80
# Extract product data
81
result = app.extract("https://store.example.com/product/123", schema)
82
print(result.data)
83
```
84
85
### Complex Schema Extraction
86
87
```python
88
from firecrawl import Firecrawl, ExtractOptions
89
90
app = Firecrawl(api_key="your-api-key")
91
92
# Complex schema for news article
93
schema = {
94
"type": "object",
95
"properties": {
96
"headline": {"type": "string"},
97
"author": {
98
"type": "object",
99
"properties": {
100
"name": {"type": "string"},
101
"bio": {"type": "string"},
102
"email": {"type": "string"}
103
}
104
},
105
"published_date": {"type": "string", "format": "date-time"},
106
"content": {"type": "string"},
107
"tags": {
108
"type": "array",
109
"items": {"type": "string"}
110
},
111
"related_articles": {
112
"type": "array",
113
"items": {
114
"type": "object",
115
"properties": {
116
"title": {"type": "string"},
117
"url": {"type": "string"}
118
}
119
}
120
}
121
},
122
"required": ["headline", "content", "published_date"]
123
}
124
125
# Extract with options
126
options = ExtractOptions(
127
prompt="Focus on extracting accurate publication dates and author information"
128
)
129
130
result = app.extract("https://news.example.com/article/123", schema, options)
131
print(f"Headline: {result.data['headline']}")
132
print(f"Author: {result.data['author']['name']}")
133
print(f"Tags: {', '.join(result.data['tags'])}")
134
```
135
136
### Asynchronous Extraction
137
138
```python
139
from firecrawl import Firecrawl
140
import time
141
142
app = Firecrawl(api_key="your-api-key")
143
144
# Schema for e-commerce catalog
145
schema = {
146
"type": "object",
147
"properties": {
148
"products": {
149
"type": "array",
150
"items": {
151
"type": "object",
152
"properties": {
153
"name": {"type": "string"},
154
"price": {"type": "number"},
155
"category": {"type": "string"},
156
"rating": {"type": "number"},
157
"reviews_count": {"type": "integer"}
158
}
159
}
160
},
161
"total_products": {"type": "integer"},
162
"page_info": {
163
"type": "object",
164
"properties": {
165
"current_page": {"type": "integer"},
166
"total_pages": {"type": "integer"}
167
}
168
}
169
}
170
}
171
172
# Start extraction job
173
extract_id = app.start_extract("https://store.example.com/catalog", schema)
174
print(f"Started extraction job: {extract_id}")
175
176
# Monitor progress
177
while True:
178
status = app.get_extract_status(extract_id)
179
print(f"Status: {status.status}")
180
181
if status.status in ["completed", "failed", "cancelled"]:
182
break
183
184
time.sleep(5)
185
186
# Get results
187
if status.status == "completed":
188
products = status.data['products']
189
print(f"Extracted {len(products)} products")
190
for product in products[:5]: # Show first 5
191
print(f"- {product['name']}: ${product['price']}")
192
```
193
194
### Multi-Page Data Extraction
195
196
```python
197
from firecrawl import Firecrawl
198
199
app = Firecrawl(api_key="your-api-key")
200
201
# Schema for extracting company information
202
company_schema = {
203
"type": "object",
204
"properties": {
205
"company_name": {"type": "string"},
206
"industry": {"type": "string"},
207
"founded": {"type": "string"},
208
"employees": {"type": "string"},
209
"headquarters": {"type": "string"},
210
"description": {"type": "string"},
211
"key_people": {
212
"type": "array",
213
"items": {
214
"type": "object",
215
"properties": {
216
"name": {"type": "string"},
217
"position": {"type": "string"}
218
}
219
}
220
},
221
"contact": {
222
"type": "object",
223
"properties": {
224
"email": {"type": "string"},
225
"phone": {"type": "string"},
226
"website": {"type": "string"}
227
}
228
}
229
}
230
}
231
232
# Extract from multiple company pages
233
companies = []
234
urls = [
235
"https://example1.com/about",
236
"https://example2.com/company",
237
"https://example3.com/about-us"
238
]
239
240
for url in urls:
241
try:
242
result = app.extract(url, company_schema)
243
companies.append(result.data)
244
print(f"Extracted: {result.data['company_name']}")
245
except Exception as e:
246
print(f"Failed to extract from {url}: {e}")
247
248
print(f"Total companies extracted: {len(companies)}")
249
```
250
251
## Types
252
253
```python { .api }
254
class ExtractOptions:
255
"""Configuration options for extraction operations"""
256
prompt: Optional[str] # Additional prompt to guide extraction
257
schema_description: Optional[str] # Description of the schema purpose
258
259
class ExtractResponse:
260
"""Response from extract operation"""
261
success: bool
262
data: dict # Extracted data matching the provided schema
263
264
class ExtractJobStatus:
265
"""Status information for extraction job"""
266
status: str # "pending", "running", "completed", "failed", "cancelled"
267
job_id: str
268
data: Optional[dict] # Extracted data (available when completed)
269
270
class ExtractRequest:
271
"""Request structure for data extraction"""
272
url: str
273
schema: dict
274
options: Optional[ExtractOptions]
275
webhook: Optional[str] # Webhook URL for completion notification
276
```
277
278
## Schema Design Best Practices
279
280
### Simple Schema
281
282
```python
283
# For basic information extraction
284
simple_schema = {
285
"type": "object",
286
"properties": {
287
"title": {"type": "string"},
288
"content": {"type": "string"},
289
"date": {"type": "string"}
290
},
291
"required": ["title", "content"]
292
}
293
```
294
295
### Nested Schema
296
297
```python
298
# For complex structured data
299
nested_schema = {
300
"type": "object",
301
"properties": {
302
"article": {
303
"type": "object",
304
"properties": {
305
"metadata": {
306
"type": "object",
307
"properties": {
308
"title": {"type": "string"},
309
"author": {"type": "string"},
310
"date": {"type": "string"}
311
}
312
},
313
"content": {
314
"type": "object",
315
"properties": {
316
"body": {"type": "string"},
317
"sections": {
318
"type": "array",
319
"items": {
320
"type": "object",
321
"properties": {
322
"heading": {"type": "string"},
323
"text": {"type": "string"}
324
}
325
}
326
}
327
}
328
}
329
}
330
}
331
}
332
}
333
```
334
335
### Array Schema
336
337
```python
338
# For extracting lists of items
339
array_schema = {
340
"type": "object",
341
"properties": {
342
"items": {
343
"type": "array",
344
"items": {
345
"type": "object",
346
"properties": {
347
"name": {"type": "string"},
348
"value": {"type": "string"},
349
"category": {"type": "string"}
350
},
351
"required": ["name"]
352
}
353
}
354
}
355
}
356
```
357
358
## Error Handling
359
360
```python
361
from firecrawl import Firecrawl
362
363
app = Firecrawl(api_key="your-api-key")
364
365
schema = {
366
"type": "object",
367
"properties": {
368
"title": {"type": "string"},
369
"price": {"type": "number"}
370
}
371
}
372
373
try:
374
result = app.extract("https://example.com/product", schema)
375
if result.success:
376
print(f"Extracted: {result.data}")
377
else:
378
print("Extraction failed")
379
except Exception as e:
380
print(f"Error during extraction: {e}")
381
```
382
383
## Async Usage
384
385
All extraction operations have async equivalents:
386
387
```python
388
import asyncio
389
from firecrawl import AsyncFirecrawl
390
391
async def extract_async():
392
app = AsyncFirecrawl(api_key="your-api-key")
393
394
schema = {"type": "object", "properties": {"title": {"type": "string"}}}
395
396
# Async complete extraction
397
result = await app.extract("https://example.com", schema)
398
399
# Async job management
400
extract_id = await app.start_extract("https://example.com", schema)
401
status = await app.get_extract_status(extract_id)
402
403
asyncio.run(extract_async())
404
```