Tessl Tile for pypi/firecrawl-py@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

batch.md crawling.md extraction.md index.md monitoring.md scraping.md usage.md v1-api.md

extraction.mddocs/

0
# Data Extraction
1

2
AI-powered structured data extraction using custom schemas. Supports both immediate extraction with result polling and asynchronous job-based extraction for complex data processing.
3

4
## Capabilities
5

6
### Complete Data Extraction
7

8
Extract structured data from a URL using AI and return complete results, automatically polling for completion. Best for smaller extractions or when you need immediate results.
9

10
```python { .api }
11
def extract(url: str, schema: dict, options: Optional[ExtractOptions] = None) -> ExtractResponse:
12
    """
13
    Extract structured data from a URL using AI.
14
    
15
    Parameters:
16
    - url: str, target URL to extract data from
17
    - schema: dict, JSON schema defining the data structure to extract
18
    - options: ExtractOptions, optional extraction configuration
19
    
20
    Returns:
21
    - ExtractResponse: extracted structured data matching the schema
22
    """
23
```
24

25
### Asynchronous Data Extraction
26

27
Start an extraction job and manage it asynchronously, ideal for complex extractions or when you need to track progress.
28

29
```python { .api }
30
def start_extract(url: str, schema: dict, options: Optional[ExtractOptions] = None) -> str:
31
    """
32
    Start an extraction job asynchronously.
33
    
34
    Parameters:
35
    - url: str, target URL to extract data from
36
    - schema: dict, JSON schema defining the data structure to extract
37
    - options: ExtractOptions, optional extraction configuration
38
    
39
    Returns:
40
    - str: extraction job ID for status tracking
41
    """
42

43
def get_extract_status(extract_id: str) -> ExtractJobStatus:
44
    """
45
    Get status of a running extraction job.
46
    
47
    Parameters:
48
    - extract_id: str, extraction job ID from start_extract
49
    
50
    Returns:
51
    - ExtractJobStatus: current status and progress information
52
    """
53
```
54

55
## Usage Examples
56

57
### Basic Data Extraction
58

59
```python
60
from firecrawl import Firecrawl
61

62
app = Firecrawl(api_key="your-api-key")
63

64
# Define schema for product information
65
schema = {
66
    "type": "object",
67
    "properties": {
68
        "title": {"type": "string"},
69
        "price": {"type": "number"},
70
        "description": {"type": "string"},
71
        "features": {
72
            "type": "array",
73
            "items": {"type": "string"}
74
        },
75
        "availability": {"type": "string"}
76
    },
77
    "required": ["title", "price"]
78
}
79

80
# Extract product data
81
result = app.extract("https://store.example.com/product/123", schema)
82
print(result.data)
83
```
84

85
### Complex Schema Extraction
86

87
```python
88
from firecrawl import Firecrawl, ExtractOptions
89

90
app = Firecrawl(api_key="your-api-key")
91

92
# Complex schema for news article
93
schema = {
94
    "type": "object",
95
    "properties": {
96
        "headline": {"type": "string"},
97
        "author": {
98
            "type": "object",
99
            "properties": {
100
                "name": {"type": "string"},
101
                "bio": {"type": "string"},
102
                "email": {"type": "string"}
103
            }
104
        },
105
        "published_date": {"type": "string", "format": "date-time"},
106
        "content": {"type": "string"},
107
        "tags": {
108
            "type": "array",
109
            "items": {"type": "string"}
110
        },
111
        "related_articles": {
112
            "type": "array",
113
            "items": {
114
                "type": "object",
115
                "properties": {
116
                    "title": {"type": "string"},
117
                    "url": {"type": "string"}
118
                }
119
            }
120
        }
121
    },
122
    "required": ["headline", "content", "published_date"]
123
}
124

125
# Extract with options
126
options = ExtractOptions(
127
    prompt="Focus on extracting accurate publication dates and author information"
128
)
129

130
result = app.extract("https://news.example.com/article/123", schema, options)
131
print(f"Headline: {result.data['headline']}")
132
print(f"Author: {result.data['author']['name']}")
133
print(f"Tags: {', '.join(result.data['tags'])}")
134
```
135

136
### Asynchronous Extraction
137

138
```python
139
from firecrawl import Firecrawl
140
import time
141

142
app = Firecrawl(api_key="your-api-key")
143

144
# Schema for e-commerce catalog
145
schema = {
146
    "type": "object",
147
    "properties": {
148
        "products": {
149
            "type": "array",
150
            "items": {
151
                "type": "object",
152
                "properties": {
153
                    "name": {"type": "string"},
154
                    "price": {"type": "number"},
155
                    "category": {"type": "string"},
156
                    "rating": {"type": "number"},
157
                    "reviews_count": {"type": "integer"}
158
                }
159
            }
160
        },
161
        "total_products": {"type": "integer"},
162
        "page_info": {
163
            "type": "object",
164
            "properties": {
165
                "current_page": {"type": "integer"},
166
                "total_pages": {"type": "integer"}
167
            }
168
        }
169
    }
170
}
171

172
# Start extraction job
173
extract_id = app.start_extract("https://store.example.com/catalog", schema)
174
print(f"Started extraction job: {extract_id}")
175

176
# Monitor progress
177
while True:
178
    status = app.get_extract_status(extract_id)
179
    print(f"Status: {status.status}")
180
    
181
    if status.status in ["completed", "failed", "cancelled"]:
182
        break
183
        
184
    time.sleep(5)
185

186
# Get results
187
if status.status == "completed":
188
    products = status.data['products']
189
    print(f"Extracted {len(products)} products")
190
    for product in products[:5]:  # Show first 5
191
        print(f"- {product['name']}: ${product['price']}")
192
```
193

194
### Multi-Page Data Extraction
195

196
```python
197
from firecrawl import Firecrawl
198

199
app = Firecrawl(api_key="your-api-key")
200

201
# Schema for extracting company information
202
company_schema = {
203
    "type": "object",
204
    "properties": {
205
        "company_name": {"type": "string"},
206
        "industry": {"type": "string"},
207
        "founded": {"type": "string"},
208
        "employees": {"type": "string"},
209
        "headquarters": {"type": "string"},
210
        "description": {"type": "string"},
211
        "key_people": {
212
            "type": "array",
213
            "items": {
214
                "type": "object",
215
                "properties": {
216
                    "name": {"type": "string"},
217
                    "position": {"type": "string"}
218
                }
219
            }
220
        },
221
        "contact": {
222
            "type": "object",
223
            "properties": {
224
                "email": {"type": "string"},
225
                "phone": {"type": "string"},
226
                "website": {"type": "string"}
227
            }
228
        }
229
    }
230
}
231

232
# Extract from multiple company pages
233
companies = []
234
urls = [
235
    "https://example1.com/about",
236
    "https://example2.com/company",
237
    "https://example3.com/about-us"
238
]
239

240
for url in urls:
241
    try:
242
        result = app.extract(url, company_schema)
243
        companies.append(result.data)
244
        print(f"Extracted: {result.data['company_name']}")
245
    except Exception as e:
246
        print(f"Failed to extract from {url}: {e}")
247

248
print(f"Total companies extracted: {len(companies)}")
249
```
250

251
## Types
252

253
```python { .api }
254
class ExtractOptions:
255
    """Configuration options for extraction operations"""
256
    prompt: Optional[str]  # Additional prompt to guide extraction
257
    schema_description: Optional[str]  # Description of the schema purpose
258
    
259
class ExtractResponse:
260
    """Response from extract operation"""
261
    success: bool
262
    data: dict  # Extracted data matching the provided schema
263
    
264
class ExtractJobStatus:
265
    """Status information for extraction job"""
266
    status: str  # "pending", "running", "completed", "failed", "cancelled"
267
    job_id: str
268
    data: Optional[dict]  # Extracted data (available when completed)
269
    
270
class ExtractRequest:
271
    """Request structure for data extraction"""
272
    url: str
273
    schema: dict
274
    options: Optional[ExtractOptions]
275
    webhook: Optional[str]  # Webhook URL for completion notification
276
```
277

278
## Schema Design Best Practices
279

280
### Simple Schema
281

282
```python
283
# For basic information extraction
284
simple_schema = {
285
    "type": "object",
286
    "properties": {
287
        "title": {"type": "string"},
288
        "content": {"type": "string"},
289
        "date": {"type": "string"}
290
    },
291
    "required": ["title", "content"]
292
}
293
```
294

295
### Nested Schema
296

297
```python
298
# For complex structured data
299
nested_schema = {
300
    "type": "object",
301
    "properties": {
302
        "article": {
303
            "type": "object",
304
            "properties": {
305
                "metadata": {
306
                    "type": "object",
307
                    "properties": {
308
                        "title": {"type": "string"},
309
                        "author": {"type": "string"},
310
                        "date": {"type": "string"}
311
                    }
312
                },
313
                "content": {
314
                    "type": "object",
315
                    "properties": {
316
                        "body": {"type": "string"},
317
                        "sections": {
318
                            "type": "array",
319
                            "items": {
320
                                "type": "object",
321
                                "properties": {
322
                                    "heading": {"type": "string"},
323
                                    "text": {"type": "string"}
324
                                }
325
                            }
326
                        }
327
                    }
328
                }
329
            }
330
        }
331
    }
332
}
333
```
334

335
### Array Schema
336

337
```python
338
# For extracting lists of items
339
array_schema = {
340
    "type": "object",
341
    "properties": {
342
        "items": {
343
            "type": "array",
344
            "items": {
345
                "type": "object",
346
                "properties": {
347
                    "name": {"type": "string"},
348
                    "value": {"type": "string"},
349
                    "category": {"type": "string"}
350
                },
351
                "required": ["name"]
352
            }
353
        }
354
    }
355
}
356
```
357

358
## Error Handling
359

360
```python
361
from firecrawl import Firecrawl
362

363
app = Firecrawl(api_key="your-api-key")
364

365
schema = {
366
    "type": "object",
367
    "properties": {
368
        "title": {"type": "string"},
369
        "price": {"type": "number"}
370
    }
371
}
372

373
try:
374
    result = app.extract("https://example.com/product", schema)
375
    if result.success:
376
        print(f"Extracted: {result.data}")
377
    else:
378
        print("Extraction failed")
379
except Exception as e:
380
    print(f"Error during extraction: {e}")
381
```
382

383
## Async Usage
384

385
All extraction operations have async equivalents:
386

387
```python
388
import asyncio
389
from firecrawl import AsyncFirecrawl
390

391
async def extract_async():
392
    app = AsyncFirecrawl(api_key="your-api-key")
393
    
394
    schema = {"type": "object", "properties": {"title": {"type": "string"}}}
395
    
396
    # Async complete extraction
397
    result = await app.extract("https://example.com", schema)
398
    
399
    # Async job management
400
    extract_id = await app.start_extract("https://example.com", schema)
401
    status = await app.get_extract_status(extract_id)
402

403
asyncio.run(extract_async())
404
```

Version

Tile

Files

extraction.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

extraction.mddocs/