Python SDK for Firecrawl API that enables web scraping, crawling, and content extraction with LLM-optimized output formats
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
AI-powered structured data extraction using custom schemas. Supports both immediate extraction with result polling and asynchronous job-based extraction for complex data processing.
Extract structured data from a URL using AI and return complete results, automatically polling for completion. Best for smaller extractions or when you need immediate results.
def extract(url: str, schema: dict, options: Optional[ExtractOptions] = None) -> ExtractResponse:
"""
Extract structured data from a URL using AI.
Parameters:
- url: str, target URL to extract data from
- schema: dict, JSON schema defining the data structure to extract
- options: ExtractOptions, optional extraction configuration
Returns:
- ExtractResponse: extracted structured data matching the schema
"""Start an extraction job and manage it asynchronously, ideal for complex extractions or when you need to track progress.
def start_extract(url: str, schema: dict, options: Optional[ExtractOptions] = None) -> str:
"""
Start an extraction job asynchronously.
Parameters:
- url: str, target URL to extract data from
- schema: dict, JSON schema defining the data structure to extract
- options: ExtractOptions, optional extraction configuration
Returns:
- str: extraction job ID for status tracking
"""
def get_extract_status(extract_id: str) -> ExtractJobStatus:
"""
Get status of a running extraction job.
Parameters:
- extract_id: str, extraction job ID from start_extract
Returns:
- ExtractJobStatus: current status and progress information
"""from firecrawl import Firecrawl
app = Firecrawl(api_key="your-api-key")
# Define schema for product information
schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"price": {"type": "number"},
"description": {"type": "string"},
"features": {
"type": "array",
"items": {"type": "string"}
},
"availability": {"type": "string"}
},
"required": ["title", "price"]
}
# Extract product data
result = app.extract("https://store.example.com/product/123", schema)
print(result.data)from firecrawl import Firecrawl, ExtractOptions
app = Firecrawl(api_key="your-api-key")
# Complex schema for news article
schema = {
"type": "object",
"properties": {
"headline": {"type": "string"},
"author": {
"type": "object",
"properties": {
"name": {"type": "string"},
"bio": {"type": "string"},
"email": {"type": "string"}
}
},
"published_date": {"type": "string", "format": "date-time"},
"content": {"type": "string"},
"tags": {
"type": "array",
"items": {"type": "string"}
},
"related_articles": {
"type": "array",
"items": {
"type": "object",
"properties": {
"title": {"type": "string"},
"url": {"type": "string"}
}
}
}
},
"required": ["headline", "content", "published_date"]
}
# Extract with options
options = ExtractOptions(
prompt="Focus on extracting accurate publication dates and author information"
)
result = app.extract("https://news.example.com/article/123", schema, options)
print(f"Headline: {result.data['headline']}")
print(f"Author: {result.data['author']['name']}")
print(f"Tags: {', '.join(result.data['tags'])}")from firecrawl import Firecrawl
import time
app = Firecrawl(api_key="your-api-key")
# Schema for e-commerce catalog
schema = {
"type": "object",
"properties": {
"products": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"price": {"type": "number"},
"category": {"type": "string"},
"rating": {"type": "number"},
"reviews_count": {"type": "integer"}
}
}
},
"total_products": {"type": "integer"},
"page_info": {
"type": "object",
"properties": {
"current_page": {"type": "integer"},
"total_pages": {"type": "integer"}
}
}
}
}
# Start extraction job
extract_id = app.start_extract("https://store.example.com/catalog", schema)
print(f"Started extraction job: {extract_id}")
# Monitor progress
while True:
status = app.get_extract_status(extract_id)
print(f"Status: {status.status}")
if status.status in ["completed", "failed", "cancelled"]:
break
time.sleep(5)
# Get results
if status.status == "completed":
products = status.data['products']
print(f"Extracted {len(products)} products")
for product in products[:5]: # Show first 5
print(f"- {product['name']}: ${product['price']}")from firecrawl import Firecrawl
app = Firecrawl(api_key="your-api-key")
# Schema for extracting company information
company_schema = {
"type": "object",
"properties": {
"company_name": {"type": "string"},
"industry": {"type": "string"},
"founded": {"type": "string"},
"employees": {"type": "string"},
"headquarters": {"type": "string"},
"description": {"type": "string"},
"key_people": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"position": {"type": "string"}
}
}
},
"contact": {
"type": "object",
"properties": {
"email": {"type": "string"},
"phone": {"type": "string"},
"website": {"type": "string"}
}
}
}
}
# Extract from multiple company pages
companies = []
urls = [
"https://example1.com/about",
"https://example2.com/company",
"https://example3.com/about-us"
]
for url in urls:
try:
result = app.extract(url, company_schema)
companies.append(result.data)
print(f"Extracted: {result.data['company_name']}")
except Exception as e:
print(f"Failed to extract from {url}: {e}")
print(f"Total companies extracted: {len(companies)}")class ExtractOptions:
"""Configuration options for extraction operations"""
prompt: Optional[str] # Additional prompt to guide extraction
schema_description: Optional[str] # Description of the schema purpose
class ExtractResponse:
"""Response from extract operation"""
success: bool
data: dict # Extracted data matching the provided schema
class ExtractJobStatus:
"""Status information for extraction job"""
status: str # "pending", "running", "completed", "failed", "cancelled"
job_id: str
data: Optional[dict] # Extracted data (available when completed)
class ExtractRequest:
"""Request structure for data extraction"""
url: str
schema: dict
options: Optional[ExtractOptions]
webhook: Optional[str] # Webhook URL for completion notification# For basic information extraction
simple_schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"content": {"type": "string"},
"date": {"type": "string"}
},
"required": ["title", "content"]
}# For complex structured data
nested_schema = {
"type": "object",
"properties": {
"article": {
"type": "object",
"properties": {
"metadata": {
"type": "object",
"properties": {
"title": {"type": "string"},
"author": {"type": "string"},
"date": {"type": "string"}
}
},
"content": {
"type": "object",
"properties": {
"body": {"type": "string"},
"sections": {
"type": "array",
"items": {
"type": "object",
"properties": {
"heading": {"type": "string"},
"text": {"type": "string"}
}
}
}
}
}
}
}
}
}# For extracting lists of items
array_schema = {
"type": "object",
"properties": {
"items": {
"type": "array",
"items": {
"type": "object",
"properties": {
"name": {"type": "string"},
"value": {"type": "string"},
"category": {"type": "string"}
},
"required": ["name"]
}
}
}
}from firecrawl import Firecrawl
app = Firecrawl(api_key="your-api-key")
schema = {
"type": "object",
"properties": {
"title": {"type": "string"},
"price": {"type": "number"}
}
}
try:
result = app.extract("https://example.com/product", schema)
if result.success:
print(f"Extracted: {result.data}")
else:
print("Extraction failed")
except Exception as e:
print(f"Error during extraction: {e}")All extraction operations have async equivalents:
import asyncio
from firecrawl import AsyncFirecrawl
async def extract_async():
app = AsyncFirecrawl(api_key="your-api-key")
schema = {"type": "object", "properties": {"title": {"type": "string"}}}
# Async complete extraction
result = await app.extract("https://example.com", schema)
# Async job management
extract_id = await app.start_extract("https://example.com", schema)
status = await app.get_extract_status(extract_id)
asyncio.run(extract_async())Install with Tessl CLI
npx tessl i tessl/pypi-firecrawl-py