tessl/pypi-firecrawl-py

Python SDK for Firecrawl API that enables web scraping, crawling, and content extraction with LLM-optimized output formats

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Data Extraction

Name: tessl/pypi-firecrawl-py
Author: tessl

AI-powered structured data extraction using custom schemas. Supports both immediate extraction with result polling and asynchronous job-based extraction for complex data processing.

Capabilities

Complete Data Extraction

Extract structured data from a URL using AI and return complete results, automatically polling for completion. Best for smaller extractions or when you need immediate results.

def extract(url: str, schema: dict, options: Optional[ExtractOptions] = None) -> ExtractResponse:
    """
    Extract structured data from a URL using AI.
    
    Parameters:
    - url: str, target URL to extract data from
    - schema: dict, JSON schema defining the data structure to extract
    - options: ExtractOptions, optional extraction configuration
    
    Returns:
    - ExtractResponse: extracted structured data matching the schema
    """

Asynchronous Data Extraction

Start an extraction job and manage it asynchronously, ideal for complex extractions or when you need to track progress.

def start_extract(url: str, schema: dict, options: Optional[ExtractOptions] = None) -> str:
    """
    Start an extraction job asynchronously.
    
    Parameters:
    - url: str, target URL to extract data from
    - schema: dict, JSON schema defining the data structure to extract
    - options: ExtractOptions, optional extraction configuration
    
    Returns:
    - str: extraction job ID for status tracking
    """

def get_extract_status(extract_id: str) -> ExtractJobStatus:
    """
    Get status of a running extraction job.
    
    Parameters:
    - extract_id: str, extraction job ID from start_extract
    
    Returns:
    - ExtractJobStatus: current status and progress information
    """

Usage Examples

Basic Data Extraction

from firecrawl import Firecrawl

app = Firecrawl(api_key="your-api-key")

# Define schema for product information
schema = {
    "type": "object",
    "properties": {
        "title": {"type": "string"},
        "price": {"type": "number"},
        "description": {"type": "string"},
        "features": {
            "type": "array",
            "items": {"type": "string"}
        },
        "availability": {"type": "string"}
    },
    "required": ["title", "price"]
}

# Extract product data
result = app.extract("https://store.example.com/product/123", schema)
print(result.data)

Complex Schema Extraction

from firecrawl import Firecrawl, ExtractOptions

app = Firecrawl(api_key="your-api-key")

# Complex schema for news article
schema = {
    "type": "object",
    "properties": {
        "headline": {"type": "string"},
        "author": {
            "type": "object",
            "properties": {
                "name": {"type": "string"},
                "bio": {"type": "string"},
                "email": {"type": "string"}
            }
        },
        "published_date": {"type": "string", "format": "date-time"},
        "content": {"type": "string"},
        "tags": {
            "type": "array",
            "items": {"type": "string"}
        },
        "related_articles": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "title": {"type": "string"},
                    "url": {"type": "string"}
                }
            }
        }
    },
    "required": ["headline", "content", "published_date"]
}

# Extract with options
options = ExtractOptions(
    prompt="Focus on extracting accurate publication dates and author information"
)

result = app.extract("https://news.example.com/article/123", schema, options)
print(f"Headline: {result.data['headline']}")
print(f"Author: {result.data['author']['name']}")
print(f"Tags: {', '.join(result.data['tags'])}")

Asynchronous Extraction

from firecrawl import Firecrawl
import time

app = Firecrawl(api_key="your-api-key")

# Schema for e-commerce catalog
schema = {
    "type": "object",
    "properties": {
        "products": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "price": {"type": "number"},
                    "category": {"type": "string"},
                    "rating": {"type": "number"},
                    "reviews_count": {"type": "integer"}
                }
            }
        },
        "total_products": {"type": "integer"},
        "page_info": {
            "type": "object",
            "properties": {
                "current_page": {"type": "integer"},
                "total_pages": {"type": "integer"}
            }
        }
    }
}

# Start extraction job
extract_id = app.start_extract("https://store.example.com/catalog", schema)
print(f"Started extraction job: {extract_id}")

# Monitor progress
while True:
    status = app.get_extract_status(extract_id)
    print(f"Status: {status.status}")
    
    if status.status in ["completed", "failed", "cancelled"]:
        break
        
    time.sleep(5)

# Get results
if status.status == "completed":
    products = status.data['products']
    print(f"Extracted {len(products)} products")
    for product in products[:5]:  # Show first 5
        print(f"- {product['name']}: ${product['price']}")

Multi-Page Data Extraction

from firecrawl import Firecrawl

app = Firecrawl(api_key="your-api-key")

# Schema for extracting company information
company_schema = {
    "type": "object",
    "properties": {
        "company_name": {"type": "string"},
        "industry": {"type": "string"},
        "founded": {"type": "string"},
        "employees": {"type": "string"},
        "headquarters": {"type": "string"},
        "description": {"type": "string"},
        "key_people": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "position": {"type": "string"}
                }
            }
        },
        "contact": {
            "type": "object",
            "properties": {
                "email": {"type": "string"},
                "phone": {"type": "string"},
                "website": {"type": "string"}
            }
        }
    }
}

# Extract from multiple company pages
companies = []
urls = [
    "https://example1.com/about",
    "https://example2.com/company",
    "https://example3.com/about-us"
]

for url in urls:
    try:
        result = app.extract(url, company_schema)
        companies.append(result.data)
        print(f"Extracted: {result.data['company_name']}")
    except Exception as e:
        print(f"Failed to extract from {url}: {e}")

print(f"Total companies extracted: {len(companies)}")

Types

class ExtractOptions:
    """Configuration options for extraction operations"""
    prompt: Optional[str]  # Additional prompt to guide extraction
    schema_description: Optional[str]  # Description of the schema purpose
    
class ExtractResponse:
    """Response from extract operation"""
    success: bool
    data: dict  # Extracted data matching the provided schema
    
class ExtractJobStatus:
    """Status information for extraction job"""
    status: str  # "pending", "running", "completed", "failed", "cancelled"
    job_id: str
    data: Optional[dict]  # Extracted data (available when completed)
    
class ExtractRequest:
    """Request structure for data extraction"""
    url: str
    schema: dict
    options: Optional[ExtractOptions]
    webhook: Optional[str]  # Webhook URL for completion notification

Schema Design Best Practices

Simple Schema

# For basic information extraction
simple_schema = {
    "type": "object",
    "properties": {
        "title": {"type": "string"},
        "content": {"type": "string"},
        "date": {"type": "string"}
    },
    "required": ["title", "content"]
}

Nested Schema

# For complex structured data
nested_schema = {
    "type": "object",
    "properties": {
        "article": {
            "type": "object",
            "properties": {
                "metadata": {
                    "type": "object",
                    "properties": {
                        "title": {"type": "string"},
                        "author": {"type": "string"},
                        "date": {"type": "string"}
                    }
                },
                "content": {
                    "type": "object",
                    "properties": {
                        "body": {"type": "string"},
                        "sections": {
                            "type": "array",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "heading": {"type": "string"},
                                    "text": {"type": "string"}
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

Array Schema

# For extracting lists of items
array_schema = {
    "type": "object",
    "properties": {
        "items": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "name": {"type": "string"},
                    "value": {"type": "string"},
                    "category": {"type": "string"}
                },
                "required": ["name"]
            }
        }
    }
}

Error Handling

from firecrawl import Firecrawl

app = Firecrawl(api_key="your-api-key")

schema = {
    "type": "object",
    "properties": {
        "title": {"type": "string"},
        "price": {"type": "number"}
    }
}

try:
    result = app.extract("https://example.com/product", schema)
    if result.success:
        print(f"Extracted: {result.data}")
    else:
        print("Extraction failed")
except Exception as e:
    print(f"Error during extraction: {e}")

Async Usage

All extraction operations have async equivalents:

import asyncio
from firecrawl import AsyncFirecrawl

async def extract_async():
    app = AsyncFirecrawl(api_key="your-api-key")
    
    schema = {"type": "object", "properties": {"title": {"type": "string"}}}
    
    # Async complete extraction
    result = await app.extract("https://example.com", schema)
    
    # Async job management
    extract_id = await app.start_extract("https://example.com", schema)
    status = await app.get_extract_status(extract_id)

asyncio.run(extract_async())

Install with Tessl CLI