tessl/pypi-firecrawl-py

Python SDK for Firecrawl API that enables web scraping, crawling, and content extraction with LLM-optimized output formats

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Batch Processing

Name: tessl/pypi-firecrawl-py
Author: tessl

Batch operations for processing multiple URLs efficiently. Includes both batch scraping with full result polling and asynchronous job management for large-scale operations.

Capabilities

Complete Batch Scraping

Process multiple URLs in batch and return complete results, automatically polling for completion. Best for smaller batches or when you need immediate complete results.

def batch_scrape(urls: List[str], options: Optional[ScrapeOptions] = None) -> BatchScrapeResponse:
    """
    Scrape multiple URLs in batch and return complete results.
    
    Parameters:
    - urls: List[str], list of URLs to scrape
    - options: ScrapeOptions, optional configuration applied to all URLs
    
    Returns:
    - BatchScrapeResponse: complete batch scraping results
    """

Asynchronous Batch Processing

Start a batch scrape job and manage it asynchronously, ideal for large batches or when you need to track progress.

def start_batch_scrape(urls: List[str], options: Optional[ScrapeOptions] = None) -> str:
    """
    Start a batch scrape job asynchronously.
    
    Parameters:
    - urls: List[str], list of URLs to scrape
    - options: ScrapeOptions, optional configuration for scraping behavior
    
    Returns:
    - str: batch job ID for status tracking
    """

def get_batch_scrape_status(batch_id: str) -> BatchScrapeJobStatus:
    """
    Get status of a running batch scrape job.
    
    Parameters:
    - batch_id: str, batch job ID from start_batch_scrape
    
    Returns:
    - BatchScrapeJobStatus: current status and progress information
    """

def cancel_batch_scrape(batch_id: str) -> dict:
    """
    Cancel a running batch scrape job.
    
    Parameters:
    - batch_id: str, batch job ID to cancel
    
    Returns:
    - dict: cancellation confirmation
    """

# Async clients only
async def wait_batch_scrape(job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> BatchScrapeResponse:
    """
    Wait for batch scrape completion with automatic polling (AsyncFirecrawl only).
    
    Parameters:
    - job_id: str, batch job ID to wait for
    - poll_interval: int, polling interval in seconds (default: 2)
    - timeout: Optional[int], maximum wait time in seconds
    
    Returns:
    - BatchScrapeResponse: completed batch scrape results
    """

Batch Job Management

Manage and monitor batch jobs with error handling and progress tracking.

def get_batch_scrape_errors(batch_id: str) -> dict:
    """
    Get errors from a batch scrape job.
    
    Parameters:
    - batch_id: str, batch job ID
    
    Returns:
    - dict: error information for failed URLs
    """

Usage Examples

Basic Batch Scraping

from firecrawl import Firecrawl, ScrapeOptions

app = Firecrawl(api_key="your-api-key")

# Simple batch scrape
urls = [
    "https://example.com/page1",
    "https://example.com/page2", 
    "https://example.com/page3"
]
result = app.batch_scrape(urls)
print(f"Scraped {len(result.data)} URLs")

# Batch scrape with options
options = ScrapeOptions(
    formats=["markdown"],
    include_tags=["article", "main"],
    wait_for=1000
)
result = app.batch_scrape(urls, options)

Asynchronous Batch Management

from firecrawl import Firecrawl
import time

app = Firecrawl(api_key="your-api-key")

# Large batch of URLs
urls = [f"https://example.com/page{i}" for i in range(1, 101)]

# Start batch job
batch_id = app.start_batch_scrape(urls, 
                                 ScrapeOptions(formats=["markdown"]))
print(f"Started batch job: {batch_id}")

# Monitor progress
while True:
    status = app.get_batch_scrape_status(batch_id)
    print(f"Status: {status.status}")
    print(f"Completed: {status.completed}/{status.total}")
    
    if status.status in ["completed", "failed", "cancelled"]:
        break
        
    time.sleep(10)

# Get final results
if status.status == "completed":
    print(f"Batch completed with {len(status.data)} pages")
    for doc in status.data:
        print(f"URL: {doc.url}, Content length: {len(doc.content)}")
else:
    # Check for errors
    errors = app.get_batch_scrape_errors(batch_id)
    print(f"Batch failed URLs: {len(errors.get('failed_urls', []))}")

Processing Large URL Lists

from firecrawl import Firecrawl, ScrapeOptions
import csv

app = Firecrawl(api_key="your-api-key")

# Read URLs from CSV file
urls = []
with open('urls.csv', 'r') as file:
    reader = csv.reader(file)
    urls = [row[0] for row in reader]

print(f"Processing {len(urls)} URLs")

# Configure scraping options
options = ScrapeOptions(
    formats=["markdown", "html"],
    include_tags=["article", "main", "content"],
    exclude_tags=["nav", "footer", "sidebar"],
    wait_for=2000
)

# Process in batches to manage resources
batch_size = 50
results = []

for i in range(0, len(urls), batch_size):
    batch_urls = urls[i:i+batch_size]
    print(f"Processing batch {i//batch_size + 1}")
    
    batch_result = app.batch_scrape(batch_urls, options)
    results.extend(batch_result.data)
    
    print(f"Completed {len(results)} URLs so far")

print(f"Total scraped: {len(results)} pages")

Types

class BatchScrapeResponse:
    """Response from batch scrape operation"""
    success: bool
    data: List[Document]
    
class BatchScrapeJobStatus:
    """Status information for batch scrape job"""
    status: str  # "pending", "running", "completed", "failed", "cancelled"
    job_id: str
    total: int  # Total URLs to scrape
    completed: int  # URLs completed
    data: Optional[List[Document]]  # Results (available when completed)
    
class BatchScrapeRequest:
    """Request structure for batch scraping"""
    urls: List[str]
    options: Optional[ScrapeOptions]
    webhook: Optional[str]  # Webhook URL for completion notification

Error Handling

Batch operations handle individual URL failures gracefully:

from firecrawl import Firecrawl

app = Firecrawl(api_key="your-api-key")

# Mix of valid and invalid URLs
urls = [
    "https://example.com/valid1",
    "https://invalid-domain-12345.com",  # This will fail
    "https://example.com/valid2",
    "https://httpstat.us/404"  # This will fail
]

result = app.batch_scrape(urls)

# Check results
successful_count = len(result.data)
total_count = len(urls)
failed_count = total_count - successful_count

print(f"Successful: {successful_count}/{total_count}")
print(f"Failed: {failed_count}")

# Individual results contain status information
for doc in result.data:
    if hasattr(doc, 'error'):
        print(f"Failed URL: {doc.url}, Error: {doc.error}")
    else:
        print(f"Success URL: {doc.url}, Content length: {len(doc.content)}")

Async Usage

All batch operations have async equivalents:

import asyncio
from firecrawl import AsyncFirecrawl

async def batch_scrape_async():
    app = AsyncFirecrawl(api_key="your-api-key")
    
    urls = ["https://example.com/1", "https://example.com/2"]
    
    # Async complete batch scrape
    result = await app.batch_scrape(urls)
    
    # Async job management
    batch_id = await app.start_batch_scrape(urls)
    status = await app.get_batch_scrape_status(batch_id)
    
    # Wait for completion (async-specific method)
    final_result = await app.wait_batch_scrape(batch_id)

asyncio.run(batch_scrape_async())

Install with Tessl CLI