Python SDK for Firecrawl API that enables web scraping, crawling, and content extraction with LLM-optimized output formats
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Batch operations for processing multiple URLs efficiently. Includes both batch scraping with full result polling and asynchronous job management for large-scale operations.
Process multiple URLs in batch and return complete results, automatically polling for completion. Best for smaller batches or when you need immediate complete results.
def batch_scrape(urls: List[str], options: Optional[ScrapeOptions] = None) -> BatchScrapeResponse:
"""
Scrape multiple URLs in batch and return complete results.
Parameters:
- urls: List[str], list of URLs to scrape
- options: ScrapeOptions, optional configuration applied to all URLs
Returns:
- BatchScrapeResponse: complete batch scraping results
"""Start a batch scrape job and manage it asynchronously, ideal for large batches or when you need to track progress.
def start_batch_scrape(urls: List[str], options: Optional[ScrapeOptions] = None) -> str:
"""
Start a batch scrape job asynchronously.
Parameters:
- urls: List[str], list of URLs to scrape
- options: ScrapeOptions, optional configuration for scraping behavior
Returns:
- str: batch job ID for status tracking
"""
def get_batch_scrape_status(batch_id: str) -> BatchScrapeJobStatus:
"""
Get status of a running batch scrape job.
Parameters:
- batch_id: str, batch job ID from start_batch_scrape
Returns:
- BatchScrapeJobStatus: current status and progress information
"""
def cancel_batch_scrape(batch_id: str) -> dict:
"""
Cancel a running batch scrape job.
Parameters:
- batch_id: str, batch job ID to cancel
Returns:
- dict: cancellation confirmation
"""
# Async clients only
async def wait_batch_scrape(job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> BatchScrapeResponse:
"""
Wait for batch scrape completion with automatic polling (AsyncFirecrawl only).
Parameters:
- job_id: str, batch job ID to wait for
- poll_interval: int, polling interval in seconds (default: 2)
- timeout: Optional[int], maximum wait time in seconds
Returns:
- BatchScrapeResponse: completed batch scrape results
"""Manage and monitor batch jobs with error handling and progress tracking.
def get_batch_scrape_errors(batch_id: str) -> dict:
"""
Get errors from a batch scrape job.
Parameters:
- batch_id: str, batch job ID
Returns:
- dict: error information for failed URLs
"""from firecrawl import Firecrawl, ScrapeOptions
app = Firecrawl(api_key="your-api-key")
# Simple batch scrape
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3"
]
result = app.batch_scrape(urls)
print(f"Scraped {len(result.data)} URLs")
# Batch scrape with options
options = ScrapeOptions(
formats=["markdown"],
include_tags=["article", "main"],
wait_for=1000
)
result = app.batch_scrape(urls, options)from firecrawl import Firecrawl
import time
app = Firecrawl(api_key="your-api-key")
# Large batch of URLs
urls = [f"https://example.com/page{i}" for i in range(1, 101)]
# Start batch job
batch_id = app.start_batch_scrape(urls,
ScrapeOptions(formats=["markdown"]))
print(f"Started batch job: {batch_id}")
# Monitor progress
while True:
status = app.get_batch_scrape_status(batch_id)
print(f"Status: {status.status}")
print(f"Completed: {status.completed}/{status.total}")
if status.status in ["completed", "failed", "cancelled"]:
break
time.sleep(10)
# Get final results
if status.status == "completed":
print(f"Batch completed with {len(status.data)} pages")
for doc in status.data:
print(f"URL: {doc.url}, Content length: {len(doc.content)}")
else:
# Check for errors
errors = app.get_batch_scrape_errors(batch_id)
print(f"Batch failed URLs: {len(errors.get('failed_urls', []))}")from firecrawl import Firecrawl, ScrapeOptions
import csv
app = Firecrawl(api_key="your-api-key")
# Read URLs from CSV file
urls = []
with open('urls.csv', 'r') as file:
reader = csv.reader(file)
urls = [row[0] for row in reader]
print(f"Processing {len(urls)} URLs")
# Configure scraping options
options = ScrapeOptions(
formats=["markdown", "html"],
include_tags=["article", "main", "content"],
exclude_tags=["nav", "footer", "sidebar"],
wait_for=2000
)
# Process in batches to manage resources
batch_size = 50
results = []
for i in range(0, len(urls), batch_size):
batch_urls = urls[i:i+batch_size]
print(f"Processing batch {i//batch_size + 1}")
batch_result = app.batch_scrape(batch_urls, options)
results.extend(batch_result.data)
print(f"Completed {len(results)} URLs so far")
print(f"Total scraped: {len(results)} pages")class BatchScrapeResponse:
"""Response from batch scrape operation"""
success: bool
data: List[Document]
class BatchScrapeJobStatus:
"""Status information for batch scrape job"""
status: str # "pending", "running", "completed", "failed", "cancelled"
job_id: str
total: int # Total URLs to scrape
completed: int # URLs completed
data: Optional[List[Document]] # Results (available when completed)
class BatchScrapeRequest:
"""Request structure for batch scraping"""
urls: List[str]
options: Optional[ScrapeOptions]
webhook: Optional[str] # Webhook URL for completion notificationBatch operations handle individual URL failures gracefully:
from firecrawl import Firecrawl
app = Firecrawl(api_key="your-api-key")
# Mix of valid and invalid URLs
urls = [
"https://example.com/valid1",
"https://invalid-domain-12345.com", # This will fail
"https://example.com/valid2",
"https://httpstat.us/404" # This will fail
]
result = app.batch_scrape(urls)
# Check results
successful_count = len(result.data)
total_count = len(urls)
failed_count = total_count - successful_count
print(f"Successful: {successful_count}/{total_count}")
print(f"Failed: {failed_count}")
# Individual results contain status information
for doc in result.data:
if hasattr(doc, 'error'):
print(f"Failed URL: {doc.url}, Error: {doc.error}")
else:
print(f"Success URL: {doc.url}, Content length: {len(doc.content)}")All batch operations have async equivalents:
import asyncio
from firecrawl import AsyncFirecrawl
async def batch_scrape_async():
app = AsyncFirecrawl(api_key="your-api-key")
urls = ["https://example.com/1", "https://example.com/2"]
# Async complete batch scrape
result = await app.batch_scrape(urls)
# Async job management
batch_id = await app.start_batch_scrape(urls)
status = await app.get_batch_scrape_status(batch_id)
# Wait for completion (async-specific method)
final_result = await app.wait_batch_scrape(batch_id)
asyncio.run(batch_scrape_async())Install with Tessl CLI
npx tessl i tessl/pypi-firecrawl-py