Python SDK for Firecrawl API that enables web scraping, crawling, and content extraction with LLM-optimized output formats
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Essential web scraping functionality for extracting content from single URLs, searching the web, and mapping website structures. These operations provide immediate results with comprehensive format and processing options.
Extract content from a single webpage with extensive formatting and processing options including markdown conversion, HTML extraction, screenshots, and metadata collection.
def scrape(
url: str,
*,
formats: Optional[List[str]] = None,
headers: Optional[Dict[str, str]] = None,
include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None,
only_main_content: Optional[bool] = None,
timeout: Optional[int] = None,
wait_for: Optional[int] = None,
mobile: Optional[bool] = None,
parsers: Optional[List[str]] = None,
actions: Optional[List[dict]] = None,
location: Optional[dict] = None,
skip_tls_verification: Optional[bool] = None,
remove_base64_images: Optional[bool] = None,
fast_mode: Optional[bool] = None,
use_mock: Optional[str] = None,
block_ads: Optional[bool] = None,
proxy: Optional[str] = None,
max_age: Optional[int] = None,
store_in_cache: Optional[bool] = None,
integration: Optional[str] = None
) -> Document:
"""
Scrape content from a single URL.
Parameters:
- url: str, target URL to scrape
- formats: List[str], output formats ("markdown", "html", "rawHtml", "screenshot", "links")
- headers: Dict[str, str], custom HTTP headers
- include_tags: List[str], HTML tags to include
- exclude_tags: List[str], HTML tags to exclude
- only_main_content: bool, extract only main content
- timeout: int, request timeout in milliseconds
- wait_for: int, wait time before scraping in milliseconds
- mobile: bool, use mobile user agent
- parsers: List[str], content parsers to use
- actions: List[dict], browser actions to perform
- location: dict, geographic location settings
- skip_tls_verification: bool, skip SSL certificate verification
- remove_base64_images: bool, remove base64 encoded images
- fast_mode: bool, use faster scraping mode
- use_mock: str, use mock response for testing
- block_ads: bool, block advertisements
- proxy: str, proxy server to use
- max_age: int, maximum cache age in seconds
- store_in_cache: bool, store result in cache
- integration: str, integration identifier
Returns:
- Document: scraped content and metadata
"""Search the web with content extraction, returning relevant results with extracted content formatted for LLM consumption.
def search(
query: str,
*,
sources: Optional[List[str]] = None,
categories: Optional[List[str]] = None,
limit: Optional[int] = None,
tbs: Optional[str] = None,
location: Optional[str] = None,
ignore_invalid_urls: Optional[bool] = None,
timeout: Optional[int] = None,
scrape_options: Optional[dict] = None,
integration: Optional[str] = None
) -> SearchData:
"""
Search the web and extract content from results.
Parameters:
- query: str, search query
- sources: List[str], search sources to use
- categories: List[str], content categories to filter
- limit: int, maximum number of results
- tbs: str, time-based search parameters
- location: str, geographic location for search
- ignore_invalid_urls: bool, skip invalid URLs in results
- timeout: int, request timeout in milliseconds
- scrape_options: dict, options for scraping search results
- integration: str, integration identifier
Returns:
- SearchData: search results with extracted content
"""Generate a structural map of a website showing available pages and their relationships, useful for understanding site architecture before crawling.
def map(
url: str,
*,
search: Optional[str] = None,
include_subdomains: Optional[bool] = None,
limit: Optional[int] = None,
sitemap: str = "include",
timeout: Optional[int] = None,
integration: Optional[str] = None,
location: Optional[dict] = None
) -> MapData:
"""
Generate a map of website structure.
Parameters:
- url: str, target website URL
- search: Optional[str], search term to filter URLs
- include_subdomains: Optional[bool], include subdomain URLs
- limit: Optional[int], maximum number of URLs to return
- sitemap: str, sitemap handling ("include", "exclude", "only")
- timeout: Optional[int], request timeout in milliseconds
- integration: Optional[str], integration identifier
- location: Optional[dict], geographic location settings
Returns:
- MapData: website structure map with URLs and metadata
"""from firecrawl import Firecrawl, ScrapeOptions
app = Firecrawl(api_key="your-api-key")
# Simple scraping
result = app.scrape("https://example.com")
print(result.data.content)
# Scraping with options
options = ScrapeOptions(
formats=["markdown", "html"],
include_tags=["article", "main"],
wait_for=2000,
screenshot=True
)
result = app.scrape("https://example.com", options)from firecrawl import Firecrawl, SearchOptions
app = Firecrawl(api_key="your-api-key")
# Basic search
results = app.search("latest AI developments")
for doc in results.data:
print(f"Title: {doc.metadata.get('title')}")
print(f"Content: {doc.content[:200]}...")
# Search with options
options = SearchOptions(
limit=10,
search_type="news",
language="en",
country="US"
)
results = app.search("AI breakthrough", options)from firecrawl import Firecrawl, MapOptions
app = Firecrawl(api_key="your-api-key")
# Generate site map
options = MapOptions(max_depth=3)
site_map = app.map("https://example.com", options)
for page in site_map.data:
print(f"URL: {page.url}")
print(f"Status: {page.status}")class ScrapeOptions:
"""Configuration options for scraping operations"""
formats: Optional[List[str]] # Output formats: ["markdown", "html", "rawHtml", "screenshot", "links"]
include_tags: Optional[List[str]] # HTML tags to include
exclude_tags: Optional[List[str]] # HTML tags to exclude
wait_for: Optional[int] # Wait time in milliseconds
screenshot: Optional[bool] # Capture screenshot
full_page_screenshot: Optional[bool] # Full page screenshot
mobile: Optional[bool] # Use mobile user agent
class ScrapeResponse:
"""Response from scrape operation"""
success: bool
data: Document
class SearchOptions:
"""Configuration options for search operations"""
limit: Optional[int] # Maximum number of results (default: 5)
search_type: Optional[str] # Search type: "web", "news", "academic"
language: Optional[str] # Language code (e.g., "en")
country: Optional[str] # Country code (e.g., "US")
class SearchResponse:
"""Response from search operation"""
success: bool
data: List[Document]
class MapOptions:
"""Configuration options for mapping operations"""
max_depth: Optional[int] # Maximum crawl depth
limit: Optional[int] # Maximum pages to map
ignore_sitemap: Optional[bool] # Ignore sitemap.xml
class MapResponse:
"""Response from map operation"""
success: bool
data: List[dict] # List of page informationAll scraping operations have async equivalents:
import asyncio
from firecrawl import AsyncFirecrawl
async def scrape_async():
app = AsyncFirecrawl(api_key="your-api-key")
# Async scraping
result = await app.scrape("https://example.com")
# Async search
search_results = await app.search("query")
# Async mapping
site_map = await app.map("https://example.com")
asyncio.run(scrape_async())Install with Tessl CLI
npx tessl i tessl/pypi-firecrawl-py