tessl/pypi-firecrawl-py

Python SDK for Firecrawl API that enables web scraping, crawling, and content extraction with LLM-optimized output formats

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Scraping Operations

Name: tessl/pypi-firecrawl-py
Author: tessl

Essential web scraping functionality for extracting content from single URLs, searching the web, and mapping website structures. These operations provide immediate results with comprehensive format and processing options.

Capabilities

Single URL Scraping

Extract content from a single webpage with extensive formatting and processing options including markdown conversion, HTML extraction, screenshots, and metadata collection.

def scrape(
    url: str,
    *,
    formats: Optional[List[str]] = None,
    headers: Optional[Dict[str, str]] = None,
    include_tags: Optional[List[str]] = None,
    exclude_tags: Optional[List[str]] = None,
    only_main_content: Optional[bool] = None,
    timeout: Optional[int] = None,
    wait_for: Optional[int] = None,
    mobile: Optional[bool] = None,
    parsers: Optional[List[str]] = None,
    actions: Optional[List[dict]] = None,
    location: Optional[dict] = None,
    skip_tls_verification: Optional[bool] = None,
    remove_base64_images: Optional[bool] = None,
    fast_mode: Optional[bool] = None,
    use_mock: Optional[str] = None,
    block_ads: Optional[bool] = None,
    proxy: Optional[str] = None,
    max_age: Optional[int] = None,
    store_in_cache: Optional[bool] = None,
    integration: Optional[str] = None
) -> Document:
    """
    Scrape content from a single URL.
    
    Parameters:
    - url: str, target URL to scrape
    - formats: List[str], output formats ("markdown", "html", "rawHtml", "screenshot", "links")
    - headers: Dict[str, str], custom HTTP headers
    - include_tags: List[str], HTML tags to include
    - exclude_tags: List[str], HTML tags to exclude  
    - only_main_content: bool, extract only main content
    - timeout: int, request timeout in milliseconds
    - wait_for: int, wait time before scraping in milliseconds
    - mobile: bool, use mobile user agent
    - parsers: List[str], content parsers to use
    - actions: List[dict], browser actions to perform
    - location: dict, geographic location settings
    - skip_tls_verification: bool, skip SSL certificate verification
    - remove_base64_images: bool, remove base64 encoded images
    - fast_mode: bool, use faster scraping mode
    - use_mock: str, use mock response for testing
    - block_ads: bool, block advertisements
    - proxy: str, proxy server to use
    - max_age: int, maximum cache age in seconds
    - store_in_cache: bool, store result in cache
    - integration: str, integration identifier
    
    Returns:
    - Document: scraped content and metadata
    """

Web Search

Search the web with content extraction, returning relevant results with extracted content formatted for LLM consumption.

def search(
    query: str,
    *,
    sources: Optional[List[str]] = None,
    categories: Optional[List[str]] = None,
    limit: Optional[int] = None,
    tbs: Optional[str] = None,
    location: Optional[str] = None,
    ignore_invalid_urls: Optional[bool] = None,
    timeout: Optional[int] = None,
    scrape_options: Optional[dict] = None,
    integration: Optional[str] = None
) -> SearchData:
    """
    Search the web and extract content from results.
    
    Parameters:
    - query: str, search query
    - sources: List[str], search sources to use
    - categories: List[str], content categories to filter
    - limit: int, maximum number of results
    - tbs: str, time-based search parameters
    - location: str, geographic location for search
    - ignore_invalid_urls: bool, skip invalid URLs in results
    - timeout: int, request timeout in milliseconds
    - scrape_options: dict, options for scraping search results
    - integration: str, integration identifier
    
    Returns:
    - SearchData: search results with extracted content
    """

Website Mapping

Generate a structural map of a website showing available pages and their relationships, useful for understanding site architecture before crawling.

def map(
    url: str,
    *,
    search: Optional[str] = None,
    include_subdomains: Optional[bool] = None,
    limit: Optional[int] = None,
    sitemap: str = "include",
    timeout: Optional[int] = None,
    integration: Optional[str] = None,
    location: Optional[dict] = None
) -> MapData:
    """
    Generate a map of website structure.
    
    Parameters:
    - url: str, target website URL
    - search: Optional[str], search term to filter URLs
    - include_subdomains: Optional[bool], include subdomain URLs
    - limit: Optional[int], maximum number of URLs to return
    - sitemap: str, sitemap handling ("include", "exclude", "only")
    - timeout: Optional[int], request timeout in milliseconds
    - integration: Optional[str], integration identifier
    - location: Optional[dict], geographic location settings
    
    Returns:
    - MapData: website structure map with URLs and metadata
    """

Usage Examples

Basic Scraping

from firecrawl import Firecrawl, ScrapeOptions

app = Firecrawl(api_key="your-api-key")

# Simple scraping
result = app.scrape("https://example.com")
print(result.data.content)

# Scraping with options
options = ScrapeOptions(
    formats=["markdown", "html"],
    include_tags=["article", "main"],
    wait_for=2000,
    screenshot=True
)
result = app.scrape("https://example.com", options)

Web Search

from firecrawl import Firecrawl, SearchOptions

app = Firecrawl(api_key="your-api-key")

# Basic search
results = app.search("latest AI developments")
for doc in results.data:
    print(f"Title: {doc.metadata.get('title')}")
    print(f"Content: {doc.content[:200]}...")

# Search with options
options = SearchOptions(
    limit=10,
    search_type="news",
    language="en",
    country="US"
)
results = app.search("AI breakthrough", options)

Website Mapping

from firecrawl import Firecrawl, MapOptions

app = Firecrawl(api_key="your-api-key")

# Generate site map
options = MapOptions(max_depth=3)
site_map = app.map("https://example.com", options)

for page in site_map.data:
    print(f"URL: {page.url}")
    print(f"Status: {page.status}")

Types

class ScrapeOptions:
    """Configuration options for scraping operations"""
    formats: Optional[List[str]]  # Output formats: ["markdown", "html", "rawHtml", "screenshot", "links"]
    include_tags: Optional[List[str]]  # HTML tags to include
    exclude_tags: Optional[List[str]]  # HTML tags to exclude
    wait_for: Optional[int]  # Wait time in milliseconds
    screenshot: Optional[bool]  # Capture screenshot
    full_page_screenshot: Optional[bool]  # Full page screenshot
    mobile: Optional[bool]  # Use mobile user agent

class ScrapeResponse:
    """Response from scrape operation"""
    success: bool
    data: Document
    
class SearchOptions:
    """Configuration options for search operations"""
    limit: Optional[int]  # Maximum number of results (default: 5)
    search_type: Optional[str]  # Search type: "web", "news", "academic"
    language: Optional[str]  # Language code (e.g., "en")
    country: Optional[str]  # Country code (e.g., "US")
    
class SearchResponse:
    """Response from search operation"""
    success: bool
    data: List[Document]
    
class MapOptions:
    """Configuration options for mapping operations"""
    max_depth: Optional[int]  # Maximum crawl depth
    limit: Optional[int]  # Maximum pages to map
    ignore_sitemap: Optional[bool]  # Ignore sitemap.xml
    
class MapResponse:
    """Response from map operation"""
    success: bool
    data: List[dict]  # List of page information

Async Usage

All scraping operations have async equivalents:

import asyncio
from firecrawl import AsyncFirecrawl

async def scrape_async():
    app = AsyncFirecrawl(api_key="your-api-key")
    
    # Async scraping
    result = await app.scrape("https://example.com")
    
    # Async search
    search_results = await app.search("query")
    
    # Async mapping
    site_map = await app.map("https://example.com")

asyncio.run(scrape_async())

Install with Tessl CLI