tessl/pypi-tavily-python

Python wrapper for the Tavily API with search, extract, crawl, and map capabilities

Overview

Eval results

Files

Content Operations

Name: tessl/pypi-tavily-python
Author: tessl

Extract content from individual URLs or crawl entire websites with intelligent navigation, content filtering, and structured data extraction capabilities.

Capabilities

Content Extraction

Extract structured content from one or more URLs with options for different output formats and extraction depth levels.

def extract(
    urls: Union[List[str], str],
    include_images: bool = None,
    extract_depth: Literal["basic", "advanced"] = None,
    format: Literal["markdown", "text"] = None,
    timeout: int = 60,
    include_favicon: bool = None,
    **kwargs
) -> dict:
    """
    Extract content from single URL or list of URLs.

    Parameters:
    - urls: Single URL string or list of URL strings to extract content from
    - include_images: Include image URLs in extracted content
    - extract_depth: Extraction thoroughness ("basic" for main content, "advanced" for comprehensive)
    - format: Output format ("markdown" for structured text, "text" for plain text)
    - timeout: Request timeout in seconds (max 120)
    - include_favicon: Include website favicon URLs
    - **kwargs: Additional extraction parameters

    Returns:
    Dict containing:
    - results: List of extraction result objects with:
      - url: Source URL
      - content: Extracted content
      - title: Page title
      - score: Content quality score
    - failed_results: List of URLs that failed extraction with error details
    """

Usage Examples:

# Extract from single URL
result = client.extract("https://example.com/article")
print(result['results'][0]['content'])

# Extract from multiple URLs
urls = [
    "https://example.com/page1",
    "https://example.com/page2",
    "https://example.com/page3"
]
results = client.extract(
    urls=urls,
    format="markdown",
    extract_depth="advanced",
    include_images=True
)

# Process results and handle failures
for result in results['results']:
    print(f"URL: {result['url']}")
    print(f"Title: {result['title']}")
    print(f"Content: {result['content'][:200]}...")

for failed in results['failed_results']:
    print(f"Failed to extract: {failed['url']} - {failed['error']}")

Website Crawling

Intelligently crawl websites with custom navigation instructions, content filtering, and structured data extraction.

def crawl(
    url: str,
    max_depth: int = None,
    max_breadth: int = None,
    limit: int = None,
    instructions: str = None,
    select_paths: Sequence[str] = None,
    select_domains: Sequence[str] = None,
    exclude_paths: Sequence[str] = None,
    exclude_domains: Sequence[str] = None,
    allow_external: bool = None,
    include_images: bool = None,
    extract_depth: Literal["basic", "advanced"] = None,
    format: Literal["markdown", "text"] = None,
    timeout: int = 60,
    include_favicon: bool = None,
    **kwargs
) -> dict:
    """
    Crawl website with intelligent navigation and content extraction.

    Parameters:
    - url: Starting URL for crawling
    - max_depth: Maximum depth to crawl from starting URL
    - max_breadth: Maximum number of pages to crawl per depth level
    - limit: Total maximum number of pages to crawl
    - instructions: Natural language instructions for crawling behavior
    - select_paths: List of path patterns to include (supports wildcards)
    - select_domains: List of domains to crawl
    - exclude_paths: List of path patterns to exclude
    - exclude_domains: List of domains to avoid
    - allow_external: Allow crawling external domains from starting domain
    - include_images: Include image URLs in crawled content
    - extract_depth: Content extraction thoroughness
    - format: Output format for extracted content
    - timeout: Request timeout in seconds (max 120)
    - include_favicon: Include website favicon URLs

    Returns:
    Dict containing crawling results with pages and extracted content
    """

Usage Examples:

# Basic website crawl
crawl_result = client.crawl(
    url="https://docs.python.org",
    max_depth=2,
    limit=20
)

# Advanced crawl with filtering
crawl_result = client.crawl(
    url="https://example.com",
    max_depth=3,
    max_breadth=10,
    instructions="Focus on documentation and tutorial pages",
    select_paths=["/docs/*", "/tutorials/*"],
    exclude_paths=["/admin/*", "/private/*"],
    format="markdown",
    extract_depth="advanced"
)

# Cross-domain crawl
crawl_result = client.crawl(
    url="https://company.com",
    allow_external=True,
    select_domains=["company.com", "docs.company.com"],
    limit=50
)

Advanced Crawling Patterns

Targeted Content Crawling:

# Crawl specific content types
blog_crawl = client.crawl(
    url="https://techblog.com",
    instructions="Only crawl blog posts and articles, skip navigation pages",
    select_paths=["/blog/*", "/articles/*", "/posts/*"],
    exclude_paths=["/tags/*", "/categories/*", "/authors/*"],
    max_depth=2,
    format="markdown"
)

# E-commerce product crawl
product_crawl = client.crawl(
    url="https://store.com",
    instructions="Focus on product pages with descriptions and specifications",
    select_paths=["/products/*", "/items/*"],
    exclude_paths=["/cart/*", "/checkout/*", "/account/*"],
    include_images=True,
    limit=100
)

Research and Documentation Crawling:

# Academic paper crawl
research_crawl = client.crawl(
    url="https://university.edu/research",
    instructions="Crawl research papers and publications, skip administrative pages",
    select_paths=["/papers/*", "/publications/*", "/research/*"],
    extract_depth="advanced",
    max_depth=3
)

# API documentation crawl
docs_crawl = client.crawl(
    url="https://api.example.com/docs",
    instructions="Focus on API reference and tutorial content",
    format="markdown",
    max_depth=4,
    limit=200
)

Crawling Instructions

The instructions parameter accepts natural language descriptions that guide the crawling behavior:

Effective Instruction Examples:

# Content-focused instructions
instructions = "Focus on main content pages, skip navigation, sidebar, and footer links"

# Topic-specific instructions  
instructions = "Only crawl pages related to machine learning and AI, ignore general company pages"

# Quality-focused instructions
instructions = "Prioritize pages with substantial text content, skip image galleries and empty pages"

# Structure-focused instructions
instructions = "Follow documentation hierarchy, crawl systematically through sections and subsections"

Path and Domain Filtering

Path Pattern Examples:

# Include patterns
select_paths = [
    "/docs/*",           # All documentation
    "/api/*/reference",  # API reference pages
    "/blog/2024/*",      # 2024 blog posts
    "*/tutorial*"        # Any tutorial pages
]

# Exclude patterns
exclude_paths = [
    "/admin/*",          # Admin pages
    "/private/*",        # Private content
    "*/download*",       # Download pages
    "*.pdf",            # PDF files
    "*.jpg",            # Image files
]

Domain Management:

# Multi-domain crawling
result = client.crawl(
    url="https://main-site.com",
    allow_external=True,
    select_domains=[
        "main-site.com",
        "docs.main-site.com", 
        "blog.main-site.com",
        "support.main-site.com"
    ],
    exclude_domains=[
        "ads.main-site.com",
        "tracking.main-site.com"
    ]
)

Performance and Limits

Optimization Strategies:

# Balanced crawl for large sites
balanced_crawl = client.crawl(
    url="https://large-site.com",
    max_depth=2,          # Limit depth to avoid going too deep
    max_breadth=15,       # Limit breadth to focus on important pages
    limit=100,            # Overall page limit
    timeout=90            # Longer timeout for complex sites
)

# Fast shallow crawl
quick_crawl = client.crawl(
    url="https://site.com",
    max_depth=1,          # Only immediate links
    limit=20,             # Small page count
    timeout=30            # Quick timeout
)

Error Handling

Content operations include robust error handling for failed extractions and crawling issues:

from tavily import TavilyClient, TimeoutError, BadRequestError

try:
    result = client.crawl("https://example.com", limit=50)
    
    # Process successful results
    for page in result.get('results', []):
        print(f"Crawled: {page['url']}")
    
    # Handle any failed pages
    for failure in result.get('failed_results', []):
        print(f"Failed: {failure['url']} - {failure.get('error', 'Unknown error')}")
        
except TimeoutError:
    print("Crawling operation timed out")
except BadRequestError as e:
    print(f"Invalid crawl parameters: {e}")

Install with Tessl CLI