tessl/pypi-tavily-python

Python wrapper for the Tavily API with search, extract, crawl, and map capabilities

Overview

Eval results

Files

Website Mapping

Name: tessl/pypi-tavily-python
Author: tessl

Discover and map website structures without extracting full content, useful for understanding site architecture and finding relevant pages before detailed crawling or extraction operations.

Capabilities

Website Structure Mapping

Map website structure and discover pages without extracting full content, providing an efficient way to understand site architecture and identify relevant content areas.

def map(
    url: str,
    max_depth: int = None,
    max_breadth: int = None,
    limit: int = None,
    instructions: str = None,
    select_paths: Sequence[str] = None,
    select_domains: Sequence[str] = None,
    exclude_paths: Sequence[str] = None,
    exclude_domains: Sequence[str] = None,
    allow_external: bool = None,
    include_images: bool = None,
    timeout: int = 60,
    **kwargs
) -> dict:
    """
    Map website structure and discover pages without full content extraction.

    Parameters:
    - url: Starting URL for mapping
    - max_depth: Maximum depth to explore from starting URL
    - max_breadth: Maximum number of pages to discover per depth level
    - limit: Total maximum number of pages to map
    - instructions: Natural language instructions for mapping behavior
    - select_paths: List of path patterns to include in mapping
    - select_domains: List of domains to explore
    - exclude_paths: List of path patterns to exclude from mapping
    - exclude_domains: List of domains to avoid
    - allow_external: Allow mapping external domains from starting domain
    - include_images: Include image URLs in mapping results
    - timeout: Request timeout in seconds (max 120)
    - **kwargs: Additional mapping parameters

    Returns:
    Dict containing website structure map with discovered pages and hierarchy
    """

Usage Examples:

# Basic website mapping
site_map = client.map(
    url="https://docs.python.org",
    max_depth=3,
    limit=100
)

# Focused documentation mapping
docs_map = client.map(
    url="https://api.example.com",
    instructions="Map API documentation structure, focus on reference sections",
    select_paths=["/docs/*", "/reference/*", "/api/*"],
    max_depth=4,
    limit=200
)

# Multi-domain site mapping
company_map = client.map(
    url="https://company.com",
    allow_external=True,
    select_domains=[
        "company.com",
        "docs.company.com",
        "support.company.com"
    ],
    exclude_paths=["/admin/*", "/private/*"],
    max_depth=2
)

Mapping Use Cases

Pre-Crawl Site Analysis

Use mapping to understand site structure before performing expensive crawling operations:

# Map first to understand structure
site_structure = client.map(
    url="https://large-company.com",
    max_depth=2,
    limit=50
)

# Analyze the structure
print("Discovered pages:")
for page in site_structure.get('results', []):
    print(f"- {page['url']} (depth: {page.get('depth', 0)})")

# Then crawl specific areas based on mapping results
focused_crawl = client.crawl(
    url="https://large-company.com/products",
    select_paths=["/products/*", "/solutions/*"],
    max_depth=3,
    format="markdown"
)

Content Discovery

Identify content-rich areas of websites before extraction:

# Map to find content sections
content_map = client.map(
    url="https://news-site.com",
    instructions="Find main content sections like articles, reports, and analysis",
    exclude_paths=["/ads/*", "/widgets/*", "/social/*"],
    max_depth=2
)

# Extract content from discovered high-value pages
high_value_pages = [
    page['url'] for page in content_map.get('results', [])
    if 'article' in page['url'] or 'report' in page['url']
]

content_results = client.extract(
    urls=high_value_pages[:10],  # Extract from top 10 pages
    format="markdown",
    extract_depth="advanced"
)

Site Architecture Analysis

Understand website organization and navigation patterns:

# Comprehensive site mapping
architecture_map = client.map(
    url="https://enterprise-site.com",
    instructions="Map the complete site structure to understand organization",
    max_depth=3,
    max_breadth=20,
    limit=500
)

# Analyze navigation patterns
pages_by_depth = {}
for page in architecture_map.get('results', []):
    depth = page.get('depth', 0)
    if depth not in pages_by_depth:
        pages_by_depth[depth] = []
    pages_by_depth[depth].append(page['url'])

print("Site structure by depth:")
for depth, urls in pages_by_depth.items():
    print(f"Depth {depth}: {len(urls)} pages")
    for url in urls[:5]:  # Show first 5 URLs per depth
        print(f"  - {url}")

Advanced Mapping Patterns

Selective Domain Exploration

Map specific parts of multi-domain organizations:

# Map organization's web presence
org_map = client.map(
    url="https://university.edu",
    allow_external=True,
    select_domains=[
        "university.edu",           # Main site
        "research.university.edu",  # Research portal
        "library.university.edu",   # Library system
        "news.university.edu"       # News site
    ],
    exclude_domains=[
        "admin.university.edu",     # Admin systems
        "student.university.edu"    # Student portals
    ],
    instructions="Map public-facing educational content and research information",
    max_depth=2
)

Topic-Focused Mapping

Discover content related to specific topics or themes:

# Map AI/ML content across a tech site
ai_content_map = client.map(
    url="https://tech-company.com",
    instructions="Find pages related to artificial intelligence, machine learning, and data science",
    select_paths=[
        "/ai/*", 
        "/machine-learning/*", 
        "/data-science/*",
        "/blog/*ai*",
        "/research/*ml*"
    ],
    max_depth=3,
    limit=150
)

# Map specific product documentation
product_docs_map = client.map(
    url="https://company.com/products/api-gateway",
    instructions="Map all documentation related to the API Gateway product",
    select_paths=[
        "/products/api-gateway/*",
        "/docs/api-gateway/*",
        "/guides/api-gateway/*"
    ],
    max_depth=4
)

Quality-Based Filtering

Map only high-quality content pages:

# Map substantial content pages
quality_map = client.map(
    url="https://content-site.com",
    instructions="Focus on pages with substantial text content, skip navigation and utility pages",
    exclude_paths=[
        "/search*",      # Search pages
        "/tag/*",        # Tag pages  
        "/category/*",   # Category pages
        "/author/*",     # Author pages
        "*/print*",      # Print versions
        "*/amp*"         # AMP versions
    ],
    max_depth=2,
    limit=200
)

Mapping Results Analysis

Process and analyze mapping results effectively:

# Comprehensive mapping analysis
site_map = client.map(
    url="https://target-site.com",
    max_depth=3,
    limit=300
)

# Analyze results
results = site_map.get('results', [])

# Group by URL patterns
url_patterns = {}
for page in results:
    url = page['url']
    path_parts = url.split('/')[3:]  # Skip protocol and domain
    if path_parts:
        pattern = '/' + path_parts[0] + '/*'
        if pattern not in url_patterns:
            url_patterns[pattern] = []
        url_patterns[pattern].append(url)

print("Content organization:")
for pattern, urls in url_patterns.items():
    print(f"{pattern}: {len(urls)} pages")

# Find potential high-value targets for extraction
content_candidates = [
    page['url'] for page in results 
    if any(keyword in page['url'].lower() 
           for keyword in ['article', 'post', 'guide', 'tutorial', 'doc'])
]

print(f"\nFound {len(content_candidates)} potential content pages for extraction")

Performance Considerations

Mapping is more efficient than crawling for site discovery:

# Efficient large site exploration
efficient_map = client.map(
    url="https://large-site.com",
    max_depth=2,        # Shallow but broad exploration
    max_breadth=25,     # More pages per level
    limit=200,          # Reasonable total limit
    timeout=60          # Standard timeout
)

# Quick site overview
quick_overview = client.map(
    url="https://new-site.com",
    max_depth=1,        # Just immediate links
    limit=50,           # Small set for overview
    timeout=30          # Fast exploration
)

Error Handling

Handle mapping errors and partial results:

from tavily import TavilyClient, TimeoutError, BadRequestError

try:
    site_map = client.map("https://example.com", limit=100)
    
    # Process successful mapping
    discovered_pages = site_map.get('results', [])
    print(f"Successfully mapped {len(discovered_pages)} pages")
    
    # Handle any failed discoveries
    failed_mappings = site_map.get('failed_results', [])
    if failed_mappings:
        print(f"Failed to map {len(failed_mappings)} pages")
        
except TimeoutError:
    print("Mapping operation timed out - partial results may be available")
except BadRequestError as e:
    print(f"Invalid mapping parameters: {e}")

Install with Tessl CLI