Python wrapper for the Tavily API with search, extract, crawl, and map capabilities
Discover and map website structures without extracting full content, useful for understanding site architecture and finding relevant pages before detailed crawling or extraction operations.
Map website structure and discover pages without extracting full content, providing an efficient way to understand site architecture and identify relevant content areas.
def map(
url: str,
max_depth: int = None,
max_breadth: int = None,
limit: int = None,
instructions: str = None,
select_paths: Sequence[str] = None,
select_domains: Sequence[str] = None,
exclude_paths: Sequence[str] = None,
exclude_domains: Sequence[str] = None,
allow_external: bool = None,
include_images: bool = None,
timeout: int = 60,
**kwargs
) -> dict:
"""
Map website structure and discover pages without full content extraction.
Parameters:
- url: Starting URL for mapping
- max_depth: Maximum depth to explore from starting URL
- max_breadth: Maximum number of pages to discover per depth level
- limit: Total maximum number of pages to map
- instructions: Natural language instructions for mapping behavior
- select_paths: List of path patterns to include in mapping
- select_domains: List of domains to explore
- exclude_paths: List of path patterns to exclude from mapping
- exclude_domains: List of domains to avoid
- allow_external: Allow mapping external domains from starting domain
- include_images: Include image URLs in mapping results
- timeout: Request timeout in seconds (max 120)
- **kwargs: Additional mapping parameters
Returns:
Dict containing website structure map with discovered pages and hierarchy
"""Usage Examples:
# Basic website mapping
site_map = client.map(
url="https://docs.python.org",
max_depth=3,
limit=100
)
# Focused documentation mapping
docs_map = client.map(
url="https://api.example.com",
instructions="Map API documentation structure, focus on reference sections",
select_paths=["/docs/*", "/reference/*", "/api/*"],
max_depth=4,
limit=200
)
# Multi-domain site mapping
company_map = client.map(
url="https://company.com",
allow_external=True,
select_domains=[
"company.com",
"docs.company.com",
"support.company.com"
],
exclude_paths=["/admin/*", "/private/*"],
max_depth=2
)Use mapping to understand site structure before performing expensive crawling operations:
# Map first to understand structure
site_structure = client.map(
url="https://large-company.com",
max_depth=2,
limit=50
)
# Analyze the structure
print("Discovered pages:")
for page in site_structure.get('results', []):
print(f"- {page['url']} (depth: {page.get('depth', 0)})")
# Then crawl specific areas based on mapping results
focused_crawl = client.crawl(
url="https://large-company.com/products",
select_paths=["/products/*", "/solutions/*"],
max_depth=3,
format="markdown"
)Identify content-rich areas of websites before extraction:
# Map to find content sections
content_map = client.map(
url="https://news-site.com",
instructions="Find main content sections like articles, reports, and analysis",
exclude_paths=["/ads/*", "/widgets/*", "/social/*"],
max_depth=2
)
# Extract content from discovered high-value pages
high_value_pages = [
page['url'] for page in content_map.get('results', [])
if 'article' in page['url'] or 'report' in page['url']
]
content_results = client.extract(
urls=high_value_pages[:10], # Extract from top 10 pages
format="markdown",
extract_depth="advanced"
)Understand website organization and navigation patterns:
# Comprehensive site mapping
architecture_map = client.map(
url="https://enterprise-site.com",
instructions="Map the complete site structure to understand organization",
max_depth=3,
max_breadth=20,
limit=500
)
# Analyze navigation patterns
pages_by_depth = {}
for page in architecture_map.get('results', []):
depth = page.get('depth', 0)
if depth not in pages_by_depth:
pages_by_depth[depth] = []
pages_by_depth[depth].append(page['url'])
print("Site structure by depth:")
for depth, urls in pages_by_depth.items():
print(f"Depth {depth}: {len(urls)} pages")
for url in urls[:5]: # Show first 5 URLs per depth
print(f" - {url}")Map specific parts of multi-domain organizations:
# Map organization's web presence
org_map = client.map(
url="https://university.edu",
allow_external=True,
select_domains=[
"university.edu", # Main site
"research.university.edu", # Research portal
"library.university.edu", # Library system
"news.university.edu" # News site
],
exclude_domains=[
"admin.university.edu", # Admin systems
"student.university.edu" # Student portals
],
instructions="Map public-facing educational content and research information",
max_depth=2
)Discover content related to specific topics or themes:
# Map AI/ML content across a tech site
ai_content_map = client.map(
url="https://tech-company.com",
instructions="Find pages related to artificial intelligence, machine learning, and data science",
select_paths=[
"/ai/*",
"/machine-learning/*",
"/data-science/*",
"/blog/*ai*",
"/research/*ml*"
],
max_depth=3,
limit=150
)
# Map specific product documentation
product_docs_map = client.map(
url="https://company.com/products/api-gateway",
instructions="Map all documentation related to the API Gateway product",
select_paths=[
"/products/api-gateway/*",
"/docs/api-gateway/*",
"/guides/api-gateway/*"
],
max_depth=4
)Map only high-quality content pages:
# Map substantial content pages
quality_map = client.map(
url="https://content-site.com",
instructions="Focus on pages with substantial text content, skip navigation and utility pages",
exclude_paths=[
"/search*", # Search pages
"/tag/*", # Tag pages
"/category/*", # Category pages
"/author/*", # Author pages
"*/print*", # Print versions
"*/amp*" # AMP versions
],
max_depth=2,
limit=200
)Process and analyze mapping results effectively:
# Comprehensive mapping analysis
site_map = client.map(
url="https://target-site.com",
max_depth=3,
limit=300
)
# Analyze results
results = site_map.get('results', [])
# Group by URL patterns
url_patterns = {}
for page in results:
url = page['url']
path_parts = url.split('/')[3:] # Skip protocol and domain
if path_parts:
pattern = '/' + path_parts[0] + '/*'
if pattern not in url_patterns:
url_patterns[pattern] = []
url_patterns[pattern].append(url)
print("Content organization:")
for pattern, urls in url_patterns.items():
print(f"{pattern}: {len(urls)} pages")
# Find potential high-value targets for extraction
content_candidates = [
page['url'] for page in results
if any(keyword in page['url'].lower()
for keyword in ['article', 'post', 'guide', 'tutorial', 'doc'])
]
print(f"\nFound {len(content_candidates)} potential content pages for extraction")Mapping is more efficient than crawling for site discovery:
# Efficient large site exploration
efficient_map = client.map(
url="https://large-site.com",
max_depth=2, # Shallow but broad exploration
max_breadth=25, # More pages per level
limit=200, # Reasonable total limit
timeout=60 # Standard timeout
)
# Quick site overview
quick_overview = client.map(
url="https://new-site.com",
max_depth=1, # Just immediate links
limit=50, # Small set for overview
timeout=30 # Fast exploration
)Handle mapping errors and partial results:
from tavily import TavilyClient, TimeoutError, BadRequestError
try:
site_map = client.map("https://example.com", limit=100)
# Process successful mapping
discovered_pages = site_map.get('results', [])
print(f"Successfully mapped {len(discovered_pages)} pages")
# Handle any failed discoveries
failed_mappings = site_map.get('failed_results', [])
if failed_mappings:
print(f"Failed to map {len(failed_mappings)} pages")
except TimeoutError:
print("Mapping operation timed out - partial results may be available")
except BadRequestError as e:
print(f"Invalid mapping parameters: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-tavily-python