Accurately separates a URL's subdomain, domain, and public suffix using the Public Suffix List
Advanced extraction functionality through the TLDExtract class, providing fine-grained control over caching, suffix list sources, private domain handling, and network behavior. Use this when you need custom configuration beyond the default extract() function.
Main configurable extractor class that allows custom PSL sources, cache management, and extraction behavior.
class TLDExtract:
def __init__(
self,
cache_dir: str | None = None,
suffix_list_urls: Sequence[str] = PUBLIC_SUFFIX_LIST_URLS,
fallback_to_snapshot: bool = True,
include_psl_private_domains: bool = False,
extra_suffixes: Sequence[str] = (),
cache_fetch_timeout: str | float | None = CACHE_TIMEOUT
) -> None:
"""
Create a configurable TLD extractor.
Parameters:
- cache_dir: Directory for caching PSL data (None disables caching)
- suffix_list_urls: URLs to fetch PSL data from, tried in order
- fallback_to_snapshot: Fall back to bundled PSL snapshot if fetch fails
- include_psl_private_domains: Include PSL private domains by default
- extra_suffixes: Additional custom suffixes to recognize
- cache_fetch_timeout: HTTP timeout for PSL fetching (seconds)
"""Core extraction methods that parse URL strings into components.
def __call__(
self,
url: str,
include_psl_private_domains: bool | None = None,
session: requests.Session | None = None
) -> ExtractResult:
"""
Extract components from URL string (alias for extract_str).
Parameters:
- url: URL string to parse
- include_psl_private_domains: Override instance default for private domains
- session: Optional requests.Session for HTTP customization
Returns:
ExtractResult with parsed components
"""
def extract_str(
self,
url: str,
include_psl_private_domains: bool | None = None,
session: requests.Session | None = None
) -> ExtractResult:
"""
Extract components from URL string.
Parameters:
- url: URL string to parse
- include_psl_private_domains: Override instance default for private domains
- session: Optional requests.Session for HTTP customization
Returns:
ExtractResult with parsed components
"""Extract from pre-parsed urllib objects for better performance when you already have parsed URL components.
def extract_urllib(
self,
url: urllib.parse.ParseResult | urllib.parse.SplitResult,
include_psl_private_domains: bool | None = None,
session: requests.Session | None = None
) -> ExtractResult:
"""
Extract from urllib.parse result for better performance.
Parameters:
- url: Result from urllib.parse.urlparse() or urlsplit()
- include_psl_private_domains: Override instance default for private domains
- session: Optional requests.Session for HTTP customization
Returns:
ExtractResult with parsed components
"""Methods for managing PSL data and caching behavior.
def update(
self,
fetch_now: bool = False,
session: requests.Session | None = None
) -> None:
"""
Force refresh of PSL data.
Parameters:
- fetch_now: Fetch immediately rather than on next extraction
- session: Optional requests.Session for HTTP customization
"""
def tlds(self, session: requests.Session | None = None) -> list[str]:
"""
Get the list of TLDs currently used by this extractor.
Parameters:
- session: Optional requests.Session for HTTP customization
Returns:
List of TLD strings, varies based on include_psl_private_domains and extra_suffixes
"""Create an extractor that doesn't use disk caching for environments where disk access is restricted.
import tldextract
# Disable caching entirely
no_cache_extractor = tldextract.TLDExtract(cache_dir=None)
result = no_cache_extractor('http://example.com')Specify a custom location for PSL data caching.
import tldextract
# Use custom cache directory
custom_cache_extractor = tldextract.TLDExtract(cache_dir='/path/to/custom/cache/')
result = custom_cache_extractor('http://example.com')Create an extractor that works entirely offline using the bundled PSL snapshot.
import tldextract
# Offline-only extractor
offline_extractor = tldextract.TLDExtract(
suffix_list_urls=(), # No remote URLs
fallback_to_snapshot=True
)
result = offline_extractor('http://example.com')Use alternative or local PSL data sources.
import tldextract
# Use custom PSL sources
custom_psl_extractor = tldextract.TLDExtract(
suffix_list_urls=[
'file:///path/to/local/suffix_list.dat',
'http://custom.psl.mirror.com/list.dat'
],
fallback_to_snapshot=False
)
result = custom_psl_extractor('http://example.com')Configure an extractor to always include PSL private domains.
import tldextract
# Always include private domains
private_extractor = tldextract.TLDExtract(include_psl_private_domains=True)
# This will treat blogspot.com as a public suffix
result = private_extractor('waiterrant.blogspot.com')
print(result)
# ExtractResult(subdomain='', domain='waiterrant', suffix='blogspot.com', is_private=True)Add custom suffixes that aren't in the PSL.
import tldextract
# Add custom internal suffixes
internal_extractor = tldextract.TLDExtract(
extra_suffixes=['internal', 'corp.example.com']
)
result = internal_extractor('subdomain.example.internal')
print(result)
# ExtractResult(subdomain='subdomain', domain='example', suffix='internal', is_private=False)Configure timeout for PSL fetching operations.
import tldextract
# Set custom timeout
timeout_extractor = tldextract.TLDExtract(cache_fetch_timeout=10.0)
result = timeout_extractor('http://example.com')
# Can also be set via environment variable
import os
os.environ['TLDEXTRACT_CACHE_TIMEOUT'] = '5.0'
env_extractor = tldextract.TLDExtract()Optimize performance when working with pre-parsed URLs.
import urllib.parse
import tldextract
extractor = tldextract.TLDExtract()
# Parse once, extract efficiently
parsed_url = urllib.parse.urlparse('http://forums.news.cnn.com/path?query=value')
result = extractor.extract_urllib(parsed_url)
print(result)
# ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)Use custom HTTP session for PSL fetching with proxies, authentication, or other customizations.
import requests
import tldextract
# Create session with custom configuration
session = requests.Session()
session.proxies = {'http': 'http://proxy.example.com:8080'}
session.headers.update({'User-Agent': 'MyApp/1.0'})
extractor = tldextract.TLDExtract()
# Use custom session for PSL fetching
result = extractor('http://example.com', session=session)
# Force update with custom session
extractor.update(fetch_now=True, session=session)The TLDExtract class handles various error conditions gracefully:
ValueError for impossible configurations (e.g., no data sources)import tldextract
# This raises ValueError - no way to get PSL data
try:
bad_extractor = tldextract.TLDExtract(
suffix_list_urls=(),
cache_dir=None,
fallback_to_snapshot=False
)
except ValueError as e:
print("Configuration error:", e)extract_urllib() when you already have parsed URLsInstall with Tessl CLI
npx tessl i tessl/pypi-tldextract