Accurately separates a URL's subdomain, domain, and public suffix using the Public Suffix List
npx @tessl/cli install tessl/pypi-tldextract@5.3.0Accurately separates a URL's subdomain, domain, and public suffix using the Public Suffix List (PSL). This library provides robust URL parsing that handles complex domain structures including country code TLDs (ccTLDs), generic TLDs (gTLDs), and their exceptions that naive string splitting cannot parse correctly.
pip install tldextractimport tldextractFor basic usage, all functionality is available through the main module:
from tldextract import extract, TLDExtract, ExtractResult, __version__import tldextract
# Basic URL extraction
result = tldextract.extract('http://forums.news.cnn.com/')
print(result)
# ExtractResult(subdomain='forums.news', domain='cnn', suffix='com', is_private=False)
# Access individual components
print(f"Subdomain: {result.subdomain}") # 'forums.news'
print(f"Domain: {result.domain}") # 'cnn'
print(f"Suffix: {result.suffix}") # 'com'
# Reconstruct full domain name
print(result.fqdn) # 'forums.news.cnn.com'
# Handle complex TLDs
uk_result = tldextract.extract('http://forums.bbc.co.uk/')
print(uk_result)
# ExtractResult(subdomain='forums', domain='bbc', suffix='co.uk', is_private=False)
# Handle edge cases
ip_result = tldextract.extract('http://127.0.0.1:8080/path')
print(ip_result)
# ExtractResult(subdomain='', domain='127.0.0.1', suffix='', is_private=False)The tldextract library uses the authoritative Public Suffix List (PSL) to make parsing decisions:
The library automatically fetches and caches the latest PSL data on first use, with intelligent fallback to a bundled snapshot if network access is unavailable.
Core functionality for extracting URL components using the convenience extract() function. This provides the most common use case with sensible defaults.
def extract(
url: str,
include_psl_private_domains: bool | None = False,
session: requests.Session | None = None
) -> ExtractResultAdvanced extraction with custom configuration options including cache settings, custom suffix lists, and private domain handling through the TLDExtract class.
class TLDExtract:
def __init__(
self,
cache_dir: str | None = None,
suffix_list_urls: Sequence[str] = PUBLIC_SUFFIX_LIST_URLS,
fallback_to_snapshot: bool = True,
include_psl_private_domains: bool = False,
extra_suffixes: Sequence[str] = (),
cache_fetch_timeout: str | float | None = CACHE_TIMEOUT
) -> None
def __call__(
self,
url: str,
include_psl_private_domains: bool | None = None,
session: requests.Session | None = None
) -> ExtractResult
def extract_str(
self,
url: str,
include_psl_private_domains: bool | None = None,
session: requests.Session | None = None
) -> ExtractResult
def extract_urllib(
self,
url: urllib.parse.ParseResult | urllib.parse.SplitResult,
include_psl_private_domains: bool | None = None,
session: requests.Session | None = None
) -> ExtractResult
def update(
self,
fetch_now: bool = False,
session: requests.Session | None = None
) -> None
def tlds(self, session: requests.Session | None = None) -> list[str]Comprehensive result handling with properties for reconstructing domains, handling IP addresses, and accessing metadata about the extraction process.
@dataclass
class ExtractResult:
subdomain: str
domain: str
suffix: str
is_private: bool
registry_suffix: str
@property
def fqdn(self) -> str
@property
def ipv4(self) -> str
@property
def ipv6(self) -> str
@property
def registered_domain(self) -> str
@property
def reverse_domain_name(self) -> str
@property
def top_domain_under_public_suffix(self) -> str
@property
def top_domain_under_registry_suffix(self) -> strCommand-line tool for URL parsing with options for output formatting, cache management, and PSL updates.
tldextract [options] <url1> [url2] ...Functions for updating and managing Public Suffix List data globally.
def update(fetch_now: bool = False, session: requests.Session | None = None) -> Nonefrom typing import Sequence
from dataclasses import dataclass, field
import requests
import urllib.parse
# Module attributes
__version__: str
# Constants
PUBLIC_SUFFIX_LIST_URLS: tuple[str, ...]
CACHE_TIMEOUT: str | None
# Functions - detailed in respective sections
# Classes - detailed in respective sections
ExtractResult = dataclass # Detailed in Result Processing section
TLDExtract = class # Detailed in Configurable Extraction section