Yet another URL library - comprehensive URL parsing and manipulation for Python
Performance optimization through configurable LRU caching for encoding/decoding operations. YARL uses caching to improve performance when processing many URLs, especially those with international domain names or complex encoding requirements.
YARL maintains separate LRU caches for different operations to optimize performance:
Deprecated Cache Types (kept for backwards compatibility):
Configure cache sizes for different operations to balance memory usage and performance.
def cache_configure(*,
idna_encode_size: int | None = 256,
idna_decode_size: int | None = 256,
ip_address_size: int | None = None,
host_validate_size: int | None = None,
encode_host_size: int | None = None) -> None:
"""
Configure LRU cache sizes for URL processing operations.
Args:
idna_encode_size (int | None): Cache size for IDNA encoding operations
idna_decode_size (int | None): Cache size for IDNA decoding operations
ip_address_size (int | None): DEPRECATED - kept for backwards compatibility
host_validate_size (int | None): DEPRECATED - kept for backwards compatibility
encode_host_size (int | None): Cache size for host encoding operations
Note:
ip_address_size and host_validate_size are deprecated and will be removed
in future versions. They are kept for backwards compatibility only.
Examples:
# Increase cache sizes for high-volume applications
cache_configure(
idna_encode_size=1024,
idna_decode_size=1024,
ip_address_size=512,
host_validate_size=512,
encode_host_size=1024
)
# Reduce memory usage for memory-constrained environments
cache_configure(
idna_encode_size=64,
idna_decode_size=64,
ip_address_size=32,
host_validate_size=32,
encode_host_size=64
)
"""Retrieve statistics about cache performance to monitor effectiveness and tune cache sizes.
def cache_info() -> CacheInfo:
"""
Get cache statistics for all URL processing caches.
Returns:
CacheInfo: Dictionary containing cache statistics for each operation
Examples:
info = cache_info()
print(f"IDNA encode hits: {info['idna_encode'].hits}")
print(f"IDNA encode misses: {info['idna_encode'].misses}")
print(f"Cache hit ratio: {info['idna_encode'].hits / (info['idna_encode'].hits + info['idna_encode'].misses)}")
"""
class CacheInfo(TypedDict):
"""
Cache information structure containing statistics for each cache type.
Each cache entry contains standard functools.lru_cache statistics:
- hits: Number of cache hits
- misses: Number of cache misses
- maxsize: Maximum cache size
- currsize: Current number of cached items
"""
idna_encode: _CacheInfo
idna_decode: _CacheInfo
ip_address: _CacheInfo
host_validate: _CacheInfo
encode_host: _CacheInfoClear caches to free memory or reset performance counters.
def cache_clear() -> None:
"""
Clear all URL processing caches.
Removes all cached entries and resets performance counters.
Useful for freeing memory or starting fresh performance measurements.
Examples:
# Clear caches after processing a large batch
cache_clear()
# Reset before performance testing
cache_clear()
process_urls(test_urls)
stats = cache_info()
"""from yarl import URL, cache_info, cache_clear, cache_configure
# Check initial cache state
initial_info = cache_info()
print("Initial cache state:")
for cache_name, stats in initial_info.items():
print(f" {cache_name}: {stats.hits} hits, {stats.misses} misses")
# Process some URLs with international domains
urls = [
URL('https://café.example.com/path'),
URL('https://münchen.de/info'),
URL('https://москва.рф/news'),
URL('https://日本.jp/page')
]
# Check cache statistics after processing
final_info = cache_info()
print("\nAfter processing international domains:")
for cache_name, stats in final_info.items():
if stats.hits > 0 or stats.misses > 0:
hit_ratio = stats.hits / (stats.hits + stats.misses) if (stats.hits + stats.misses) > 0 else 0
print(f" {cache_name}: {stats.hits} hits, {stats.misses} misses (hit ratio: {hit_ratio:.2%})")from yarl import URL, cache_configure, cache_info, cache_clear
# Configure larger caches for high-volume application
cache_configure(
idna_encode_size=2048,
idna_decode_size=2048,
ip_address_size=1024,
host_validate_size=1024,
encode_host_size=2048
)
def process_url_batch(urls):
"""Process a batch of URLs and return timing info."""
import time
start_time = time.time()
processed_urls = [URL(url_str) for url_str in urls]
end_time = time.time()
return processed_urls, end_time - start_time
# Simulate processing batches of URLs
url_batch = [
'https://café.example.com/api/v1/users',
'https://münchen.de/api/search',
'https://москва.рф/api/news',
] * 100 # Repeat for cache effectiveness
# First pass - populate caches
cache_clear() # Start fresh
first_batch, first_time = process_url_batch(url_batch)
first_stats = cache_info()
# Second pass - should benefit from caching
second_batch, second_time = process_url_batch(url_batch)
second_stats = cache_info()
print(f"First pass: {first_time:.4f}s")
print(f"Second pass: {second_time:.4f}s")
print(f"Speedup: {first_time/second_time:.2f}x")
# Analyze cache effectiveness
for cache_name in first_stats:
first = first_stats[cache_name]
second = second_stats[cache_name]
if second.hits > first.hits:
print(f"{cache_name}: {second.hits - first.hits} additional hits in second pass")from yarl import URL, cache_configure, cache_info
# Configure smaller caches for memory-constrained environments
cache_configure(
idna_encode_size=32,
idna_decode_size=32,
ip_address_size=16,
host_validate_size=16,
encode_host_size=32
)
# Process URLs and monitor memory usage
urls_to_process = [
'https://example.com/api/users',
'https://test.org/data',
'https://192.168.1.1:8080/status',
] * 50
processed = [URL(url) for url in urls_to_process]
# Check cache utilization
stats = cache_info()
for cache_name, info in stats.items():
if info.currsize > 0:
utilization = info.currsize / info.maxsize
print(f"{cache_name}: {info.currsize}/{info.maxsize} ({utilization:.1%} full)")from yarl import URL, cache_info, cache_clear, cache_configure
def analyze_cache_performance(urls):
"""Analyze cache performance for a given set of URLs."""
cache_clear() # Start with empty caches
# Process URLs twice to see caching benefit
first_pass = [URL(url) for url in urls]
first_stats = cache_info()
second_pass = [URL(url) for url in urls]
second_stats = cache_info()
print("Cache Performance Analysis:")
print("-" * 50)
for cache_name in first_stats:
first = first_stats[cache_name]
second = second_stats[cache_name]
total_ops = second.hits + second.misses
if total_ops > 0:
hit_ratio = second.hits / total_ops
cache_benefit = second.hits - first.hits
print(f"{cache_name}:")
print(f" Total operations: {total_ops}")
print(f" Hit ratio: {hit_ratio:.1%}")
print(f" Cache benefit: {cache_benefit} hits saved")
print(f" Current size: {second.currsize}/{second.maxsize}")
# Suggest cache size adjustments
if hit_ratio < 0.5 and second.currsize == second.maxsize:
print(f" 💡 Consider increasing {cache_name}_size")
elif second.currsize < second.maxsize * 0.3:
print(f" 💡 Consider decreasing {cache_name}_size")
print()
# Test with various URL patterns
test_urls = [
# International domains (benefit from IDNA caching)
'https://café.example.com/api',
'https://münchen.de/search',
'https://москва.рф/news',
# IP addresses (benefit from IP validation caching)
'https://192.168.1.1:8080/status',
'https://10.0.0.1/api/health',
# Regular domains (benefit from host validation caching)
'https://api.example.com/v1/users',
'https://cdn.example.org/images/logo.png',
] * 20 # Repeat for meaningful cache statistics
analyze_cache_performance(test_urls)from yarl import cache_configure, cache_info
import os
def configure_production_cache():
"""Configure caches based on environment and expected load."""
# Get configuration from environment or use defaults
idna_encode_size = int(os.environ.get('YARL_IDNA_ENCODE_CACHE', 1024))
idna_decode_size = int(os.environ.get('YARL_IDNA_DECODE_CACHE', 1024))
ip_address_size = int(os.environ.get('YARL_IP_ADDRESS_CACHE', 512))
host_validate_size = int(os.environ.get('YARL_HOST_VALIDATE_CACHE', 512))
encode_host_size = int(os.environ.get('YARL_ENCODE_HOST_CACHE', 1024))
cache_configure(
idna_encode_size=idna_encode_size,
idna_decode_size=idna_decode_size,
ip_address_size=ip_address_size,
host_validate_size=host_validate_size,
encode_host_size=encode_host_size
)
# Log configuration
stats = cache_info()
print("YARL cache configuration:")
for cache_name, info in stats.items():
print(f" {cache_name}: maxsize={info.maxsize}")
# Call during application startup
configure_production_cache()cache_info() to monitor hit ratios and adjust accordinglyInstall with Tessl CLI
npx tessl i tessl/pypi-yarl