tessl/pypi-yarl

Yet another URL library - comprehensive URL parsing and manipulation for Python

Overview

Eval results

Files

Cache Management

Name: tessl/pypi-yarl
Author: tessl

Performance optimization through configurable LRU caching for encoding/decoding operations. YARL uses caching to improve performance when processing many URLs, especially those with international domain names or complex encoding requirements.

Cache Types

YARL maintains separate LRU caches for different operations to optimize performance:

IDNA Encoding: International domain name encoding (Unicode to ASCII)
IDNA Decoding: International domain name decoding (ASCII to Unicode)
Host Encoding: Complete host encoding operations

Deprecated Cache Types (kept for backwards compatibility):

IP Address Validation: No longer used, functionality moved to host encoding
Host Validation: No longer used, functionality moved to host encoding

Capabilities

Cache Configuration

Configure cache sizes for different operations to balance memory usage and performance.

def cache_configure(*,
                   idna_encode_size: int | None = 256,
                   idna_decode_size: int | None = 256,
                   ip_address_size: int | None = None,
                   host_validate_size: int | None = None,
                   encode_host_size: int | None = None) -> None:
    """
    Configure LRU cache sizes for URL processing operations.
    
    Args:
        idna_encode_size (int | None): Cache size for IDNA encoding operations
        idna_decode_size (int | None): Cache size for IDNA decoding operations
        ip_address_size (int | None): DEPRECATED - kept for backwards compatibility
        host_validate_size (int | None): DEPRECATED - kept for backwards compatibility
        encode_host_size (int | None): Cache size for host encoding operations
    
    Note:
        ip_address_size and host_validate_size are deprecated and will be removed
        in future versions. They are kept for backwards compatibility only.
        
    Examples:
        # Increase cache sizes for high-volume applications
        cache_configure(
            idna_encode_size=1024,
            idna_decode_size=1024,
            ip_address_size=512,
            host_validate_size=512,
            encode_host_size=1024
        )
        
        # Reduce memory usage for memory-constrained environments
        cache_configure(
            idna_encode_size=64,
            idna_decode_size=64,
            ip_address_size=32,
            host_validate_size=32,
            encode_host_size=64
        )
    """

Cache Information

Retrieve statistics about cache performance to monitor effectiveness and tune cache sizes.

def cache_info() -> CacheInfo:
    """
    Get cache statistics for all URL processing caches.
    
    Returns:
        CacheInfo: Dictionary containing cache statistics for each operation
        
    Examples:
        info = cache_info()
        print(f"IDNA encode hits: {info['idna_encode'].hits}")
        print(f"IDNA encode misses: {info['idna_encode'].misses}")
        print(f"Cache hit ratio: {info['idna_encode'].hits / (info['idna_encode'].hits + info['idna_encode'].misses)}")
    """

class CacheInfo(TypedDict):
    """
    Cache information structure containing statistics for each cache type.
    
    Each cache entry contains standard functools.lru_cache statistics:
    - hits: Number of cache hits
    - misses: Number of cache misses  
    - maxsize: Maximum cache size
    - currsize: Current number of cached items
    """
    idna_encode: _CacheInfo
    idna_decode: _CacheInfo
    ip_address: _CacheInfo
    host_validate: _CacheInfo
    encode_host: _CacheInfo

Cache Management

Clear caches to free memory or reset performance counters.

def cache_clear() -> None:
    """
    Clear all URL processing caches.
    
    Removes all cached entries and resets performance counters.
    Useful for freeing memory or starting fresh performance measurements.
    
    Examples:
        # Clear caches after processing a large batch
        cache_clear()
        
        # Reset before performance testing
        cache_clear()
        process_urls(test_urls)
        stats = cache_info()
    """

Usage Examples

Basic Cache Management

from yarl import URL, cache_info, cache_clear, cache_configure

# Check initial cache state
initial_info = cache_info()
print("Initial cache state:")
for cache_name, stats in initial_info.items():
    print(f"  {cache_name}: {stats.hits} hits, {stats.misses} misses")

# Process some URLs with international domains
urls = [
    URL('https://café.example.com/path'),
    URL('https://münchen.de/info'),
    URL('https://москва.рф/news'),
    URL('https://日本.jp/page')
]

# Check cache statistics after processing
final_info = cache_info()
print("\nAfter processing international domains:")
for cache_name, stats in final_info.items():
    if stats.hits > 0 or stats.misses > 0:
        hit_ratio = stats.hits / (stats.hits + stats.misses) if (stats.hits + stats.misses) > 0 else 0
        print(f"  {cache_name}: {stats.hits} hits, {stats.misses} misses (hit ratio: {hit_ratio:.2%})")

Performance Optimization

from yarl import URL, cache_configure, cache_info, cache_clear

# Configure larger caches for high-volume application
cache_configure(
    idna_encode_size=2048,
    idna_decode_size=2048, 
    ip_address_size=1024,
    host_validate_size=1024,
    encode_host_size=2048
)

def process_url_batch(urls):
    """Process a batch of URLs and return timing info."""
    import time
    
    start_time = time.time()
    processed_urls = [URL(url_str) for url_str in urls]
    end_time = time.time()
    
    return processed_urls, end_time - start_time

# Simulate processing batches of URLs
url_batch = [
    'https://café.example.com/api/v1/users',
    'https://münchen.de/api/search',
    'https://москва.рф/api/news',
] * 100  # Repeat for cache effectiveness

# First pass - populate caches
cache_clear()  # Start fresh
first_batch, first_time = process_url_batch(url_batch)
first_stats = cache_info()

# Second pass - should benefit from caching
second_batch, second_time = process_url_batch(url_batch)
second_stats = cache_info()

print(f"First pass: {first_time:.4f}s")
print(f"Second pass: {second_time:.4f}s")
print(f"Speedup: {first_time/second_time:.2f}x")

# Analyze cache effectiveness
for cache_name in first_stats:
    first = first_stats[cache_name]
    second = second_stats[cache_name]
    if second.hits > first.hits:
        print(f"{cache_name}: {second.hits - first.hits} additional hits in second pass")

Memory-Constrained Environments

from yarl import URL, cache_configure, cache_info

# Configure smaller caches for memory-constrained environments
cache_configure(
    idna_encode_size=32,
    idna_decode_size=32,
    ip_address_size=16,
    host_validate_size=16, 
    encode_host_size=32
)

# Process URLs and monitor memory usage
urls_to_process = [
    'https://example.com/api/users',
    'https://test.org/data',
    'https://192.168.1.1:8080/status',
] * 50

processed = [URL(url) for url in urls_to_process]

# Check cache utilization
stats = cache_info()
for cache_name, info in stats.items():
    if info.currsize > 0:
        utilization = info.currsize / info.maxsize
        print(f"{cache_name}: {info.currsize}/{info.maxsize} ({utilization:.1%} full)")

Cache Monitoring and Tuning

from yarl import URL, cache_info, cache_clear, cache_configure

def analyze_cache_performance(urls):
    """Analyze cache performance for a given set of URLs."""
    cache_clear()  # Start with empty caches
    
    # Process URLs twice to see caching benefit
    first_pass = [URL(url) for url in urls]
    first_stats = cache_info()
    
    second_pass = [URL(url) for url in urls]  
    second_stats = cache_info()
    
    print("Cache Performance Analysis:")
    print("-" * 50)
    
    for cache_name in first_stats:
        first = first_stats[cache_name]
        second = second_stats[cache_name]
        
        total_ops = second.hits + second.misses
        if total_ops > 0:
            hit_ratio = second.hits / total_ops
            cache_benefit = second.hits - first.hits
            
            print(f"{cache_name}:")
            print(f"  Total operations: {total_ops}")
            print(f"  Hit ratio: {hit_ratio:.1%}")
            print(f"  Cache benefit: {cache_benefit} hits saved")
            print(f"  Current size: {second.currsize}/{second.maxsize}")
            
            # Suggest cache size adjustments
            if hit_ratio < 0.5 and second.currsize == second.maxsize:
                print(f"  💡 Consider increasing {cache_name}_size")
            elif second.currsize < second.maxsize * 0.3:
                print(f"  💡 Consider decreasing {cache_name}_size")
            print()

# Test with various URL patterns
test_urls = [
    # International domains (benefit from IDNA caching)
    'https://café.example.com/api',
    'https://münchen.de/search', 
    'https://москва.рф/news',
    
    # IP addresses (benefit from IP validation caching)
    'https://192.168.1.1:8080/status',
    'https://10.0.0.1/api/health',
    
    # Regular domains (benefit from host validation caching)
    'https://api.example.com/v1/users',
    'https://cdn.example.org/images/logo.png',
] * 20  # Repeat for meaningful cache statistics

analyze_cache_performance(test_urls)

Production Cache Configuration

from yarl import cache_configure, cache_info
import os

def configure_production_cache():
    """Configure caches based on environment and expected load."""
    
    # Get configuration from environment or use defaults
    idna_encode_size = int(os.environ.get('YARL_IDNA_ENCODE_CACHE', 1024))
    idna_decode_size = int(os.environ.get('YARL_IDNA_DECODE_CACHE', 1024))
    ip_address_size = int(os.environ.get('YARL_IP_ADDRESS_CACHE', 512))
    host_validate_size = int(os.environ.get('YARL_HOST_VALIDATE_CACHE', 512))
    encode_host_size = int(os.environ.get('YARL_ENCODE_HOST_CACHE', 1024))
    
    cache_configure(
        idna_encode_size=idna_encode_size,
        idna_decode_size=idna_decode_size,
        ip_address_size=ip_address_size,
        host_validate_size=host_validate_size,
        encode_host_size=encode_host_size
    )
    
    # Log configuration
    stats = cache_info()
    print("YARL cache configuration:")
    for cache_name, info in stats.items():
        print(f"  {cache_name}: maxsize={info.maxsize}")

# Call during application startup
configure_production_cache()

Performance Considerations

When Caching Helps Most

International Domain Names: URLs with non-ASCII characters benefit significantly from IDNA caching
Repeated URL Processing: Applications that process the same URLs multiple times
High-Volume Applications: Web servers, crawlers, or API clients processing many URLs
IP Address Heavy Workloads: Applications dealing with many IP-based URLs

Cache Size Tuning Guidelines

Small Applications: Default sizes (256) are usually sufficient
High-Volume Applications: Increase to 1024-2048 for frequently accessed caches
Memory-Constrained: Reduce to 32-128 to minimize memory usage
Monitoring: Use cache_info() to monitor hit ratios and adjust accordingly

Optimal Hit Ratios

>80%: Excellent cache performance, consider current size appropriate
50-80%: Good performance, monitor for opportunities to increase cache size
<50%: Poor cache performance, consider increasing cache size or investigating URL patterns

Install with Tessl CLI