tessl/pypi-yt-dlp

A feature-rich command-line audio/video downloader forked from youtube-dl

Overview

Eval results

Files

Utility Functions

Name: tessl/pypi-yt-dlp
Author: tessl

Comprehensive utility functions for file handling, data parsing, URL processing, format conversion, and template processing commonly needed when working with media downloads and extraction operations.

Capabilities

File and Path Operations

Functions for sanitizing filenames, handling paths, and managing file system operations.

def sanitize_filename(s, restricted=False, is_id=False):
    """
    Sanitize filename by removing/replacing invalid characters.
    
    Parameters:
    - s: str, filename to sanitize
    - restricted: bool, use ASCII-only characters
    - is_id: bool, treat as video ID (more permissive)
    
    Returns:
    str: sanitized filename safe for file system
    """

def sanitize_path(s, force=False):
    """
    Sanitize file path by cleaning path components.
    
    Parameters:
    - s: str, path to sanitize
    - force: bool, force sanitization even if path exists
    
    Returns:
    str: sanitized path
    """

def expand_path(s):
    """
    Expand user path with ~ notation and environment variables.
    
    Parameters:
    - s: str, path to expand
    
    Returns:
    str: expanded path
    """

def shell_quote(args, *, shell_quote_wrapper=None):
    """
    Quote arguments for safe shell execution.
    
    Parameters:
    - args: str|list[str], arguments to quote
    - shell_quote_wrapper: callable|None, custom quoting function
    
    Returns:
    str: quoted arguments string
    """

Data Parsing and Conversion

Functions for parsing various data formats and safely converting between types.

def parse_duration(s):
    """
    Parse duration string to seconds.
    
    Supports formats like '1:23:45', '1h23m45s', '3600', etc.
    
    Parameters:
    - s: str, duration string
    
    Returns:
    int|None: duration in seconds, None if parsing fails
    """

def parse_bytes(s):
    """
    Parse byte size string to integer.
    
    Supports formats like '1.5GB', '500MB', '1024KB', etc.
    
    Parameters:
    - s: str, byte size string
    
    Returns:
    int|None: size in bytes, None if parsing fails
    """

def parse_filesize(s):
    """
    Parse file size string to integer bytes.
    
    Parameters:
    - s: str, file size string
    
    Returns:
    int|None: size in bytes, None if parsing fails
    """

def parse_resolution(s, *, lenient=False):
    """
    Parse resolution string to width/height tuple.
    
    Parameters:
    - s: str, resolution string like '1920x1080'
    - lenient: bool, allow lenient parsing
    
    Returns:
    tuple[int, int]|None: (width, height) or None if parsing fails
    """

def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
    """
    Safe integer conversion with scaling.
    
    Parameters:
    - v: Any, value to convert
    - scale: int, scaling factor
    - default: Any, default if conversion fails
    - get_attr: str|None, attribute to get from v
    - invscale: int, inverse scaling factor
    
    Returns:
    int|Any: converted integer or default
    """

def float_or_none(v, scale=1, invscale=1, default=None):
    """
    Safe float conversion with scaling.
    
    Parameters:
    - v: Any, value to convert
    - scale: float, scaling factor
    - invscale: float, inverse scaling factor
    - default: Any, default if conversion fails
    
    Returns:
    float|Any: converted float or default
    """

def str_or_none(v, default=None):
    """
    Safe string conversion.
    
    Parameters:
    - v: Any, value to convert
    - default: Any, default if conversion fails
    
    Returns:
    str|Any: converted string or default
    """

Date and Time Utilities

Functions for parsing, formatting, and manipulating dates and timestamps.

def unified_strdate(date_str, day_first=True):
    """
    Parse date string to unified YYYYMMDD format.
    
    Parameters:
    - date_str: str, date string in various formats
    - day_first: bool, assume day comes before month in ambiguous cases
    
    Returns:
    str|None: date in YYYYMMDD format, None if parsing fails
    """

def unified_timestamp(date_str, day_first=True):
    """
    Parse date string to Unix timestamp.
    
    Parameters:
    - date_str: str, date string in various formats
    - day_first: bool, assume day comes before month in ambiguous cases
    
    Returns:
    int|None: Unix timestamp, None if parsing fails
    """

def formatSeconds(secs, delim=':'):
    """
    Format seconds as duration string.
    
    Parameters:
    - secs: int|float, seconds to format
    - delim: str, delimiter between time components
    
    Returns:
    str: formatted duration (e.g., '1:23:45')
    """

HTML and Web Processing

Functions for processing HTML content and extracting information from web pages.

def clean_html(html):
    """
    Remove HTML tags and decode entities.
    
    Parameters:
    - html: str, HTML content to clean
    
    Returns:
    str: cleaned text content
    """

def unescapeHTML(s):
    """
    Decode HTML entities in string.
    
    Parameters:
    - s: str, string with HTML entities
    
    Returns:
    str: decoded string
    """

def extract_attributes(html_element):
    """
    Extract attributes from HTML element string.
    
    Parameters:
    - html_element: str, HTML element as string
    
    Returns:
    dict[str, str]: attribute name-value pairs
    """

def get_element_by_id(id, html, **kwargs):
    """
    Extract HTML element by ID.
    
    Parameters:
    - id: str, element ID to find
    - html: str, HTML content to search
    - **kwargs: additional options
    
    Returns:
    str|None: element content or None if not found
    """

Network and URL Utilities

Functions for processing URLs, handling network operations, and managing web requests.

def sanitize_url(url, *, scheme='http'):
    """
    Clean and sanitize URL.
    
    Parameters:
    - url: str, URL to sanitize
    - scheme: str, default scheme if missing
    
    Returns:
    str: sanitized URL
    """

def url_basename(url):
    """
    Get basename (filename) from URL.
    
    Parameters:
    - url: str, URL to extract basename from
    
    Returns:
    str: basename of URL
    """

def urljoin(base, path):
    """
    Join base URL with path.
    
    Parameters:
    - base: str, base URL
    - path: str, path to join
    
    Returns:
    str: joined URL
    """

def smuggle_url(url, data):
    """
    Encode data into URL for internal passing.
    
    Parameters:
    - url: str, base URL
    - data: dict, data to encode
    
    Returns:
    str: URL with smuggled data
    """

def unsmuggle_url(smug_url, default=None):
    """
    Extract smuggled data from URL.
    
    Parameters:
    - smug_url: str, URL with smuggled data
    - default: Any, default if no data found
    
    Returns:
    tuple[str, Any]: (clean_url, extracted_data)
    """

Format and Output Utilities

Functions for formatting data for display and managing output streams.

def format_bytes(bytes):
    """
    Format byte count for human-readable display.
    
    Parameters:
    - bytes: int, byte count
    
    Returns:
    str: formatted byte string (e.g., '1.5 GB')
    """

def render_table(headers, rows, delim=' ', extra_gap=0, hide_empty=False):
    """
    Create formatted table string.
    
    Parameters:
    - headers: list[str], column headers
    - rows: list[list[str]], table rows
    - delim: str, column delimiter
    - extra_gap: int, extra spacing between columns
    - hide_empty: bool, hide empty columns
    
    Returns:
    str: formatted table
    """

def write_string(s, out=None, encoding=None):
    """
    Write string to output stream with proper encoding.
    
    Parameters:
    - s: str, string to write
    - out: file-like|None, output stream (default: stdout)
    - encoding: str|None, encoding to use
    """

def traverse_obj(obj, *paths, **kwargs):
    """
    Safely navigate nested objects with multiple path options.
    
    Parameters:
    - obj: Any, object to traverse
    - *paths: path specifications (strings, tuples, callables)
    - **kwargs: options like 'default', 'expected_type', etc.
    
    Returns:
    Any: value at path or default
    """

Template and String Processing

Functions for processing output templates and manipulating strings.

class FormatSorter:
    """
    Advanced format sorting with customizable criteria.
    
    Provides sophisticated format selection based on quality,
    codec preferences, file size, and other criteria.
    """
    
    def __init__(self, *args, **kwargs):
        """Initialize format sorter with criteria."""
    
    def evaluate(self, format_list):
        """
        Sort formats according to criteria.
        
        Parameters:
        - format_list: list[dict], formats to sort
        
        Returns:
        list[dict]: sorted formats
        """

def match_filter_func(filters, breaking_filters):
    """
    Create match filter function from filter expressions.
    
    Parameters:
    - filters: list[str], filter expressions
    - breaking_filters: list[str], breaking filter expressions
    
    Returns:
    callable: filter function
    """

Usage Examples

Filename Sanitization

from yt_dlp.utils import sanitize_filename

# Basic sanitization
unsafe_name = "My Video: Part 1 (2024) <HD>.mp4"
safe_name = sanitize_filename(unsafe_name)
print(f"Safe filename: {safe_name}")
# Output: My Video꞉ Part 1 (2024) ⧸HD⧹.mp4

# Restricted ASCII-only sanitization
restricted_name = sanitize_filename(unsafe_name, restricted=True)
print(f"Restricted filename: {restricted_name}")
# Output: My_Video_Part_1_2024_HD.mp4

Duration Parsing

from yt_dlp.utils import parse_duration, formatSeconds

# Parse various duration formats
durations = ['1:23:45', '3600', '1h23m45s', '5003.7']
for duration_str in durations:
    seconds = parse_duration(duration_str)
    formatted = formatSeconds(seconds) if seconds else 'Invalid'
    print(f"{duration_str} -> {seconds}s -> {formatted}")

Data Size Parsing

from yt_dlp.utils import parse_bytes, format_bytes

# Parse file sizes
sizes = ['1.5GB', '500MB', '1024KB', '2048']
for size_str in sizes:
    bytes_count = parse_bytes(size_str)
    formatted = format_bytes(bytes_count) if bytes_count else 'Invalid'
    print(f"{size_str} -> {bytes_count} bytes -> {formatted}")

Date Processing

from yt_dlp.utils import unified_strdate, unified_timestamp
import datetime

# Parse dates
dates = ['2024-01-15', 'Jan 15, 2024', '15/01/2024']
for date_str in dates:
    unified = unified_strdate(date_str)
    timestamp = unified_timestamp(date_str)
    if timestamp:
        readable = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
        print(f"{date_str} -> {unified} -> {timestamp} -> {readable}")

HTML Processing

from yt_dlp.utils import clean_html, unescapeHTML

html_content = "&lt;p&gt;Video title with &amp;quot;quotes&amp;quot;&lt;/p&gt;"
decoded = unescapeHTML(html_content)
clean_text = clean_html(decoded)
print(f"Original: {html_content}")
print(f"Decoded: {decoded}")
print(f"Clean: {clean_text}")

Safe Object Traversal

from yt_dlp.utils import traverse_obj

# Complex nested data
data = {
    'video': {
        'metadata': {
            'title': 'Example Video',
            'stats': {'views': 1000000}
        },
        'formats': [
            {'quality': 'high', 'url': 'https://example.com/high.mp4'},
            {'quality': 'low', 'url': 'https://example.com/low.mp4'}
        ]
    }
}

# Safely extract nested values
title = traverse_obj(data, ('video', 'metadata', 'title'))
views = traverse_obj(data, ('video', 'metadata', 'stats', 'views'))
first_url = traverse_obj(data, ('video', 'formats', 0, 'url'))
missing = traverse_obj(data, ('video', 'missing', 'field'), default='Not found')

print(f"Title: {title}")
print(f"Views: {views}")
print(f"First URL: {first_url}")
print(f"Missing field: {missing}")

Table Formatting

from yt_dlp.utils import render_table

headers = ['Format', 'Quality', 'Size', 'Codec']
rows = [
    ['mp4', '1080p', '500MB', 'h264'],
    ['webm', '720p', '300MB', 'vp9'],
    ['mp4', '480p', '150MB', 'h264'],
]

table = render_table(headers, rows, delim=' | ', extra_gap=1)
print(table)

Types

# Date range class for filtering by date
class DateRange:
    def __init__(self, start=None, end=None): ...
    def day(cls, day): ...  # Create single-day range
    
# Configuration management class
class Config:
    def __init__(self): ...
    
# Format sorting and preference class
class FormatSorter:
    def __init__(self, extractor, field_preference=None): ...
    
# Configuration namespace class
class Namespace:
    def __init__(self, **kwargs): ...

# Lazy list implementation for memory efficiency
class LazyList:
    def __init__(self, iterable): ...

# Paged list for handling large datasets
class PagedList:
    def __init__(self, pagefunc, pagesize): ...

# Playlist entry parser
class PlaylistEntries:
    @staticmethod
    def parse_playlist_items(spec): ...

# Geographic utilities
class GeoUtils:
    @staticmethod
    def random_ipv4(code): ...

# ISO country code utilities
class ISO3166Utils:
    @staticmethod
    def short2full(code): ...

# Sentinel object for no default value
NO_DEFAULT = object()

Install with Tessl CLI