A feature-rich command-line audio/video downloader forked from youtube-dl
Comprehensive utility functions for file handling, data parsing, URL processing, format conversion, and template processing commonly needed when working with media downloads and extraction operations.
Functions for sanitizing filenames, handling paths, and managing file system operations.
def sanitize_filename(s, restricted=False, is_id=False):
"""
Sanitize filename by removing/replacing invalid characters.
Parameters:
- s: str, filename to sanitize
- restricted: bool, use ASCII-only characters
- is_id: bool, treat as video ID (more permissive)
Returns:
str: sanitized filename safe for file system
"""
def sanitize_path(s, force=False):
"""
Sanitize file path by cleaning path components.
Parameters:
- s: str, path to sanitize
- force: bool, force sanitization even if path exists
Returns:
str: sanitized path
"""
def expand_path(s):
"""
Expand user path with ~ notation and environment variables.
Parameters:
- s: str, path to expand
Returns:
str: expanded path
"""
def shell_quote(args, *, shell_quote_wrapper=None):
"""
Quote arguments for safe shell execution.
Parameters:
- args: str|list[str], arguments to quote
- shell_quote_wrapper: callable|None, custom quoting function
Returns:
str: quoted arguments string
"""Functions for parsing various data formats and safely converting between types.
def parse_duration(s):
"""
Parse duration string to seconds.
Supports formats like '1:23:45', '1h23m45s', '3600', etc.
Parameters:
- s: str, duration string
Returns:
int|None: duration in seconds, None if parsing fails
"""
def parse_bytes(s):
"""
Parse byte size string to integer.
Supports formats like '1.5GB', '500MB', '1024KB', etc.
Parameters:
- s: str, byte size string
Returns:
int|None: size in bytes, None if parsing fails
"""
def parse_filesize(s):
"""
Parse file size string to integer bytes.
Parameters:
- s: str, file size string
Returns:
int|None: size in bytes, None if parsing fails
"""
def parse_resolution(s, *, lenient=False):
"""
Parse resolution string to width/height tuple.
Parameters:
- s: str, resolution string like '1920x1080'
- lenient: bool, allow lenient parsing
Returns:
tuple[int, int]|None: (width, height) or None if parsing fails
"""
def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1):
"""
Safe integer conversion with scaling.
Parameters:
- v: Any, value to convert
- scale: int, scaling factor
- default: Any, default if conversion fails
- get_attr: str|None, attribute to get from v
- invscale: int, inverse scaling factor
Returns:
int|Any: converted integer or default
"""
def float_or_none(v, scale=1, invscale=1, default=None):
"""
Safe float conversion with scaling.
Parameters:
- v: Any, value to convert
- scale: float, scaling factor
- invscale: float, inverse scaling factor
- default: Any, default if conversion fails
Returns:
float|Any: converted float or default
"""
def str_or_none(v, default=None):
"""
Safe string conversion.
Parameters:
- v: Any, value to convert
- default: Any, default if conversion fails
Returns:
str|Any: converted string or default
"""Functions for parsing, formatting, and manipulating dates and timestamps.
def unified_strdate(date_str, day_first=True):
"""
Parse date string to unified YYYYMMDD format.
Parameters:
- date_str: str, date string in various formats
- day_first: bool, assume day comes before month in ambiguous cases
Returns:
str|None: date in YYYYMMDD format, None if parsing fails
"""
def unified_timestamp(date_str, day_first=True):
"""
Parse date string to Unix timestamp.
Parameters:
- date_str: str, date string in various formats
- day_first: bool, assume day comes before month in ambiguous cases
Returns:
int|None: Unix timestamp, None if parsing fails
"""
def formatSeconds(secs, delim=':'):
"""
Format seconds as duration string.
Parameters:
- secs: int|float, seconds to format
- delim: str, delimiter between time components
Returns:
str: formatted duration (e.g., '1:23:45')
"""Functions for processing HTML content and extracting information from web pages.
def clean_html(html):
"""
Remove HTML tags and decode entities.
Parameters:
- html: str, HTML content to clean
Returns:
str: cleaned text content
"""
def unescapeHTML(s):
"""
Decode HTML entities in string.
Parameters:
- s: str, string with HTML entities
Returns:
str: decoded string
"""
def extract_attributes(html_element):
"""
Extract attributes from HTML element string.
Parameters:
- html_element: str, HTML element as string
Returns:
dict[str, str]: attribute name-value pairs
"""
def get_element_by_id(id, html, **kwargs):
"""
Extract HTML element by ID.
Parameters:
- id: str, element ID to find
- html: str, HTML content to search
- **kwargs: additional options
Returns:
str|None: element content or None if not found
"""Functions for processing URLs, handling network operations, and managing web requests.
def sanitize_url(url, *, scheme='http'):
"""
Clean and sanitize URL.
Parameters:
- url: str, URL to sanitize
- scheme: str, default scheme if missing
Returns:
str: sanitized URL
"""
def url_basename(url):
"""
Get basename (filename) from URL.
Parameters:
- url: str, URL to extract basename from
Returns:
str: basename of URL
"""
def urljoin(base, path):
"""
Join base URL with path.
Parameters:
- base: str, base URL
- path: str, path to join
Returns:
str: joined URL
"""
def smuggle_url(url, data):
"""
Encode data into URL for internal passing.
Parameters:
- url: str, base URL
- data: dict, data to encode
Returns:
str: URL with smuggled data
"""
def unsmuggle_url(smug_url, default=None):
"""
Extract smuggled data from URL.
Parameters:
- smug_url: str, URL with smuggled data
- default: Any, default if no data found
Returns:
tuple[str, Any]: (clean_url, extracted_data)
"""Functions for formatting data for display and managing output streams.
def format_bytes(bytes):
"""
Format byte count for human-readable display.
Parameters:
- bytes: int, byte count
Returns:
str: formatted byte string (e.g., '1.5 GB')
"""
def render_table(headers, rows, delim=' ', extra_gap=0, hide_empty=False):
"""
Create formatted table string.
Parameters:
- headers: list[str], column headers
- rows: list[list[str]], table rows
- delim: str, column delimiter
- extra_gap: int, extra spacing between columns
- hide_empty: bool, hide empty columns
Returns:
str: formatted table
"""
def write_string(s, out=None, encoding=None):
"""
Write string to output stream with proper encoding.
Parameters:
- s: str, string to write
- out: file-like|None, output stream (default: stdout)
- encoding: str|None, encoding to use
"""
def traverse_obj(obj, *paths, **kwargs):
"""
Safely navigate nested objects with multiple path options.
Parameters:
- obj: Any, object to traverse
- *paths: path specifications (strings, tuples, callables)
- **kwargs: options like 'default', 'expected_type', etc.
Returns:
Any: value at path or default
"""Functions for processing output templates and manipulating strings.
class FormatSorter:
"""
Advanced format sorting with customizable criteria.
Provides sophisticated format selection based on quality,
codec preferences, file size, and other criteria.
"""
def __init__(self, *args, **kwargs):
"""Initialize format sorter with criteria."""
def evaluate(self, format_list):
"""
Sort formats according to criteria.
Parameters:
- format_list: list[dict], formats to sort
Returns:
list[dict]: sorted formats
"""
def match_filter_func(filters, breaking_filters):
"""
Create match filter function from filter expressions.
Parameters:
- filters: list[str], filter expressions
- breaking_filters: list[str], breaking filter expressions
Returns:
callable: filter function
"""from yt_dlp.utils import sanitize_filename
# Basic sanitization
unsafe_name = "My Video: Part 1 (2024) <HD>.mp4"
safe_name = sanitize_filename(unsafe_name)
print(f"Safe filename: {safe_name}")
# Output: My Video꞉ Part 1 (2024) ⧸HD⧹.mp4
# Restricted ASCII-only sanitization
restricted_name = sanitize_filename(unsafe_name, restricted=True)
print(f"Restricted filename: {restricted_name}")
# Output: My_Video_Part_1_2024_HD.mp4from yt_dlp.utils import parse_duration, formatSeconds
# Parse various duration formats
durations = ['1:23:45', '3600', '1h23m45s', '5003.7']
for duration_str in durations:
seconds = parse_duration(duration_str)
formatted = formatSeconds(seconds) if seconds else 'Invalid'
print(f"{duration_str} -> {seconds}s -> {formatted}")from yt_dlp.utils import parse_bytes, format_bytes
# Parse file sizes
sizes = ['1.5GB', '500MB', '1024KB', '2048']
for size_str in sizes:
bytes_count = parse_bytes(size_str)
formatted = format_bytes(bytes_count) if bytes_count else 'Invalid'
print(f"{size_str} -> {bytes_count} bytes -> {formatted}")from yt_dlp.utils import unified_strdate, unified_timestamp
import datetime
# Parse dates
dates = ['2024-01-15', 'Jan 15, 2024', '15/01/2024']
for date_str in dates:
unified = unified_strdate(date_str)
timestamp = unified_timestamp(date_str)
if timestamp:
readable = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
print(f"{date_str} -> {unified} -> {timestamp} -> {readable}")from yt_dlp.utils import clean_html, unescapeHTML
html_content = "<p>Video title with &quot;quotes&quot;</p>"
decoded = unescapeHTML(html_content)
clean_text = clean_html(decoded)
print(f"Original: {html_content}")
print(f"Decoded: {decoded}")
print(f"Clean: {clean_text}")from yt_dlp.utils import traverse_obj
# Complex nested data
data = {
'video': {
'metadata': {
'title': 'Example Video',
'stats': {'views': 1000000}
},
'formats': [
{'quality': 'high', 'url': 'https://example.com/high.mp4'},
{'quality': 'low', 'url': 'https://example.com/low.mp4'}
]
}
}
# Safely extract nested values
title = traverse_obj(data, ('video', 'metadata', 'title'))
views = traverse_obj(data, ('video', 'metadata', 'stats', 'views'))
first_url = traverse_obj(data, ('video', 'formats', 0, 'url'))
missing = traverse_obj(data, ('video', 'missing', 'field'), default='Not found')
print(f"Title: {title}")
print(f"Views: {views}")
print(f"First URL: {first_url}")
print(f"Missing field: {missing}")from yt_dlp.utils import render_table
headers = ['Format', 'Quality', 'Size', 'Codec']
rows = [
['mp4', '1080p', '500MB', 'h264'],
['webm', '720p', '300MB', 'vp9'],
['mp4', '480p', '150MB', 'h264'],
]
table = render_table(headers, rows, delim=' | ', extra_gap=1)
print(table)# Date range class for filtering by date
class DateRange:
def __init__(self, start=None, end=None): ...
def day(cls, day): ... # Create single-day range
# Configuration management class
class Config:
def __init__(self): ...
# Format sorting and preference class
class FormatSorter:
def __init__(self, extractor, field_preference=None): ...
# Configuration namespace class
class Namespace:
def __init__(self, **kwargs): ...
# Lazy list implementation for memory efficiency
class LazyList:
def __init__(self, iterable): ...
# Paged list for handling large datasets
class PagedList:
def __init__(self, pagefunc, pagesize): ...
# Playlist entry parser
class PlaylistEntries:
@staticmethod
def parse_playlist_items(spec): ...
# Geographic utilities
class GeoUtils:
@staticmethod
def random_ipv4(code): ...
# ISO country code utilities
class ISO3166Utils:
@staticmethod
def short2full(code): ...
# Sentinel object for no default value
NO_DEFAULT = object()Install with Tessl CLI
npx tessl i tessl/pypi-yt-dlp