Command-line program to download videos from YouTube.com and other video sites
Information extractors are site-specific modules that handle video metadata extraction from over 1000 supported sites. Each extractor understands the specific URL patterns, API interfaces, and data structures for its target site.
Functions for discovering, listing, and managing available extractors.
def gen_extractors():
"""
Return a list of instances of every supported extractor.
The order matters; the first extractor matched handles the URL.
Returns:
list: List of extractor instances
"""
def gen_extractor_classes():
"""
Return a list of supported extractor classes.
The order matters; the first extractor matched handles the URL.
Returns:
list: List of extractor classes
"""
def list_extractors(age_limit):
"""
Return a list of extractors suitable for the given age limit,
sorted by extractor ID.
Parameters:
- age_limit (int): Age limit for content filtering
Returns:
list: List of suitable extractor instances
"""
def get_info_extractor(ie_name):
"""
Returns the info extractor class with the given name.
Parameters:
- ie_name (str): Extractor name (without 'IE' suffix)
Returns:
class: Extractor class
"""Base class that all site-specific extractors inherit from, providing common functionality and interfaces.
class InfoExtractor:
def __init__(self, downloader=None):
"""
Base class for information extractors.
Parameters:
- downloader: YoutubeDL instance
"""
def suitable(self, url):
"""
Check if the extractor is suitable for the given URL.
Parameters:
- url (str): URL to check
Returns:
bool: True if suitable, False otherwise
"""
def extract(self, url):
"""
Extract information from the given URL.
Parameters:
- url (str): URL to extract from
Returns:
dict: Extracted information dictionary
"""
def _real_extract(self, url):
"""
Actual extraction logic (implemented by subclasses).
Parameters:
- url (str): URL to extract from
Returns:
dict: Extracted information dictionary
"""Utility methods available to all extractors for common operations.
def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
"""
Download webpage content.
Parameters:
- url_or_request: URL string or Request object
- video_id (str): Video identifier for error reporting
- note (str): Progress note to display
- errnote (str): Error note for failures
- fatal (bool): Whether to raise error on failure
- tries (int): Number of retry attempts
- timeout (float): Request timeout
- encoding (str): Character encoding
- data: POST data
- headers (dict): HTTP headers
- query (dict): URL query parameters
Returns:
str: Webpage content
"""
def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
"""
Download and parse JSON data.
Parameters:
- url_or_request: URL string or Request object
- video_id (str): Video identifier
- note (str): Progress note
- errnote (str): Error note
- transform_source (callable): Function to transform JSON source
- fatal (bool): Whether to raise error on failure
- encoding (str): Character encoding
- data: POST data
- headers (dict): HTTP headers
- query (dict): URL query parameters
Returns:
dict: Parsed JSON data
"""
def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0, group=None):
"""
Search for regex pattern in HTML string.
Parameters:
- pattern (str): Regex pattern
- string (str): HTML string to search
- name (str): Description for error messages
- default: Default value if not found
- fatal (bool): Whether to raise error if not found
- flags (int): Regex flags
- group (int/str): Capture group to return
Returns:
str: Matched text
"""Standard format for information returned by extractors.
InfoDict = {
'id': str, # Video identifier
'title': str, # Video title
'url': str, # Video URL (for single videos)
'ext': str, # File extension
'format': str, # Format description
'format_id': str, # Format identifier
'uploader': str, # Video uploader name
'uploader_id': str, # Uploader identifier
'uploader_url': str, # Uploader profile URL
'upload_date': str, # Upload date (YYYYMMDD format)
'timestamp': int, # Upload timestamp (Unix)
'duration': int, # Duration in seconds
'view_count': int, # View count
'like_count': int, # Like count
'dislike_count': int, # Dislike count
'description': str, # Video description
'tags': list, # List of tags
'thumbnail': str, # Thumbnail URL
'thumbnails': list, # List of thumbnail dictionaries
'subtitles': dict, # Subtitle tracks
'automatic_captions': dict, # Auto-generated captions
'formats': list, # List of available formats
'playlist': str, # Playlist title (for playlist entries)
'playlist_id': str, # Playlist identifier
'playlist_index': int, # Position in playlist
'webpage_url': str, # Original webpage URL
'webpage_url_basename': str, # Basename of webpage URL
'extractor': str, # Extractor name
'extractor_key': str, # Extractor key
}Structure for individual video/audio format information.
FormatDict = {
'format_id': str, # Unique format identifier
'url': str, # Direct media URL
'ext': str, # File extension
'width': int, # Video width
'height': int, # Video height
'resolution': str, # Resolution string
'fps': float, # Frames per second
'vcodec': str, # Video codec
'vbr': float, # Video bitrate
'acodec': str, # Audio codec
'abr': float, # Audio bitrate
'asr': int, # Audio sample rate
'filesize': int, # File size in bytes
'tbr': float, # Total bitrate
'protocol': str, # Download protocol
'preference': int, # Format preference (-1 to 100)
'quality': int, # Quality metric
'format_note': str, # Additional format info
'language': str, # Language code
'http_headers': dict, # Required HTTP headers
}youtube-dl includes extractors for over 1000 sites. Some notable ones include:
from youtube_dl import list_extractors
# Get all extractors
extractors = list_extractors(age_limit=18)
for extractor in extractors:
print(f"{extractor.IE_NAME}: {extractor.IE_DESC}")from youtube_dl.extractor import get_info_extractor
# Get YouTube extractor class
YoutubeIE = get_info_extractor('Youtube')
extractor = YoutubeIE()from youtube_dl import YoutubeDL
ydl_opts = {'quiet': True}
with YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info('https://www.youtube.com/watch?v=dQw4w9WgXcQ', download=False)
print(f"Title: {info['title']}")
print(f"Duration: {info['duration']} seconds")
print(f"Uploader: {info['uploader']}")
# List available formats
for fmt in info['formats']:
print(f"Format: {fmt['format_id']} - {fmt['ext']} - {fmt.get('height', 'audio')}p")Install with Tessl CLI
npx tessl i tessl/pypi-youtube-dl