tessl/pypi-pytube

Python library for downloading YouTube videos with comprehensive stream management and metadata extraction capabilities.

—

Pending

Overview

Eval results

Files

Caption and Subtitle Support

Name: tessl/pypi-pytube
Author: tessl

Caption track extraction and conversion to .srt format with support for multiple languages and automatic subtitle generation from YouTube videos.

Capabilities

Caption Class

Represents an individual caption track with language-specific subtitle data and conversion capabilities.

class Caption:
    def __init__(self, caption_track: Dict):
        """
        Initialize a Caption object.
        
        Args:
            caption_track (dict): Caption track metadata dictionary
        """

Caption Properties

Access caption track information and content.

@property
def url(self) -> str:
    """Get the URL for downloading the caption track."""

@property
def name(self) -> str:
    """Get the human-readable name of the caption track (e.g., 'English', 'Spanish')."""

@property
def code(self) -> str:
    """Get the language code for the caption track (e.g., 'en', 'es', 'fr')."""

@property
def xml_captions(self) -> str:
    """Get the raw XML caption data from YouTube."""

@property
def json_captions(self) -> dict:
    """Get the parsed JSON caption data."""

Caption Conversion

Convert caption data between formats.

def generate_srt_captions(self) -> str:
    """
    Convert the caption track to SRT (SubRip) format.
    
    Returns:
        str: Caption content in SRT format with timestamps and text
    """

Caption Download

Download caption files with various format options.

def download(
    self,
    title: str,
    srt: bool = True,
    output_path: Optional[str] = None,
    filename_prefix: Optional[str] = None
) -> str:
    """
    Download the caption track to a file.
    
    Args:
        title (str): Base filename for the caption file
        srt (bool): Convert to SRT format (default: True)
        output_path (str, optional): Directory to save the file
        filename_prefix (str, optional): Prefix to add to filename
        
    Returns:
        str: Path to the downloaded caption file
    """

Static Caption Utilities

Utility methods for caption format conversion.

@staticmethod
def float_to_srt_time_format(d: float) -> str:
    """
    Convert a float timestamp to SRT time format.
    
    Args:
        d (float): Time in seconds as a float
        
    Returns:
        str: Time in SRT format (HH:MM:SS,mmm)
    """

@staticmethod  
def xml_caption_to_srt(xml_captions: str) -> str:
    """
    Convert XML caption data to SRT format.
    
    Args:
        xml_captions (str): Raw XML caption content
        
    Returns:
        str: Caption content converted to SRT format
    """

CaptionQuery Class

Query interface for caption collections providing dictionary-like access to caption tracks by language code.

class CaptionQuery:
    def __init__(self, captions: List[Caption]):
        """
        Initialize CaptionQuery with a list of caption tracks.
        
        Args:
            captions (List[Caption]): List of available caption tracks
        """

Caption Access

Access caption tracks by language code and iterate through available captions.

def __getitem__(self, lang_code: str) -> Caption:
    """
    Get caption track by language code.
    
    Args:
        lang_code (str): Language code (e.g., 'en', 'es', 'fr')
        
    Returns:
        Caption: Caption track for the specified language
        
    Raises:
        KeyError: If language code is not found
    """

def __len__(self) -> int:
    """
    Get the number of available caption tracks.
    
    Returns:
        int: Number of caption tracks
    """

def __iter__(self) -> Iterator[Caption]:
    """
    Iterate through all available caption tracks.
    
    Returns:
        Iterator[Caption]: Iterator over caption tracks
    """

### Deprecated Methods

Legacy methods maintained for backward compatibility.

```python { .api }
def get_by_language_code(self, lang_code: str) -> Optional[Caption]:
    """
    Get caption track by language code.
    
    **DEPRECATED**: Use dictionary-style access with captions[lang_code] instead.
    
    Args:
        lang_code (str): Language code (e.g., 'en', 'es')
        
    Returns:
        Caption or None: Caption track for the specified language
    """

def all(self) -> List[Caption]:
    """
    Get all the results represented by this query as a list.
    
    **DEPRECATED**: CaptionQuery can be treated as a dictionary/iterable directly.
    
    Returns:
        List[Caption]: All caption tracks
    """

Usage Examples

Basic Caption Download

from pytube import YouTube

# Get video with captions
yt = YouTube('https://www.youtube.com/watch?v=9bZkp7q19f0')

# Check available caption tracks
print("Available captions:")
for caption in yt.captions:
    print(f"- {caption.name} ({caption.code})")

# Download English captions
if 'en' in yt.captions:
    caption = yt.captions['en']
    caption.download(title=yt.title)
    print(f"Downloaded captions: {caption.name}")

SRT Format Conversion

from pytube import YouTube

yt = YouTube('https://www.youtube.com/watch?v=9bZkp7q19f0')

# Get English captions and convert to SRT
if 'en' in yt.captions:
    caption = yt.captions['en']
    
    # Generate SRT content
    srt_content = caption.generate_srt_captions()
    
    # Save to custom file
    with open('custom_captions.srt', 'w', encoding='utf-8') as f:
        f.write(srt_content)
    
    print("SRT file created: custom_captions.srt")

Multiple Language Downloads

from pytube import YouTube
import os

yt = YouTube('https://www.youtube.com/watch?v=9bZkp7q19f0')

# Create captions directory
captions_dir = "captions"
os.makedirs(captions_dir, exist_ok=True)

# Download all available caption tracks
for caption in yt.captions:
    try:
        file_path = caption.download(
            title=yt.title,
            output_path=captions_dir,
            filename_prefix=f"{caption.code}_"
        )
        print(f"Downloaded {caption.name}: {file_path}")
    except Exception as e:
        print(f"Failed to download {caption.name}: {e}")

Caption Content Analysis

from pytube import YouTube

yt = YouTube('https://www.youtube.com/watch?v=9bZkp7q19f0')

if 'en' in yt.captions:
    caption = yt.captions['en']
    
    # Get raw caption data
    xml_data = caption.xml_captions
    json_data = caption.json_captions
    
    print(f"XML data length: {len(xml_data)} characters")
    print(f"JSON entries: {len(json_data.get('events', []))}")
    
    # Convert to SRT and analyze
    srt_content = caption.generate_srt_captions()
    srt_lines = srt_content.split('\n')
    subtitle_count = srt_content.count('\n\n') + 1
    
    print(f"SRT content: {len(srt_lines)} lines")
    print(f"Number of subtitles: {subtitle_count}")

Custom SRT Processing

from pytube import YouTube
import re

yt = YouTube('https://www.youtube.com/watch?v=9bZkp7q19f0')

if 'en' in yt.captions:
    caption = yt.captions['en']
    srt_content = caption.generate_srt_captions()
    
    # Extract all subtitle text (remove timestamps and numbering)
    subtitle_pattern = r'\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n(.+?)(?=\n\n|\n\d+\n|\Z)'
    matches = re.findall(subtitle_pattern, srt_content, re.DOTALL)
    
    all_text = ' '.join(match.replace('\n', ' ') for match in matches)
    print(f"Full transcript: {all_text[:200]}...")

Error Handling

from pytube import YouTube

yt = YouTube('https://www.youtube.com/watch?v=9bZkp7q19f0')

# Check if captions are available
if len(yt.captions) == 0:
    print("No captions available for this video")
else:
    print(f"Found {len(yt.captions)} caption tracks")
    
    # Try to get specific language with fallback
    preferred_languages = ['en', 'en-US', 'en-GB']
    
    selected_caption = None
    for lang in preferred_languages:
        if lang in yt.captions:
            selected_caption = yt.captions[lang]
            break
    
    if selected_caption:
        try:
            selected_caption.download(title=yt.title)
            print(f"Downloaded captions: {selected_caption.name}")
        except Exception as e:
            print(f"Download failed: {e}")
    else:
        # Fall back to first available caption
        first_caption = next(iter(yt.captions))
        print(f"Using fallback caption: {first_caption.name}")
        first_caption.download(title=yt.title)

Time-based Caption Extraction

from pytube import YouTube
import json

def extract_captions_for_timerange(caption, start_seconds, end_seconds):
    """Extract captions for a specific time range."""
    json_data = caption.json_captions
    events = json_data.get('events', [])
    
    selected_captions = []
    for event in events:
        if 'tStartMs' in event and 'dDurationMs' in event:
            start_ms = event['tStartMs']
            duration_ms = event['dDurationMs']
            start_time = start_ms / 1000
            end_time = (start_ms + duration_ms) / 1000
            
            # Check if this caption overlaps with our time range
            if start_time < end_seconds and end_time > start_seconds:
                if 'segs' in event:
                    text = ''.join(seg.get('utf8', '') for seg in event['segs'])
                    selected_captions.append({
                        'start': start_time,
                        'end': end_time,
                        'text': text.strip()
                    })
    
    return selected_captions

# Usage
yt = YouTube('https://www.youtube.com/watch?v=9bZkp7q19f0')
if 'en' in yt.captions:
    caption = yt.captions['en']
    
    # Get captions for first 60 seconds
    timerange_captions = extract_captions_for_timerange(caption, 0, 60)
    
    for cap in timerange_captions:
        print(f"{cap['start']:.1f}s - {cap['end']:.1f}s: {cap['text']}")

Types

from typing import Dict, List, Optional, Iterator

# Caption track metadata structure
CaptionTrackDict = Dict[str, Any]

# JSON caption event structure  
CaptionEvent = Dict[str, Any]

Install with Tessl CLI