tessl/pypi-youtube-transcript-api

Python API for retrieving YouTube video transcripts and subtitles without browser automation

Overview

Eval results

Files

Output Formatters

Name: tessl/pypi-youtube-transcript-api
Author: tessl

Classes for converting transcript data into various output formats. Supports JSON, plain text, SRT subtitles, WebVTT, and pretty-printed formats for different use cases.

Capabilities

Base Formatter Class

Abstract base class defining the formatter interface. All concrete formatters inherit from this class.

class Formatter:
    def format_transcript(self, transcript, **kwargs):
        """
        Format a single transcript.

        Args:
            transcript (FetchedTranscript): Transcript to format
            **kwargs: Formatter-specific options

        Returns:
            str: Formatted transcript string

        Raises:
            NotImplementedError: Must be implemented by subclasses
        """

    def format_transcripts(self, transcripts, **kwargs):
        """
        Format multiple transcripts.

        Args:
            transcripts (List[FetchedTranscript]): Transcripts to format
            **kwargs: Formatter-specific options

        Returns:
            str: Formatted transcripts string

        Raises:
            NotImplementedError: Must be implemented by subclasses
        """

JSON Formatter

Converts transcript data to JSON format for programmatic processing and data interchange.

class JSONFormatter(Formatter):
    def format_transcript(self, transcript, **kwargs):
        """
        Convert transcript to JSON string.

        Args:
            transcript (FetchedTranscript): Transcript to format
            **kwargs: Passed to json.dumps() (indent, ensure_ascii, etc.)

        Returns:
            str: JSON representation of transcript data
        """

    def format_transcripts(self, transcripts, **kwargs):
        """
        Convert multiple transcripts to JSON array string.

        Args:
            transcripts (List[FetchedTranscript]): Transcripts to format
            **kwargs: Passed to json.dumps()

        Returns:
            str: JSON array of transcript data
        """

Text Formatter

Converts transcripts to plain text with no timestamps. Useful for text analysis and content extraction.

class TextFormatter(Formatter):
    def format_transcript(self, transcript, **kwargs):
        """
        Convert transcript to plain text (no timestamps).

        Args:
            transcript (FetchedTranscript): Transcript to format
            **kwargs: Unused

        Returns:
            str: Plain text with lines separated by newlines
        """

    def format_transcripts(self, transcripts, **kwargs):
        """
        Convert multiple transcripts to plain text.

        Args:
            transcripts (List[FetchedTranscript]): Transcripts to format
            **kwargs: Unused

        Returns:
            str: Plain text with transcripts separated by triple newlines
        """

Pretty Print Formatter

Human-readable formatted output using Python's pprint module for debugging and inspection.

class PrettyPrintFormatter(Formatter):
    def format_transcript(self, transcript, **kwargs):
        """
        Pretty print transcript data.

        Args:
            transcript (FetchedTranscript): Transcript to format
            **kwargs: Passed to pprint.pformat()

        Returns:
            str: Pretty formatted transcript representation
        """

    def format_transcripts(self, transcripts, **kwargs):
        """
        Pretty print multiple transcripts.

        Args:
            transcripts (List[FetchedTranscript]): Transcripts to format
            **kwargs: Passed to pprint.pformat()

        Returns:
            str: Pretty formatted list of transcripts
        """

SRT Formatter

Creates SRT (SubRip) subtitle files compatible with video players and subtitle software.

class SRTFormatter(Formatter):
    def format_transcript(self, transcript, **kwargs):
        """
        Convert transcript to SRT subtitle format.

        Args:
            transcript (FetchedTranscript): Transcript to format
            **kwargs: Unused

        Returns:
            str: SRT formatted subtitles with sequence numbers and timestamps
        """

    def format_transcripts(self, transcripts, **kwargs):
        """
        Convert multiple transcripts to SRT format.

        Args:
            transcripts (List[FetchedTranscript]): Transcripts to format
            **kwargs: Unused

        Returns:
            str: Combined SRT formatted subtitles
        """

WebVTT Formatter

Creates WebVTT subtitle files for web video players and HTML5 video elements.

class WebVTTFormatter(Formatter):
    def format_transcript(self, transcript, **kwargs):
        """
        Convert transcript to WebVTT subtitle format.

        Args:
            transcript (FetchedTranscript): Transcript to format
            **kwargs: Unused

        Returns:
            str: WebVTT formatted subtitles with WEBVTT header
        """

    def format_transcripts(self, transcripts, **kwargs):
        """
        Convert multiple transcripts to WebVTT format.

        Args:
            transcripts (List[FetchedTranscript]): Transcripts to format
            **kwargs: Unused

        Returns:
            str: Combined WebVTT formatted subtitles
        """

Formatter Loader

Utility class for loading formatters by type string. Provides a convenient interface for dynamic formatter selection.

class FormatterLoader:
    TYPES = {
        "json": JSONFormatter,
        "pretty": PrettyPrintFormatter,
        "text": TextFormatter,
        "webvtt": WebVTTFormatter,
        "srt": SRTFormatter,
    }

    def load(self, formatter_type="pretty"):
        """
        Load formatter by type string.

        Args:
            formatter_type (str): Formatter type name. Defaults to "pretty"

        Returns:
            Formatter: Formatter instance

        Raises:
            UnknownFormatterType: Invalid formatter type
        """

    class UnknownFormatterType(Exception):
        def __init__(self, formatter_type):
            """
            Exception for invalid formatter types.

            Args:
                formatter_type (str): The invalid formatter type
            """

Usage Examples

Basic Formatting

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter, TextFormatter

api = YouTubeTranscriptApi()
transcript = api.fetch('dQw4w9WgXcQ')

# JSON format
json_formatter = JSONFormatter()
json_output = json_formatter.format_transcript(transcript)
print(json_output)

# Plain text format
text_formatter = TextFormatter()
text_output = text_formatter.format_transcript(transcript)
print(text_output)

Subtitle File Creation

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import SRTFormatter, WebVTTFormatter

api = YouTubeTranscriptApi()
transcript = api.fetch('dQw4w9WgXcQ')

# Create SRT subtitle file
srt_formatter = SRTFormatter()
srt_content = srt_formatter.format_transcript(transcript)

with open('subtitles.srt', 'w', encoding='utf-8') as f:
    f.write(srt_content)

# Create WebVTT subtitle file
webvtt_formatter = WebVTTFormatter()
webvtt_content = webvtt_formatter.format_transcript(transcript)

with open('subtitles.vtt', 'w', encoding='utf-8') as f:
    f.write(webvtt_content)

Using FormatterLoader

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import FormatterLoader

api = YouTubeTranscriptApi()
transcript = api.fetch('dQw4w9WgXcQ')

loader = FormatterLoader()

# Load different formatters dynamically
for format_type in ['json', 'text', 'srt', 'webvtt', 'pretty']:
    formatter = loader.load(format_type)
    output = formatter.format_transcript(transcript)
    print(f"=== {format_type.upper()} ===")
    print(output[:200] + "..." if len(output) > 200 else output)
    print()

JSON Formatting with Options

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import JSONFormatter
import json

api = YouTubeTranscriptApi()
transcript = api.fetch('dQw4w9WgXcQ')

json_formatter = JSONFormatter()

# Pretty printed JSON
pretty_json = json_formatter.format_transcript(transcript, indent=2, ensure_ascii=False)
print(pretty_json)

# Compact JSON
compact_json = json_formatter.format_transcript(transcript, separators=(',', ':'))
print(compact_json)

Multiple Transcripts

from youtube_transcript_api import YouTubeTranscriptApi
from youtube_transcript_api.formatters import TextFormatter

api = YouTubeTranscriptApi()

# Get transcripts in different languages
video_ids = ['dQw4w9WgXcQ', 'jNQXAC9IVRw']
transcripts = []

for video_id in video_ids:
    try:
        transcript = api.fetch(video_id)
        transcripts.append(transcript)
    except Exception as e:
        print(f"Failed to fetch {video_id}: {e}")

# Format all transcripts together
if transcripts:
    text_formatter = TextFormatter()
    combined_text = text_formatter.format_transcripts(transcripts)
    print(combined_text)

Types

from typing import List
from youtube_transcript_api._transcripts import FetchedTranscript

# Formatter interface types
FormatterType = str  # One of: "json", "text", "pretty", "srt", "webvtt"
FormatterKwargs = dict  # Formatter-specific keyword arguments

Install with Tessl CLI

npx tessl i tessl/pypi-youtube-transcript-api

docs