tessl/pypi-youtube-transcript-api

Python API for retrieving YouTube video transcripts and subtitles without browser automation

Overview

Eval results

Files

Data Structures

Name: tessl/pypi-youtube-transcript-api
Author: tessl

Core data classes for representing transcript metadata, collections, and content. These structures provide the foundation for all transcript operations in the library.

Capabilities

TranscriptList

Container for all available transcripts for a specific video. Provides methods to search and filter transcripts by language and type (manual vs. generated).

class TranscriptList:
    def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
        """
        Internal constructor. Use YouTubeTranscriptApi.list() to create instances.

        Args:
            video_id (str): Video ID this list belongs to
            manually_created_transcripts (dict): Manual transcripts by language code
            generated_transcripts (dict): Generated transcripts by language code
            translation_languages (list): Available translation languages
        """

    def find_transcript(self, language_codes):
        """
        Find transcript with language priority. Prefers manual over generated.

        Args:
            language_codes (Iterable[str]): Language codes in priority order

        Returns:
            Transcript: First matching transcript found

        Raises:
            NoTranscriptFound: No transcript found for any requested language
        """

    def find_generated_transcript(self, language_codes):
        """
        Find automatically generated transcript.

        Args:
            language_codes (Iterable[str]): Language codes in priority order

        Returns:
            Transcript: First matching generated transcript

        Raises:
            NoTranscriptFound: No generated transcript found
        """

    def find_manually_created_transcript(self, language_codes):
        """
        Find manually created transcript.

        Args:
            language_codes (Iterable[str]): Language codes in priority order

        Returns:
            Transcript: First matching manual transcript

        Raises:
            NoTranscriptFound: No manual transcript found
        """

    def __iter__(self):
        """
        Iterate over all transcripts (manual first, then generated).

        Yields:
            Transcript: Each available transcript
        """

    @property
    def video_id(self):
        """str: Video ID this transcript list belongs to"""

Transcript

Metadata and fetching interface for an individual transcript. Represents a specific language version of a video's subtitles.

class Transcript:
    def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
        """
        Internal constructor. Access via TranscriptList methods.
        """

    def fetch(self, preserve_formatting=False):
        """
        Load the actual transcript content.

        Args:
            preserve_formatting (bool, optional): Keep HTML formatting tags. Defaults to False

        Returns:
            FetchedTranscript: Transcript with content and timing data

        Raises:
            PoTokenRequired: PO token required for this video
            YouTubeRequestFailed: HTTP request failed
        """

    def translate(self, language_code):
        """
        Create translated version of this transcript.

        Args:
            language_code (str): Target language code for translation

        Returns:
            Transcript: New transcript object for translated version

        Raises:
            NotTranslatable: This transcript cannot be translated
            TranslationLanguageNotAvailable: Requested language not available
        """

    @property
    def video_id(self):
        """str: Video ID this transcript belongs to"""

    @property
    def language(self):
        """str: Human-readable language name"""

    @property
    def language_code(self):
        """str: Language code (e.g., 'en', 'es', 'fr')"""

    @property
    def is_generated(self):
        """bool: True if automatically generated, False if manually created"""

    @property
    def translation_languages(self):
        """list: Available languages for translation"""

    @property
    def is_translatable(self):
        """bool: True if this transcript can be translated"""

FetchedTranscript

Complete transcript data with timing information. Contains the actual subtitle content as a sequence of time-stamped text snippets.

class FetchedTranscript:
    def __init__(self, snippets, video_id, language, language_code, is_generated):
        """
        Fetched transcript with content. Created by Transcript.fetch().

        Args:
            snippets (List[FetchedTranscriptSnippet]): Transcript content
            video_id (str): Video ID
            language (str): Language name
            language_code (str): Language code
            is_generated (bool): Whether auto-generated
        """

    def to_raw_data(self):
        """
        Convert to raw dictionary format for serialization.

        Returns:
            List[Dict]: List of snippet dictionaries with text, start, duration
        """

    def __iter__(self):
        """
        Iterate over transcript snippets.

        Yields:
            FetchedTranscriptSnippet: Each text snippet with timing
        """

    def __getitem__(self, index):
        """
        Access snippet by index.

        Args:
            index (int): Snippet index

        Returns:
            FetchedTranscriptSnippet: Snippet at index
        """

    def __len__(self):
        """
        Get number of snippets.

        Returns:
            int: Number of transcript snippets
        """

    @property
    def snippets(self):
        """List[FetchedTranscriptSnippet]: All transcript snippets"""

    @property
    def video_id(self):
        """str: Video ID this transcript belongs to"""

    @property
    def language(self):
        """str: Human-readable language name"""

    @property
    def language_code(self):
        """str: Language code"""

    @property
    def is_generated(self):
        """bool: True if automatically generated"""

FetchedTranscriptSnippet

Individual text segment with precise timing information. Represents a single subtitle entry with start time and duration.

class FetchedTranscriptSnippet:
    def __init__(self, text, start, duration):
        """
        Single transcript snippet with timing.

        Args:
            text (str): Transcript text content
            start (float): Start timestamp in seconds
            duration (float): Duration in seconds (screen display time, not speech duration)
        """

    @property
    def text(self):
        """str: Transcript text content"""

    @property
    def start(self):
        """float: Start timestamp in seconds"""

    @property
    def duration(self):
        """float: Duration in seconds (screen display time)"""

Usage Examples

Working with TranscriptList

from youtube_transcript_api import YouTubeTranscriptApi

api = YouTubeTranscriptApi()
transcript_list = api.list('dQw4w9WgXcQ')

# Print all available transcripts
print(f"Available transcripts for {transcript_list.video_id}:")
for transcript in transcript_list:
    print(f"  {transcript.language_code}: {transcript.language}")
    print(f"    Generated: {transcript.is_generated}")
    print(f"    Translatable: {transcript.is_translatable}")

# Find specific transcript types
try:
    manual_en = transcript_list.find_manually_created_transcript(['en'])
    print(f"Found manual English transcript: {manual_en.language}")
except NoTranscriptFound:
    print("No manual English transcript available")

try:
    auto_es = transcript_list.find_generated_transcript(['es'])
    print(f"Found generated Spanish transcript: {auto_es.language}")
except NoTranscriptFound:
    print("No generated Spanish transcript available")

Working with Transcript Objects

from youtube_transcript_api import YouTubeTranscriptApi

api = YouTubeTranscriptApi()
transcript_list = api.list('dQw4w9WgXcQ')
transcript = transcript_list.find_transcript(['en'])

print(f"Transcript info:")
print(f"  Video: {transcript.video_id}")
print(f"  Language: {transcript.language} ({transcript.language_code})")
print(f"  Generated: {transcript.is_generated}")
print(f"  Translatable: {transcript.is_translatable}")

# Fetch content
fetched = transcript.fetch()
print(f"Fetched {len(fetched)} snippets")

# Translate if possible
if transcript.is_translatable:
    french = transcript.translate('fr')
    french_content = french.fetch()
    print(f"Translated to French: {len(french_content)} snippets")

Working with FetchedTranscript

from youtube_transcript_api import YouTubeTranscriptApi

api = YouTubeTranscriptApi()
transcript = api.fetch('dQw4w9WgXcQ')

# Basic information
print(f"Video: {transcript.video_id}")
print(f"Language: {transcript.language}")
print(f"Total snippets: {len(transcript)}")

# Iterate through content
for i, snippet in enumerate(transcript):
    end_time = snippet.start + snippet.duration
    print(f"[{snippet.start:.2f}-{end_time:.2f}s] {snippet.text}")
    
    if i >= 5:  # Show first 5 snippets
        break

# Access specific snippets
first_snippet = transcript[0]
print(f"First snippet: '{first_snippet.text}' at {first_snippet.start}s")

# Convert to raw data for serialization
raw_data = transcript.to_raw_data()
print(f"Raw format: {raw_data[0]}")  # {'text': '...', 'start': 0.0, 'duration': 3.84}

Types

from typing import List, Dict, Iterator, Iterable
from dataclasses import dataclass

# Internal translation language type
@dataclass
class _TranslationLanguage:
    language: str
    language_code: str

Install with Tessl CLI

npx tessl i tessl/pypi-youtube-transcript-api

docs