0
# Data Structures
1
2
Core data classes for representing transcript metadata, collections, and content. These structures provide the foundation for all transcript operations in the library.
3
4
## Capabilities
5
6
### TranscriptList
7
8
Container for all available transcripts for a specific video. Provides methods to search and filter transcripts by language and type (manual vs. generated).
9
10
```python { .api }
11
class TranscriptList:
12
def __init__(self, video_id, manually_created_transcripts, generated_transcripts, translation_languages):
13
"""
14
Internal constructor. Use YouTubeTranscriptApi.list() to create instances.
15
16
Args:
17
video_id (str): Video ID this list belongs to
18
manually_created_transcripts (dict): Manual transcripts by language code
19
generated_transcripts (dict): Generated transcripts by language code
20
translation_languages (list): Available translation languages
21
"""
22
23
def find_transcript(self, language_codes):
24
"""
25
Find transcript with language priority. Prefers manual over generated.
26
27
Args:
28
language_codes (Iterable[str]): Language codes in priority order
29
30
Returns:
31
Transcript: First matching transcript found
32
33
Raises:
34
NoTranscriptFound: No transcript found for any requested language
35
"""
36
37
def find_generated_transcript(self, language_codes):
38
"""
39
Find automatically generated transcript.
40
41
Args:
42
language_codes (Iterable[str]): Language codes in priority order
43
44
Returns:
45
Transcript: First matching generated transcript
46
47
Raises:
48
NoTranscriptFound: No generated transcript found
49
"""
50
51
def find_manually_created_transcript(self, language_codes):
52
"""
53
Find manually created transcript.
54
55
Args:
56
language_codes (Iterable[str]): Language codes in priority order
57
58
Returns:
59
Transcript: First matching manual transcript
60
61
Raises:
62
NoTranscriptFound: No manual transcript found
63
"""
64
65
def __iter__(self):
66
"""
67
Iterate over all transcripts (manual first, then generated).
68
69
Yields:
70
Transcript: Each available transcript
71
"""
72
73
@property
74
def video_id(self):
75
"""str: Video ID this transcript list belongs to"""
76
```
77
78
### Transcript
79
80
Metadata and fetching interface for an individual transcript. Represents a specific language version of a video's subtitles.
81
82
```python { .api }
83
class Transcript:
84
def __init__(self, http_client, video_id, url, language, language_code, is_generated, translation_languages):
85
"""
86
Internal constructor. Access via TranscriptList methods.
87
"""
88
89
def fetch(self, preserve_formatting=False):
90
"""
91
Load the actual transcript content.
92
93
Args:
94
preserve_formatting (bool, optional): Keep HTML formatting tags. Defaults to False
95
96
Returns:
97
FetchedTranscript: Transcript with content and timing data
98
99
Raises:
100
PoTokenRequired: PO token required for this video
101
YouTubeRequestFailed: HTTP request failed
102
"""
103
104
def translate(self, language_code):
105
"""
106
Create translated version of this transcript.
107
108
Args:
109
language_code (str): Target language code for translation
110
111
Returns:
112
Transcript: New transcript object for translated version
113
114
Raises:
115
NotTranslatable: This transcript cannot be translated
116
TranslationLanguageNotAvailable: Requested language not available
117
"""
118
119
@property
120
def video_id(self):
121
"""str: Video ID this transcript belongs to"""
122
123
@property
124
def language(self):
125
"""str: Human-readable language name"""
126
127
@property
128
def language_code(self):
129
"""str: Language code (e.g., 'en', 'es', 'fr')"""
130
131
@property
132
def is_generated(self):
133
"""bool: True if automatically generated, False if manually created"""
134
135
@property
136
def translation_languages(self):
137
"""list: Available languages for translation"""
138
139
@property
140
def is_translatable(self):
141
"""bool: True if this transcript can be translated"""
142
```
143
144
### FetchedTranscript
145
146
Complete transcript data with timing information. Contains the actual subtitle content as a sequence of time-stamped text snippets.
147
148
```python { .api }
149
class FetchedTranscript:
150
def __init__(self, snippets, video_id, language, language_code, is_generated):
151
"""
152
Fetched transcript with content. Created by Transcript.fetch().
153
154
Args:
155
snippets (List[FetchedTranscriptSnippet]): Transcript content
156
video_id (str): Video ID
157
language (str): Language name
158
language_code (str): Language code
159
is_generated (bool): Whether auto-generated
160
"""
161
162
def to_raw_data(self):
163
"""
164
Convert to raw dictionary format for serialization.
165
166
Returns:
167
List[Dict]: List of snippet dictionaries with text, start, duration
168
"""
169
170
def __iter__(self):
171
"""
172
Iterate over transcript snippets.
173
174
Yields:
175
FetchedTranscriptSnippet: Each text snippet with timing
176
"""
177
178
def __getitem__(self, index):
179
"""
180
Access snippet by index.
181
182
Args:
183
index (int): Snippet index
184
185
Returns:
186
FetchedTranscriptSnippet: Snippet at index
187
"""
188
189
def __len__(self):
190
"""
191
Get number of snippets.
192
193
Returns:
194
int: Number of transcript snippets
195
"""
196
197
@property
198
def snippets(self):
199
"""List[FetchedTranscriptSnippet]: All transcript snippets"""
200
201
@property
202
def video_id(self):
203
"""str: Video ID this transcript belongs to"""
204
205
@property
206
def language(self):
207
"""str: Human-readable language name"""
208
209
@property
210
def language_code(self):
211
"""str: Language code"""
212
213
@property
214
def is_generated(self):
215
"""bool: True if automatically generated"""
216
```
217
218
### FetchedTranscriptSnippet
219
220
Individual text segment with precise timing information. Represents a single subtitle entry with start time and duration.
221
222
```python { .api }
223
class FetchedTranscriptSnippet:
224
def __init__(self, text, start, duration):
225
"""
226
Single transcript snippet with timing.
227
228
Args:
229
text (str): Transcript text content
230
start (float): Start timestamp in seconds
231
duration (float): Duration in seconds (screen display time, not speech duration)
232
"""
233
234
@property
235
def text(self):
236
"""str: Transcript text content"""
237
238
@property
239
def start(self):
240
"""float: Start timestamp in seconds"""
241
242
@property
243
def duration(self):
244
"""float: Duration in seconds (screen display time)"""
245
```
246
247
## Usage Examples
248
249
### Working with TranscriptList
250
251
```python
252
from youtube_transcript_api import YouTubeTranscriptApi
253
254
api = YouTubeTranscriptApi()
255
transcript_list = api.list('dQw4w9WgXcQ')
256
257
# Print all available transcripts
258
print(f"Available transcripts for {transcript_list.video_id}:")
259
for transcript in transcript_list:
260
print(f" {transcript.language_code}: {transcript.language}")
261
print(f" Generated: {transcript.is_generated}")
262
print(f" Translatable: {transcript.is_translatable}")
263
264
# Find specific transcript types
265
try:
266
manual_en = transcript_list.find_manually_created_transcript(['en'])
267
print(f"Found manual English transcript: {manual_en.language}")
268
except NoTranscriptFound:
269
print("No manual English transcript available")
270
271
try:
272
auto_es = transcript_list.find_generated_transcript(['es'])
273
print(f"Found generated Spanish transcript: {auto_es.language}")
274
except NoTranscriptFound:
275
print("No generated Spanish transcript available")
276
```
277
278
### Working with Transcript Objects
279
280
```python
281
from youtube_transcript_api import YouTubeTranscriptApi
282
283
api = YouTubeTranscriptApi()
284
transcript_list = api.list('dQw4w9WgXcQ')
285
transcript = transcript_list.find_transcript(['en'])
286
287
print(f"Transcript info:")
288
print(f" Video: {transcript.video_id}")
289
print(f" Language: {transcript.language} ({transcript.language_code})")
290
print(f" Generated: {transcript.is_generated}")
291
print(f" Translatable: {transcript.is_translatable}")
292
293
# Fetch content
294
fetched = transcript.fetch()
295
print(f"Fetched {len(fetched)} snippets")
296
297
# Translate if possible
298
if transcript.is_translatable:
299
french = transcript.translate('fr')
300
french_content = french.fetch()
301
print(f"Translated to French: {len(french_content)} snippets")
302
```
303
304
### Working with FetchedTranscript
305
306
```python
307
from youtube_transcript_api import YouTubeTranscriptApi
308
309
api = YouTubeTranscriptApi()
310
transcript = api.fetch('dQw4w9WgXcQ')
311
312
# Basic information
313
print(f"Video: {transcript.video_id}")
314
print(f"Language: {transcript.language}")
315
print(f"Total snippets: {len(transcript)}")
316
317
# Iterate through content
318
for i, snippet in enumerate(transcript):
319
end_time = snippet.start + snippet.duration
320
print(f"[{snippet.start:.2f}-{end_time:.2f}s] {snippet.text}")
321
322
if i >= 5: # Show first 5 snippets
323
break
324
325
# Access specific snippets
326
first_snippet = transcript[0]
327
print(f"First snippet: '{first_snippet.text}' at {first_snippet.start}s")
328
329
# Convert to raw data for serialization
330
raw_data = transcript.to_raw_data()
331
print(f"Raw format: {raw_data[0]}") # {'text': '...', 'start': 0.0, 'duration': 3.84}
332
```
333
334
## Types
335
336
```python { .api }
337
from typing import List, Dict, Iterator, Iterable
338
from dataclasses import dataclass
339
340
# Internal translation language type
341
@dataclass
342
class _TranslationLanguage:
343
language: str
344
language_code: str
345
```