0
# I/O Operations
1
2
TorchVision I/O module provides efficient image and video reading, writing, and processing capabilities with support for multiple formats and backends. It offers both high-level convenience functions and low-level streaming interfaces for various multimedia formats.
3
4
## Capabilities
5
6
### Image I/O
7
8
#### Image Reading Functions
9
10
Functions for reading various image formats into tensors.
11
12
```python { .api }
13
def read_image(path: str, mode: str = 'RGB') -> torch.Tensor:
14
"""
15
Read image file and return as tensor.
16
17
Args:
18
path (str): Path to image file
19
mode (str): Image mode ('RGB', 'GRAY', 'UNCHANGED')
20
- RGB: Convert to 3-channel RGB
21
- GRAY: Convert to 1-channel grayscale
22
- UNCHANGED: Keep original format
23
24
Returns:
25
torch.Tensor: Image tensor of shape (C, H, W) with values in [0, 255]
26
"""
27
28
def decode_image(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
29
"""
30
Decode image from bytes tensor.
31
32
Args:
33
input (torch.Tensor): 1-D tensor containing encoded image bytes
34
mode (str): Image mode for decoding
35
36
Returns:
37
torch.Tensor: Decoded image tensor
38
"""
39
40
def decode_jpeg(input: torch.Tensor, mode: str = 'RGB', device: str = 'cpu') -> torch.Tensor:
41
"""
42
Decode JPEG image from bytes.
43
44
Args:
45
input (torch.Tensor): 1-D tensor containing JPEG bytes
46
mode (str): Image mode ('RGB', 'GRAY', 'UNCHANGED')
47
device (str): Device to place output tensor ('cpu' or 'cuda')
48
49
Returns:
50
torch.Tensor: Decoded JPEG image tensor
51
"""
52
53
def decode_png(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
54
"""
55
Decode PNG image from bytes.
56
57
Args:
58
input (torch.Tensor): 1-D tensor containing PNG bytes
59
mode (str): Image mode for decoding
60
61
Returns:
62
torch.Tensor: Decoded PNG image tensor
63
"""
64
65
def decode_gif(input: torch.Tensor) -> torch.Tensor:
66
"""
67
Decode GIF image from bytes.
68
69
Args:
70
input (torch.Tensor): 1-D tensor containing GIF bytes
71
72
Returns:
73
torch.Tensor: Decoded GIF frames tensor of shape (N, C, H, W)
74
"""
75
76
def decode_webp(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
77
"""
78
Decode WebP image from bytes.
79
80
Args:
81
input (torch.Tensor): 1-D tensor containing WebP bytes
82
mode (str): Image mode for decoding
83
84
Returns:
85
torch.Tensor: Decoded WebP image tensor
86
"""
87
88
def decode_avif(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
89
"""
90
Decode AVIF image from bytes.
91
92
Args:
93
input (torch.Tensor): 1-D tensor containing AVIF bytes
94
mode (str): Image mode for decoding
95
96
Returns:
97
torch.Tensor: Decoded AVIF image tensor
98
"""
99
100
def decode_heic(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
101
"""
102
Decode HEIC image from bytes.
103
104
Args:
105
input (torch.Tensor): 1-D tensor containing HEIC bytes
106
mode (str): Image mode for decoding
107
108
Returns:
109
torch.Tensor: Decoded HEIC image tensor
110
"""
111
```
112
113
#### Image Writing Functions
114
115
Functions for encoding and writing tensors as image files.
116
117
```python { .api }
118
def write_jpeg(input: torch.Tensor, filename: str, quality: int = 75) -> None:
119
"""
120
Write tensor as JPEG file.
121
122
Args:
123
input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]
124
filename (str): Output file path
125
quality (int): JPEG quality (1-100, higher is better quality)
126
"""
127
128
def write_png(input: torch.Tensor, filename: str, compression_level: int = 6) -> None:
129
"""
130
Write tensor as PNG file.
131
132
Args:
133
input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]
134
filename (str): Output file path
135
compression_level (int): PNG compression level (0-9, higher is smaller file)
136
"""
137
138
def encode_jpeg(input: torch.Tensor, quality: int = 75) -> torch.Tensor:
139
"""
140
Encode tensor to JPEG bytes.
141
142
Args:
143
input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]
144
quality (int): JPEG quality (1-100)
145
146
Returns:
147
torch.Tensor: 1-D tensor containing JPEG bytes
148
"""
149
150
def encode_png(input: torch.Tensor, compression_level: int = 6) -> torch.Tensor:
151
"""
152
Encode tensor to PNG bytes.
153
154
Args:
155
input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]
156
compression_level (int): PNG compression level (0-9)
157
158
Returns:
159
torch.Tensor: 1-D tensor containing PNG bytes
160
"""
161
```
162
163
#### File I/O Functions
164
165
Low-level file reading and writing functions.
166
167
```python { .api }
168
def read_file(path: str) -> torch.Tensor:
169
"""
170
Read file contents into bytes tensor.
171
172
Args:
173
path (str): Path to file
174
175
Returns:
176
torch.Tensor: 1-D tensor containing file bytes
177
"""
178
179
def write_file(filename: str, data: torch.Tensor) -> None:
180
"""
181
Write bytes tensor to file.
182
183
Args:
184
filename (str): Output file path
185
data (torch.Tensor): 1-D tensor containing bytes to write
186
"""
187
```
188
189
#### Image Reading Modes
190
191
Constants for specifying image reading modes.
192
193
```python { .api }
194
class ImageReadMode:
195
"""Image reading mode constants."""
196
UNCHANGED: int = 0 # Keep original format and channels
197
GRAY: int = 1 # Convert to single-channel grayscale
198
GRAY_ALPHA: int = 2 # Convert to grayscale with alpha channel
199
RGB: int = 3 # Convert to 3-channel RGB
200
RGB_ALPHA: int = 4 # Convert to RGB with alpha channel
201
```
202
203
### Video I/O
204
205
#### High-Level Video Functions
206
207
Convenient functions for reading and writing video files.
208
209
```python { .api }
210
def read_video(filename: str, start_pts: float = 0, end_pts: float = None, pts_unit: str = 'pts') -> tuple:
211
"""
212
Read video file and return video frames, audio frames, and info.
213
214
Args:
215
filename (str): Path to video file
216
start_pts (float): Start time for reading (in pts_unit)
217
end_pts (float, optional): End time for reading (in pts_unit)
218
pts_unit (str): Time unit ('pts' for presentation timestamp, 'sec' for seconds)
219
220
Returns:
221
tuple: (video_frames, audio_frames, video_info)
222
- video_frames (torch.Tensor): Video tensor of shape (T, H, W, C)
223
- audio_frames (torch.Tensor): Audio tensor of shape (T, C)
224
- video_info (dict): Video metadata including fps, duration, etc.
225
"""
226
227
def read_video_timestamps(filename: str, pts_unit: str = 'pts') -> tuple:
228
"""
229
Read video timestamps without loading frame data.
230
231
Args:
232
filename (str): Path to video file
233
pts_unit (str): Time unit for timestamps
234
235
Returns:
236
tuple: (video_pts, video_fps)
237
- video_pts (list): List of presentation timestamps
238
- video_fps (float): Video frame rate
239
"""
240
241
def write_video(filename: str, video_array: torch.Tensor, fps: float, video_codec: str = 'libx264', options=None) -> None:
242
"""
243
Write video tensor to file.
244
245
Args:
246
filename (str): Output video file path
247
video_array (torch.Tensor): Video tensor of shape (T, H, W, C) with values in [0, 255]
248
fps (float): Frame rate for output video
249
video_codec (str): Video codec to use ('libx264', 'mpeg4', etc.)
250
options (dict, optional): Additional encoding options
251
"""
252
```
253
254
#### Video Reader Class
255
256
Streaming video reader for efficient frame-by-frame processing.
257
258
```python { .api }
259
class VideoReader:
260
"""
261
Video reader for streaming video data frame by frame.
262
263
Args:
264
path (str): Path to video file
265
stream (str): Stream type ('video' or 'audio')
266
"""
267
268
def __init__(self, path: str, stream: str = 'video'): ...
269
270
def get_metadata(self) -> dict:
271
"""
272
Get video metadata information.
273
274
Returns:
275
dict: Metadata including duration, fps, resolution, codec info
276
"""
277
278
def set_current_stream(self, stream: str) -> None:
279
"""
280
Set current stream for reading.
281
282
Args:
283
stream (str): Stream type ('video' or 'audio')
284
"""
285
286
def seek(self, time_s: float) -> None:
287
"""
288
Seek to specific time in video.
289
290
Args:
291
time_s (float): Time in seconds to seek to
292
"""
293
294
def next(self) -> dict:
295
"""
296
Get next frame from video stream.
297
298
Returns:
299
dict: Frame data including 'data' tensor and 'pts' timestamp
300
"""
301
302
def __iter__(self):
303
"""Iterator interface for frame-by-frame reading."""
304
return self
305
306
def __next__(self) -> dict:
307
"""Get next frame in iterator."""
308
```
309
310
#### Low-Level Video Functions
311
312
Internal functions for advanced video processing.
313
314
```python { .api }
315
def _read_video_from_file(filename: str, start_pts: float = 0, end_pts: float = None, pts_unit: str = 'pts') -> tuple:
316
"""
317
Internal video reading from file.
318
319
Args:
320
filename (str): Path to video file
321
start_pts (float): Start time
322
end_pts (float, optional): End time
323
pts_unit (str): Time unit
324
325
Returns:
326
tuple: (video_frames, audio_frames, video_info)
327
"""
328
329
def _read_video_timestamps_from_file(filename: str, pts_unit: str = 'pts') -> tuple:
330
"""
331
Internal timestamp reading from file.
332
333
Args:
334
filename (str): Path to video file
335
pts_unit (str): Time unit
336
337
Returns:
338
tuple: (video_pts, video_fps)
339
"""
340
341
def _read_video_from_memory(video_data: torch.Tensor, start_pts: float = 0, end_pts: float = None, pts_unit: str = 'pts') -> tuple:
342
"""
343
Read video from memory buffer.
344
345
Args:
346
video_data (torch.Tensor): Video data bytes
347
start_pts (float): Start time
348
end_pts (float, optional): End time
349
pts_unit (str): Time unit
350
351
Returns:
352
tuple: (video_frames, audio_frames, video_info)
353
"""
354
355
def _read_video_timestamps_from_memory(video_data: torch.Tensor, pts_unit: str = 'pts') -> tuple:
356
"""
357
Read timestamps from memory buffer.
358
359
Args:
360
video_data (torch.Tensor): Video data bytes
361
pts_unit (str): Time unit
362
363
Returns:
364
tuple: (video_pts, video_fps)
365
"""
366
367
def _probe_video_from_file(filename: str) -> dict:
368
"""
369
Probe video file for metadata without reading frames.
370
371
Args:
372
filename (str): Path to video file
373
374
Returns:
375
dict: Video metadata
376
"""
377
378
def _probe_video_from_memory(video_data: torch.Tensor) -> dict:
379
"""
380
Probe video data for metadata without reading frames.
381
382
Args:
383
video_data (torch.Tensor): Video data bytes
384
385
Returns:
386
dict: Video metadata
387
"""
388
```
389
390
#### Video Metadata Classes
391
392
Classes for representing video metadata and timing information.
393
394
```python { .api }
395
class VideoMetaData:
396
"""
397
Container for video metadata information.
398
399
Attributes:
400
has_video (bool): Whether video stream is present
401
has_audio (bool): Whether audio stream is present
402
video_duration (float): Video duration in seconds
403
video_fps (float): Video frame rate
404
audio_sample_rate (int): Audio sample rate
405
video_codec (str): Video codec name
406
audio_codec (str): Audio codec name
407
"""
408
409
has_video: bool
410
has_audio: bool
411
video_duration: float
412
video_fps: float
413
audio_sample_rate: int
414
video_codec: str
415
audio_codec: str
416
417
class Timebase:
418
"""
419
Video timebase information for timestamp conversion.
420
421
Attributes:
422
numerator (int): Timebase numerator
423
denominator (int): Timebase denominator
424
"""
425
426
numerator: int
427
denominator: int
428
```
429
430
#### Video Backend Flags
431
432
Runtime flags indicating video decoding capabilities.
433
434
```python { .api }
435
_HAS_CPU_VIDEO_DECODER: bool # Whether CPU video decoder is available
436
_HAS_GPU_VIDEO_DECODER: bool # Whether GPU video decoder is available
437
_HAS_VIDEO_OPT: bool # Whether video optimization is available
438
```
439
440
## Usage Examples
441
442
### Basic Image Reading and Writing
443
444
```python
445
import torchvision.io as io
446
import torch
447
448
# Read image from file
449
image = io.read_image('input.jpg', mode='RGB')
450
print(f"Image shape: {image.shape}") # (C, H, W)
451
print(f"Image dtype: {image.dtype}") # torch.uint8
452
453
# Write image to file
454
io.write_jpeg(image, 'output.jpg', quality=95)
455
io.write_png(image, 'output.png', compression_level=3)
456
457
# Read with different modes
458
gray_image = io.read_image('input.jpg', mode='GRAY') # (1, H, W)
459
unchanged_image = io.read_image('input.jpg', mode='UNCHANGED') # Original format
460
```
461
462
### Image Encoding and Decoding
463
464
```python
465
import torchvision.io as io
466
import torch
467
468
# Read file as bytes
469
image_bytes = io.read_file('input.jpg')
470
print(f"File size: {image_bytes.shape[0]} bytes")
471
472
# Decode image from bytes
473
image = io.decode_jpeg(image_bytes, mode='RGB')
474
475
# Encode image back to bytes
476
encoded_jpeg = io.encode_jpeg(image, quality=90)
477
encoded_png = io.encode_png(image, compression_level=6)
478
479
# Write encoded bytes to file
480
io.write_file('output_encoded.jpg', encoded_jpeg)
481
io.write_file('output_encoded.png', encoded_png)
482
```
483
484
### Multi-Format Image Support
485
486
```python
487
import torchvision.io as io
488
489
# Support for various image formats
490
formats = ['jpg', 'png', 'gif', 'webp']
491
492
for fmt in formats:
493
try:
494
# Read image
495
image = io.read_image(f'input.{fmt}')
496
print(f"Successfully read {fmt}: {image.shape}")
497
498
# For GIF, handle multiple frames
499
if fmt == 'gif':
500
# GIF returns (N, C, H, W) for N frames
501
print(f"GIF frames: {image.shape[0]}")
502
503
except Exception as e:
504
print(f"Error reading {fmt}: {e}")
505
```
506
507
### Basic Video Reading
508
509
```python
510
import torchvision.io as io
511
512
# Read entire video
513
video_frames, audio_frames, video_info = io.read_video('input.mp4')
514
515
print(f"Video shape: {video_frames.shape}") # (T, H, W, C)
516
print(f"Audio shape: {audio_frames.shape}") # (T, C)
517
print(f"Video info: {video_info}")
518
519
# Read specific time range (5-10 seconds)
520
video_frames, audio_frames, info = io.read_video(
521
'input.mp4',
522
start_pts=5,
523
end_pts=10,
524
pts_unit='sec'
525
)
526
527
# Get video timestamps without loading frames
528
video_pts, video_fps = io.read_video_timestamps('input.mp4')
529
print(f"Video FPS: {video_fps}")
530
print(f"Number of frames: {len(video_pts)}")
531
```
532
533
### Streaming Video Processing
534
535
```python
536
import torchvision.io as io
537
import torch
538
539
# Create video reader for streaming
540
reader = io.VideoReader('large_video.mp4', 'video')
541
542
# Get metadata
543
metadata = reader.get_metadata()
544
print(f"Duration: {metadata['video']['duration'][0]} seconds")
545
print(f"FPS: {metadata['video']['fps'][0]}")
546
print(f"Resolution: {metadata['video']['width'][0]}x{metadata['video']['height'][0]}")
547
548
# Process video frame by frame
549
frame_count = 0
550
for frame_data in reader:
551
frame = frame_data['data'] # Shape: (C, H, W)
552
pts = frame_data['pts'] # Presentation timestamp
553
554
# Process frame here
555
# For example, apply transforms or run inference
556
557
frame_count += 1
558
if frame_count >= 100: # Process only first 100 frames
559
break
560
561
print(f"Processed {frame_count} frames")
562
563
# Seek to specific time and continue reading
564
reader.seek(30.0) # Seek to 30 seconds
565
frame_data = reader.next()
566
print(f"Frame at 30s has timestamp: {frame_data['pts']}")
567
```
568
569
### Video Writing
570
571
```python
572
import torchvision.io as io
573
import torch
574
575
# Create synthetic video data (100 frames, 480x640, RGB)
576
video_data = torch.randint(0, 256, (100, 480, 640, 3), dtype=torch.uint8)
577
578
# Write video with default settings
579
io.write_video('output.mp4', video_data, fps=30.0)
580
581
# Write with custom codec and options
582
io.write_video(
583
'output_hq.mp4',
584
video_data,
585
fps=30.0,
586
video_codec='libx264',
587
options={'crf': '18', 'preset': 'slow'} # High quality settings
588
)
589
590
# Write with different codec
591
io.write_video(
592
'output_fast.mp4',
593
video_data,
594
fps=30.0,
595
video_codec='mpeg4'
596
)
597
```
598
599
### Video Processing Pipeline
600
601
```python
602
import torchvision.io as io
603
import torchvision.transforms as transforms
604
import torch
605
606
def process_video_batch(input_path, output_path, transform=None):
607
"""
608
Process video by applying transforms to batches of frames.
609
"""
610
# Read video
611
video_frames, audio_frames, info = io.read_video(input_path)
612
613
# Convert from (T, H, W, C) to (T, C, H, W) for transforms
614
video_frames = video_frames.permute(0, 3, 1, 2).float() / 255.0
615
616
# Apply transforms if provided
617
if transform:
618
processed_frames = []
619
for frame in video_frames:
620
processed_frame = transform(frame)
621
processed_frames.append(processed_frame)
622
video_frames = torch.stack(processed_frames)
623
624
# Convert back to (T, H, W, C) and uint8 for writing
625
video_frames = video_frames.permute(0, 2, 3, 1)
626
video_frames = (video_frames * 255).byte()
627
628
# Write processed video
629
io.write_video(output_path, video_frames, fps=info['video_fps'])
630
631
# Define processing pipeline
632
transform = transforms.Compose([
633
transforms.Resize((224, 224)),
634
transforms.ColorJitter(brightness=0.2, contrast=0.2),
635
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
636
])
637
638
# Process video
639
process_video_batch('input.mp4', 'processed.mp4', transform)
640
```
641
642
### Memory-Efficient Video Processing
643
644
```python
645
import torchvision.io as io
646
import torch
647
648
def process_large_video(input_path, output_path, batch_size=32):
649
"""
650
Process large video in batches to manage memory usage.
651
"""
652
reader = io.VideoReader(input_path, 'video')
653
metadata = reader.get_metadata()
654
fps = metadata['video']['fps'][0]
655
656
processed_frames = []
657
batch = []
658
659
for frame_data in reader:
660
frame = frame_data['data'].float() / 255.0 # Normalize to [0, 1]
661
batch.append(frame)
662
663
# Process batch when full
664
if len(batch) == batch_size:
665
batch_tensor = torch.stack(batch)
666
667
# Apply batch processing here (e.g., model inference)
668
# For example, apply a simple transform
669
processed_batch = torch.flip(batch_tensor, dims=[2]) # Horizontal flip
670
671
processed_frames.extend(processed_batch)
672
batch = []
673
674
# Process remaining frames
675
if batch:
676
batch_tensor = torch.stack(batch)
677
processed_batch = torch.flip(batch_tensor, dims=[2])
678
processed_frames.extend(processed_batch)
679
680
# Stack all processed frames and convert back to uint8
681
all_frames = torch.stack(processed_frames)
682
all_frames = (all_frames * 255).byte().permute(0, 2, 3, 1) # (T, H, W, C)
683
684
# Write output video
685
io.write_video(output_path, all_frames, fps=fps)
686
687
# Process video in batches
688
process_large_video('large_input.mp4', 'large_output.mp4', batch_size=16)
689
```