Computer vision library for PyTorch with datasets, model architectures, and image/video transforms.
TorchVision I/O module provides efficient image and video reading, writing, and processing capabilities with support for multiple formats and backends. It offers both high-level convenience functions and low-level streaming interfaces for various multimedia formats.
Functions for reading various image formats into tensors.
def read_image(path: str, mode: str = 'RGB') -> torch.Tensor:
"""
Read image file and return as tensor.
Args:
path (str): Path to image file
mode (str): Image mode ('RGB', 'GRAY', 'UNCHANGED')
- RGB: Convert to 3-channel RGB
- GRAY: Convert to 1-channel grayscale
- UNCHANGED: Keep original format
Returns:
torch.Tensor: Image tensor of shape (C, H, W) with values in [0, 255]
"""
def decode_image(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
"""
Decode image from bytes tensor.
Args:
input (torch.Tensor): 1-D tensor containing encoded image bytes
mode (str): Image mode for decoding
Returns:
torch.Tensor: Decoded image tensor
"""
def decode_jpeg(input: torch.Tensor, mode: str = 'RGB', device: str = 'cpu') -> torch.Tensor:
"""
Decode JPEG image from bytes.
Args:
input (torch.Tensor): 1-D tensor containing JPEG bytes
mode (str): Image mode ('RGB', 'GRAY', 'UNCHANGED')
device (str): Device to place output tensor ('cpu' or 'cuda')
Returns:
torch.Tensor: Decoded JPEG image tensor
"""
def decode_png(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
"""
Decode PNG image from bytes.
Args:
input (torch.Tensor): 1-D tensor containing PNG bytes
mode (str): Image mode for decoding
Returns:
torch.Tensor: Decoded PNG image tensor
"""
def decode_gif(input: torch.Tensor) -> torch.Tensor:
"""
Decode GIF image from bytes.
Args:
input (torch.Tensor): 1-D tensor containing GIF bytes
Returns:
torch.Tensor: Decoded GIF frames tensor of shape (N, C, H, W)
"""
def decode_webp(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
"""
Decode WebP image from bytes.
Args:
input (torch.Tensor): 1-D tensor containing WebP bytes
mode (str): Image mode for decoding
Returns:
torch.Tensor: Decoded WebP image tensor
"""
def decode_avif(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
"""
Decode AVIF image from bytes.
Args:
input (torch.Tensor): 1-D tensor containing AVIF bytes
mode (str): Image mode for decoding
Returns:
torch.Tensor: Decoded AVIF image tensor
"""
def decode_heic(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
"""
Decode HEIC image from bytes.
Args:
input (torch.Tensor): 1-D tensor containing HEIC bytes
mode (str): Image mode for decoding
Returns:
torch.Tensor: Decoded HEIC image tensor
"""Functions for encoding and writing tensors as image files.
def write_jpeg(input: torch.Tensor, filename: str, quality: int = 75) -> None:
"""
Write tensor as JPEG file.
Args:
input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]
filename (str): Output file path
quality (int): JPEG quality (1-100, higher is better quality)
"""
def write_png(input: torch.Tensor, filename: str, compression_level: int = 6) -> None:
"""
Write tensor as PNG file.
Args:
input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]
filename (str): Output file path
compression_level (int): PNG compression level (0-9, higher is smaller file)
"""
def encode_jpeg(input: torch.Tensor, quality: int = 75) -> torch.Tensor:
"""
Encode tensor to JPEG bytes.
Args:
input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]
quality (int): JPEG quality (1-100)
Returns:
torch.Tensor: 1-D tensor containing JPEG bytes
"""
def encode_png(input: torch.Tensor, compression_level: int = 6) -> torch.Tensor:
"""
Encode tensor to PNG bytes.
Args:
input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]
compression_level (int): PNG compression level (0-9)
Returns:
torch.Tensor: 1-D tensor containing PNG bytes
"""Low-level file reading and writing functions.
def read_file(path: str) -> torch.Tensor:
"""
Read file contents into bytes tensor.
Args:
path (str): Path to file
Returns:
torch.Tensor: 1-D tensor containing file bytes
"""
def write_file(filename: str, data: torch.Tensor) -> None:
"""
Write bytes tensor to file.
Args:
filename (str): Output file path
data (torch.Tensor): 1-D tensor containing bytes to write
"""Constants for specifying image reading modes.
class ImageReadMode:
"""Image reading mode constants."""
UNCHANGED: int = 0 # Keep original format and channels
GRAY: int = 1 # Convert to single-channel grayscale
GRAY_ALPHA: int = 2 # Convert to grayscale with alpha channel
RGB: int = 3 # Convert to 3-channel RGB
RGB_ALPHA: int = 4 # Convert to RGB with alpha channelConvenient functions for reading and writing video files.
def read_video(filename: str, start_pts: float = 0, end_pts: float = None, pts_unit: str = 'pts') -> tuple:
"""
Read video file and return video frames, audio frames, and info.
Args:
filename (str): Path to video file
start_pts (float): Start time for reading (in pts_unit)
end_pts (float, optional): End time for reading (in pts_unit)
pts_unit (str): Time unit ('pts' for presentation timestamp, 'sec' for seconds)
Returns:
tuple: (video_frames, audio_frames, video_info)
- video_frames (torch.Tensor): Video tensor of shape (T, H, W, C)
- audio_frames (torch.Tensor): Audio tensor of shape (T, C)
- video_info (dict): Video metadata including fps, duration, etc.
"""
def read_video_timestamps(filename: str, pts_unit: str = 'pts') -> tuple:
"""
Read video timestamps without loading frame data.
Args:
filename (str): Path to video file
pts_unit (str): Time unit for timestamps
Returns:
tuple: (video_pts, video_fps)
- video_pts (list): List of presentation timestamps
- video_fps (float): Video frame rate
"""
def write_video(filename: str, video_array: torch.Tensor, fps: float, video_codec: str = 'libx264', options=None) -> None:
"""
Write video tensor to file.
Args:
filename (str): Output video file path
video_array (torch.Tensor): Video tensor of shape (T, H, W, C) with values in [0, 255]
fps (float): Frame rate for output video
video_codec (str): Video codec to use ('libx264', 'mpeg4', etc.)
options (dict, optional): Additional encoding options
"""Streaming video reader for efficient frame-by-frame processing.
class VideoReader:
"""
Video reader for streaming video data frame by frame.
Args:
path (str): Path to video file
stream (str): Stream type ('video' or 'audio')
"""
def __init__(self, path: str, stream: str = 'video'): ...
def get_metadata(self) -> dict:
"""
Get video metadata information.
Returns:
dict: Metadata including duration, fps, resolution, codec info
"""
def set_current_stream(self, stream: str) -> None:
"""
Set current stream for reading.
Args:
stream (str): Stream type ('video' or 'audio')
"""
def seek(self, time_s: float) -> None:
"""
Seek to specific time in video.
Args:
time_s (float): Time in seconds to seek to
"""
def next(self) -> dict:
"""
Get next frame from video stream.
Returns:
dict: Frame data including 'data' tensor and 'pts' timestamp
"""
def __iter__(self):
"""Iterator interface for frame-by-frame reading."""
return self
def __next__(self) -> dict:
"""Get next frame in iterator."""Internal functions for advanced video processing.
def _read_video_from_file(filename: str, start_pts: float = 0, end_pts: float = None, pts_unit: str = 'pts') -> tuple:
"""
Internal video reading from file.
Args:
filename (str): Path to video file
start_pts (float): Start time
end_pts (float, optional): End time
pts_unit (str): Time unit
Returns:
tuple: (video_frames, audio_frames, video_info)
"""
def _read_video_timestamps_from_file(filename: str, pts_unit: str = 'pts') -> tuple:
"""
Internal timestamp reading from file.
Args:
filename (str): Path to video file
pts_unit (str): Time unit
Returns:
tuple: (video_pts, video_fps)
"""
def _read_video_from_memory(video_data: torch.Tensor, start_pts: float = 0, end_pts: float = None, pts_unit: str = 'pts') -> tuple:
"""
Read video from memory buffer.
Args:
video_data (torch.Tensor): Video data bytes
start_pts (float): Start time
end_pts (float, optional): End time
pts_unit (str): Time unit
Returns:
tuple: (video_frames, audio_frames, video_info)
"""
def _read_video_timestamps_from_memory(video_data: torch.Tensor, pts_unit: str = 'pts') -> tuple:
"""
Read timestamps from memory buffer.
Args:
video_data (torch.Tensor): Video data bytes
pts_unit (str): Time unit
Returns:
tuple: (video_pts, video_fps)
"""
def _probe_video_from_file(filename: str) -> dict:
"""
Probe video file for metadata without reading frames.
Args:
filename (str): Path to video file
Returns:
dict: Video metadata
"""
def _probe_video_from_memory(video_data: torch.Tensor) -> dict:
"""
Probe video data for metadata without reading frames.
Args:
video_data (torch.Tensor): Video data bytes
Returns:
dict: Video metadata
"""Classes for representing video metadata and timing information.
class VideoMetaData:
"""
Container for video metadata information.
Attributes:
has_video (bool): Whether video stream is present
has_audio (bool): Whether audio stream is present
video_duration (float): Video duration in seconds
video_fps (float): Video frame rate
audio_sample_rate (int): Audio sample rate
video_codec (str): Video codec name
audio_codec (str): Audio codec name
"""
has_video: bool
has_audio: bool
video_duration: float
video_fps: float
audio_sample_rate: int
video_codec: str
audio_codec: str
class Timebase:
"""
Video timebase information for timestamp conversion.
Attributes:
numerator (int): Timebase numerator
denominator (int): Timebase denominator
"""
numerator: int
denominator: intRuntime flags indicating video decoding capabilities.
_HAS_CPU_VIDEO_DECODER: bool # Whether CPU video decoder is available
_HAS_GPU_VIDEO_DECODER: bool # Whether GPU video decoder is available
_HAS_VIDEO_OPT: bool # Whether video optimization is availableimport torchvision.io as io
import torch
# Read image from file
image = io.read_image('input.jpg', mode='RGB')
print(f"Image shape: {image.shape}") # (C, H, W)
print(f"Image dtype: {image.dtype}") # torch.uint8
# Write image to file
io.write_jpeg(image, 'output.jpg', quality=95)
io.write_png(image, 'output.png', compression_level=3)
# Read with different modes
gray_image = io.read_image('input.jpg', mode='GRAY') # (1, H, W)
unchanged_image = io.read_image('input.jpg', mode='UNCHANGED') # Original formatimport torchvision.io as io
import torch
# Read file as bytes
image_bytes = io.read_file('input.jpg')
print(f"File size: {image_bytes.shape[0]} bytes")
# Decode image from bytes
image = io.decode_jpeg(image_bytes, mode='RGB')
# Encode image back to bytes
encoded_jpeg = io.encode_jpeg(image, quality=90)
encoded_png = io.encode_png(image, compression_level=6)
# Write encoded bytes to file
io.write_file('output_encoded.jpg', encoded_jpeg)
io.write_file('output_encoded.png', encoded_png)import torchvision.io as io
# Support for various image formats
formats = ['jpg', 'png', 'gif', 'webp']
for fmt in formats:
try:
# Read image
image = io.read_image(f'input.{fmt}')
print(f"Successfully read {fmt}: {image.shape}")
# For GIF, handle multiple frames
if fmt == 'gif':
# GIF returns (N, C, H, W) for N frames
print(f"GIF frames: {image.shape[0]}")
except Exception as e:
print(f"Error reading {fmt}: {e}")import torchvision.io as io
# Read entire video
video_frames, audio_frames, video_info = io.read_video('input.mp4')
print(f"Video shape: {video_frames.shape}") # (T, H, W, C)
print(f"Audio shape: {audio_frames.shape}") # (T, C)
print(f"Video info: {video_info}")
# Read specific time range (5-10 seconds)
video_frames, audio_frames, info = io.read_video(
'input.mp4',
start_pts=5,
end_pts=10,
pts_unit='sec'
)
# Get video timestamps without loading frames
video_pts, video_fps = io.read_video_timestamps('input.mp4')
print(f"Video FPS: {video_fps}")
print(f"Number of frames: {len(video_pts)}")import torchvision.io as io
import torch
# Create video reader for streaming
reader = io.VideoReader('large_video.mp4', 'video')
# Get metadata
metadata = reader.get_metadata()
print(f"Duration: {metadata['video']['duration'][0]} seconds")
print(f"FPS: {metadata['video']['fps'][0]}")
print(f"Resolution: {metadata['video']['width'][0]}x{metadata['video']['height'][0]}")
# Process video frame by frame
frame_count = 0
for frame_data in reader:
frame = frame_data['data'] # Shape: (C, H, W)
pts = frame_data['pts'] # Presentation timestamp
# Process frame here
# For example, apply transforms or run inference
frame_count += 1
if frame_count >= 100: # Process only first 100 frames
break
print(f"Processed {frame_count} frames")
# Seek to specific time and continue reading
reader.seek(30.0) # Seek to 30 seconds
frame_data = reader.next()
print(f"Frame at 30s has timestamp: {frame_data['pts']}")import torchvision.io as io
import torch
# Create synthetic video data (100 frames, 480x640, RGB)
video_data = torch.randint(0, 256, (100, 480, 640, 3), dtype=torch.uint8)
# Write video with default settings
io.write_video('output.mp4', video_data, fps=30.0)
# Write with custom codec and options
io.write_video(
'output_hq.mp4',
video_data,
fps=30.0,
video_codec='libx264',
options={'crf': '18', 'preset': 'slow'} # High quality settings
)
# Write with different codec
io.write_video(
'output_fast.mp4',
video_data,
fps=30.0,
video_codec='mpeg4'
)import torchvision.io as io
import torchvision.transforms as transforms
import torch
def process_video_batch(input_path, output_path, transform=None):
"""
Process video by applying transforms to batches of frames.
"""
# Read video
video_frames, audio_frames, info = io.read_video(input_path)
# Convert from (T, H, W, C) to (T, C, H, W) for transforms
video_frames = video_frames.permute(0, 3, 1, 2).float() / 255.0
# Apply transforms if provided
if transform:
processed_frames = []
for frame in video_frames:
processed_frame = transform(frame)
processed_frames.append(processed_frame)
video_frames = torch.stack(processed_frames)
# Convert back to (T, H, W, C) and uint8 for writing
video_frames = video_frames.permute(0, 2, 3, 1)
video_frames = (video_frames * 255).byte()
# Write processed video
io.write_video(output_path, video_frames, fps=info['video_fps'])
# Define processing pipeline
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# Process video
process_video_batch('input.mp4', 'processed.mp4', transform)import torchvision.io as io
import torch
def process_large_video(input_path, output_path, batch_size=32):
"""
Process large video in batches to manage memory usage.
"""
reader = io.VideoReader(input_path, 'video')
metadata = reader.get_metadata()
fps = metadata['video']['fps'][0]
processed_frames = []
batch = []
for frame_data in reader:
frame = frame_data['data'].float() / 255.0 # Normalize to [0, 1]
batch.append(frame)
# Process batch when full
if len(batch) == batch_size:
batch_tensor = torch.stack(batch)
# Apply batch processing here (e.g., model inference)
# For example, apply a simple transform
processed_batch = torch.flip(batch_tensor, dims=[2]) # Horizontal flip
processed_frames.extend(processed_batch)
batch = []
# Process remaining frames
if batch:
batch_tensor = torch.stack(batch)
processed_batch = torch.flip(batch_tensor, dims=[2])
processed_frames.extend(processed_batch)
# Stack all processed frames and convert back to uint8
all_frames = torch.stack(processed_frames)
all_frames = (all_frames * 255).byte().permute(0, 2, 3, 1) # (T, H, W, C)
# Write output video
io.write_video(output_path, all_frames, fps=fps)
# Process video in batches
process_large_video('large_input.mp4', 'large_output.mp4', batch_size=16)Install with Tessl CLI
npx tessl i tessl/pypi-torchvision