CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-torchvision

Computer vision library for PyTorch with datasets, model architectures, and image/video transforms.

Overview
Eval results
Files

io.mddocs/

I/O Operations

TorchVision I/O module provides efficient image and video reading, writing, and processing capabilities with support for multiple formats and backends. It offers both high-level convenience functions and low-level streaming interfaces for various multimedia formats.

Capabilities

Image I/O

Image Reading Functions

Functions for reading various image formats into tensors.

def read_image(path: str, mode: str = 'RGB') -> torch.Tensor:
    """
    Read image file and return as tensor.
    
    Args:
        path (str): Path to image file
        mode (str): Image mode ('RGB', 'GRAY', 'UNCHANGED')
                   - RGB: Convert to 3-channel RGB
                   - GRAY: Convert to 1-channel grayscale  
                   - UNCHANGED: Keep original format
    
    Returns:
        torch.Tensor: Image tensor of shape (C, H, W) with values in [0, 255]
    """

def decode_image(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
    """
    Decode image from bytes tensor.
    
    Args:
        input (torch.Tensor): 1-D tensor containing encoded image bytes
        mode (str): Image mode for decoding
    
    Returns:
        torch.Tensor: Decoded image tensor
    """

def decode_jpeg(input: torch.Tensor, mode: str = 'RGB', device: str = 'cpu') -> torch.Tensor:
    """
    Decode JPEG image from bytes.
    
    Args:
        input (torch.Tensor): 1-D tensor containing JPEG bytes
        mode (str): Image mode ('RGB', 'GRAY', 'UNCHANGED')  
        device (str): Device to place output tensor ('cpu' or 'cuda')
    
    Returns:
        torch.Tensor: Decoded JPEG image tensor
    """

def decode_png(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
    """
    Decode PNG image from bytes.
    
    Args:
        input (torch.Tensor): 1-D tensor containing PNG bytes
        mode (str): Image mode for decoding
    
    Returns:
        torch.Tensor: Decoded PNG image tensor
    """

def decode_gif(input: torch.Tensor) -> torch.Tensor:
    """
    Decode GIF image from bytes.
    
    Args:
        input (torch.Tensor): 1-D tensor containing GIF bytes
    
    Returns:
        torch.Tensor: Decoded GIF frames tensor of shape (N, C, H, W)
    """

def decode_webp(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
    """
    Decode WebP image from bytes.
    
    Args:
        input (torch.Tensor): 1-D tensor containing WebP bytes  
        mode (str): Image mode for decoding
    
    Returns:
        torch.Tensor: Decoded WebP image tensor
    """

def decode_avif(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
    """
    Decode AVIF image from bytes.
    
    Args:
        input (torch.Tensor): 1-D tensor containing AVIF bytes
        mode (str): Image mode for decoding
    
    Returns:
        torch.Tensor: Decoded AVIF image tensor
    """

def decode_heic(input: torch.Tensor, mode: str = 'RGB') -> torch.Tensor:
    """
    Decode HEIC image from bytes.
    
    Args:
        input (torch.Tensor): 1-D tensor containing HEIC bytes
        mode (str): Image mode for decoding
    
    Returns:
        torch.Tensor: Decoded HEIC image tensor
    """

Image Writing Functions

Functions for encoding and writing tensors as image files.

def write_jpeg(input: torch.Tensor, filename: str, quality: int = 75) -> None:
    """
    Write tensor as JPEG file.
    
    Args:
        input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]
        filename (str): Output file path
        quality (int): JPEG quality (1-100, higher is better quality)
    """

def write_png(input: torch.Tensor, filename: str, compression_level: int = 6) -> None:
    """
    Write tensor as PNG file.
    
    Args:
        input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]
        filename (str): Output file path  
        compression_level (int): PNG compression level (0-9, higher is smaller file)
    """

def encode_jpeg(input: torch.Tensor, quality: int = 75) -> torch.Tensor:
    """
    Encode tensor to JPEG bytes.
    
    Args:
        input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]
        quality (int): JPEG quality (1-100)
    
    Returns:
        torch.Tensor: 1-D tensor containing JPEG bytes
    """

def encode_png(input: torch.Tensor, compression_level: int = 6) -> torch.Tensor:
    """
    Encode tensor to PNG bytes.
    
    Args:
        input (torch.Tensor): Image tensor of shape (C, H, W) with values in [0, 255]  
        compression_level (int): PNG compression level (0-9)
    
    Returns:
        torch.Tensor: 1-D tensor containing PNG bytes
    """

File I/O Functions

Low-level file reading and writing functions.

def read_file(path: str) -> torch.Tensor:
    """
    Read file contents into bytes tensor.
    
    Args:
        path (str): Path to file
    
    Returns:
        torch.Tensor: 1-D tensor containing file bytes
    """

def write_file(filename: str, data: torch.Tensor) -> None:
    """
    Write bytes tensor to file.
    
    Args:
        filename (str): Output file path
        data (torch.Tensor): 1-D tensor containing bytes to write
    """

Image Reading Modes

Constants for specifying image reading modes.

class ImageReadMode:
    """Image reading mode constants."""
    UNCHANGED: int = 0    # Keep original format and channels
    GRAY: int = 1         # Convert to single-channel grayscale
    GRAY_ALPHA: int = 2   # Convert to grayscale with alpha channel
    RGB: int = 3          # Convert to 3-channel RGB
    RGB_ALPHA: int = 4    # Convert to RGB with alpha channel

Video I/O

High-Level Video Functions

Convenient functions for reading and writing video files.

def read_video(filename: str, start_pts: float = 0, end_pts: float = None, pts_unit: str = 'pts') -> tuple:
    """
    Read video file and return video frames, audio frames, and info.
    
    Args:
        filename (str): Path to video file
        start_pts (float): Start time for reading (in pts_unit)
        end_pts (float, optional): End time for reading (in pts_unit)  
        pts_unit (str): Time unit ('pts' for presentation timestamp, 'sec' for seconds)
    
    Returns:
        tuple: (video_frames, audio_frames, video_info)
            - video_frames (torch.Tensor): Video tensor of shape (T, H, W, C)
            - audio_frames (torch.Tensor): Audio tensor of shape (T, C) 
            - video_info (dict): Video metadata including fps, duration, etc.
    """

def read_video_timestamps(filename: str, pts_unit: str = 'pts') -> tuple:
    """
    Read video timestamps without loading frame data.
    
    Args:
        filename (str): Path to video file
        pts_unit (str): Time unit for timestamps
    
    Returns:
        tuple: (video_pts, video_fps) 
            - video_pts (list): List of presentation timestamps
            - video_fps (float): Video frame rate
    """

def write_video(filename: str, video_array: torch.Tensor, fps: float, video_codec: str = 'libx264', options=None) -> None:
    """
    Write video tensor to file.
    
    Args:
        filename (str): Output video file path
        video_array (torch.Tensor): Video tensor of shape (T, H, W, C) with values in [0, 255]
        fps (float): Frame rate for output video
        video_codec (str): Video codec to use ('libx264', 'mpeg4', etc.)
        options (dict, optional): Additional encoding options
    """

Video Reader Class

Streaming video reader for efficient frame-by-frame processing.

class VideoReader:
    """
    Video reader for streaming video data frame by frame.
    
    Args:
        path (str): Path to video file
        stream (str): Stream type ('video' or 'audio')
    """
    
    def __init__(self, path: str, stream: str = 'video'): ...
    
    def get_metadata(self) -> dict:
        """
        Get video metadata information.
        
        Returns:
            dict: Metadata including duration, fps, resolution, codec info
        """
    
    def set_current_stream(self, stream: str) -> None:
        """
        Set current stream for reading.
        
        Args:
            stream (str): Stream type ('video' or 'audio')
        """
    
    def seek(self, time_s: float) -> None:
        """
        Seek to specific time in video.
        
        Args:
            time_s (float): Time in seconds to seek to
        """
    
    def next(self) -> dict:
        """
        Get next frame from video stream.
        
        Returns:
            dict: Frame data including 'data' tensor and 'pts' timestamp
        """
    
    def __iter__(self):
        """Iterator interface for frame-by-frame reading."""
        return self
    
    def __next__(self) -> dict:
        """Get next frame in iterator."""

Low-Level Video Functions

Internal functions for advanced video processing.

def _read_video_from_file(filename: str, start_pts: float = 0, end_pts: float = None, pts_unit: str = 'pts') -> tuple:
    """
    Internal video reading from file.
    
    Args:
        filename (str): Path to video file
        start_pts (float): Start time
        end_pts (float, optional): End time
        pts_unit (str): Time unit
    
    Returns:
        tuple: (video_frames, audio_frames, video_info)
    """

def _read_video_timestamps_from_file(filename: str, pts_unit: str = 'pts') -> tuple:
    """
    Internal timestamp reading from file.
    
    Args:
        filename (str): Path to video file
        pts_unit (str): Time unit
    
    Returns:
        tuple: (video_pts, video_fps)
    """

def _read_video_from_memory(video_data: torch.Tensor, start_pts: float = 0, end_pts: float = None, pts_unit: str = 'pts') -> tuple:
    """
    Read video from memory buffer.
    
    Args:
        video_data (torch.Tensor): Video data bytes
        start_pts (float): Start time
        end_pts (float, optional): End time
        pts_unit (str): Time unit
    
    Returns:
        tuple: (video_frames, audio_frames, video_info)
    """

def _read_video_timestamps_from_memory(video_data: torch.Tensor, pts_unit: str = 'pts') -> tuple:
    """
    Read timestamps from memory buffer.
    
    Args:
        video_data (torch.Tensor): Video data bytes
        pts_unit (str): Time unit
    
    Returns:
        tuple: (video_pts, video_fps)
    """

def _probe_video_from_file(filename: str) -> dict:
    """
    Probe video file for metadata without reading frames.
    
    Args:
        filename (str): Path to video file
    
    Returns:
        dict: Video metadata
    """

def _probe_video_from_memory(video_data: torch.Tensor) -> dict:
    """
    Probe video data for metadata without reading frames.
    
    Args:
        video_data (torch.Tensor): Video data bytes
    
    Returns:
        dict: Video metadata
    """

Video Metadata Classes

Classes for representing video metadata and timing information.

class VideoMetaData:
    """
    Container for video metadata information.
    
    Attributes:
        has_video (bool): Whether video stream is present
        has_audio (bool): Whether audio stream is present
        video_duration (float): Video duration in seconds
        video_fps (float): Video frame rate
        audio_sample_rate (int): Audio sample rate
        video_codec (str): Video codec name
        audio_codec (str): Audio codec name
    """
    
    has_video: bool
    has_audio: bool
    video_duration: float
    video_fps: float
    audio_sample_rate: int
    video_codec: str
    audio_codec: str

class Timebase:
    """
    Video timebase information for timestamp conversion.
    
    Attributes:
        numerator (int): Timebase numerator
        denominator (int): Timebase denominator
    """
    
    numerator: int
    denominator: int

Video Backend Flags

Runtime flags indicating video decoding capabilities.

_HAS_CPU_VIDEO_DECODER: bool  # Whether CPU video decoder is available
_HAS_GPU_VIDEO_DECODER: bool  # Whether GPU video decoder is available
_HAS_VIDEO_OPT: bool          # Whether video optimization is available

Usage Examples

Basic Image Reading and Writing

import torchvision.io as io
import torch

# Read image from file
image = io.read_image('input.jpg', mode='RGB')
print(f"Image shape: {image.shape}")  # (C, H, W)
print(f"Image dtype: {image.dtype}")  # torch.uint8

# Write image to file
io.write_jpeg(image, 'output.jpg', quality=95)
io.write_png(image, 'output.png', compression_level=3)

# Read with different modes
gray_image = io.read_image('input.jpg', mode='GRAY')  # (1, H, W)
unchanged_image = io.read_image('input.jpg', mode='UNCHANGED')  # Original format

Image Encoding and Decoding

import torchvision.io as io
import torch

# Read file as bytes
image_bytes = io.read_file('input.jpg')
print(f"File size: {image_bytes.shape[0]} bytes")

# Decode image from bytes
image = io.decode_jpeg(image_bytes, mode='RGB')

# Encode image back to bytes
encoded_jpeg = io.encode_jpeg(image, quality=90)
encoded_png = io.encode_png(image, compression_level=6)

# Write encoded bytes to file
io.write_file('output_encoded.jpg', encoded_jpeg)
io.write_file('output_encoded.png', encoded_png)

Multi-Format Image Support

import torchvision.io as io

# Support for various image formats
formats = ['jpg', 'png', 'gif', 'webp']

for fmt in formats:
    try:
        # Read image
        image = io.read_image(f'input.{fmt}')
        print(f"Successfully read {fmt}: {image.shape}")
        
        # For GIF, handle multiple frames
        if fmt == 'gif':
            # GIF returns (N, C, H, W) for N frames
            print(f"GIF frames: {image.shape[0]}")
            
    except Exception as e:
        print(f"Error reading {fmt}: {e}")

Basic Video Reading

import torchvision.io as io

# Read entire video
video_frames, audio_frames, video_info = io.read_video('input.mp4')

print(f"Video shape: {video_frames.shape}")  # (T, H, W, C)
print(f"Audio shape: {audio_frames.shape}")  # (T, C)
print(f"Video info: {video_info}")

# Read specific time range (5-10 seconds)
video_frames, audio_frames, info = io.read_video(
    'input.mp4', 
    start_pts=5, 
    end_pts=10, 
    pts_unit='sec'
)

# Get video timestamps without loading frames
video_pts, video_fps = io.read_video_timestamps('input.mp4')
print(f"Video FPS: {video_fps}")
print(f"Number of frames: {len(video_pts)}")

Streaming Video Processing

import torchvision.io as io
import torch

# Create video reader for streaming
reader = io.VideoReader('large_video.mp4', 'video')

# Get metadata
metadata = reader.get_metadata()
print(f"Duration: {metadata['video']['duration'][0]} seconds")
print(f"FPS: {metadata['video']['fps'][0]}")
print(f"Resolution: {metadata['video']['width'][0]}x{metadata['video']['height'][0]}")

# Process video frame by frame
frame_count = 0
for frame_data in reader:
    frame = frame_data['data']  # Shape: (C, H, W)
    pts = frame_data['pts']     # Presentation timestamp
    
    # Process frame here
    # For example, apply transforms or run inference
    
    frame_count += 1
    if frame_count >= 100:  # Process only first 100 frames
        break

print(f"Processed {frame_count} frames")

# Seek to specific time and continue reading
reader.seek(30.0)  # Seek to 30 seconds
frame_data = reader.next()
print(f"Frame at 30s has timestamp: {frame_data['pts']}")

Video Writing

import torchvision.io as io
import torch

# Create synthetic video data (100 frames, 480x640, RGB)
video_data = torch.randint(0, 256, (100, 480, 640, 3), dtype=torch.uint8)

# Write video with default settings
io.write_video('output.mp4', video_data, fps=30.0)

# Write with custom codec and options
io.write_video(
    'output_hq.mp4',
    video_data, 
    fps=30.0,
    video_codec='libx264',
    options={'crf': '18', 'preset': 'slow'}  # High quality settings
)

# Write with different codec
io.write_video(
    'output_fast.mp4',
    video_data,
    fps=30.0,
    video_codec='mpeg4'
)

Video Processing Pipeline

import torchvision.io as io
import torchvision.transforms as transforms
import torch

def process_video_batch(input_path, output_path, transform=None):
    """
    Process video by applying transforms to batches of frames.
    """
    # Read video
    video_frames, audio_frames, info = io.read_video(input_path)
    
    # Convert from (T, H, W, C) to (T, C, H, W) for transforms
    video_frames = video_frames.permute(0, 3, 1, 2).float() / 255.0
    
    # Apply transforms if provided
    if transform:
        processed_frames = []
        for frame in video_frames:
            processed_frame = transform(frame)
            processed_frames.append(processed_frame)
        video_frames = torch.stack(processed_frames)
    
    # Convert back to (T, H, W, C) and uint8 for writing
    video_frames = video_frames.permute(0, 2, 3, 1)
    video_frames = (video_frames * 255).byte()
    
    # Write processed video
    io.write_video(output_path, video_frames, fps=info['video_fps'])

# Define processing pipeline
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ColorJitter(brightness=0.2, contrast=0.2),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Process video
process_video_batch('input.mp4', 'processed.mp4', transform)

Memory-Efficient Video Processing

import torchvision.io as io
import torch

def process_large_video(input_path, output_path, batch_size=32):
    """
    Process large video in batches to manage memory usage.
    """
    reader = io.VideoReader(input_path, 'video')
    metadata = reader.get_metadata()
    fps = metadata['video']['fps'][0]
    
    processed_frames = []
    batch = []
    
    for frame_data in reader:
        frame = frame_data['data'].float() / 255.0  # Normalize to [0, 1]
        batch.append(frame)
        
        # Process batch when full
        if len(batch) == batch_size:
            batch_tensor = torch.stack(batch)
            
            # Apply batch processing here (e.g., model inference)
            # For example, apply a simple transform
            processed_batch = torch.flip(batch_tensor, dims=[2])  # Horizontal flip
            
            processed_frames.extend(processed_batch)
            batch = []
    
    # Process remaining frames
    if batch:
        batch_tensor = torch.stack(batch)
        processed_batch = torch.flip(batch_tensor, dims=[2])
        processed_frames.extend(processed_batch)
    
    # Stack all processed frames and convert back to uint8
    all_frames = torch.stack(processed_frames)
    all_frames = (all_frames * 255).byte().permute(0, 2, 3, 1)  # (T, H, W, C)
    
    # Write output video
    io.write_video(output_path, all_frames, fps=fps)

# Process video in batches
process_large_video('large_input.mp4', 'large_output.mp4', batch_size=16)

Install with Tessl CLI

npx tessl i tessl/pypi-torchvision

docs

datasets.md

index.md

io.md

models.md

ops.md

transforms.md

tv_tensors.md

utils.md

tile.json