Computer vision library for PyTorch with datasets, model architectures, and image/video transforms.
npx @tessl/cli install tessl/pypi-torchvision@0.23.0TorchVision is a computer vision library for PyTorch that provides datasets, model architectures, and computer vision transforms. It offers a comprehensive toolkit for building computer vision applications with pre-trained models, data loading utilities, and image/video processing capabilities.
pip install torchvisionimport torchvision
from torchvision import datasets, models, transforms, utils, io, ops, tv_tensorsCommon patterns:
import torchvision.transforms as transforms
import torchvision.models as models
from torchvision.datasets import CIFAR10, ImageNetimport torch
import torchvision.transforms as transforms
from torchvision import models, datasets
from torch.utils.data import DataLoader
# Load a pre-trained model
model = models.resnet50(weights='DEFAULT')
model.eval()
# Create transform pipeline
transform = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
])
# Load dataset
dataset = datasets.CIFAR10(root='./data', train=False,
download=True, transform=transform)
dataloader = DataLoader(dataset, batch_size=32, shuffle=False)
# Inference
with torch.no_grad():
for images, labels in dataloader:
outputs = model(images)
predictions = torch.argmax(outputs, dim=1)
breakTorchVision is organized into several key modules:
Core TorchVision configuration functions for backend management.
def set_image_backend(backend: str) -> None:
"""Set the image loading backend ('PIL' or 'accimage')."""
def get_image_backend() -> str:
"""Get the current image backend."""
def set_video_backend(backend: str) -> None:
"""Set the video decoding backend ('pyav', 'video_reader', or 'cuda')."""
def get_video_backend() -> str:
"""Get the current video backend."""
def disable_beta_transforms_warning() -> None:
"""Disable beta transforms warning (legacy compatibility function)."""Comprehensive collection of computer vision datasets with automatic downloading and preprocessing capabilities. Includes image classification, object detection, segmentation, and video datasets.
class VisionDataset:
"""Base class for all vision datasets."""
class ImageFolder(VisionDataset):
"""Data loader for image classification datasets in folder format."""
class CIFAR10(VisionDataset):
"""CIFAR-10 dataset."""
class ImageNet(VisionDataset):
"""ImageNet dataset."""
class CocoDetection(VisionDataset):
"""COCO dataset for object detection."""Pre-trained neural network models for various computer vision tasks including classification, object detection, instance segmentation, semantic segmentation, and video understanding.
def get_model(name: str, **config) -> torch.nn.Module:
"""Get model by name with configuration."""
def list_models() -> list[str]:
"""List all available models."""
def resnet50(weights=None, progress: bool = True, **kwargs) -> torch.nn.Module:
"""ResNet-50 model."""
def fasterrcnn_resnet50_fpn(weights=None, progress: bool = True, **kwargs) -> torch.nn.Module:
"""Faster R-CNN with ResNet-50-FPN backbone."""Image and video preprocessing and augmentation operations. Includes both v1 (PIL/tensor) and v2 (multi-tensor) APIs for different data types.
class Compose:
"""Composes several transforms together."""
class Resize:
"""Resize image to given size."""
class ToTensor:
"""Convert PIL Image or numpy array to tensor."""
class Normalize:
"""Normalize tensor with mean and std."""
class RandomHorizontalFlip:
"""Randomly flip image horizontally."""Visualization utilities and tensor operations for working with images, bounding boxes, masks, and keypoints.
def make_grid(tensor, nrow: int = 8, padding: int = 2, normalize: bool = False):
"""Make a grid of images."""
def save_image(tensor, fp, nrow: int = 8, padding: int = 2, normalize: bool = False):
"""Save tensor as image file."""
def draw_bounding_boxes(image, boxes, labels=None, colors=None, fill: bool = False, width: int = 1):
"""Draw bounding boxes on image."""Image and video input/output operations with support for multiple formats and backends.
def read_image(path: str, mode: str = 'RGB'):
"""Read image file to tensor."""
def write_jpeg(input, filename: str, quality: int = 75):
"""Write tensor as JPEG file."""
def read_video(filename: str, start_pts: float = 0, end_pts=None, pts_unit: str = 'pts'):
"""Read video file."""
class VideoReader:
"""Video reader for streaming video data."""Low-level operations for object detection, segmentation, and specialized neural network layers.
def nms(boxes, scores, iou_threshold: float):
"""Non-maximum suppression."""
def roi_align(input, boxes, output_size, spatial_scale: float = 1.0, sampling_ratio: int = -1, aligned: bool = False):
"""RoI Align operation."""
def box_iou(boxes1, boxes2):
"""Calculate IoU between box sets."""
class FeaturePyramidNetwork(torch.nn.Module):
"""Feature Pyramid Network."""Enhanced tensor types that preserve metadata and semantics through transformations, supporting images, videos, bounding boxes, masks, and keypoints.
class Image(torch.Tensor):
"""Image tensor type with metadata."""
class BoundingBoxes(torch.Tensor):
"""Bounding box tensor with format and canvas size."""
class Mask(torch.Tensor):
"""Segmentation mask tensor type."""
class Video(torch.Tensor):
"""Video tensor type for temporal data."""__version__: str # TorchVision version string (0.23.0)
git_version: str # Git commit hash