CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-fastai

fastai simplifies training fast and accurate neural nets using modern best practices

Pending
Overview
Eval results
Files

data-loading.mddocs/

Data Loading and Processing

Comprehensive data loading system built around the DataBlock API and transform pipelines. Provides flexible, composable data processing for all fastai domains.

Capabilities

DataLoaders

Main class for managing training and validation data with integrated transforms.

class DataLoaders:
    """
    Container for train/valid DataLoader pairs.
    
    Parameters:
    - *loaders: DataLoader instances (typically train, valid)
    - path: Base path for saving/loading
    - device: Device to place data on
    """
    def __init__(self, *loaders, path='.', device=None): ...
    
    @classmethod
    def from_dblock(cls, dblock, source, path='.', **kwargs):
        """
        Create DataLoaders from DataBlock.
        
        Parameters:
        - dblock: DataBlock defining data processing
        - source: Data source (path, list, etc.)
        - path: Base path
        - **kwargs: Additional arguments
        
        Returns:
        - DataLoaders instance
        """
    
    def show_batch(self, b=None, max_n=9, ctxs=None, show=True, **kwargs):
        """Display a batch of data."""
    
    @property
    def train(self):
        """Training DataLoader."""
    
    @property  
    def valid(self):
        """Validation DataLoader."""
    
    def one_batch(self):
        """Get one batch from training data."""
    
    def save(self, file='data_loaders.pkl'):
        """Save DataLoaders to disk."""
    
    @classmethod
    def load(cls, path, file='data_loaders.pkl'):
        """Load DataLoaders from disk."""

DataBlock API

Flexible API for constructing data processing pipelines from modular components.

class DataBlock:
    """
    Flexible data processing pipeline constructor.
    
    Parameters:
    - blocks: Transform blocks for inputs and targets
    - dl_type: DataLoader type to use
    - getters: Functions to extract data from source
    - n_inp: Number of input elements
    - item_tfms: Item-level transforms
    - batch_tfms: Batch-level transforms
    - **kwargs: Additional DataLoader arguments
    """
    def __init__(self, blocks=(TransformBlock,), dl_type=None, getters=None, 
                 n_inp=None, item_tfms=None, batch_tfms=None, **kwargs): ...
    
    def dataloaders(self, source, path='.', verbose=False, **kwargs):
        """
        Create DataLoaders from data source.
        
        Parameters:
        - source: Data source
        - path: Base path
        - verbose: Show processing information
        - **kwargs: DataLoader arguments
        
        Returns:
        - DataLoaders instance
        """
    
    def datasets(self, source, verbose=False, **kwargs):
        """Create datasets without DataLoaders."""
    
    def summary(self, source, **kwargs):
        """Show summary of data processing pipeline."""

Transform Blocks

Building blocks for different data types in the DataBlock API.

class TransformBlock:
    """Base class for transform blocks."""
    
    def __init__(self, type_tfms=None, item_tfms=None, batch_tfms=None, 
                 dl_type=None, dls_kwargs=None): ...

class ImageBlock(TransformBlock):
    """Transform block for image data."""
    
    def __init__(self, cls=PILImage): ...

class CategoryBlock(TransformBlock):
    """Transform block for categorical labels."""
    
    def __init__(self, vocab=None, sort=True, add_na=False): ...

class MultiCategoryBlock(TransformBlock):
    """Transform block for multi-label categorical data."""
    
    def __init__(self, encoded=False, vocab=None, add_na=False): ...

class RegressionBlock(TransformBlock):
    """Transform block for regression targets."""

class MaskBlock(TransformBlock):
    """Transform block for segmentation masks."""
    
    def __init__(self, codes=None): ...

class PointBlock(TransformBlock):
    """Transform block for point/keypoint data."""

class BBoxBlock(TransformBlock):
    """Transform block for bounding boxes."""

class BBoxLblBlock(TransformBlock):
    """Transform block for labeled bounding boxes."""

Data Splitting

Functions and classes for splitting data into train/validation sets.

class RandomSplitter:
    """Random train/validation split."""
    
    def __init__(self, valid_pct=0.2, seed=None): ...
    
    def __call__(self, o):
        """
        Split data randomly.
        
        Parameters:
        - o: Data items to split
        
        Returns:
        - Train indices, validation indices
        """

class TrainTestSplitter:
    """Split based on test set."""
    
    def __init__(self, test_name='test', valid_name='valid'): ...

def RandomSubsetSplitter(valid_pct=0.2, n=None, **kwargs):
    """Random subset splitter for large datasets."""

def FuncSplitter(func):
    """Split based on function result."""

def MaskSplitter(mask):
    """Split based on boolean mask."""

def FileSplitter(fname):
    """Split based on filenames in text file."""

def GrandparentSplitter(train_name='train', valid_name='valid'):
    """Split based on grandparent folder names."""

def IndexSplitter(valid_idx):
    """Split based on specific indices."""

File and Dataset Utilities

Utilities for working with files and external datasets.

def get_files(path, extensions=None, recurse=True, folders=None, followlinks=True):
    """
    Get list of files with optional filtering.
    
    Parameters:
    - path: Directory path
    - extensions: File extensions to include
    - recurse: Search subdirectories
    - folders: Folder names to include/exclude
    - followlinks: Follow symbolic links
    
    Returns:
    - List of Path objects
    """

def get_image_files(path, recurse=True, folders=None):
    """Get image files from directory."""

def get_text_files(path, recurse=True, folders=None):
    """Get text files from directory."""

def untar_data(url, dest=None, c_key='data', force_download=False, extract=True):
    """
    Download and extract fastai datasets.
    
    Parameters:
    - url: Dataset URL or URLs enum value
    - dest: Destination directory
    - c_key: Config key for base path
    - force_download: Re-download if exists
    - extract: Extract after download
    
    Returns:
    - Path to extracted data
    """

class URLs:
    """Predefined dataset URLs."""
    PETS = 'https://s3.amazonaws.com/fast-ai-imageclas/oxford-iiit-pet.tgz'
    MNIST = 'https://s3.amazonaws.com/fast-ai-sample/mnist_png.tgz'
    CIFAR = 'https://s3.amazonaws.com/fast-ai-sample/cifar10.tgz'
    IMDB = 'https://s3.amazonaws.com/fast-ai-nlp/imdb.tgz'
    # ... many more dataset URLs

def download_url(url, dest=None, timeout=None, show_progress=True):
    """Download file from URL."""

def fastai_path():
    """Get fastai data directory path."""

Transforms

Core transform classes for data preprocessing.

class Transform:
    """Base class for transforms."""
    
    def __init__(self, enc=None, dec=None, split_idx=None, order=None): ...
    
    def __call__(self, x, **kwargs): ...

class ToTensor(Transform):
    """Convert to tensor."""

class IntToFloatTensor(Transform):
    """Convert integer tensor to float."""

class Normalize(Transform):
    """Normalize with mean and standard deviation."""
    
    def __init__(self, mean=None, std=None, axes=None): ...

class CategoryMap(Transform):
    """Map categories to integers."""
    
    def __init__(self, vocab=None, add_na=False, sort=True): ...

class MultiCategoryMap(Transform):
    """Map multi-categories to multi-hot encoding."""
    
    def __init__(self, vocab=None, add_na=False, c2i=None): ...

class Resize(Transform):
    """Resize images to specified size."""
    
    def __init__(self, size, method='crop', pad_mode='reflection'): ...

TfmdLists and Datasets

Advanced data containers with integrated transforms.

class TfmdLists:
    """Lists with integrated transform pipeline."""
    
    def __init__(self, items, tfms, use_list=None, do_setup=True, split_idx=None, 
                 train_setup=True, splits=None, types=None, verbose=False): ...
    
    def subset(self, i):
        """Get subset by index."""
    
    def new_empty(self):
        """Create new empty instance."""

class Datasets:
    """Multiple TfmdLists that create tuples."""
    
    def __init__(self, items, tfms=None, tls=None, n_inp=None, dl_type=None, **kwargs): ...
    
    def subset(self, i):
        """Get subset by split index."""
    
    @property
    def train(self):
        """Training dataset."""
    
    @property 
    def valid(self):
        """Validation dataset."""

Install with Tessl CLI

npx tessl i tessl/pypi-fastai

docs

callbacks.md

collaborative-filtering.md

core-training.md

data-loading.md

index.md

interpretation.md

medical.md

metrics-losses.md

tabular.md

text.md

vision.md

tile.json