tessl/pypi-autogluon

AutoGluon automates machine learning tasks enabling you to easily achieve strong predictive performance in your applications.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Multimodal Machine Learning

Name: tessl/pypi-autogluon
Author: tessl

Automated machine learning for heterogeneous data combining text, images, and tabular features. MultiModalPredictor supports diverse tasks including classification, regression, object detection, named entity recognition, semantic matching, and feature extraction using state-of-the-art foundation models.

Capabilities

MultiModalPredictor Class

Main predictor class for multimodal data that automatically handles different data modalities and task types with minimal configuration.

class MultiModalPredictor:
    def __init__(
        self,
        label: str = None,
        problem_type: str = None,
        query: str = None,
        response: str = None,
        match_label = None,
        presets: str = None,
        eval_metric = None,
        hyperparameters: dict = None,
        path: str = None,
        verbosity: int = 2,
        num_classes: int = None,
        classes: list = None,
        warn_if_exist: bool = True,
        enable_progress_bar: bool = None,
        pretrained: bool = True,
        validation_metric: str = None,
        sample_data_path: str = None,
        use_ensemble: bool = False,
        ensemble_size: int = 2,
        ensemble_mode: str = "one_shot"
    ):
        """
        Initialize MultiModalPredictor for automated multimodal machine learning.
        
        Parameters:
        - label: Name of target column to predict
        - problem_type: Problem type ('binary', 'multiclass', 'regression', 'object_detection', 
          'ner', 'text_similarity', 'image_similarity', 'image_text_similarity', 
          'feature_extraction', 'zero_shot_image_classification', 'few_shot_classification',
          'semantic_segmentation')
        - query: Column name for query data in semantic matching tasks
        - response: Column name for response data in semantic matching tasks
        - match_label: Label indicating positive matches in semantic matching
        - presets: Quality presets ('best_quality', 'high_quality', 'medium_quality')
        - eval_metric: Evaluation metric for model selection
        - hyperparameters: Custom hyperparameter configurations
        - path: Directory to save models and artifacts
        - verbosity: Logging verbosity level (0-4)
        - num_classes: Number of classes for object detection
        - classes: Class names for object detection
        - warn_if_exist: Whether to warn if save path exists
        - enable_progress_bar: Show training progress bars
        - pretrained: Use pretrained model weights
        - validation_metric: Metric for validation and early stopping
        - sample_data_path: Path to sample data for inference shape
        - use_ensemble: Enable ensemble learning
        - ensemble_size: Number of models in ensemble
        - ensemble_mode: Ensemble construction mode ('one_shot', 'sequential')
        """

Model Training

Train multimodal models on heterogeneous data with automatic preprocessing and model selection.

def fit(
    self,
    train_data,
    presets: str = None,
    tuning_data = None,
    max_num_tuning_data: int = None,
    id_mappings: dict = None,
    time_limit: int = None,
    save_path: str = None,
    hyperparameters = None,
    column_types: dict = None,
    holdout_frac: float = None,
    teacher_predictor = None,
    seed: int = 0,
    standalone: bool = True,
    hyperparameter_tune_kwargs: dict = None,
    clean_ckpts: bool = True,
    predictions: list = None,
    labels = None,
    predictors: list = None
):
    """
    Fit MultiModalPredictor on multimodal training data.
    
    Parameters:
    - train_data: Training data (DataFrame with text, images, tabular columns)
    - presets: Quality/speed presets
    - tuning_data: Validation data for hyperparameter tuning
    - max_num_tuning_data: Maximum tuning samples for object detection
    - id_mappings: ID-to-content mappings for semantic matching
    - time_limit: Maximum training time in seconds
    - save_path: Directory to save models
    - hyperparameters: Custom hyperparameter configurations
    - column_types: Manual column type specifications
    - holdout_frac: Fraction of data for validation
    - teacher_predictor: Teacher model for knowledge distillation
    - seed: Random seed for reproducibility
    - standalone: Save complete model for offline deployment
    - hyperparameter_tune_kwargs: HPO configuration
    - clean_ckpts: Clean intermediate checkpoints
    - predictions: Pre-computed predictions for ensemble
    - labels: Pre-computed labels for ensemble
    - predictors: Pre-trained predictors for ensemble
    
    Returns:
    MultiModalPredictor: Fitted predictor instance
    """

Prediction

Generate predictions for multimodal data across different task types.

def predict(
    self,
    data,
    candidate_data = None,
    id_mappings: dict = None,
    as_pandas: bool = None,
    realtime: bool = False,
    save_results: bool = None,
    **kwargs
):
    """
    Generate predictions for multimodal data.
    
    Parameters:
    - data: Input data (DataFrame, dict, list, or file path)
    - candidate_data: Candidate data for semantic matching/retrieval
    - id_mappings: ID-to-content mappings
    - as_pandas: Return results as pandas DataFrame/Series
    - realtime: Use realtime inference optimization
    - save_results: Save prediction results to disk
    - **kwargs: Additional arguments (e.g., as_coco for object detection)
    
    Returns:
    Predictions in format appropriate for the task type
    """

def predict_proba(
    self,
    data,
    candidate_data = None,
    id_mappings: dict = None,
    as_pandas: bool = None,
    as_multiclass: bool = True,
    realtime: bool = False
):
    """
    Generate prediction probabilities for classification tasks.
    
    Parameters:
    - data: Input data
    - candidate_data: Candidate data for retrieval tasks
    - id_mappings: ID-to-content mappings
    - as_pandas: Return results as pandas DataFrame
    - as_multiclass: Return all class probabilities vs positive class only
    - realtime: Use realtime inference optimization
    
    Returns:
    Prediction probabilities as DataFrame or numpy array
    """

Feature Extraction

Extract embeddings and features from multimodal data for downstream tasks.

def extract_embedding(
    self,
    data,
    id_mappings: dict = None,
    return_masks: bool = False,
    as_tensor: bool = False,
    as_pandas: bool = False,
    realtime: bool = False,
    signature: str = None
):
    """
    Extract feature embeddings from multimodal data.
    
    Parameters:
    - data: Input data (DataFrame, dict, or list)
    - id_mappings: ID-to-content mappings
    - return_masks: Return attention masks for missing data
    - as_tensor: Return PyTorch tensors
    - as_pandas: Return pandas DataFrame
    - realtime: Use realtime inference optimization
    - signature: Signature type for semantic matching ('query' or 'response')
    
    Returns:
    Feature embeddings as numpy array, tensor, or DataFrame
    """

Model Evaluation

Evaluate multimodal model performance with task-specific metrics.

def evaluate(
    self,
    data,
    query_data: list = None,
    response_data: list = None,
    id_mappings: dict = None,
    metrics: list = None,
    chunk_size: int = 1024,
    similarity_type: str = "cosine",
    cutoffs: list = [1, 5, 10],
    label: str = None,
    return_pred: bool = False,
    realtime: bool = False,
    eval_tool: str = None,
    predictions: list = None,
    labels = None
):
    """
    Evaluate multimodal model performance.
    
    Parameters:
    - data: Test data (DataFrame, dict, list, or annotation file path)
    - query_data: Query data for ranking evaluation
    - response_data: Response data for ranking evaluation  
    - id_mappings: ID-to-content mappings
    - metrics: List of evaluation metrics
    - chunk_size: Batch size for similarity computation
    - similarity_type: Similarity function ('cosine', 'dot_prod')
    - cutoffs: Cutoff values for ranking metrics
    - label: Label column name
    - return_pred: Return individual predictions
    - realtime: Use realtime inference
    - eval_tool: Evaluation tool for object detection ('pycocotools', 'torchmetrics')
    - predictions: Pre-computed predictions
    - labels: Pre-computed labels
    
    Returns:
    dict: Evaluation metrics and optionally predictions
    """

Model Management

Save, load, and export multimodal models for deployment.

def save(self, path: str, standalone: bool = True):
    """
    Save trained predictor to disk.
    
    Parameters:
    - path: Directory to save predictor
    - standalone: Save complete model for offline deployment
    """

@classmethod
def load(
    cls,
    path: str,
    resume: bool = False,
    verbosity: int = 3
):
    """
    Load saved predictor from disk.
    
    Parameters:
    - path: Directory containing saved predictor
    - resume: Resume training from checkpoint
    - verbosity: Logging verbosity level
    
    Returns:
    MultiModalPredictor: Loaded predictor instance
    """

def export_onnx(
    self,
    data,
    path: str = None,
    batch_size: int = None,
    verbose: bool = False,
    opset_version: int = 16,
    truncate_long_and_double: bool = False
):
    """
    Export model to ONNX format for deployment.
    
    Parameters:
    - data: Sample data for tracing
    - path: Export path (if None, returns bytes)
    - batch_size: Batch size for export
    - verbose: Verbose export logging
    - opset_version: ONNX opset version
    - truncate_long_and_double: Truncate precision for compatibility
    
    Returns:
    Export path or ONNX model bytes
    """

def optimize_for_inference(self, providers: list = None):
    """
    Optimize model for faster inference using ONNX runtime.
    
    Parameters:
    - providers: ONNX execution providers
    
    Returns:
    Optimized ONNX module for inference
    """

Advanced Features

Advanced functionality for specialized use cases and model analysis.

def fit_summary(self, verbosity: int = 0, show_plot: bool = False):
    """
    Display training summary and model information.
    
    Parameters:
    - verbosity: Detail level (0-4)
    - show_plot: Show training plots
    
    Returns:
    dict: Training summary information
    """

def list_supported_models(self, pretrained: bool = True):
    """
    List supported models for the current problem type.
    
    Parameters:
    - pretrained: Show only models with pretrained weights
    
    Returns:
    list: Available model names
    """

def dump_model(self, save_path: str = None):
    """
    Export model weights and configs to local directory.
    
    Parameters:
    - save_path: Directory to save model files
    """

def set_num_gpus(self, num_gpus: int):
    """
    Set number of GPUs for training/inference.
    
    Parameters:
    - num_gpus: Number of GPUs to use
    """

Properties

Access model and training information through properties.

@property
def problem_type(self) -> str:
    """Type of ML problem (classification, object_detection, etc.)"""

@property
def label(self) -> str:
    """Name of target label column"""

@property
def eval_metric(self) -> str:
    """Evaluation metric used for model selection"""

@property
def class_labels(self) -> list:
    """Original class label names for classification"""

@property
def positive_class(self):
    """Positive class label for binary classification"""

@property
def total_parameters(self) -> int:
    """Total number of model parameters"""

@property
def trainable_parameters(self) -> int:
    """Number of trainable model parameters"""

@property
def model_size(self) -> float:
    """Model size in megabytes"""

Usage Examples

Text and Image Classification

from autogluon.multimodal import MultiModalPredictor
import pandas as pd

# Prepare multimodal dataset
data = pd.DataFrame({
    'image_path': ['img1.jpg', 'img2.jpg', 'img3.jpg'],
    'text_content': ['Product description 1', 'Product description 2', 'Product description 3'],
    'price': [10.5, 25.0, 15.5],
    'category': ['A', 'B', 'A']
})

# Train multimodal classifier
predictor = MultiModalPredictor(
    label='category',
    problem_type='multiclass',
    presets='high_quality'
)

predictor.fit(
    data,
    time_limit=3600,
    column_types={
        'image_path': 'image_path',
        'text_content': 'text',
        'price': 'numerical'
    }
)

# Make predictions
predictions = predictor.predict(test_data)
probabilities = predictor.predict_proba(test_data)

# Extract embeddings
embeddings = predictor.extract_embedding(test_data)
print(f"Embedding shape: {embeddings.shape}")

Object Detection

# Object detection on image data
detector = MultiModalPredictor(
    problem_type='object_detection',
    presets='medium_quality',  
    classes=['person', 'car', 'bicycle'],  # Target classes
    path='./detection_models'
)

# Train on COCO-format data
detector.fit(
    train_data='train_annotations.json',  # COCO format
    time_limit=7200
)

# Predict bounding boxes
detections = detector.predict(
    'test_images/',
    save_results=True,
    as_coco=True  # Return COCO format results
)

# Evaluate with COCO metrics
metrics = detector.evaluate(
    'test_annotations.json',
    eval_tool='pycocotools'
)
print(f"mAP: {metrics['map']:.3f}")

Semantic Text Matching

# Text similarity for semantic matching
matcher = MultiModalPredictor(
    problem_type='text_similarity',
    query='question',
    response='answer',
    match_label='relevant'
)

# Train on query-response pairs
matcher.fit(qa_data, time_limit=1800)

# Find similar documents
query_data = ['What is machine learning?']
candidate_data = [
    'ML is a subset of AI',
    'Python is a programming language', 
    'Deep learning uses neural networks'
]

similarities = matcher.predict(
    data=query_data,
    candidate_data=candidate_data
)
print("Most similar:", candidate_data[similarities.argmax()])

Install with Tessl CLI