tessl/pypi-flaml

A fast library for automated machine learning and tuning

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Online Learning

Name: tessl/pypi-flaml
Author: tessl

Automated online learning system using Vowpal Wabbit with multiple model management, adaptive resource allocation, and real-time model selection. The online learning module is designed for streaming data scenarios where models need to continuously adapt to new information.

Capabilities

AutoVW Class

Main class for automated online learning with Vowpal Wabbit, managing multiple models simultaneously and selecting the best performer dynamically.

class AutoVW:
    def __init__(self, max_live_model_num, search_space, init_config={},
                 min_resource_lease="auto", automl_runner_args={}, scheduler_args={},
                 model_select_policy="threshold_loss_ucb", metric="mae_clipped",
                 random_seed=None, model_selection_mode="min", cb_coef=None):
        """
        Initialize AutoVW for automated online learning.
        
        Args:
            max_live_model_num (int): Maximum number of 'live' models to maintain
            search_space (dict): Hyperparameter search space including both tunable 
                                and fixed hyperparameters
            init_config (dict): Initial partial or full configuration
            min_resource_lease (str or float): Minimum resource lease for models ('auto' or float)
            automl_runner_args (dict): Configuration for OnlineTrialRunner
            scheduler_args (dict): Configuration for scheduler
            model_select_policy (str): Model selection policy ('threshold_loss_ucb', etc.)
            metric (str): Loss function metric ('mae_clipped', 'mae', 'mse', 'absolute_loss')
            random_seed (int): Random seed for reproducibility
            model_selection_mode (str): Optimization mode ('min' or 'max')
            cb_coef (float): Sample complexity bound coefficient
        """
        
    def predict(self, data_sample):
        """
        Make prediction on a data sample.
        
        Args:
            data_sample: Input data sample in VW format or structured format
            
        Returns:
            Prediction value from the selected model
        """
        
    def learn(self, data_sample):
        """
        Update models with new data sample.
        
        Args:
            data_sample: Training data sample with features and label
        """

Class Constants

class AutoVW:
    WARMSTART_NUM = 100  # Number of warmstart samples
    AUTOMATIC = "_auto"  # Automatic configuration identifier
    VW_INTERACTION_ARG_NAME = "interactions"  # VW interactions argument name

Supporting Classes

VowpalWabbitTrial

Individual Vowpal Wabbit trial representing a single model configuration.

class VowpalWabbitTrial:
    """
    Individual VW model trial in online learning system.
    Manages a single VW model instance with specific hyperparameters.
    """

OnlineTrialRunner

Manages execution and coordination of multiple online learning trials.

class OnlineTrialRunner:
    """
    Manages execution of online learning trials.
    Coordinates multiple VW models and handles resource allocation.
    """

Utility Functions

def get_ns_feature_dim_from_vw_example(vw_example):
    """
    Extract namespace feature dimensions from VW example.
    
    Args:
        vw_example (str): Vowpal Wabbit format example string
        
    Returns:
        dict: Dictionary mapping namespace to feature dimensions
    """

Usage Examples

Basic Online Learning Setup

from flaml import AutoVW

# Define search space for hyperparameters
search_space = {
    "learning_rate": {"_type": "loguniform", "_value": [0.001, 1.0]},
    "l1": {"_type": "loguniform", "_value": [1e-10, 1.0]},
    "l2": {"_type": "loguniform", "_value": [1e-10, 1.0]},
    "interactions": {"_type": "choice", "_value": [set(), {"ab"}, {"ac"}, {"ab", "ac"}]}
}

# Initialize AutoVW
autovw = AutoVW(
    max_live_model_num=5,
    search_space=search_space,
    init_config={"learning_rate": 0.1},
    metric="mae_clipped",
    random_seed=42
)

# Simulate streaming data
for i, data_sample in enumerate(streaming_data):
    # Make prediction
    prediction = autovw.predict(data_sample)
    
    # Update models with new sample
    autovw.learn(data_sample)
    
    if i % 1000 == 0:
        print(f"Processed {i} samples, latest prediction: {prediction}")

Advanced Configuration with Custom Policies

from flaml import AutoVW

# Advanced search space with multiple hyperparameters
search_space = {
    "learning_rate": {"_type": "loguniform", "_value": [0.0001, 1.0]},
    "power_t": {"_type": "uniform", "_value": [0.0, 1.0]},
    "l1": {"_type": "loguniform", "_value": [1e-10, 1.0]}, 
    "l2": {"_type": "loguniform", "_value": [1e-10, 1.0]},
    "interactions": {"_type": "choice", "_value": [
        set(), {"ab"}, {"ac"}, {"bc"}, {"ab", "ac"}, {"ab", "bc"}, {"ac", "bc"}
    ]},
    "bit_precision": {"_type": "choice", "_value": [18, 20, 22, 24]}
}

# Custom runner and scheduler arguments
automl_runner_args = {
    "champion_test_policy": "loss_ucb",
    "remove_worse": True
}

scheduler_args = {
    "resource_dimension": "sample_size",
    "max_resource": 10000,
    "reduction_factor": 2
}

# Initialize with advanced configuration
autovw = AutoVW(
    max_live_model_num=10,
    search_space=search_space,
    init_config={"learning_rate": 0.05, "l1": 1e-6},
    min_resource_lease=100,
    automl_runner_args=automl_runner_args,
    scheduler_args=scheduler_args,
    model_select_policy="threshold_loss_ucb",
    metric="mae",  # Mean absolute error
    cb_coef=0.1,  # Confidence bound coefficient
    random_seed=123
)

Integration with Data Streams

import pandas as pd
from flaml import AutoVW

# Search space for regression task
search_space = {
    "learning_rate": {"_type": "loguniform", "_value": [0.001, 0.5]},
    "l1": {"_type": "loguniform", "_value": [1e-8, 0.1]},
    "l2": {"_type": "loguniform", "_value": [1e-8, 0.1]}
}

autovw = AutoVW(
    max_live_model_num=3,
    search_space=search_space,
    metric="mse",
    model_selection_mode="min"
)

# Process streaming CSV data
def process_csv_stream(csv_file):
    for chunk in pd.read_csv(csv_file, chunksize=1000):
        for _, row in chunk.iterrows():
            # Convert to VW format: label |features feature1:value1 feature2:value2
            vw_sample = f"{row['target']} |features "
            vw_sample += " ".join([f"{col}:{row[col]}" for col in chunk.columns if col != 'target'])
            
            # Get prediction before updating
            pred = autovw.predict(vw_sample)
            
            # Update model
            autovw.learn(vw_sample)
            
            yield pred, row['target']

# Use with streaming data
predictions_and_actuals = list(process_csv_stream("streaming_data.csv"))

Multi-Class Classification Online Learning

from flaml import AutoVW

# Search space for multi-class classification
search_space = {
    "learning_rate": {"_type": "loguniform", "_value": [0.01, 1.0]},
    "oaa": {"_type": "choice", "_value": [3, 5, 10]},  # One-Against-All classes
    "loss_function": {"_type": "choice", "_value": ["logistic", "hinge"]}
}

# Initialize for classification
autovw_classifier = AutoVW(
    max_live_model_num=4,
    search_space=search_space,
    init_config={"oaa": 3},
    metric="absolute_loss",
    random_seed=456
)

# Example with categorical features
def create_vw_multiclass_sample(features, label):
    """Convert features to VW multi-class format."""
    vw_line = f"{label} |features "
    
    for key, value in features.items():
        if isinstance(value, str):
            # Categorical feature
            vw_line += f"{key}_{value}:1 "
        else:
            # Numerical feature
            vw_line += f"{key}:{value} "
    
    return vw_line.strip()

# Process multi-class data
sample_features = {"age": 25, "category": "A", "score": 0.8}
sample_label = 2  # Class label

vw_sample = create_vw_multiclass_sample(sample_features, sample_label)
prediction = autovw_classifier.predict(vw_sample)
autovw_classifier.learn(vw_sample)

Contextual Bandit Learning

from flaml import AutoVW

# Search space for contextual bandits
search_space = {
    "learning_rate": {"_type": "loguniform", "_value": [0.001, 0.1]},
    "cb_explore_adf": {"_type": "choice", "_value": [True]},
    "epsilon": {"_type": "uniform", "_value": [0.01, 0.3]}
}

# Initialize for contextual bandit
autovw_cb = AutoVW(
    max_live_model_num=5,
    search_space=search_space,
    metric="cb_loss",
    model_selection_mode="min"
)

def create_cb_sample(context, action, cost, probability):
    """Create contextual bandit VW format sample."""
    # Format: cost:probability:action |context features
    vw_line = f"{cost}:{probability}:{action} |context "
    vw_line += " ".join([f"{k}:{v}" for k, v in context.items()])
    return vw_line

# Example contextual bandit interaction
context = {"user_age": 30, "day_of_week": 2, "weather": 1}
action = 1  # Action taken
cost = 0.5  # Cost observed (lower is better)
probability = 0.2  # Probability of taking this action

cb_sample = create_cb_sample(context, action, cost, probability)
autovw_cb.learn(cb_sample)

# For prediction, provide context without action/cost
prediction_context = "1 |context user_age:25 day_of_week:3 weather:0"
predicted_action = autovw_cb.predict(prediction_context)

Model Selection Policies

Available Policies

threshold_loss_ucb: Threshold-based selection with upper confidence bounds
loss_ucb: Loss-based selection with confidence bounds
min_loss: Select model with minimum observed loss
random: Random model selection (baseline)

Metrics

mae_clipped: Mean absolute error with clipping
mae: Mean absolute error
mse: Mean squared error
absolute_loss: Absolute loss (for classification)
squared_loss: Squared loss
cb_loss: Contextual bandit loss

Advanced Trial Management

Lower-level components for managing individual Vowpal Wabbit trials and online trial execution.

class VowpalWabbitTrial:
    """Individual Vowpal Wabbit trial with specific hyperparameters."""
    
    def __init__(self, config, trial_id=None):
        """
        Initialize VW trial.
        
        Args:
            config (dict): VW hyperparameter configuration
            trial_id (str): Unique trial identifier
        """
    
    def train_eval(self, data_sample, eval_only=False):
        """
        Train and/or evaluate on data sample.
        
        Args:
            data_sample (str): VW-formatted data sample
            eval_only (bool): Only evaluate without training
            
        Returns:
            dict: Performance metrics
        """
    
    def predict(self, data_sample):
        """Make prediction on data sample."""
    
    @property
    def config(self):
        """dict: Trial configuration"""
    
    @property
    def trial_id(self):
        """str: Trial identifier"""

class OnlineTrialRunner:
    """Manager for running multiple online learning trials."""
    
    def __init__(self, search_space, max_live_model_num=5, **kwargs):
        """
        Initialize online trial runner.
        
        Args:
            search_space (dict): Hyperparameter search space
            max_live_model_num (int): Maximum concurrent models
            **kwargs: Additional configuration
        """
    
    def step(self, data_sample):
        """
        Process one data sample across all active trials.
        
        Args:
            data_sample (str): VW-formatted data sample
            
        Returns:
            dict: Aggregated results from all trials
        """
    
    def get_best_trial(self):
        """Get currently best performing trial."""
    
    def suggest_trial(self):
        """Suggest new trial configuration."""
    
    def remove_trial(self, trial_id):
        """Remove trial from active set."""

Integration Features

Vowpal Wabbit Backend: Leverages VW's efficient online learning algorithms
Multi-Model Management: Maintains multiple models with different hyperparameters
Adaptive Selection: Dynamic model selection based on performance
Resource Management: Intelligent allocation of computational resources
Streaming Data Support: Designed for continuous data streams
Multiple Task Support: Regression, classification, contextual bandits
Hyperparameter Optimization: Automated search over hyperparameter space