tessl/pypi-vectorbt

Python library for backtesting and analyzing trading strategies at scale

—

Pending

Overview

Eval results

Files

Label Generation for Machine Learning

Name: tessl/pypi-vectorbt
Author: tessl

Look-ahead analysis tools for generating labels from future price movements, enabling machine learning model training on financial time series data. The labels module provides various methods to create target variables for supervised learning applications in quantitative finance.

Capabilities

Future Statistical Measures

Generators for statistical measures computed over future time windows, commonly used for regression and forecasting tasks.

class FMEAN:
    """
    Future mean label generator.
    
    Calculates the mean of future values over a specified window,
    useful for predicting future average prices or returns.
    """
    
    @classmethod
    def run(cls, close, window, **kwargs):
        """
        Calculate future mean labels.
        
        Parameters:
        - close: pd.Series or pd.DataFrame, price data
        - window: int, forward-looking window size
        - pct_change: bool, use percentage change (default: False)
        
        Returns:
        FMEAN: Label generator with fmean attribute
        """

class FSTD:
    """
    Future standard deviation label generator.
    
    Calculates the standard deviation of future values over a window,
    useful for volatility prediction and risk modeling.
    """
    
    @classmethod
    def run(cls, close, window, **kwargs):
        """
        Calculate future standard deviation labels.
        
        Parameters:
        - close: pd.Series or pd.DataFrame, price data
        - window: int, forward-looking window size
        - pct_change: bool, use percentage change (default: False)
        - ddof: int, degrees of freedom (default: 1)
        
        Returns:
        FSTD: Label generator with fstd attribute
        """

class FMIN:
    """
    Future minimum label generator.
    
    Finds the minimum value over future time windows,
    useful for support level prediction and drawdown analysis.
    """
    
    @classmethod
    def run(cls, close, window, **kwargs):
        """
        Calculate future minimum labels.
        
        Parameters:
        - close: pd.Series or pd.DataFrame, price data
        - window: int, forward-looking window size
        - pct_change: bool, use percentage change from current (default: False)
        
        Returns:
        FMIN: Label generator with fmin attribute
        """

class FMAX:
    """
    Future maximum label generator.
    
    Finds the maximum value over future time windows,
    useful for resistance level prediction and profit target analysis.
    """
    
    @classmethod
    def run(cls, close, window, **kwargs):
        """
        Calculate future maximum labels.
        
        Parameters:
        - close: pd.Series or pd.DataFrame, price data
        - window: int, forward-looking window size
        - pct_change: bool, use percentage change from current (default: False)
        
        Returns:
        FMAX: Label generator with fmax attribute
        """

Fixed and Mean-Based Labels

Simple labeling methods for basic classification and regression tasks.

class FIXLB:
    """
    Fixed label generator.
    
    Generates constant labels across all time periods,
    useful for baseline models and control experiments.
    """
    
    @classmethod
    def run(cls, shape, value=1, **kwargs):
        """
        Generate fixed labels.
        
        Parameters:
        - shape: tuple, output shape (n_rows, n_cols)
        - value: scalar, fixed label value
        - dtype: data type for labels
        
        Returns:
        FIXLB: Label generator with fixed labels
        """

class MEANLB:
    """
    Mean-based label generator.
    
    Generates labels based on deviations from mean values,
    useful for mean reversion strategies and anomaly detection.
    """
    
    @classmethod
    def run(cls, close, window, threshold=0, **kwargs):
        """
        Generate mean-based labels.
        
        Parameters:
        - close: pd.Series or pd.DataFrame, price data
        - window: int, rolling window for mean calculation
        - threshold: float, threshold for label generation
        - above: bool, label when above mean (default: True)
        
        Returns:
        MEANLB: Label generator with mean-based labels
        """

Lexicographic and Ranking Labels

Advanced labeling methods for ranking and relative performance analysis.

class LEXLB:
    """
    Lexicographic label generator.
    
    Generates labels based on lexicographic ordering of multiple criteria,
    useful for multi-objective optimization and ranking problems.
    """
    
    @classmethod
    def run(cls, *args, **kwargs):
        """
        Generate lexicographic labels.
        
        Parameters:
        - args: sequence of arrays for lexicographic comparison
        - descending: bool, use descending order (default: False)
        
        Returns:
        LEXLB: Label generator with lexicographic rankings
        """

Trend-Based Labels

Sophisticated trend analysis and classification for directional predictions.

class TRENDLB:
    """
    Trend-based label generator.
    
    Analyzes price trends over various time horizons and generates
    labels for trend direction, strength, and continuation patterns.
    """
    
    @classmethod
    def run(cls, close, window=20, mode='binary', **kwargs):
        """
        Generate trend-based labels.
        
        Parameters:
        - close: pd.Series or pd.DataFrame, price data
        - window: int, trend analysis window
        - mode: str, trend mode (see TrendMode enum)
        - min_pct_change: float, minimum change for trend (default: 0.01)
        - smooth_window: int, smoothing window for trend (default: None)
        
        Returns:
        TRENDLB: Label generator with trend labels
        """

class TrendMode(IntEnum):
    """
    Trend calculation modes for TRENDLB.
    
    Defines different methods for calculating and categorizing trends
    in financial time series data.
    """
    Binary = 0          # Simple up/down binary classification
    BinaryCont = 1      # Binary with continuation signals
    BinaryContSat = 2   # Binary with continuation and saturation
    PctChange = 3       # Percentage change-based trends
    PctChangeNorm = 4   # Normalized percentage change trends

Binary Outcome Labels

Specialized generators for binary classification tasks in trading applications.

class BOLB:
    """
    Binary outcome label generator.
    
    Generates binary labels for classification tasks such as
    profitable/unprofitable trades or directional movements.
    """
    
    @classmethod
    def run(cls, close, window, threshold=0, **kwargs):
        """
        Generate binary outcome labels.
        
        Parameters:
        - close: pd.Series or pd.DataFrame, price data
        - window: int, forward-looking window for outcome
        - threshold: float, threshold for binary classification
        - return_type: str, type of return calculation ('simple', 'log')
        - min_periods: int, minimum periods for valid calculation
        
        Returns:
        BOLB: Label generator with binary outcome labels
        """

Usage Examples

Basic Future Labels

import vectorbt as vbt
import pandas as pd

# Download data
data = vbt.YFData.download("AAPL", start="2020-01-01", end="2023-01-01")
close = data.get("Close")

# Generate future statistical labels
future_mean = vbt.FMEAN.run(close, window=5)
future_std = vbt.FSTD.run(close, window=10)
future_min = vbt.FMIN.run(close, window=20, pct_change=True)
future_max = vbt.FMAX.run(close, window=20, pct_change=True)

# Access label values
mean_labels = future_mean.fmean
std_labels = future_std.fstd
min_labels = future_min.fmin  # Future minimum % change
max_labels = future_max.fmax  # Future maximum % change

Trend Analysis Labels

# Generate trend-based labels with different modes
trend_binary = vbt.TRENDLB.run(
    close, 
    window=20, 
    mode='binary'
)

trend_pct = vbt.TRENDLB.run(
    close,
    window=20,
    mode='pct_change',
    min_pct_change=0.02  # 2% minimum change
)

trend_smooth = vbt.TRENDLB.run(
    close,
    window=20,
    mode='binary_cont',
    smooth_window=5
)

# Access trend labels
binary_trends = trend_binary.trend
pct_trends = trend_pct.trend
smooth_trends = trend_smooth.trend

Classification Labels for ML

# Binary outcome labels for profitable trades
profitable_trades = vbt.BOLB.run(
    close,
    window=10,  # 10-day forward window
    threshold=0.05,  # 5% profit threshold
    return_type='simple'
)

# Mean reversion labels
mean_reversion = vbt.MEANLB.run(
    close,
    window=20,  # 20-day rolling mean
    threshold=0.02,  # 2% deviation threshold
    above=True  # Label when above mean
)

# Access binary labels
profit_labels = profitable_trades.labels  # True for profitable periods
reversion_labels = mean_reversion.labels  # True when above mean

Multi-Asset Label Generation

# Download multiple assets
symbols = ["AAPL", "GOOGL", "MSFT", "TSLA"]
data = vbt.YFData.download(symbols, start="2020-01-01", end="2023-01-01")
close = data.get("Close")

# Generate labels for all assets
future_returns = {}
trend_labels = {}

for symbol in symbols:
    # Future return labels
    future_returns[symbol] = vbt.FMEAN.run(
        close[symbol], 
        window=5, 
        pct_change=True
    ).fmean
    
    # Trend labels
    trend_labels[symbol] = vbt.TRENDLB.run(
        close[symbol],
        window=20,
        mode='binary'
    ).trend

# Combine into DataFrames
future_returns_df = pd.DataFrame(future_returns)
trend_labels_df = pd.DataFrame(trend_labels)

Labels for Strategy Development

# Generate labels for different time horizons
short_term = vbt.FMAX.run(close, window=5, pct_change=True)   # 5-day max return
medium_term = vbt.FMAX.run(close, window=20, pct_change=True) # 20-day max return  
long_term = vbt.FMAX.run(close, window=60, pct_change=True)   # 60-day max return

# Create multi-horizon labels
horizon_labels = pd.DataFrame({
    'short_max': short_term.fmax,
    'medium_max': medium_term.fmax,
    'long_max': long_term.fmax
})

# Classification thresholds
horizon_labels['short_profitable'] = horizon_labels['short_max'] > 0.03
horizon_labels['medium_profitable'] = horizon_labels['medium_max'] > 0.10
horizon_labels['long_profitable'] = horizon_labels['long_max'] > 0.25

Advanced ML Pipeline

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Generate features (indicators)
ma_20 = vbt.MA.run(close, 20).ma
ma_50 = vbt.MA.run(close, 50).ma
rsi = vbt.RSI.run(close, 14).rsi
macd = vbt.MACD.run(close)

# Create feature matrix
features = pd.DataFrame({
    'ma_ratio': ma_20 / ma_50,
    'rsi': rsi,
    'macd': macd.macd,
    'macd_signal': macd.signal,
    'returns_5d': close.pct_change(5),
    'volatility': close.rolling(20).std()
})

# Generate labels
target = vbt.BOLB.run(
    close,
    window=10,
    threshold=0.05,  # 5% profit in next 10 days
    return_type='simple'
).labels

# Prepare data for ML
X = features.dropna()
y = target.reindex(X.index).dropna()

# Align X and y
common_index = X.index.intersection(y.index)
X = X.loc[common_index]
y = y.loc[common_index]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Train Score: {train_score:.3f}")
print(f"Test Score: {test_score:.3f}")

Custom Label Generators

class CustomVolatilityLabel:
    """Custom label for volatility regime classification."""
    
    @classmethod
    def run(cls, close, short_window=5, long_window=20, threshold=1.5):
        # Calculate short and long-term volatility
        short_vol = close.rolling(short_window).std()
        long_vol = close.rolling(long_window).std()
        
        # Volatility ratio
        vol_ratio = short_vol / long_vol
        
        # Classify regime
        labels = pd.Series(0, index=close.index)  # Low volatility
        labels[vol_ratio > threshold] = 1  # High volatility
        labels[vol_ratio > threshold * 1.5] = 2  # Very high volatility
        
        return labels

# Use custom label generator
vol_labels = CustomVolatilityLabel.run(close)

Install with Tessl CLI