tessl/pypi-skl2onnx

Convert scikit-learn models to ONNX format for cross-platform inference and deployment

—

Pending

Overview

Eval results

Files

Helper Utilities

Name: tessl/pypi-skl2onnx
Author: tessl

Investigation and integration utilities for debugging conversions, comparing outputs between scikit-learn and ONNX models, analyzing pipeline structures, and integrating custom ONNX graphs. These utilities support development, testing, and troubleshooting of ONNX conversions.

Capabilities

Investigation and Debugging

Tools for analyzing conversion processes, collecting intermediate results, and debugging conversion issues.

def collect_intermediate_steps(model, X=None, target_opset=None):
    """
    Collect intermediate outputs during conversion process for debugging.
    
    Provides detailed information about shape inference, operator creation,
    and conversion steps to help diagnose conversion issues.
    
    Parameters:
    - model: scikit-learn model to analyze
    - X: array-like, sample input data for type inference (optional)
    - target_opset: int, target ONNX opset version (optional)
    
    Returns:
    - dict: Detailed conversion information including:
      - 'shapes': Shape inference results for each step
      - 'operators': Generated ONNX operators
      - 'variables': Variable names and types
      - 'topology': Model topology structure
    """

def compare_objects(sklearn_output, onnx_output, decimal=5):
    """
    Compare outputs between scikit-learn and ONNX models.
    
    Validates conversion accuracy by comparing predictions from original
    sklearn model with converted ONNX model outputs.
    
    Parameters:
    - sklearn_output: array-like, output from sklearn model
    - onnx_output: array-like, output from ONNX model
    - decimal: int, number of decimal places for comparison (default 5)
    
    Returns:
    - bool: True if outputs match within specified precision
    
    Raises:
    - AssertionError: If outputs don't match within tolerance
    - ValueError: If output shapes or types are incompatible
    """

def enumerate_pipeline_models(model):
    """
    Enumerate all models within a pipeline or ensemble.
    
    Recursively discovers all sub-models in complex pipelines,
    feature unions, and ensemble models for analysis or debugging.
    
    Parameters:
    - model: scikit-learn model, pipeline, or ensemble
    
    Returns:
    - list: List of tuples (model_name, model_instance, path)
      where path indicates the location within the pipeline structure
    """

Integration Utilities

Functions for integrating custom ONNX graphs and extending existing models.

def add_onnx_graph(onx, to_add, inputs, outputs):
    """
    Add a custom ONNX graph to an existing ONNX model.
    
    Enables integration of custom operators or preprocessing/postprocessing
    steps by merging ONNX graphs while maintaining proper variable connections.
    
    Parameters:
    - onx: ModelProto, existing ONNX model
    - to_add: GraphProto or ModelProto, graph/model to add
    - inputs: list, input variable names for connection
    - outputs: list, output variable names for connection
    
    Returns:
    - ModelProto: Modified ONNX model with integrated graph
    
    Raises:
    - ValueError: If input/output connections are invalid
    - TypeError: If graph types are incompatible
    """

Performance and Benchmarking

Utilities for measuring and comparing performance between sklearn and ONNX models.

def measure_time(stmt, context, repeat=10, number=50, div_by_number=False):
    """
    Measure execution time for model operations.
    
    Provides accurate timing measurements for comparing sklearn vs ONNX
    model performance, including statistical analysis of multiple runs.
    
    Parameters:
    - stmt: str, statement to time (e.g., 'model.predict(X)')
    - context: dict, variable context dictionary for statement execution
    - repeat: int, number of timing runs for statistical analysis (default 10)
    - number: int, number of executions per timing run (default 50)
    - div_by_number: bool, divide timing results by number of executions (default False)
    
    Returns:
    - dict: Timing results including:
      - 'average': Average execution time
      - 'deviation': Standard deviation
      - 'min_exec': Minimum execution time
      - 'max_exec': Maximum execution time
      - 'repeat': Number of repeat runs
      - 'number': Number of executions per run
    """

Usage Examples

Debugging Conversion Issues

from skl2onnx.helpers.investigate import collect_intermediate_steps
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

# Create model
X, y = make_classification(n_samples=100, n_features=10, random_state=42)
model = RandomForestClassifier(n_estimators=5, random_state=42)
model.fit(X, y)

# Collect detailed conversion information
debug_info = collect_intermediate_steps(model, X, target_opset=18)

# Analyze the results
print("Shape inference results:")
for step, shapes in debug_info['shapes'].items():
    print(f"  {step}: {shapes}")

print("\nGenerated operators:")
for i, op in enumerate(debug_info['operators']):
    print(f"  {i}: {op.op_type} ({op.inputs} -> {op.outputs})")

print("\nVariable information:")
for name, var_info in debug_info['variables'].items():
    print(f"  {name}: {var_info}")

Validating Conversion Accuracy

from skl2onnx.helpers.investigate import compare_objects
from skl2onnx import to_onnx
import onnxruntime as rt
import numpy as np

# Convert model
onnx_model = to_onnx(model, X)

# Get sklearn predictions
sklearn_pred = model.predict_proba(X)

# Get ONNX predictions
sess = rt.InferenceSession(onnx_model.SerializeToString())
input_name = sess.get_inputs()[0].name
onnx_pred = sess.run(None, {input_name: X.astype(np.float32)})[1]

# Compare outputs
try:
    match = compare_objects(sklearn_pred, onnx_pred, decimal=4)
    print("Conversion validated: outputs match within tolerance")
except AssertionError as e:
    print(f"Conversion issue detected: {e}")

Analyzing Pipeline Structure

from skl2onnx.helpers.investigate import enumerate_pipeline_models
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.ensemble import RandomForestClassifier

# Create complex pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(k=5)),
    ('classifier', RandomForestClassifier(n_estimators=10))
])
pipeline.fit(X, y)

# Enumerate all models in pipeline
models = enumerate_pipeline_models(pipeline)

print("Pipeline structure:")
for name, model_instance, path in models:
    print(f"  {path}: {name} ({type(model_instance).__name__})")

Adding Custom ONNX Operations

from skl2onnx.helpers.integration import add_onnx_graph
from skl2onnx import to_onnx
import onnx
from onnx import helper, TensorProto

# Convert base model
base_model = to_onnx(model, X)

# Create custom preprocessing graph
custom_inputs = [helper.make_tensor_value_info('input', TensorProto.FLOAT, [None, 10])]
custom_outputs = [helper.make_tensor_value_info('processed', TensorProto.FLOAT, [None, 10])]

# Custom operation: multiply by constant
multiply_node = helper.make_node(
    'Mul',
    inputs=['input', 'scale_factor'],
    outputs=['processed'],
    name='custom_scaling'
)

# Create scale factor initializer
scale_factor = helper.make_tensor(
    'scale_factor',
    TensorProto.FLOAT,
    [1],
    [2.0]  # Scale factor value
)

custom_graph = helper.make_graph(
    [multiply_node],
    'custom_preprocessing',
    custom_inputs,
    custom_outputs,
    [scale_factor]
)

# Integrate custom graph with base model
enhanced_model = add_onnx_graph(
    base_model,
    custom_graph,
    inputs=['input'],
    outputs=['processed']
)

Performance Benchmarking

from skl2onnx.tutorial import measure_time
import onnxruntime as rt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from skl2onnx import to_onnx

# Create and train model
X_test = np.random.randn(1000, 10).astype(np.float32)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_test[:100], np.random.randint(0, 2, 100))

# Convert to ONNX
onnx_model = to_onnx(model, X_test[:1])
sess = rt.InferenceSession(onnx_model.SerializeToString())
input_name = sess.get_inputs()[0].name

# Measure sklearn performance
sklearn_context = {
    'model': model,
    'X_test': X_test
}
sklearn_times = measure_time(
    'model.predict_proba(X_test)',
    context=sklearn_context,
    number=10,
    repeat=5
)

# Measure ONNX performance
onnx_context = {
    'sess': sess,
    'input_name': input_name,
    'X_test': X_test
}
onnx_times = measure_time(
    'sess.run(None, {input_name: X_test})',
    context=onnx_context,
    number=10,
    repeat=5
)

print(f"Sklearn average time: {sklearn_times['average']:.4f}s (±{sklearn_times['deviation']:.4f})")
print(f"ONNX average time: {onnx_times['average']:.4f}s (±{onnx_times['deviation']:.4f})")
print(f"Speedup: {sklearn_times['average'] / onnx_times['average']:.2f}x")

Advanced Pipeline Analysis

# Analyze complex nested pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Create complex pipeline with column transformer
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), [0, 1, 2]),
    ('cat', OneHotEncoder(), [3, 4])
])

complex_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Enumerate all components
all_models = enumerate_pipeline_models(complex_pipeline)

print("Complex pipeline analysis:")
for name, instance, path in all_models:
    print(f"  {path}: {name}")
    if hasattr(instance, 'get_params'):
        key_params = {k: v for k, v in instance.get_params().items() 
                     if not k.endswith('__') and not callable(v)}
        print(f"    Key parameters: {key_params}")