Core foundational classes and utilities for the aiSSEMBLE platform, providing authentication, metadata management, configuration, file storage, and policy management capabilities.
—
Complete lifecycle tracking for machine learning training processes with structured metadata capture. The Training BOM (Bill of Materials) provides comprehensive documentation of ML training workflows including dataset information, feature engineering details, model specifications, and integration with MLflow for experiment tracking and reproducibility.
The main data model for capturing comprehensive training workflow metadata with nested information structures for datasets, features, and models, designed for integration with MLflow tracking systems.
class TrainingBOM(BaseModel):
"""
Represent a Bill of Materials for model training.
Attributes:
- id: str - Training identifier
- start_time: str - Training start timestamp
- end_time: str - Training end timestamp
- dataset_info: DatasetInfo - Dataset information
- feature_info: FeatureInfo - Feature engineering information
- model_info: ModelInfo - Model information
- mlflow_params: Dict - MLflow parameters
- mlflow_metrics: Dict - MLflow metrics
"""
id: str
start_time: str
end_time: str
dataset_info: DatasetInfo
feature_info: FeatureInfo
model_info: ModelInfo
mlflow_params: Dict
mlflow_metrics: DictCaptures essential metadata about training datasets including origin sources and size metrics for comprehensive data lineage and reproducibility tracking.
class TrainingBOM.DatasetInfo(BaseModel):
"""
Represents training dataset information for the Bill of Materials.
Attributes:
- origin: str - Dataset origin
- size: int - Dataset size (default: 0)
"""
origin: str
size: int = 0Comprehensive tracking of feature engineering and selection processes including original feature sets and selected subsets for model training transparency and reproducibility.
class TrainingBOM.FeatureInfo(BaseModel):
"""
Represents feature engineering/selection information for the Bill of Materials.
Attributes:
- original_features: List[str] - Original features (default: empty list)
- selected_features: List[str] - Selected features (default: empty list)
"""
original_features: List[str] = []
selected_features: List[str] = []Structured capture of model specifications including type classification and architectural details for comprehensive model documentation and version tracking.
class TrainingBOM.ModelInfo(BaseModel):
"""
Represents training model information for the Bill of Materials.
Attributes:
- type: str - Model type
- architecture: str - Model architecture
"""
type: str
architecture: strfrom aissemble_core_bom.training_bom import TrainingBOM
from datetime import datetime
# Create dataset information
dataset_info = TrainingBOM.DatasetInfo(
origin="s3://ml-data-bucket/training-data-v2.0.parquet",
size=1500000
)
# Define feature engineering details
feature_info = TrainingBOM.FeatureInfo(
original_features=["age", "income", "credit_score", "employment_years", "debt_ratio"],
selected_features=["age", "income", "credit_score", "employment_years"]
)
# Specify model information
model_info = TrainingBOM.ModelInfo(
type="RandomForestClassifier",
architecture="ensemble_tree_based"
)
# Create complete training BOM
training_bom = TrainingBOM(
id="training-run-2024-09-05-001",
start_time=datetime.now().isoformat(),
end_time=datetime.now().isoformat(),
dataset_info=dataset_info,
feature_info=feature_info,
model_info=model_info,
mlflow_params={
"n_estimators": 100,
"max_depth": 10,
"min_samples_split": 2,
"random_state": 42
},
mlflow_metrics={
"accuracy": 0.87,
"precision": 0.84,
"recall": 0.89,
"f1_score": 0.86,
"auc_roc": 0.91
}
)
print(f"Created Training BOM: {training_bom.id}")
print(f"Dataset origin: {training_bom.dataset_info.origin}")
print(f"Selected features: {training_bom.feature_info.selected_features}")
print(f"Model type: {training_bom.model_info.type}")
print(f"Training accuracy: {training_bom.mlflow_metrics['accuracy']}")import mlflow
from aissemble_core_bom.training_bom import TrainingBOM
def create_training_bom_from_mlflow(run_id: str) -> TrainingBOM:
"""Create Training BOM from MLflow run data"""
# Get MLflow run information
run = mlflow.get_run(run_id)
# Extract parameters and metrics
params = run.data.params
metrics = run.data.metrics
# Create BOM with MLflow data
training_bom = TrainingBOM(
id=f"mlflow-{run_id}",
start_time=str(run.info.start_time),
end_time=str(run.info.end_time),
dataset_info=TrainingBOM.DatasetInfo(
origin=params.get("dataset_path", "unknown"),
size=int(params.get("dataset_size", 0))
),
feature_info=TrainingBOM.FeatureInfo(
original_features=params.get("original_features", "").split(",") if params.get("original_features") else [],
selected_features=params.get("selected_features", "").split(",") if params.get("selected_features") else []
),
model_info=TrainingBOM.ModelInfo(
type=params.get("model_type", "unknown"),
architecture=params.get("model_architecture", "unknown")
),
mlflow_params=params,
mlflow_metrics=metrics
)
return training_bom
# Usage example
with mlflow.start_run() as run:
# Training code here...
mlflow.log_param("dataset_path", "s3://bucket/data.parquet")
mlflow.log_param("model_type", "XGBoostClassifier")
mlflow.log_metric("accuracy", 0.92)
# Create BOM after training
bom = create_training_bom_from_mlflow(run.info.run_id)from aissemble_core_bom.training_bom import TrainingBOM
from datetime import datetime
import json
class TrainingWorkflowTracker:
"""Utility class for comprehensive training workflow tracking"""
def __init__(self):
self.start_time = None
self.dataset_info = None
self.feature_info = None
self.model_info = None
self.mlflow_params = {}
self.mlflow_metrics = {}
def start_training(self, training_id: str):
"""Initialize training session"""
self.training_id = training_id
self.start_time = datetime.now().isoformat()
print(f"Started training session: {training_id}")
def register_dataset(self, origin: str, size: int):
"""Register dataset information"""
self.dataset_info = TrainingBOM.DatasetInfo(origin=origin, size=size)
print(f"Registered dataset from: {origin}")
def register_features(self, original_features: list, selected_features: list):
"""Register feature engineering information"""
self.feature_info = TrainingBOM.FeatureInfo(
original_features=original_features,
selected_features=selected_features
)
print(f"Registered {len(selected_features)} selected features from {len(original_features)} original")
def register_model(self, model_type: str, architecture: str):
"""Register model information"""
self.model_info = TrainingBOM.ModelInfo(type=model_type, architecture=architecture)
print(f"Registered model: {model_type}")
def log_parameter(self, key: str, value):
"""Log training parameter"""
self.mlflow_params[key] = value
def log_metric(self, key: str, value: float):
"""Log training metric"""
self.mlflow_metrics[key] = value
def finalize_training(self) -> TrainingBOM:
"""Create final Training BOM"""
end_time = datetime.now().isoformat()
bom = TrainingBOM(
id=self.training_id,
start_time=self.start_time,
end_time=end_time,
dataset_info=self.dataset_info,
feature_info=self.feature_info,
model_info=self.model_info,
mlflow_params=self.mlflow_params,
mlflow_metrics=self.mlflow_metrics
)
print(f"Training completed: {self.training_id}")
return bom
# Example usage
tracker = TrainingWorkflowTracker()
# Initialize training
tracker.start_training("fraud-detection-v3.2")
# Register components
tracker.register_dataset("s3://fraud-data/transactions-2024.parquet", 2500000)
tracker.register_features(
original_features=["amount", "merchant", "timestamp", "location", "card_type", "user_age"],
selected_features=["amount", "merchant", "timestamp", "location"]
)
tracker.register_model("GradientBoostingClassifier", "tree_ensemble")
# Log training parameters
tracker.log_parameter("n_estimators", 200)
tracker.log_parameter("learning_rate", 0.1)
tracker.log_parameter("max_depth", 8)
# Log training metrics
tracker.log_metric("accuracy", 0.94)
tracker.log_metric("precision", 0.91)
tracker.log_metric("recall", 0.96)
# Finalize and get BOM
final_bom = tracker.finalize_training()
# Serialize BOM for storage or transmission
bom_json = final_bom.model_dump_json(indent=2)
print("Training BOM JSON:", bom_json)Install with Tessl CLI
npx tessl i tessl/pypi-aissemble-foundation-core-python