Debug and profile training jobs with real-time monitoring, tensor inspection, and performance analysis.
Configuration for capturing tensor data during training.
class DebuggerHookConfig:
"""
Debugger hook configuration for tensor capture.
Parameters:
s3_output_path: str - S3 path for debug data (required)
hook_parameters: Optional[Dict[str, str]] - Hook parameters
collection_configs: Optional[List[CollectionConfig]] - Collection configurations
Hook Parameters:
save_interval: str - Save interval (e.g., "100" for every 100 steps)
- Higher interval = less data, lower overhead
start_step: str - Start step for collection
- Begin capturing at this step number
end_step: str - End step for collection
- Stop capturing at this step number
save_all: str - Save all tensors ("true" or "false")
- "true": Capture all tensors (large data volume)
- "false": Capture only specified collections
include_regex: str - Regex for tensors to include
- Example: ".*weight.*" for all weight tensors
reductions: str - Reductions to compute
- Options: "mean", "std", "min", "max", "abs_mean", "abs_max"
- Comma-separated: "mean,std,min,max"
Usage:
Configure tensor data capture for debugging during training.
Capture weights, gradients, activations, etc.
Notes:
- Capturing tensors adds training overhead (5-15%)
- Storage costs for captured data
- Use save_interval to balance detail vs cost
- Analyze with SageMaker Debugger Insights
"""Usage:
from sagemaker.core.debugger import DebuggerHookConfig, CollectionConfig
# Basic configuration
hook_config = DebuggerHookConfig(
s3_output_path="s3://my-bucket/debug-output",
hook_parameters={
"save_interval": "100", # Every 100 steps
"start_step": "0",
"end_step": "5000"
}
)
# With collection configs for specific tensors
collection_configs = [
CollectionConfig(
name="weights",
parameters={
"save_interval": "500", # Less frequent for weights
"include_regex": ".*weight.*",
"reductions": "mean,std,min,max"
}
),
CollectionConfig(
name="gradients",
parameters={
"save_interval": "100",
"include_regex": ".*gradient.*",
"reductions": "abs_mean,abs_max" # Useful for gradient monitoring
}
),
CollectionConfig(
name="losses",
parameters={
"save_interval": "10", # Frequent for loss
"include_regex": ".*loss.*"
}
)
]
hook_config = DebuggerHookConfig(
s3_output_path="s3://my-bucket/debug-output",
collection_configs=collection_configs
)
# Use with ModelTrainer
from sagemaker.train import ModelTrainer
trainer = ModelTrainer(
training_image="pytorch-image",
role=role,
compute=compute,
debugger_hook_config=hook_config # Enable debugging
)
trainer.train(input_data_config=[train_data])
# Tensor data saved to S3, analyze with Debugger Insightsclass CollectionConfig:
"""
Configuration for specific tensor collections.
Parameters:
name: str - Collection name (required)
Built-in collections:
- "weights": Model weights/parameters
- "gradients": Gradient tensors
- "biases": Bias parameters
- "losses": Loss values
- "metrics": Evaluation metrics
- "outputs": Layer outputs/activations
- "sm_metrics": SageMaker metrics
parameters: Optional[Dict[str, str]] - Collection parameters
Collection Parameters:
save_interval: str - Save frequency (steps)
start_step: str - Start step
end_step: str - End step
save_histogram: str - Save histogram data ("true"/"false")
include_regex: str - Include regex pattern
reductions: str - Reductions to compute
Notes:
- Built-in collections have pre-configured regex
- Custom collections require include_regex
- Histograms useful for weight/gradient distributions
- Reductions reduce data volume
"""Usage:
# Comprehensive tensor monitoring
collections = [
# Monitor weights infrequently
CollectionConfig(
name="weights",
parameters={
"save_interval": "1000",
"save_histogram": "true",
"reductions": "mean,std"
}
),
# Monitor gradients frequently (detect vanishing/exploding)
CollectionConfig(
name="gradients",
parameters={
"save_interval": "50",
"reductions": "abs_mean,abs_max,abs_min"
}
),
# Monitor loss every step
CollectionConfig(
name="losses",
parameters={
"save_interval": "1"
}
),
# Monitor specific layers
CollectionConfig(
name="attention_weights",
parameters={
"include_regex": ".*attention.*weight.*",
"save_interval": "500"
}
)
]Debugger rules for automatic issue detection during training.
class Rule:
"""
Debugger rule for automatic issue detection.
Parameters:
name: str - Rule name (required)
image_uri: str - Rule container image URI (required)
- Get from: get_rule_container_image_uri(region)
instance_type: str - Instance type for rule evaluation (default: "ml.t3.medium")
volume_size_in_gb: int - EBS volume size (default: 30)
container_arguments: Optional[Dict[str, str]] - Rule parameters
container_entrypoint: Optional[List[str]] - Container entrypoint
rule_parameters: Optional[Dict[str, str]] - Rule parameters
Built-in Rules:
VanishingGradient: Detect vanishing gradients
- Parameters: threshold (default: 0.0000001)
ExplodingTensor: Detect exploding tensors
- Parameters: threshold (default: 1000000)
LossNotDecreasing: Detect non-decreasing loss
- Parameters: patience, min_delta
Overfit: Detect overfitting
- Parameters: patience_train, patience_validation
Overtraining: Detect overtraining
- Parameters: patience, threshold_train, threshold_validation
SimilarAcrossRuns: Compare across runs
TensorVariance: Check tensor variance
UnchangedTensor: Detect unchanged tensors
CheckInputImages: Validate input images
NLPSequenceRatio: Check NLP sequence ratios
ClassImbalance: Detect class imbalance
- Parameters: threshold_imbalance
Notes:
- Rules run in parallel with training
- Automatically trigger if conditions met
- Can stop training on rule trigger
- Results in CloudWatch and S3
"""Usage:
from sagemaker.core.debugger import Rule, get_rule_container_image_uri
# Get rule container for region
rule_image_uri = get_rule_container_image_uri("us-west-2")
# Create built-in rules
rules = [
Rule(
name="VanishingGradient",
image_uri=rule_image_uri,
instance_type="ml.t3.medium",
rule_parameters={
"threshold": "0.0000001"
}
),
Rule(
name="ExplodingTensor",
image_uri=rule_image_uri,
instance_type="ml.t3.medium",
rule_parameters={
"threshold": "1000000"
}
),
Rule(
name="LossNotDecreasing",
image_uri=rule_image_uri,
instance_type="ml.t3.medium",
rule_parameters={
"tensor_regex": ".*loss.*",
"patience": "5", # Wait 5 steps before triggering
"min_delta": "0.01" # Minimum improvement
}
),
Rule(
name="Overfit",
image_uri=rule_image_uri,
instance_type="ml.t3.medium",
rule_parameters={
"patience_train": "5",
"patience_validation": "10"
}
),
Rule(
name="UnchangedTensor",
image_uri=rule_image_uri,
instance_type="ml.t3.medium",
rule_parameters={
"tensor_regex": ".*weight.*",
"num_steps": "100" # Trigger if unchanged for 100 steps
}
)
]
# Use with ModelTrainer
trainer = ModelTrainer(
training_image="pytorch-image",
role=role,
compute=compute,
debugger_hook_config=hook_config,
debugger_rules=rules # Enable automatic detection
)
trainer.train(input_data_config=[train_data])
# Check rule evaluation status
# Rules automatically evaluated during training
# Triggered rules logged to CloudWatchCustom Rule:
# Create custom rule with specific logic
custom_rule = Rule(
name="CustomGradientCheck",
image_uri="123456789012.dkr.ecr.us-west-2.amazonaws.com/my-custom-rule:latest",
instance_type="ml.m5.xlarge",
rule_parameters={
"threshold": "0.001",
"tensor_regex": ".*gradient.*",
"check_interval": "100",
"alert_on_trigger": "true"
}
)
trainer = ModelTrainer(
training_image="pytorch-image",
role=role,
compute=compute,
debugger_rules=[custom_rule]
)class ProfilerConfig:
"""
Profiler configuration for performance analysis.
Parameters:
s3_output_path: str - S3 path for profiling data (required)
profiling_interval_in_milliseconds: int - Profiling interval (default: 500)
- Range: 100-5000 milliseconds
- Lower interval = more granular data, higher overhead
profiling_parameters: Optional[Dict[str, str]] - Profiling parameters
disable_profiler: bool - Disable profiler (default: False)
Profiling Parameters:
DetailedProfilingConfig: str - Detailed profiling settings
- CPU, GPU, memory profiling
DataloaderProfilingConfig: str - Dataloader profiling
- Data loading performance
PythonProfilingConfig: str - Python profiling
- Python code execution
SMDataParallelProfilingConfig: str - Data parallel profiling
- SageMaker distributed data parallel
HorovodProfilingConfig: str - Horovod profiling
- Horovod distributed training
Usage:
Configure performance profiling for training jobs.
Identify bottlenecks in training loop.
Notes:
- Profiling adds 2-10% overhead
- Generates interactive HTML report
- Useful for optimizing training performance
- Disable for production training after optimization
"""Usage:
from sagemaker.core.debugger import ProfilerConfig
# Basic profiling
profiler_config = ProfilerConfig(
s3_output_path="s3://my-bucket/profiler-output",
profiling_interval_in_milliseconds=500 # Sample every 500ms
)
# Comprehensive profiling
detailed_profiler_config = ProfilerConfig(
s3_output_path="s3://my-bucket/profiler-output",
profiling_interval_in_milliseconds=500,
profiling_parameters={
"DetailedProfilingConfig": "true",
"DataloaderProfilingConfig": "true",
"PythonProfilingConfig": "true"
}
)
# Use with ModelTrainer
trainer = ModelTrainer(
training_image="pytorch-image",
role=role,
compute=compute,
profiler_config=detailed_profiler_config
)
trainer.train(input_data_config=[train_data])
# After training: download profiler report
from sagemaker.core.s3 import S3Downloader
S3Downloader.download(
s3_uri="s3://my-bucket/profiler-output/profiler-reports/profiler-report.html",
local_path="./"
)
# Open profiler-report.html in browser for interactive analysisSelective Profiling:
from sagemaker.core.debugger import (
ProfilerConfig,
DetailedProfilingConfig,
DataloaderProfilingConfig
)
# Profile only specific training steps
profiler_config = ProfilerConfig(
s3_output_path="s3://my-bucket/profiler-output",
profiling_parameters={
"DetailedProfilingConfig": json.dumps({
"start_step": 1000,
"num_steps": 10 # Profile 10 steps starting at step 1000
}),
"DataloaderProfilingConfig": json.dumps({
"start_step": 1000,
"num_steps": 10
})
}
)class ProfilerRule:
"""
Profiler rule for performance issue detection.
Built-in Profiler Rules:
ProfilerReport: Generate comprehensive profiling report
- Overview of system resource usage
- Recommendations for improvement
CPUBottleneck: Detect CPU bottlenecks
- Threshold: cpu_threshold (default: 90%)
GPUMemoryIncrease: Detect GPU memory growth
- Detect memory leaks
IOBottleneck: Detect I/O bottlenecks
- Slow data loading
LoadBalancing: Check load balancing
- Imbalanced GPU utilization
LowGPUUtilization: Detect low GPU usage
- Threshold: threshold (default: 70%)
- Window: window (default: 500 steps)
OverallSystemUsage: Monitor system resources
- CPU, GPU, memory, disk, network
BatchSize: Suggest optimal batch size
- Based on GPU memory usage
MaxInitializationTime: Check initialization time
- Threshold: max_initialization_time_ms
StepOutlier: Detect step outliers
- Unusually slow steps
Notes:
- Rules run continuously during training
- Generate recommendations automatically
- Can trigger actions (alerts, stop training)
"""Usage:
from sagemaker.core.debugger import ProfilerRule, get_rule_container_image_uri
# Get rule container image
rule_image = get_rule_container_image_uri("us-west-2")
# Create profiler rules
profiler_rules = [
ProfilerRule(
name="ProfilerReport",
image_uri=rule_image,
instance_type="ml.t3.medium"
),
ProfilerRule(
name="LowGPUUtilization",
image_uri=rule_image,
instance_type="ml.t3.medium",
rule_parameters={
"threshold": "70", # Alert if GPU < 70%
"window": "500", # Check over 500 steps
"patience": "3" # Trigger after 3 violations
}
),
ProfilerRule(
name="IOBottleneck",
image_uri=rule_image,
instance_type="ml.t3.medium",
rule_parameters={
"threshold": "50" # Alert if I/O wait > 50%
}
),
ProfilerRule(
name="BatchSize",
image_uri=rule_image,
instance_type="ml.t3.medium"
),
ProfilerRule(
name="CPUBottleneck",
image_uri=rule_image,
instance_type="ml.t3.medium",
rule_parameters={
"cpu_threshold": "90" # Alert if CPU > 90%
}
)
]
# Apply to training
trainer = ModelTrainer(
training_image="pytorch-image",
role=role,
compute=compute,
profiler_config=profiler_config,
profiler_rules=profiler_rules
)
trainer.train(input_data_config=[train_data])
# Check profiler rule results
# Recommendations in S3 profiler output pathclass TensorBoardOutputConfig:
"""
TensorBoard output configuration.
Parameters:
s3_output_path: str - S3 path for TensorBoard data (required)
container_local_output_path: str - Container path for logs (default: "/opt/ml/output/tensorboard")
Usage:
Configure TensorBoard logging during training.
Logs automatically synced to S3.
Notes:
- Training code must write TensorBoard logs to local path
- Logs synced to S3 during and after training
- View with: tensorboard --logdir=s3://path
- Requires TensorBoard in training container
"""Usage:
from sagemaker.core.debugger import TensorBoardOutputConfig
# Configure TensorBoard
tensorboard_config = TensorBoardOutputConfig(
s3_output_path="s3://my-bucket/tensorboard-logs",
container_local_output_path="/opt/ml/output/tensorboard"
)
# Use with ModelTrainer
trainer = ModelTrainer(
training_image="pytorch-image",
role=role,
compute=compute
)
trainer.with_tensorboard_output_config(tensorboard_config)
# Training code must include TensorBoard logging
# Example in train.py:
# from torch.utils.tensorboard import SummaryWriter
# writer = SummaryWriter('/opt/ml/output/tensorboard')
# writer.add_scalar('Loss/train', loss, epoch)
trainer.train(input_data_config=[train_data])
# View TensorBoard locally
# tensorboard --logdir=s3://my-bucket/tensorboard-logsTraining Script with TensorBoard:
# train.py
from torch.utils.tensorboard import SummaryWriter
import os
def train():
# TensorBoard writer
tb_path = os.environ.get('SM_OUTPUT_DATA_DIR', '/opt/ml/output') + '/tensorboard'
writer = SummaryWriter(tb_path)
for epoch in range(num_epochs):
# Training loop
train_loss = train_epoch(model, train_loader)
val_loss, val_acc = validate(model, val_loader)
# Log to TensorBoard
writer.add_scalar('Loss/train', train_loss, epoch)
writer.add_scalar('Loss/validation', val_loss, epoch)
writer.add_scalar('Accuracy/validation', val_acc, epoch)
# Log learning rate
writer.add_scalar('LearningRate', optimizer.param_groups[0]['lr'], epoch)
# Log histograms
for name, param in model.named_parameters():
writer.add_histogram(f'Parameters/{name}', param, epoch)
if param.grad is not None:
writer.add_histogram(f'Gradients/{name}', param.grad, epoch)
writer.close()
if __name__ == '__main__':
train()def get_rule_container_image_uri(region: str) -> str:
"""
Get debugger/profiler rule container image URI for region.
Parameters:
region: str - AWS region (required)
- Example: "us-west-2", "us-east-1", "eu-west-1"
Returns:
str: Container image URI for rules in specified region
Format: "{account}.dkr.ecr.{region}.amazonaws.com/sagemaker-debugger-rules:latest"
Raises:
ValueError: If region not supported
Usage:
Get appropriate container URI for debugger/profiler rules.
Required for creating Rule and ProfilerRule instances.
Notes:
- Container varies by region
- Always use for rule image_uri
- Updated automatically by SDK
"""def framework_name(image_uri: str) -> str:
"""
Extract framework name from training image URI.
Parameters:
image_uri: str - Training container image URI (required)
Returns:
str: Framework name
- "pytorch", "tensorflow", "mxnet", "xgboost", "huggingface", etc.
Raises:
ValueError: If framework cannot be determined
Usage:
Determine framework for debugger configuration.
Automatic framework detection for rules.
Example:
image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training:2.0-gpu-py310"
framework = framework_name(image) # Returns: "pytorch"
"""from sagemaker.core.debugger import DebuggerHookConfig, Rule, get_rule_container_image_uri
# Comprehensive debugging setup
hook_config = DebuggerHookConfig(
s3_output_path="s3://my-bucket/debug",
hook_parameters={
"save_interval": "10",
"include_regex": ".*" # Capture all tensors
}
)
# All recommended rules
rule_image = get_rule_container_image_uri("us-west-2")
debug_rules = [
Rule(name="VanishingGradient", image_uri=rule_image, instance_type="ml.t3.medium"),
Rule(name="ExplodingTensor", image_uri=rule_image, instance_type="ml.t3.medium"),
Rule(name="LossNotDecreasing", image_uri=rule_image, instance_type="ml.t3.medium"),
Rule(name="Overfit", image_uri=rule_image, instance_type="ml.t3.medium"),
Rule(name="UnchangedTensor", image_uri=rule_image, instance_type="ml.t3.medium"),
Rule(name="TensorVariance", image_uri=rule_image, instance_type="ml.t3.medium"),
Rule(name="ClassImbalance", image_uri=rule_image, instance_type="ml.t3.medium")
]
# Train with full debugging
trainer = ModelTrainer(
training_image="pytorch-image",
role=role,
compute=compute,
debugger_hook_config=hook_config,
debugger_rules=debug_rules
)
try:
trainer.train(input_data_config=[train_data])
except RuntimeError as e:
print(f"Training failed: {e}")
# Check which rules triggered
job = trainer._latest_training_job
rule_statuses = job.rule_evaluation_statuses
for rule_status in rule_statuses:
if rule_status['RuleEvaluationStatus'] == 'IssuesFound':
print(f"Rule triggered: {rule_status['RuleName']}")
print(f" Details: {rule_status['StatusDetails']}")from sagemaker.core.debugger import DebuggerHookConfig, CollectionConfig
# Monitor different tensor groups with different frequencies
collections = [
# Weights: infrequent, with histograms
CollectionConfig(
name="weights",
parameters={
"save_interval": "1000",
"save_histogram": "true",
"reductions": "mean,std,min,max"
}
),
# Gradients: frequent, detect vanishing/exploding
CollectionConfig(
name="gradients",
parameters={
"save_interval": "100",
"reductions": "abs_mean,abs_max"
}
),
# Losses: every step for curve
CollectionConfig(
name="losses",
parameters={
"save_interval": "1"
}
),
# Biases: infrequent
CollectionConfig(
name="biases",
parameters={
"save_interval": "1000"
}
),
# Layer outputs: sample periodically
CollectionConfig(
name="outputs",
parameters={
"save_interval": "500",
"include_regex": ".*relu_output.*"
}
),
# Custom metrics from training code
CollectionConfig(
name="sm_metrics",
parameters={
"save_interval": "10"
}
)
]
hook_config = DebuggerHookConfig(
s3_output_path="s3://my-bucket/debug",
collection_configs=collections
)from sagemaker.core.s3 import S3Downloader
import webbrowser
# After training completes
job_name = trainer._latest_training_job.training_job_name
profiler_report_path = f"s3://my-bucket/profiler-output/{job_name}/profiler-reports/profiler-report.html"
# Download profiler report
local_path = S3Downloader.download(
s3_uri=profiler_report_path,
local_path="./profiler-reports/"
)
# Open in browser for interactive analysis
webbrowser.open('profiler-reports/profiler-report.html')
# Report includes:
# - System resource utilization (CPU, GPU, memory)
# - Step time breakdown
# - Dataloader performance
# - Python profiling results
# - Recommendations for optimization# Profile distributed training
profiler_config = ProfilerConfig(
s3_output_path="s3://bucket/profiler",
profiling_interval_in_milliseconds=500,
profiling_parameters={
"DetailedProfilingConfig": "true",
"SMDataParallelProfilingConfig": "true" # For SageMaker distributed
# Or "HorovodProfilingConfig": "true" # For Horovod
}
)
trainer = ModelTrainer(
training_image="pytorch-image",
role=role,
compute=Compute(
instance_type="ml.p3.16xlarge",
instance_count=4 # 4-instance distributed training
),
distributed=Torchrun(process_count_per_node=8),
profiler_config=profiler_config
)
# Profile shows:
# - Per-instance metrics
# - Communication overhead
# - Load balancing across instances
# - Synchronization bottlenecksHook Not Registered:
Tensor Not Found:
Rule Evaluation Failed:
Profiler Report Empty:
Storage Costs High:
TensorBoard Not Showing Data: