Python library for monitoring and managing AMD GPUs and CPUs with programmatic hardware metrics access
This document covers advanced usage patterns, edge cases, and complex scenarios for the AMDSMI library.
Many AMDSMI features are hardware-dependent. Always handle cases where features may not be available:
import amdsmi
from amdsmi import AmdSmiLibraryException
def safe_get_feature(device, feature_func, *args, **kwargs):
"""Safely get a feature that may not be supported."""
try:
return feature_func(device, *args, **kwargs)
except AmdSmiLibraryException as e:
error_code = e.get_error_code()
# Check if it's a NOT_SUPPORTED error
if "NOT_SUPPORTED" in str(e) or error_code == 0x10000000:
return None
raise
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
device = devices[0]
# Try to get partition info (may not be supported)
partition = safe_get_feature(
device,
amdsmi.amdsmi_get_gpu_compute_partition
)
if partition is None:
print("GPU partitioning not supported on this device")
else:
print(f"Compute partition: {partition}")
finally:
amdsmi.amdsmi_shut_down()Coordinate operations across multiple GPUs:
import amdsmi
import threading
class MultiGPUCoordinator:
"""Coordinate operations across multiple GPUs."""
def __init__(self):
amdsmi.amdsmi_init()
self.devices = amdsmi.amdsmi_get_processor_handles()
self.locks = {i: threading.Lock() for i in range(len(self.devices))}
def execute_on_all(self, func, *args, **kwargs):
"""Execute a function on all devices."""
results = []
for i, device in enumerate(self.devices):
with self.locks[i]:
try:
result = func(device, *args, **kwargs)
results.append((i, result, None))
except Exception as e:
results.append((i, None, e))
return results
def execute_on_device(self, device_index, func, *args, **kwargs):
"""Execute a function on a specific device."""
if device_index >= len(self.devices):
raise IndexError(f"Device index {device_index} out of range")
with self.locks[device_index]:
return func(self.devices[device_index], *args, **kwargs)
def shutdown(self):
"""Shut down the library."""
amdsmi.amdsmi_shut_down()
# Usage
coordinator = MultiGPUCoordinator()
try:
# Get activity on all GPUs
activities = coordinator.execute_on_all(amdsmi.amdsmi_get_gpu_activity)
for device_idx, activity, error in activities:
if error:
print(f"GPU {device_idx}: Error - {error}")
else:
print(f"GPU {device_idx}: {activity['gfx_activity']}%")
finally:
coordinator.shutdown()Create a context manager for automatic initialization and cleanup:
import amdsmi
from contextlib import contextmanager
@contextmanager
def amdsmi_context(flags=amdsmi.AmdSmiInitFlags.INIT_AMD_GPUS):
"""Context manager for AMDSMI initialization."""
amdsmi.amdsmi_init(flags)
try:
yield
finally:
amdsmi.amdsmi_shut_down()
# Usage
with amdsmi_context():
devices = amdsmi.amdsmi_get_processor_handles()
for device in devices:
activity = amdsmi.amdsmi_get_gpu_activity(device)
print(f"Activity: {activity}")Implement retry logic for transient errors:
import amdsmi
import time
from amdsmi import AmdSmiRetryException, AmdSmiLibraryException
def retry_on_error(func, max_retries=3, delay=1.0, *args, **kwargs):
"""Retry a function call on retry exceptions."""
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except AmdSmiRetryException:
if attempt < max_retries - 1:
time.sleep(delay * (attempt + 1)) # Exponential backoff
continue
raise
except AmdSmiLibraryException as e:
# Don't retry on non-retry errors
raise
raise Exception("Max retries exceeded")
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
# Retry getting activity if it fails
activity = retry_on_error(
amdsmi.amdsmi_get_gpu_activity,
max_retries=3,
device=devices[0]
)
print(f"Activity: {activity}")
finally:
amdsmi.amdsmi_shut_down()Many AMDSMI functions return "N/A" for unavailable metrics:
import amdsmi
def safe_get_numeric_metric(metric_value, default=0):
"""Safely convert metric value to numeric, handling N/A."""
if metric_value == "N/A" or metric_value is None:
return default
if isinstance(metric_value, (int, float)):
return metric_value
try:
return float(metric_value)
except (ValueError, TypeError):
return default
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
activity = amdsmi.amdsmi_get_gpu_activity(devices[0])
# Handle N/A values
gfx_activity = safe_get_numeric_metric(activity.get('gfx_activity'), 0)
umc_activity = safe_get_numeric_metric(activity.get('umc_activity'), 0)
print(f"GFX: {gfx_activity}%, UMC: {umc_activity}%")
finally:
amdsmi.amdsmi_shut_down()Handle temperature conversions (millidegrees to Celsius):
import amdsmi
def get_temperature_celsius(device, sensor_type, metric):
"""Get temperature in Celsius, handling conversion."""
temp_millidegrees = amdsmi.amdsmi_get_temp_metric(
device, sensor_type, metric
)
# Temperature is returned in millidegrees Celsius
return temp_millidegrees / 1000.0
def get_all_temperatures(device):
"""Get all available temperature readings."""
temperatures = {}
sensor_types = [
amdsmi.AmdSmiTemperatureType.EDGE,
amdsmi.AmdSmiTemperatureType.HOTSPOT,
amdsmi.AmdSmiTemperatureType.JUNCTION,
amdsmi.AmdSmiTemperatureType.VRAM,
]
for sensor_type in sensor_types:
try:
temp_c = get_temperature_celsius(
device,
sensor_type,
amdsmi.AmdSmiTemperatureMetric.CURRENT
)
sensor_name = sensor_type.name
temperatures[sensor_name] = temp_c
except:
pass # Sensor may not be available
return temperatures
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
temps = get_all_temperatures(devices[0])
for sensor, temp in temps.items():
print(f"{sensor}: {temp:.1f}°C")
finally:
amdsmi.amdsmi_shut_down()Handle memory size conversions:
import amdsmi
def format_bytes(bytes_value):
"""Format bytes to human-readable format."""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if bytes_value < 1024.0:
return f"{bytes_value:.2f} {unit}"
bytes_value /= 1024.0
return f"{bytes_value:.2f} PB"
def get_memory_info(device):
"""Get comprehensive memory information."""
vram = amdsmi.amdsmi_get_gpu_vram_usage(device)
return {
'total': vram['vram_total'],
'used': vram['vram_used'],
'free': vram['vram_total'] - vram['vram_used'],
'usage_percent': (vram['vram_used'] / vram['vram_total']) * 100,
'total_formatted': format_bytes(vram['vram_total']),
'used_formatted': format_bytes(vram['vram_used']),
'free_formatted': format_bytes(vram['vram_total'] - vram['vram_used'])
}
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
mem_info = get_memory_info(devices[0])
print(f"Total: {mem_info['total_formatted']}")
print(f"Used: {mem_info['used_formatted']}")
print(f"Free: {mem_info['free_formatted']}")
print(f"Usage: {mem_info['usage_percent']:.1f}%")
finally:
amdsmi.amdsmi_shut_down()Safely configure GPU partitioning with proper error handling:
import amdsmi
from amdsmi import (
AmdSmiComputePartitionType,
AmdSmiMemoryPartitionType,
AmdSmiLibraryException
)
def configure_partitioning_safe(device, compute_part, memory_part):
"""Safely configure GPU partitioning with error handling."""
# Check if partitioning is supported
try:
current_compute = amdsmi.amdsmi_get_gpu_compute_partition(device)
current_memory = amdsmi.amdsmi_get_gpu_memory_partition(device)
except AmdSmiLibraryException as e:
if "NOT_SUPPORTED" in str(e):
return False, "Partitioning not supported on this device"
raise
# Check available modes
try:
mem_config = amdsmi.amdsmi_get_gpu_memory_partition_config(device)
available_modes = mem_config.get('partition_caps', [])
if memory_part.name not in available_modes:
return False, f"Memory partition mode {memory_part.name} not available"
except:
pass # May not be able to query config
# Attempt to set partitions
try:
amdsmi.amdsmi_set_gpu_compute_partition(device, compute_part)
amdsmi.amdsmi_set_gpu_memory_partition(device, memory_part)
return True, "Partitioning configured (GPU reset may be required)"
except AmdSmiLibraryException as e:
return False, f"Failed to configure: {e}"
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
success, message = configure_partitioning_safe(
devices[0],
AmdSmiComputePartitionType.DPX,
AmdSmiMemoryPartitionType.NPS2
)
print(f"Result: {message}")
finally:
amdsmi.amdsmi_shut_down()