This document covers advanced usage patterns, edge cases, and complex scenarios for the AMDSMI library.
Many AMDSMI features are hardware-dependent. Always handle cases where features may not be available:
import amdsmi
from amdsmi import AmdSmiLibraryException
def safe_get_feature(device, feature_func, *args, **kwargs):
"""Safely get a feature that may not be supported."""
try:
return feature_func(device, *args, **kwargs)
except AmdSmiLibraryException as e:
error_code = e.get_error_code()
# Check if it's a NOT_SUPPORTED error
if "NOT_SUPPORTED" in str(e) or error_code == 0x10000000:
return None
raise
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
device = devices[0]
# Try to get partition info (may not be supported)
partition = safe_get_feature(
device,
amdsmi.amdsmi_get_gpu_compute_partition
)
if partition is None:
print("GPU partitioning not supported on this device")
else:
print(f"Compute partition: {partition}")
finally:
amdsmi.amdsmi_shut_down()Coordinate operations across multiple GPUs:
import amdsmi
import threading
class MultiGPUCoordinator:
"""Coordinate operations across multiple GPUs."""
def __init__(self):
amdsmi.amdsmi_init()
self.devices = amdsmi.amdsmi_get_processor_handles()
self.locks = {i: threading.Lock() for i in range(len(self.devices))}
def execute_on_all(self, func, *args, **kwargs):
"""Execute a function on all devices."""
results = []
for i, device in enumerate(self.devices):
with self.locks[i]:
try:
result = func(device, *args, **kwargs)
results.append((i, result, None))
except Exception as e:
results.append((i, None, e))
return results
def execute_on_device(self, device_index, func, *args, **kwargs):
"""Execute a function on a specific device."""
if device_index >= len(self.devices):
raise IndexError(f"Device index {device_index} out of range")
with self.locks[device_index]:
return func(self.devices[device_index], *args, **kwargs)
def shutdown(self):
"""Shut down the library."""
amdsmi.amdsmi_shut_down()
# Usage
coordinator = MultiGPUCoordinator()
try:
# Get activity on all GPUs
activities = coordinator.execute_on_all(amdsmi.amdsmi_get_gpu_activity)
for device_idx, activity, error in activities:
if error:
print(f"GPU {device_idx}: Error - {error}")
else:
print(f"GPU {device_idx}: {activity['gfx_activity']}%")
finally:
coordinator.shutdown()Create a context manager for automatic initialization and cleanup:
import amdsmi
from contextlib import contextmanager
@contextmanager
def amdsmi_context(flags=amdsmi.AmdSmiInitFlags.INIT_AMD_GPUS):
"""Context manager for AMDSMI initialization."""
amdsmi.amdsmi_init(flags)
try:
yield
finally:
amdsmi.amdsmi_shut_down()
# Usage
with amdsmi_context():
devices = amdsmi.amdsmi_get_processor_handles()
for device in devices:
activity = amdsmi.amdsmi_get_gpu_activity(device)
print(f"Activity: {activity}")Implement retry logic for transient errors:
import amdsmi
import time
from amdsmi import AmdSmiRetryException, AmdSmiLibraryException
def retry_on_error(func, max_retries=3, delay=1.0, *args, **kwargs):
"""Retry a function call on retry exceptions."""
for attempt in range(max_retries):
try:
return func(*args, **kwargs)
except AmdSmiRetryException:
if attempt < max_retries - 1:
time.sleep(delay * (attempt + 1)) # Exponential backoff
continue
raise
except AmdSmiLibraryException as e:
# Don't retry on non-retry errors
raise
raise Exception("Max retries exceeded")
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
# Retry getting activity if it fails
activity = retry_on_error(
amdsmi.amdsmi_get_gpu_activity,
max_retries=3,
device=devices[0]
)
print(f"Activity: {activity}")
finally:
amdsmi.amdsmi_shut_down()Many AMDSMI functions return "N/A" for unavailable metrics:
import amdsmi
def safe_get_numeric_metric(metric_value, default=0):
"""Safely convert metric value to numeric, handling N/A."""
if metric_value == "N/A" or metric_value is None:
return default
if isinstance(metric_value, (int, float)):
return metric_value
try:
return float(metric_value)
except (ValueError, TypeError):
return default
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
activity = amdsmi.amdsmi_get_gpu_activity(devices[0])
# Handle N/A values
gfx_activity = safe_get_numeric_metric(activity.get('gfx_activity'), 0)
umc_activity = safe_get_numeric_metric(activity.get('umc_activity'), 0)
print(f"GFX: {gfx_activity}%, UMC: {umc_activity}%")
finally:
amdsmi.amdsmi_shut_down()Handle temperature conversions (millidegrees to Celsius):
import amdsmi
def get_temperature_celsius(device, sensor_type, metric):
"""Get temperature in Celsius, handling conversion."""
temp_millidegrees = amdsmi.amdsmi_get_temp_metric(
device, sensor_type, metric
)
# Temperature is returned in millidegrees Celsius
return temp_millidegrees / 1000.0
def get_all_temperatures(device):
"""Get all available temperature readings."""
temperatures = {}
sensor_types = [
amdsmi.AmdSmiTemperatureType.EDGE,
amdsmi.AmdSmiTemperatureType.HOTSPOT,
amdsmi.AmdSmiTemperatureType.JUNCTION,
amdsmi.AmdSmiTemperatureType.VRAM,
]
for sensor_type in sensor_types:
try:
temp_c = get_temperature_celsius(
device,
sensor_type,
amdsmi.AmdSmiTemperatureMetric.CURRENT
)
sensor_name = sensor_type.name
temperatures[sensor_name] = temp_c
except:
pass # Sensor may not be available
return temperatures
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
temps = get_all_temperatures(devices[0])
for sensor, temp in temps.items():
print(f"{sensor}: {temp:.1f}°C")
finally:
amdsmi.amdsmi_shut_down()Handle memory size conversions:
import amdsmi
def format_bytes(bytes_value):
"""Format bytes to human-readable format."""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if bytes_value < 1024.0:
return f"{bytes_value:.2f} {unit}"
bytes_value /= 1024.0
return f"{bytes_value:.2f} PB"
def get_memory_info(device):
"""Get comprehensive memory information."""
vram = amdsmi.amdsmi_get_gpu_vram_usage(device)
return {
'total': vram['vram_total'],
'used': vram['vram_used'],
'free': vram['vram_total'] - vram['vram_used'],
'usage_percent': (vram['vram_used'] / vram['vram_total']) * 100,
'total_formatted': format_bytes(vram['vram_total']),
'used_formatted': format_bytes(vram['vram_used']),
'free_formatted': format_bytes(vram['vram_total'] - vram['vram_used'])
}
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
mem_info = get_memory_info(devices[0])
print(f"Total: {mem_info['total_formatted']}")
print(f"Used: {mem_info['used_formatted']}")
print(f"Free: {mem_info['free_formatted']}")
print(f"Usage: {mem_info['usage_percent']:.1f}%")
finally:
amdsmi.amdsmi_shut_down()Safely configure GPU partitioning with proper error handling:
import amdsmi
from amdsmi import (
AmdSmiComputePartitionType,
AmdSmiMemoryPartitionType,
AmdSmiLibraryException
)
def configure_partitioning_safe(device, compute_part, memory_part):
"""Safely configure GPU partitioning with error handling."""
# Check if partitioning is supported
try:
current_compute = amdsmi.amdsmi_get_gpu_compute_partition(device)
current_memory = amdsmi.amdsmi_get_gpu_memory_partition(device)
except AmdSmiLibraryException as e:
if "NOT_SUPPORTED" in str(e):
return False, "Partitioning not supported on this device"
raise
# Check available modes
try:
mem_config = amdsmi.amdsmi_get_gpu_memory_partition_config(device)
available_modes = mem_config.get('partition_caps', [])
if memory_part.name not in available_modes:
return False, f"Memory partition mode {memory_part.name} not available"
except:
pass # May not be able to query config
# Attempt to set partitions
try:
amdsmi.amdsmi_set_gpu_compute_partition(device, compute_part)
amdsmi.amdsmi_set_gpu_memory_partition(device, memory_part)
return True, "Partitioning configured (GPU reset may be required)"
except AmdSmiLibraryException as e:
return False, f"Failed to configure: {e}"
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
success, message = configure_partitioning_safe(
devices[0],
AmdSmiComputePartitionType.DPX,
AmdSmiMemoryPartitionType.NPS2
)
print(f"Result: {message}")
finally:
amdsmi.amdsmi_shut_down()