This document provides comprehensive real-world usage examples for the AMDSMI library.
Create a comprehensive GPU monitoring dashboard:
import amdsmi
import time
def monitor_gpu_dashboard():
"""Continuously monitor GPU metrics and display dashboard."""
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
while True:
print("\n" + "="*80)
print("GPU Monitoring Dashboard")
print("="*80)
for i, device in enumerate(devices):
# Get device info
asic_info = amdsmi.amdsmi_get_gpu_asic_info(device)
print(f"\nGPU {i}: {asic_info['market_name']}")
print("-" * 80)
# Utilization
activity = amdsmi.amdsmi_get_gpu_activity(device)
print(f" Utilization:")
print(f" GFX: {activity['gfx_activity']}%")
print(f" Memory: {activity['umc_activity']}%")
print(f" Multimedia: {activity['mm_activity']}%")
# Temperature
temp = amdsmi.amdsmi_get_temp_metric(
device,
amdsmi.AmdSmiTemperatureType.EDGE,
amdsmi.AmdSmiTemperatureMetric.CURRENT
)
print(f" Temperature: {temp / 1000.0:.1f}°C")
# Power
power = amdsmi.amdsmi_get_power_info(device)
print(f" Power: {power['current_socket_power'] / 1000000.0:.2f}W")
print(f" Average Power: {power['average_socket_power'] / 1000000.0:.2f}W")
# Memory
vram = amdsmi.amdsmi_get_gpu_vram_usage(device)
used_gb = vram['vram_used'] / (1024**3)
total_gb = vram['vram_total'] / (1024**3)
usage_pct = (vram['vram_used'] / vram['vram_total']) * 100
print(f" VRAM: {used_gb:.2f} GB / {total_gb:.2f} GB ({usage_pct:.1f}%)")
# Clocks
gfx_clock = amdsmi.amdsmi_get_clock_info(
device, amdsmi.AmdSmiClkType.GFX
)
mem_clock = amdsmi.amdsmi_get_clock_info(
device, amdsmi.AmdSmiClkType.MEM
)
print(f" Clocks:")
print(f" GFX: {gfx_clock.get('current_clk', 'N/A')} MHz")
print(f" Memory: {mem_clock.get('current_clk', 'N/A')} MHz")
# Processes
processes = amdsmi.amdsmi_get_gpu_process_list(device)
if processes:
print(f" Processes: {len(processes)}")
for proc in processes[:5]: # Show first 5
print(f" PID {proc.get('pid', 'N/A')}: "
f"{proc.get('mem_usage', 0) / (1024**2):.2f} MB")
time.sleep(2) # Update every 2 seconds
except KeyboardInterrupt:
print("\nMonitoring stopped.")
finally:
amdsmi.amdsmi_shut_down()
if __name__ == "__main__":
monitor_gpu_dashboard()Monitor and balance workloads across multiple GPUs:
import amdsmi
def find_least_utilized_gpu():
"""Find the GPU with the lowest utilization."""
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if not devices:
return None
best_device = None
lowest_util = 100
for device in devices:
activity = amdsmi.amdsmi_get_gpu_activity(device)
gfx_util = activity.get('gfx_activity', 0)
if isinstance(gfx_util, (int, float)) and gfx_util < lowest_util:
lowest_util = gfx_util
best_device = device
return best_device
finally:
amdsmi.amdsmi_shut_down()
def get_gpu_memory_availability(device):
"""Get available GPU memory in GB."""
vram = amdsmi.amdsmi_get_gpu_vram_usage(device)
available = (vram['vram_total'] - vram['vram_used']) / (1024**3)
return available
# Usage
device = find_least_utilized_gpu()
if device:
available_mem = get_gpu_memory_availability(device)
print(f"Available memory: {available_mem:.2f} GB")Monitor GPU temperatures and alert on high temperatures:
import amdsmi
def check_thermal_status(device, warning_temp=80, critical_temp=90):
"""Check GPU thermal status and return status."""
temp = amdsmi.amdsmi_get_temp_metric(
device,
amdsmi.AmdSmiTemperatureType.EDGE,
amdsmi.AmdSmiTemperatureMetric.CURRENT
)
temp_c = temp / 1000.0
# Get critical threshold
try:
critical = amdsmi.amdsmi_get_temp_metric(
device,
amdsmi.AmdSmiTemperatureType.EDGE,
amdsmi.AmdSmiTemperatureMetric.CRITICAL
)
critical_c = critical / 1000.0
except:
critical_c = critical_temp
status = "OK"
if temp_c >= critical_c:
status = "CRITICAL"
elif temp_c >= warning_temp:
status = "WARNING"
return {
'temperature': temp_c,
'critical_threshold': critical_c,
'status': status
}
def monitor_thermal_all_gpus():
"""Monitor thermal status for all GPUs."""
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
for i, device in enumerate(devices):
asic_info = amdsmi.amdsmi_get_gpu_asic_info(device)
thermal = check_thermal_status(device)
print(f"GPU {i} ({asic_info['market_name']}):")
print(f" Temperature: {thermal['temperature']:.1f}°C")
print(f" Status: {thermal['status']}")
if thermal['status'] != "OK":
print(f" ⚠️ Alert: Temperature is {thermal['status']}")
finally:
amdsmi.amdsmi_shut_down()Track and analyze GPU power consumption:
import amdsmi
import time
from collections import deque
class PowerTracker:
"""Track GPU power consumption over time."""
def __init__(self, device, window_size=60):
self.device = device
self.power_history = deque(maxlen=window_size)
self.start_time = time.time()
def sample(self):
"""Take a power sample."""
power = amdsmi.amdsmi_get_power_info(self.device)
power_w = power['current_socket_power'] / 1000000.0
self.power_history.append(power_w)
return power_w
def get_stats(self):
"""Get power statistics."""
if not self.power_history:
return None
return {
'current': self.power_history[-1],
'average': sum(self.power_history) / len(self.power_history),
'min': min(self.power_history),
'max': max(self.power_history),
'samples': len(self.power_history)
}
def estimate_energy(self, duration_seconds):
"""Estimate energy consumption in kWh."""
if not self.power_history:
return None
avg_power_w = sum(self.power_history) / len(self.power_history)
energy_wh = (avg_power_w * duration_seconds) / 3600
return energy_wh / 1000 # Convert to kWh
# Usage
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
if devices:
tracker = PowerTracker(devices[0])
# Sample for 60 seconds
for _ in range(60):
tracker.sample()
time.sleep(1)
stats = tracker.get_stats()
print(f"Power Statistics:")
print(f" Current: {stats['current']:.2f}W")
print(f" Average: {stats['average']:.2f}W")
print(f" Min: {stats['min']:.2f}W")
print(f" Max: {stats['max']:.2f}W")
print(f" Estimated Energy (1 min): {tracker.estimate_energy(60):.4f} kWh")
finally:
amdsmi.amdsmi_shut_down()Monitor which processes are using GPU resources:
import amdsmi
def monitor_gpu_processes():
"""Monitor processes using GPU resources."""
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
for i, device in enumerate(devices):
asic_info = amdsmi.amdsmi_get_gpu_asic_info(device)
print(f"\nGPU {i}: {asic_info['market_name']}")
print("-" * 60)
# Get process list
processes = amdsmi.amdsmi_get_gpu_process_list(device)
if not processes:
print(" No processes using GPU")
continue
print(f" Processes: {len(processes)}")
print(f" {'PID':<10} {'Memory (MB)':<15} {'Type':<10}")
print("-" * 60)
total_memory = 0
for proc in processes:
pid = proc.get('pid', 'N/A')
mem_mb = proc.get('mem_usage', 0) / (1024**2)
proc_type = proc.get('type', 'N/A')
total_memory += mem_mb
print(f" {pid:<10} {mem_mb:<15.2f} {proc_type:<10}")
print(f"\n Total Memory Used: {total_memory:.2f} MB")
finally:
amdsmi.amdsmi_shut_down()Generate a comprehensive device information report:
import amdsmi
def generate_device_report():
"""Generate comprehensive device information report."""
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
for i, device in enumerate(devices):
print(f"\n{'='*80}")
print(f"Device {i} Report")
print(f"{'='*80}\n")
# ASIC Information
asic_info = amdsmi.amdsmi_get_gpu_asic_info(device)
print("ASIC Information:")
print(f" Market Name: {asic_info.get('market_name', 'N/A')}")
print(f" Vendor: {asic_info.get('vendor_name', 'N/A')}")
print(f" Device ID: {asic_info.get('device_id', 'N/A')}")
# Driver Information
try:
driver_info = amdsmi.amdsmi_get_gpu_driver_info(device)
print(f"\nDriver Information:")
for key, value in driver_info.items():
print(f" {key}: {value}")
except:
print("\nDriver Information: Not available")
# VBIOS Information
try:
vbios_info = amdsmi.amdsmi_get_gpu_vbios_info(device)
print(f"\nVBIOS Information:")
for key, value in vbios_info.items():
print(f" {key}: {value}")
except:
print("\nVBIOS Information: Not available")
# Memory Information
vram_info = amdsmi.amdsmi_get_gpu_vram_info(device)
print(f"\nMemory Information:")
print(f" VRAM Type: {vram_info.get('vram_type', 'N/A')}")
print(f" VRAM Vendor: {vram_info.get('vram_vendor', 'N/A')}")
print(f" Bit Width: {vram_info.get('bit_width', 'N/A')} bits")
# UUID and BDF
try:
uuid = amdsmi.amdsmi_get_gpu_device_uuid(device)
bdf = amdsmi.amdsmi_get_gpu_device_bdf(device)
print(f"\nDevice Identifiers:")
print(f" UUID: {uuid}")
print(f" BDF: {bdf}")
except:
pass
finally:
amdsmi.amdsmi_shut_down()