AMD System Management Interface (AMD SMI) Go library for unified GPU and CPU management and monitoring
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Real-time event monitoring, GPU reset detection, and system state change notifications. Essential for responsive monitoring applications and automated system management.
Get real-time notifications of GPU events and state changes.
amdsmi_status_t amdsmi_get_gpu_event_notification(int timeout_ms,
uint32_t* num_elem,
amdsmi_evt_notification_data_t* data);Event Notification Data Structure:
typedef struct {
amdsmi_processor_handle processor_handle; // GPU that generated the event
amdsmi_evt_notification_type_t event; // Event type
char message[AMDSMI_MAX_STRING_LENGTH]; // Event message
} amdsmi_evt_notification_data_t;Event Types:
typedef enum {
AMDSMI_EVT_NOTIF_VMFAULT, // VM fault event
AMDSMI_EVT_NOTIF_FIRST, // First event marker
AMDSMI_EVT_NOTIF_LAST, // Last event marker
AMDSMI_EVT_NOTIF_GPU_PRE_RESET, // GPU pre-reset notification
AMDSMI_EVT_NOTIF_GPU_POST_RESET // GPU post-reset notification
} amdsmi_evt_notification_type_t;Configure which events to monitor by setting notification masks.
amdsmi_status_t amdsmi_set_gpu_event_notification_mask(amdsmi_processor_handle processor_handle,
uint64_t mask);Event Mask Bits:
#define AMDSMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - AMDSMI_EVT_NOTIF_FIRST))Usage Example:
// Enable GPU reset notifications
uint64_t mask = AMDSMI_EVENT_MASK_FROM_INDEX(AMDSMI_EVT_NOTIF_GPU_PRE_RESET) |
AMDSMI_EVENT_MASK_FROM_INDEX(AMDSMI_EVT_NOTIF_GPU_POST_RESET);
amdsmi_status_t status = amdsmi_set_gpu_event_notification_mask(gpu_handle, mask);
if (status == AMDSMI_STATUS_SUCCESS) {
printf("Event notifications enabled for GPU resets\n");
// Monitor for events with 5 second timeout
amdsmi_evt_notification_data_t events[10];
uint32_t num_events = 10;
status = amdsmi_get_gpu_event_notification(5000, &num_events, events);
if (status == AMDSMI_STATUS_SUCCESS) {
for (uint32_t i = 0; i < num_events; i++) {
printf("Event: %d, Message: %s\n",
events[i].event, events[i].message);
}
} else if (status == AMDSMI_STATUS_INTERRUPT) {
printf("Event monitoring timed out\n");
}
}import amdsmi
import time
gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)
if gpu_handles:
gpu_handle = gpu_handles[0]
try:
# Enable event notifications for GPU resets
reset_mask = (amdsmi.AmdSmiEventType.GPU_PRE_RESET |
amdsmi.AmdSmiEventType.GPU_POST_RESET)
amdsmi.amdsmi_set_gpu_event_notification_mask(gpu_handle, reset_mask)
print("Monitoring GPU events... (Press Ctrl+C to stop)")
while True:
try:
# Wait for events with 2 second timeout
events = amdsmi.amdsmi_get_gpu_event_notification(2000)
for event in events:
print(f"GPU Event: {event['event']}")
print(f"Message: {event['message']}")
print(f"GPU Handle: {event['processor_handle']}")
print("-" * 40)
except amdsmi.AmdSmiTimeoutException:
# Timeout is normal - continue monitoring
continue
except KeyboardInterrupt:
print("Event monitoring stopped")
except amdsmi.AmdSmiException as e:
print(f"Event monitoring error: {e}")use amdsmi::{set_gpu_event_notification_mask, get_gpu_event_notification};
use amdsmi::{EventNotificationType, EvtNotificationData};
use std::time::Duration;
// Set up event monitoring
let reset_events = EventNotificationType::GpuPreReset as u64 |
EventNotificationType::GpuPostReset as u64;
match set_gpu_event_notification_mask(gpu_handle, reset_events) {
Ok(_) => {
println!("Event monitoring enabled");
// Monitor events in a loop
loop {
match get_gpu_event_notification(Duration::from_secs(2)) {
Ok(events) => {
for event in events {
println!("GPU Event: {:?}", event.event);
println!("Message: {}", event.message);
println!("---");
}
},
Err(amdsmi::AmdsmiError::Timeout) => {
// Timeout is normal, continue monitoring
continue;
},
Err(e) => {
println!("Event monitoring error: {:?}", e);
break;
}
}
}
},
Err(e) => println!("Failed to enable event monitoring: {:?}", e),
}Install with Tessl CLI
npx tessl i tessl/go-amdsmi