AMD System Management Interface (AMD SMI) Go library for unified GPU and CPU management and monitoring
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
NUMA affinity, PCIe topology, XGMI links, reliability features, and error monitoring. Essential for multi-GPU systems and high-reliability computing environments.
Get information about GPU interconnect topology and nearest neighbor relationships.
amdsmi_status_t amdsmi_get_link_topology_nearest(amdsmi_processor_handle processor_handle,
amdsmi_link_type_t link_type,
amdsmi_topology_t* topology);Link Types:
typedef enum {
AMDSMI_LINK_TYPE_PCIE, // PCIe links
AMDSMI_LINK_TYPE_XGMI, // XGMI (high-speed interconnect) links
AMDSMI_LINK_TYPE_UNKNOWN // Unknown link type
} amdsmi_link_type_t;Get Reliability, Availability, and Serviceability feature status.
amdsmi_status_t amdsmi_get_gpu_ras_feature_info(amdsmi_processor_handle processor_handle,
amdsmi_gpu_block_t block,
amdsmi_ras_feature_t* ras_feature);GPU Blocks:
typedef enum {
AMDSMI_GPU_BLOCK_UMC, // Unified Memory Controller
AMDSMI_GPU_BLOCK_SDMA, // System DMA
AMDSMI_GPU_BLOCK_GFX, // Graphics block
AMDSMI_GPU_BLOCK_MMHUB, // Memory Management Hub
AMDSMI_GPU_BLOCK_ATHUB, // Address Translation Hub
AMDSMI_GPU_BLOCK_PCIE_BIF, // PCIe Bus Interface
AMDSMI_GPU_BLOCK_HDP, // Host Data Path
AMDSMI_GPU_BLOCK_XGMI_WAFL, // XGMI Wide Area Fabric Link
AMDSMI_GPU_BLOCK_DF, // Data Fabric
AMDSMI_GPU_BLOCK_SMN, // System Management Network
AMDSMI_GPU_BLOCK_SEM, // SEM block
AMDSMI_GPU_BLOCK_MP0, // MP0 block
AMDSMI_GPU_BLOCK_MP1, // MP1 block
AMDSMI_GPU_BLOCK_FUSE // Fuse block
} amdsmi_gpu_block_t;Monitor error counts for different GPU blocks to assess system health.
amdsmi_status_t amdsmi_get_gpu_ecc_count(amdsmi_processor_handle processor_handle,
amdsmi_gpu_block_t block,
amdsmi_error_count_t* ec);Error Count Structure:
typedef struct {
uint64_t correctable_count; // Correctable error count
uint64_t uncorrectable_count; // Uncorrectable error count
} amdsmi_error_count_t;Get NUMA node affinity information for optimal memory allocation.
amdsmi_status_t amdsmi_get_gpu_numa_affinity(amdsmi_processor_handle processor_handle,
uint32_t* numa_node);import amdsmi
gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)
for i, gpu_handle in enumerate(gpu_handles):
print(f"GPU {i}:")
# Get NUMA affinity
try:
numa_node = amdsmi.amdsmi_get_gpu_numa_affinity(gpu_handle)
print(f" NUMA Node: {numa_node}")
except amdsmi.AmdSmiException:
print(" NUMA affinity not available")
# Check RAS features for UMC block
try:
ras_info = amdsmi.amdsmi_get_gpu_ras_feature_info(gpu_handle,
amdsmi.AmdSmiGpuBlock.UMC)
print(f" UMC RAS enabled: {ras_info['ras_ecc_enabled']}")
except amdsmi.AmdSmiException:
print(" RAS information not available")
# Get error counts
try:
ecc_count = amdsmi.amdsmi_get_gpu_ecc_count(gpu_handle,
amdsmi.AmdSmiGpuBlock.UMC)
print(f" ECC Errors - Correctable: {ecc_count['correctable_count']}, "
f"Uncorrectable: {ecc_count['uncorrectable_count']}")
except amdsmi.AmdSmiException:
print(" ECC error counts not available")use amdsmi::{get_gpu_numa_affinity, get_gpu_ras_feature_info, get_gpu_ecc_count};
use amdsmi::{GpuBlock, ErrorCount};
// Get topology and reliability information
match get_gpu_numa_affinity(gpu_handle) {
Ok(numa_node) => println!("NUMA Node: {}", numa_node),
Err(_) => println!("NUMA affinity not available"),
}
// Check RAS features
match get_gpu_ras_feature_info(gpu_handle, GpuBlock::UMC) {
Ok(ras_info) => println!("UMC RAS enabled: {}", ras_info.ras_ecc_enabled),
Err(_) => println!("RAS information not available"),
}
// Monitor error counts
match get_gpu_ecc_count(gpu_handle, GpuBlock::UMC) {
Ok(ErrorCount { correctable_count, uncorrectable_count }) => {
println!("ECC Errors - Correctable: {}, Uncorrectable: {}",
correctable_count, uncorrectable_count);
},
Err(_) => println!("ECC error counts not available"),
}Install with Tessl CLI
npx tessl i tessl/go-amdsmi