AMD System Management Interface (AMD SMI) Go library for unified GPU and CPU management and monitoring
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
VRAM usage monitoring, memory partitioning, bad page reporting, and memory error handling. Critical for understanding memory health and optimizing memory-intensive workloads.
Monitor GPU memory usage across different memory types and regions.
amdsmi_status_t amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle,
amdsmi_memory_type_t mem_type,
amdsmi_memory_usage_t* usage);Memory Types:
typedef enum {
AMDSMI_MEMORY_TYPE_VRAM, // Video RAM (on-device memory)
AMDSMI_MEMORY_TYPE_VIS_VRAM, // Visible VRAM (CPU accessible)
AMDSMI_MEMORY_TYPE_GTT // Graphics Translation Table
} amdsmi_memory_type_t;Memory Usage Structure:
typedef struct {
uint64_t used; // Used memory in bytes
uint64_t total; // Total memory in bytes
} amdsmi_memory_usage_t;Get total memory capacity for different memory types.
amdsmi_status_t amdsmi_get_gpu_memory_total(amdsmi_processor_handle processor_handle,
amdsmi_memory_type_t mem_type,
uint64_t* total);Monitor memory reliability by checking for bad memory pages.
amdsmi_status_t amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle,
uint32_t* num_pages,
amdsmi_retired_page_record_t* info);Retired Page Record Structure:
typedef struct {
uint64_t page_address; // Physical address of retired page
uint64_t page_size; // Size of retired page in bytes
amdsmi_memory_page_status_t status; // Page retirement status
} amdsmi_retired_page_record_t;Page Status Types:
typedef enum {
AMDSMI_MEM_PAGE_STATUS_RESERVED, // Page reserved but usable
AMDSMI_MEM_PAGE_STATUS_PENDING, // Page pending retirement
AMDSMI_MEM_PAGE_STATUS_UNRESERVABLE // Page unusable
} amdsmi_memory_page_status_t;Get and manage GPU memory partitioning information.
amdsmi_status_t amdsmi_get_gpu_memory_partition_info(amdsmi_processor_handle processor_handle,
char* memory_partition,
uint32_t len);amdsmi_status_t amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle,
amdsmi_memory_partition_type_t partition_type);Memory Partition Types:
typedef enum {
AMDSMI_MEMORY_PARTITION_UNKNOWN, // Unknown partition
AMDSMI_MEMORY_PARTITION_NPS1, // 1 NUMA node per socket
AMDSMI_MEMORY_PARTITION_NPS2, // 2 NUMA nodes per socket
AMDSMI_MEMORY_PARTITION_NPS4, // 4 NUMA nodes per socket
AMDSMI_MEMORY_PARTITION_NPS8 // 8 NUMA nodes per socket
} amdsmi_memory_partition_type_t;import amdsmi
gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)
if gpu_handles:
gpu_handle = gpu_handles[0]
# Get VRAM usage
vram_usage = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle,
amdsmi.AmdSmiMemoryType.VRAM)
vram_used_gb = vram_usage['used'] / (1024**3)
vram_total_gb = vram_usage['total'] / (1024**3)
vram_percent = (vram_usage['used'] / vram_usage['total']) * 100
print(f"VRAM Usage: {vram_used_gb:.2f} / {vram_total_gb:.2f} GB ({vram_percent:.1f}%)")
# Get visible VRAM usage
vis_vram = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle,
amdsmi.AmdSmiMemoryType.VIS_VRAM)
print(f"Visible VRAM: {vis_vram['used'] / (1024**3):.2f} GB")
# Check for bad pages
try:
bad_pages = amdsmi.amdsmi_get_gpu_bad_page_info(gpu_handle)
if bad_pages:
print(f"Found {len(bad_pages)} bad memory pages")
for page in bad_pages:
print(f" Address: 0x{page['page_address']:X}, Status: {page['status']}")
else:
print("No bad memory pages detected")
except amdsmi.AmdSmiException:
print("Bad page information not available")import "github.com/ROCm/amdsmi"
// Get memory information for each GPU
for i := 0; i < int(goamdsmi.GO_gpu_num_monitor_devices()); i++ {
// Get memory usage
memUsed := goamdsmi.GO_gpu_dev_memory_used_get(i)
memTotal := goamdsmi.GO_gpu_dev_memory_total_get(i)
memPercent := float64(memUsed) / float64(memTotal) * 100
fmt.Printf("GPU %d Memory: %d MB / %d MB (%.1f%%)\n",
i, memUsed/(1024*1024), memTotal/(1024*1024), memPercent)
// Get busy memory percentage
memBusy := goamdsmi.GO_gpu_dev_memory_busy_percent_get(i)
fmt.Printf("GPU %d Memory Activity: %d%%\n", i, memBusy)
}use amdsmi::{get_gpu_memory_usage, get_gpu_memory_total, get_gpu_bad_page_info};
use amdsmi::{MemoryType, MemoryUsage};
// Get comprehensive memory information
let vram_usage: MemoryUsage = get_gpu_memory_usage(gpu_handle, MemoryType::Vram)?;
let vram_used_gb = vram_usage.used as f64 / (1024.0_f64.powi(3));
let vram_total_gb = vram_usage.total as f64 / (1024.0_f64.powi(3));
let vram_percent = (vram_usage.used as f64 / vram_usage.total as f64) * 100.0;
println!("VRAM Usage: {:.2} / {:.2} GB ({:.1}%)",
vram_used_gb, vram_total_gb, vram_percent);
// Check memory health
match get_gpu_bad_page_info(gpu_handle) {
Ok(bad_pages) => {
if bad_pages.is_empty() {
println!("No bad memory pages detected");
} else {
println!("Found {} bad memory pages", bad_pages.len());
for page in bad_pages {
println!(" Address: 0x{:X}, Status: {:?}",
page.page_address, page.status);
}
}
},
Err(e) => println!("Could not get bad page info: {:?}", e),
}Install with Tessl CLI
npx tessl i tessl/go-amdsmi