tessl/go-amdsmi

AMD System Management Interface (AMD SMI) Go library for unified GPU and CPU management and monitoring

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Memory Management

Name: tessl/go-amdsmi
Author: tessl

VRAM usage monitoring, memory partitioning, bad page reporting, and memory error handling. Critical for understanding memory health and optimizing memory-intensive workloads.

Capabilities

Memory Usage Information

Monitor GPU memory usage across different memory types and regions.

amdsmi_status_t amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle,
                                           amdsmi_memory_type_t mem_type,
                                           amdsmi_memory_usage_t* usage);

Memory Types:

typedef enum {
    AMDSMI_MEMORY_TYPE_VRAM,        // Video RAM (on-device memory)
    AMDSMI_MEMORY_TYPE_VIS_VRAM,    // Visible VRAM (CPU accessible)
    AMDSMI_MEMORY_TYPE_GTT          // Graphics Translation Table
} amdsmi_memory_type_t;

Memory Usage Structure:

typedef struct {
    uint64_t used;          // Used memory in bytes
    uint64_t total;         // Total memory in bytes
} amdsmi_memory_usage_t;

Total Memory Capacity

Get total memory capacity for different memory types.

amdsmi_status_t amdsmi_get_gpu_memory_total(amdsmi_processor_handle processor_handle,
                                           amdsmi_memory_type_t mem_type,
                                           uint64_t* total);

Bad Page Information

Monitor memory reliability by checking for bad memory pages.

amdsmi_status_t amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle,
                                            uint32_t* num_pages,
                                            amdsmi_retired_page_record_t* info);

Retired Page Record Structure:

typedef struct {
    uint64_t page_address;          // Physical address of retired page
    uint64_t page_size;             // Size of retired page in bytes
    amdsmi_memory_page_status_t status; // Page retirement status
} amdsmi_retired_page_record_t;

Page Status Types:

typedef enum {
    AMDSMI_MEM_PAGE_STATUS_RESERVED,    // Page reserved but usable
    AMDSMI_MEM_PAGE_STATUS_PENDING,     // Page pending retirement
    AMDSMI_MEM_PAGE_STATUS_UNRESERVABLE // Page unusable
} amdsmi_memory_page_status_t;

Memory Partitioning

Get and manage GPU memory partitioning information.

amdsmi_status_t amdsmi_get_gpu_memory_partition_info(amdsmi_processor_handle processor_handle,
                                                    char* memory_partition,
                                                    uint32_t len);

amdsmi_status_t amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle,
                                               amdsmi_memory_partition_type_t partition_type);

Memory Partition Types:

typedef enum {
    AMDSMI_MEMORY_PARTITION_UNKNOWN,    // Unknown partition
    AMDSMI_MEMORY_PARTITION_NPS1,       // 1 NUMA node per socket
    AMDSMI_MEMORY_PARTITION_NPS2,       // 2 NUMA nodes per socket
    AMDSMI_MEMORY_PARTITION_NPS4,       // 4 NUMA nodes per socket
    AMDSMI_MEMORY_PARTITION_NPS8        // 8 NUMA nodes per socket
} amdsmi_memory_partition_type_t;

Language Interface Examples

Python

import amdsmi

gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)
if gpu_handles:
    gpu_handle = gpu_handles[0]
    
    # Get VRAM usage
    vram_usage = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle, 
                                                   amdsmi.AmdSmiMemoryType.VRAM)
    vram_used_gb = vram_usage['used'] / (1024**3)
    vram_total_gb = vram_usage['total'] / (1024**3)
    vram_percent = (vram_usage['used'] / vram_usage['total']) * 100
    
    print(f"VRAM Usage: {vram_used_gb:.2f} / {vram_total_gb:.2f} GB ({vram_percent:.1f}%)")
    
    # Get visible VRAM usage
    vis_vram = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle,
                                                 amdsmi.AmdSmiMemoryType.VIS_VRAM)
    print(f"Visible VRAM: {vis_vram['used'] / (1024**3):.2f} GB")
    
    # Check for bad pages
    try:
        bad_pages = amdsmi.amdsmi_get_gpu_bad_page_info(gpu_handle)
        if bad_pages:
            print(f"Found {len(bad_pages)} bad memory pages")
            for page in bad_pages:
                print(f"  Address: 0x{page['page_address']:X}, Status: {page['status']}")
        else:
            print("No bad memory pages detected")
    except amdsmi.AmdSmiException:
        print("Bad page information not available")

Go

import "github.com/ROCm/amdsmi"

// Get memory information for each GPU
for i := 0; i < int(goamdsmi.GO_gpu_num_monitor_devices()); i++ {
    // Get memory usage
    memUsed := goamdsmi.GO_gpu_dev_memory_used_get(i)
    memTotal := goamdsmi.GO_gpu_dev_memory_total_get(i)
    memPercent := float64(memUsed) / float64(memTotal) * 100
    
    fmt.Printf("GPU %d Memory: %d MB / %d MB (%.1f%%)\n", 
               i, memUsed/(1024*1024), memTotal/(1024*1024), memPercent)
    
    // Get busy memory percentage
    memBusy := goamdsmi.GO_gpu_dev_memory_busy_percent_get(i)
    fmt.Printf("GPU %d Memory Activity: %d%%\n", i, memBusy)
}

Rust

use amdsmi::{get_gpu_memory_usage, get_gpu_memory_total, get_gpu_bad_page_info};
use amdsmi::{MemoryType, MemoryUsage};

// Get comprehensive memory information
let vram_usage: MemoryUsage = get_gpu_memory_usage(gpu_handle, MemoryType::Vram)?;
let vram_used_gb = vram_usage.used as f64 / (1024.0_f64.powi(3));
let vram_total_gb = vram_usage.total as f64 / (1024.0_f64.powi(3));
let vram_percent = (vram_usage.used as f64 / vram_usage.total as f64) * 100.0;

println!("VRAM Usage: {:.2} / {:.2} GB ({:.1}%)", 
         vram_used_gb, vram_total_gb, vram_percent);

// Check memory health
match get_gpu_bad_page_info(gpu_handle) {
    Ok(bad_pages) => {
        if bad_pages.is_empty() {
            println!("No bad memory pages detected");
        } else {
            println!("Found {} bad memory pages", bad_pages.len());
            for page in bad_pages {
                println!("  Address: 0x{:X}, Status: {:?}", 
                         page.page_address, page.status);
            }
        }
    },
    Err(e) => println!("Could not get bad page info: {:?}", e),
}

Memory Management Best Practices

Regular Monitoring: Monitor VRAM usage to prevent out-of-memory conditions
Memory Health: Check for bad pages regularly, especially in data center environments
Visible VRAM: Monitor visible VRAM for CPU-GPU data transfer bottlenecks
Memory Partitioning: Understand NUMA topology impacts on memory access patterns
Error Handling: Bad page queries may fail on older hardware or drivers