CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/go-amdsmi

AMD System Management Interface (AMD SMI) Go library for unified GPU and CPU management and monitoring

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

memory.mddocs/

Memory Management

VRAM usage monitoring, memory partitioning, bad page reporting, and memory error handling. Critical for understanding memory health and optimizing memory-intensive workloads.

Capabilities

Memory Usage Information

Monitor GPU memory usage across different memory types and regions.

amdsmi_status_t amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle,
                                           amdsmi_memory_type_t mem_type,
                                           amdsmi_memory_usage_t* usage);

Memory Types:

typedef enum {
    AMDSMI_MEMORY_TYPE_VRAM,        // Video RAM (on-device memory)
    AMDSMI_MEMORY_TYPE_VIS_VRAM,    // Visible VRAM (CPU accessible)
    AMDSMI_MEMORY_TYPE_GTT          // Graphics Translation Table
} amdsmi_memory_type_t;

Memory Usage Structure:

typedef struct {
    uint64_t used;          // Used memory in bytes
    uint64_t total;         // Total memory in bytes
} amdsmi_memory_usage_t;

Total Memory Capacity

Get total memory capacity for different memory types.

amdsmi_status_t amdsmi_get_gpu_memory_total(amdsmi_processor_handle processor_handle,
                                           amdsmi_memory_type_t mem_type,
                                           uint64_t* total);

Bad Page Information

Monitor memory reliability by checking for bad memory pages.

amdsmi_status_t amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle,
                                            uint32_t* num_pages,
                                            amdsmi_retired_page_record_t* info);

Retired Page Record Structure:

typedef struct {
    uint64_t page_address;          // Physical address of retired page
    uint64_t page_size;             // Size of retired page in bytes
    amdsmi_memory_page_status_t status; // Page retirement status
} amdsmi_retired_page_record_t;

Page Status Types:

typedef enum {
    AMDSMI_MEM_PAGE_STATUS_RESERVED,    // Page reserved but usable
    AMDSMI_MEM_PAGE_STATUS_PENDING,     // Page pending retirement
    AMDSMI_MEM_PAGE_STATUS_UNRESERVABLE // Page unusable
} amdsmi_memory_page_status_t;

Memory Partitioning

Get and manage GPU memory partitioning information.

amdsmi_status_t amdsmi_get_gpu_memory_partition_info(amdsmi_processor_handle processor_handle,
                                                    char* memory_partition,
                                                    uint32_t len);
amdsmi_status_t amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle,
                                               amdsmi_memory_partition_type_t partition_type);

Memory Partition Types:

typedef enum {
    AMDSMI_MEMORY_PARTITION_UNKNOWN,    // Unknown partition
    AMDSMI_MEMORY_PARTITION_NPS1,       // 1 NUMA node per socket
    AMDSMI_MEMORY_PARTITION_NPS2,       // 2 NUMA nodes per socket
    AMDSMI_MEMORY_PARTITION_NPS4,       // 4 NUMA nodes per socket
    AMDSMI_MEMORY_PARTITION_NPS8        // 8 NUMA nodes per socket
} amdsmi_memory_partition_type_t;

Language Interface Examples

Python

import amdsmi

gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)
if gpu_handles:
    gpu_handle = gpu_handles[0]
    
    # Get VRAM usage
    vram_usage = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle, 
                                                   amdsmi.AmdSmiMemoryType.VRAM)
    vram_used_gb = vram_usage['used'] / (1024**3)
    vram_total_gb = vram_usage['total'] / (1024**3)
    vram_percent = (vram_usage['used'] / vram_usage['total']) * 100
    
    print(f"VRAM Usage: {vram_used_gb:.2f} / {vram_total_gb:.2f} GB ({vram_percent:.1f}%)")
    
    # Get visible VRAM usage
    vis_vram = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle,
                                                 amdsmi.AmdSmiMemoryType.VIS_VRAM)
    print(f"Visible VRAM: {vis_vram['used'] / (1024**3):.2f} GB")
    
    # Check for bad pages
    try:
        bad_pages = amdsmi.amdsmi_get_gpu_bad_page_info(gpu_handle)
        if bad_pages:
            print(f"Found {len(bad_pages)} bad memory pages")
            for page in bad_pages:
                print(f"  Address: 0x{page['page_address']:X}, Status: {page['status']}")
        else:
            print("No bad memory pages detected")
    except amdsmi.AmdSmiException:
        print("Bad page information not available")

Go

import "github.com/ROCm/amdsmi"

// Get memory information for each GPU
for i := 0; i < int(goamdsmi.GO_gpu_num_monitor_devices()); i++ {
    // Get memory usage
    memUsed := goamdsmi.GO_gpu_dev_memory_used_get(i)
    memTotal := goamdsmi.GO_gpu_dev_memory_total_get(i)
    memPercent := float64(memUsed) / float64(memTotal) * 100
    
    fmt.Printf("GPU %d Memory: %d MB / %d MB (%.1f%%)\n", 
               i, memUsed/(1024*1024), memTotal/(1024*1024), memPercent)
    
    // Get busy memory percentage
    memBusy := goamdsmi.GO_gpu_dev_memory_busy_percent_get(i)
    fmt.Printf("GPU %d Memory Activity: %d%%\n", i, memBusy)
}

Rust

use amdsmi::{get_gpu_memory_usage, get_gpu_memory_total, get_gpu_bad_page_info};
use amdsmi::{MemoryType, MemoryUsage};

// Get comprehensive memory information
let vram_usage: MemoryUsage = get_gpu_memory_usage(gpu_handle, MemoryType::Vram)?;
let vram_used_gb = vram_usage.used as f64 / (1024.0_f64.powi(3));
let vram_total_gb = vram_usage.total as f64 / (1024.0_f64.powi(3));
let vram_percent = (vram_usage.used as f64 / vram_usage.total as f64) * 100.0;

println!("VRAM Usage: {:.2} / {:.2} GB ({:.1}%)", 
         vram_used_gb, vram_total_gb, vram_percent);

// Check memory health
match get_gpu_bad_page_info(gpu_handle) {
    Ok(bad_pages) => {
        if bad_pages.is_empty() {
            println!("No bad memory pages detected");
        } else {
            println!("Found {} bad memory pages", bad_pages.len());
            for page in bad_pages {
                println!("  Address: 0x{:X}, Status: {:?}", 
                         page.page_address, page.status);
            }
        }
    },
    Err(e) => println!("Could not get bad page info: {:?}", e),
}

Memory Management Best Practices

  1. Regular Monitoring: Monitor VRAM usage to prevent out-of-memory conditions
  2. Memory Health: Check for bad pages regularly, especially in data center environments
  3. Visible VRAM: Monitor visible VRAM for CPU-GPU data transfer bottlenecks
  4. Memory Partitioning: Understand NUMA topology impacts on memory access patterns
  5. Error Handling: Bad page queries may fail on older hardware or drivers

Install with Tessl CLI

npx tessl i tessl/go-amdsmi

docs

cpu-management.md

device-info.md

events.md

gpu-performance.md

index.md

initialization.md

memory.md

performance-control.md

power-thermal.md

topology-ras.md

tile.json