CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/go-amdsmi

AMD System Management Interface (AMD SMI) Go library for unified GPU and CPU management and monitoring

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

power-thermal.mddocs/

Power and Thermal Management

Power consumption monitoring, thermal sensors, fan control, and power limit management for AMD GPUs and CPUs. Essential for system thermal management and power optimization.

Capabilities

Power Information

Get comprehensive power consumption data including current, average, and maximum power draw.

amdsmi_status_t amdsmi_get_power_info(amdsmi_processor_handle processor_handle,
                                     amdsmi_power_info_t* power_info);

Power Information Structure:

typedef struct {
    uint64_t current_socket_power;      // Current socket power (W)
    uint64_t average_socket_power;      // Average socket power (W)
    uint64_t max_socket_power_limit;    // Maximum power limit (W)
    uint64_t min_socket_power_limit;    // Minimum power limit (W)
} amdsmi_power_info_t;

Usage Example:

amdsmi_power_info_t power_info;
amdsmi_status_t status = amdsmi_get_power_info(gpu_handle, &power_info);

if (status == AMDSMI_STATUS_SUCCESS) {
    printf("Power Status:\n");
    printf("  Current: %lu W\n", power_info.current_socket_power);
    printf("  Average: %lu W\n", power_info.average_socket_power);
    printf("  Max Limit: %lu W\n", power_info.max_socket_power_limit);
    printf("  Min Limit: %lu W\n", power_info.min_socket_power_limit);
}

Temperature Monitoring

Monitor various temperature sensors across the GPU die and components.

amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle,
                                      amdsmi_temperature_type_t sensor_type,
                                      amdsmi_temperature_metric_t metric,
                                      int64_t* temperature);

Temperature Sensor Types:

typedef enum {
    AMDSMI_TEMP_TYPE_EDGE,              // Edge temperature sensor
    AMDSMI_TEMP_TYPE_JUNCTION,          // Junction temperature sensor  
    AMDSMI_TEMP_TYPE_MEMORY,            // Memory temperature sensor
    AMDSMI_TEMP_TYPE_HBM_0,             // HBM instance 0
    AMDSMI_TEMP_TYPE_HBM_1,             // HBM instance 1
    AMDSMI_TEMP_TYPE_HBM_2,             // HBM instance 2
    AMDSMI_TEMP_TYPE_HBM_3,             // HBM instance 3
    AMDSMI_TEMP_TYPE_PLX                // PLX sensor
} amdsmi_temperature_type_t;

Temperature Metrics:

typedef enum {
    AMDSMI_TEMP_CURRENT,               // Current temperature
    AMDSMI_TEMP_MAX,                   // Maximum recorded temperature
    AMDSMI_TEMP_MIN,                   // Minimum recorded temperature
    AMDSMI_TEMP_MAX_HYST,              // Maximum temperature hysteresis
    AMDSMI_TEMP_MIN_HYST,              // Minimum temperature hysteresis
    AMDSMI_TEMP_CRITICAL,              // Critical temperature threshold
    AMDSMI_TEMP_CRITICAL_HYST,         // Critical temperature hysteresis
    AMDSMI_TEMP_EMERGENCY,             // Emergency temperature threshold
    AMDSMI_TEMP_EMERGENCY_HYST         // Emergency temperature hysteresis
} amdsmi_temperature_metric_t;

Usage Example:

// Get current edge temperature
int64_t edge_temp;
amdsmi_status_t status = amdsmi_get_temp_metric(gpu_handle, 
                                               AMDSMI_TEMP_TYPE_EDGE,
                                               AMDSMI_TEMP_CURRENT, 
                                               &edge_temp);

if (status == AMDSMI_STATUS_SUCCESS) {
    printf("GPU Edge Temperature: %ld°C\n", edge_temp / 1000); // Convert from millicelsius
}

// Get critical temperature threshold
int64_t critical_temp;
status = amdsmi_get_temp_metric(gpu_handle,
                               AMDSMI_TEMP_TYPE_EDGE,
                               AMDSMI_TEMP_CRITICAL,
                               &critical_temp);

if (status == AMDSMI_STATUS_SUCCESS) {
    printf("Critical Temperature: %ld°C\n", critical_temp / 1000);
}

Fan Speed Monitoring

Monitor fan speeds and RPM values for cooling system management.

amdsmi_status_t amdsmi_get_fan_speed(amdsmi_processor_handle processor_handle,
                                    uint32_t sensor_idx,
                                    int64_t* speed);

Parameters:

  • processor_handle: GPU handle
  • sensor_idx: Fan sensor index (typically 0 for primary fan)
  • speed: Output fan speed in RPM
amdsmi_status_t amdsmi_get_fan_speed_max(amdsmi_processor_handle processor_handle,
                                        uint32_t sensor_idx,
                                        uint64_t* max_speed);

Usage Example:

// Get current fan speed
int64_t fan_speed;
amdsmi_status_t status = amdsmi_get_fan_speed(gpu_handle, 0, &fan_speed);

if (status == AMDSMI_STATUS_SUCCESS) {
    printf("Fan Speed: %ld RPM\n", fan_speed);
}

// Get maximum fan speed
uint64_t max_fan_speed;
status = amdsmi_get_fan_speed_max(gpu_handle, 0, &max_fan_speed);

if (status == AMDSMI_STATUS_SUCCESS) {
    printf("Max Fan Speed: %lu RPM\n", max_fan_speed);
    double fan_percent = (double)fan_speed / max_fan_speed * 100.0;
    printf("Fan Usage: %.1f%%\n", fan_percent);
}

Power Limit Control

Set and get power consumption limits for power management.

amdsmi_status_t amdsmi_set_power_cap(amdsmi_processor_handle processor_handle,
                                    uint32_t sensor_ind,
                                    uint64_t cap);
amdsmi_status_t amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle,
                                         uint32_t sensor_ind,
                                         amdsmi_power_cap_info_t* info);

Power Cap Information Structure:

typedef struct {
    uint64_t power_cap;                // Current power cap (W)
    uint64_t default_power_cap;        // Default power cap (W)
    uint64_t dpm_cap;                  // DPM power cap (W)
    uint64_t min_power_cap;            // Minimum power cap (W)
    uint64_t max_power_cap;            // Maximum power cap (W)
} amdsmi_power_cap_info_t;

Usage Example:

// Get current power cap info
amdsmi_power_cap_info_t cap_info;
amdsmi_status_t status = amdsmi_get_power_cap_info(gpu_handle, 0, &cap_info);

if (status == AMDSMI_STATUS_SUCCESS) {
    printf("Power Cap Info:\n");
    printf("  Current: %lu W\n", cap_info.power_cap);
    printf("  Default: %lu W\n", cap_info.default_power_cap);
    printf("  Range: %lu - %lu W\n", cap_info.min_power_cap, cap_info.max_power_cap);
}

// Set new power limit (requires appropriate permissions)
uint64_t new_cap = 200; // 200W
status = amdsmi_set_power_cap(gpu_handle, 0, new_cap);

if (status == AMDSMI_STATUS_SUCCESS) {
    printf("Power cap set to %lu W\n", new_cap);
} else if (status == AMDSMI_STATUS_PERMISSION) {
    printf("Insufficient permissions to set power cap\n");
}

Voltage Information

Monitor GPU voltage levels and voltage curves.

amdsmi_status_t amdsmi_get_gpu_volt_metric(amdsmi_processor_handle processor_handle,
                                          amdsmi_voltage_type_t sensor_type,
                                          amdsmi_voltage_metric_t metric,
                                          int64_t* voltage);

Voltage Types:

typedef enum {
    AMDSMI_VOLT_TYPE_VDDGFX,           // Graphics voltage
    AMDSMI_VOLT_TYPE_VDDNB,            // Northbridge voltage
    AMDSMI_VOLT_TYPE_VDDMEM            // Memory voltage
} amdsmi_voltage_type_t;

Voltage Metrics:

typedef enum {
    AMDSMI_VOLT_CURRENT,              // Current voltage
    AMDSMI_VOLT_MAX,                  // Maximum voltage
    AMDSMI_VOLT_MIN                   // Minimum voltage
} amdsmi_voltage_metric_t;

Language Interface Examples

Python

import amdsmi

gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)
if gpu_handles:
    gpu_handle = gpu_handles[0]
    
    # Get power information
    power_info = amdsmi.amdsmi_get_power_info(gpu_handle)
    print(f"Current Power: {power_info['current_socket_power']}W")
    print(f"Max Power Limit: {power_info['max_socket_power_limit']}W")
    
    # Get temperature
    temp = amdsmi.amdsmi_get_temp_metric(gpu_handle, 
                                        amdsmi.AmdSmiTemperatureType.EDGE,
                                        amdsmi.AmdSmiTemperatureMetric.CURRENT)
    print(f"GPU Temperature: {temp // 1000}°C")
    
    # Get fan speed
    fan_speed = amdsmi.amdsmi_get_fan_speed(gpu_handle, 0)
    print(f"Fan Speed: {fan_speed} RPM")
    
    # Get power cap info
    power_cap_info = amdsmi.amdsmi_get_power_cap_info(gpu_handle, 0)
    print(f"Current Power Cap: {power_cap_info['power_cap']}W")

Go

import "github.com/ROCm/amdsmi"

// Get power and thermal data for each GPU
for i := 0; i < int(goamdsmi.GO_gpu_num_monitor_devices()); i++ {
    // Get power consumption
    power := goamdsmi.GO_gpu_dev_power_ave_get(i)
    fmt.Printf("GPU %d Average Power: %d W\n", i, power)
    
    // Get temperature
    temp := goamdsmi.GO_gpu_dev_temp_get(i, goamdsmi.TEMPERATURE_TYPE_EDGE)
    fmt.Printf("GPU %d Temperature: %d°C\n", i, temp/1000)
    
    // Get fan speed
    fanSpeed := goamdsmi.GO_gpu_dev_fan_speed_get(i, 0)
    fmt.Printf("GPU %d Fan Speed: %d RPM\n", i, fanSpeed)
}

Rust

use amdsmi::{get_power_info, get_temp_metric, get_fan_speed};
use amdsmi::{TemperatureType, TemperatureMetric};

// Get comprehensive thermal and power data
let power_info = get_power_info(gpu_handle)?;
println!("Current Power: {}W", power_info.current_socket_power);

let edge_temp = get_temp_metric(gpu_handle, 
                               TemperatureType::Edge, 
                               TemperatureMetric::Current)?;
println!("GPU Temperature: {}°C", edge_temp / 1000);

let fan_speed = get_fan_speed(gpu_handle, 0)?;
println!("Fan Speed: {} RPM", fan_speed);

Thermal Management Best Practices

  1. Temperature Monitoring: Monitor edge and junction temperatures regularly
  2. Thermal Throttling: Be aware that high temperatures (>90°C) may trigger automatic throttling
  3. Fan Curves: Understand that fan speeds are typically controlled automatically by the driver
  4. Power Limits: Setting power caps too low may significantly impact performance
  5. Cooling Solutions: Consider ambient temperature and case airflow when interpreting thermal data
  6. Memory Temperatures: HBM temperatures are critical for memory-intensive workloads

Install with Tessl CLI

npx tessl i tessl/go-amdsmi

docs

cpu-management.md

device-info.md

events.md

gpu-performance.md

index.md

initialization.md

memory.md

performance-control.md

power-thermal.md

topology-ras.md

tile.json