AMD System Management Interface Library for monitoring and controlling AMD GPU devices on Linux systems
pkg:github/radeonopencompute/amdsmi@5.7.x
npx @tessl/cli install tessl/github-amd-smi@5.7.0The AMD System Management Interface (AMD SMI) Library is a comprehensive C/C++ library with Python bindings that provides user-space applications the ability to monitor and control AMD GPU devices on Linux systems. It offers socket and device handle abstractions for better hardware representation, supports querying device information like temperature, power consumption, and performance metrics, and includes comprehensive device management capabilities.
#include "amd_smi/amdsmi.h"import amdsmiFor specific functionality imports:
from amdsmi import (
amdsmi_init, amdsmi_shut_down,
amdsmi_get_socket_handles, amdsmi_get_processor_handles,
amdsmi_get_gpu_activity, amdsmi_get_power_info
)#include <iostream>
#include <vector>
#include "amd_smi/amdsmi.h"
int main() {
amdsmi_status_t ret;
uint32_t socket_count = 0;
// Initialize AMD SMI for GPUs only
ret = amdsmi_init(AMDSMI_INIT_AMD_GPUS);
if (ret != AMDSMI_STATUS_SUCCESS) {
return 1;
}
// Get socket count
ret = amdsmi_get_socket_handles(&socket_count, nullptr);
std::vector<amdsmi_socket_handle> sockets(socket_count);
ret = amdsmi_get_socket_handles(&socket_count, &sockets[0]);
// Get devices for first socket
uint32_t device_count = 0;
ret = amdsmi_get_processor_handles(sockets[0], &device_count, nullptr);
std::vector<amdsmi_processor_handle> devices(device_count);
ret = amdsmi_get_processor_handles(sockets[0], &device_count, &devices[0]);
// Get temperature for first device
int64_t temperature = 0;
ret = amdsmi_get_temp_metric(devices[0], TEMPERATURE_TYPE_EDGE,
AMDSMI_TEMP_CURRENT, &temperature);
std::cout << "GPU Temperature: " << temperature << "C" << std::endl;
// Cleanup
amdsmi_shut_down();
return 0;
}import amdsmi
# Initialize the library
amdsmi.amdsmi_init()
try:
# Get socket handles
sockets = amdsmi.amdsmi_get_socket_handles()
if sockets:
# Get processor handles for first socket
processors = amdsmi.amdsmi_get_processor_handles(sockets[0])
if processors:
# Get GPU activity information
activity = amdsmi.amdsmi_get_gpu_activity(processors[0])
print(f"GFX Activity: {activity.gfx_activity}%")
# Get power information
power_info = amdsmi.amdsmi_get_power_info(processors[0])
print(f"Socket Power: {power_info.average_socket_power}W")
finally:
# Always shut down the library
amdsmi.amdsmi_shut_down()The AMD SMI Library uses a hierarchical device representation:
This design enables the library to provide a unified interface for mixed-processor systems while maintaining efficient resource management and clear hardware topology representation.
Core library initialization, shutdown, and version management functions that must be called before using other AMD SMI functionality.
amdsmi_status_t amdsmi_init(uint64_t init_flags);
amdsmi_status_t amdsmi_shut_down(void);
amdsmi_status_t amdsmi_get_lib_version(amdsmi_version_t *version);Functions for discovering and identifying AMD processors, sockets, and their properties in the system.
amdsmi_status_t amdsmi_get_socket_handles(uint32_t *socket_count, amdsmi_socket_handle *socket_handles);
amdsmi_status_t amdsmi_get_processor_handles(amdsmi_socket_handle socket_handle, uint32_t *processor_count, amdsmi_processor_handle *processor_handles);
amdsmi_status_t amdsmi_get_processor_type(amdsmi_processor_handle processor_handle, processor_type_t *processor_type);
amdsmi_status_t amdsmi_get_gpu_device_uuid(amdsmi_processor_handle processor_handle, unsigned int *uuid_length, char *uuid);Static hardware information including ASIC details, board information, firmware versions, and driver information.
amdsmi_status_t amdsmi_get_gpu_asic_info(amdsmi_processor_handle processor_handle, amdsmi_asic_info_t *info);
amdsmi_status_t amdsmi_get_gpu_board_info(amdsmi_processor_handle processor_handle, amdsmi_board_info_t *info);
amdsmi_status_t amdsmi_get_fw_info(amdsmi_processor_handle processor_handle, amdsmi_fw_info_t *info);
amdsmi_status_t amdsmi_get_gpu_driver_version(amdsmi_processor_handle processor_handle, int *length, char *version);Real-time monitoring of GPU performance metrics including activity levels, clock frequencies, power consumption, and temperature measurements.
amdsmi_status_t amdsmi_get_gpu_activity(amdsmi_processor_handle processor_handle, amdsmi_engine_usage_t *info);
amdsmi_status_t amdsmi_get_power_info(amdsmi_processor_handle processor_handle, amdsmi_power_info_t *info);
amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle, amdsmi_temperature_type_t sensor_type, amdsmi_temperature_metric_t metric, int64_t *temperature);
amdsmi_status_t amdsmi_get_clk_freq(amdsmi_processor_handle processor_handle, amdsmi_clk_type_t clk_type, amdsmi_frequencies_t *f);Memory information including total memory, usage statistics, VRAM details, and memory error management.
amdsmi_status_t amdsmi_get_gpu_memory_total(amdsmi_processor_handle processor_handle, amdsmi_memory_type_t mem_type, uint64_t *total);
amdsmi_status_t amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle, amdsmi_memory_type_t mem_type, uint64_t *used);
amdsmi_status_t amdsmi_get_gpu_vram_usage(amdsmi_processor_handle processor_handle, amdsmi_vram_info_t *info);
amdsmi_status_t amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle, uint32_t *num_pages, amdsmi_retired_page_record_t *info);PCIe interface monitoring, bandwidth management, topology discovery, and multi-GPU connectivity features.
amdsmi_status_t amdsmi_get_gpu_pci_bandwidth(amdsmi_processor_handle processor_handle, amdsmi_pcie_bandwidth_t *bandwidth);
amdsmi_status_t amdsmi_get_gpu_pci_throughput(amdsmi_processor_handle processor_handle, uint64_t *sent, uint64_t *received, uint64_t *max_pkt_sz);
amdsmi_status_t amdsmi_get_gpu_topo_numa_affinity(amdsmi_processor_handle processor_handle, uint32_t *numa_node);
amdsmi_status_t amdsmi_topo_get_link_type(amdsmi_processor_handle processor_handle_src, amdsmi_processor_handle processor_handle_dst, uint64_t *hops, AMDSMI_IO_LINK_TYPE *type);Advanced performance tuning including clock control, power management, fan control, and overclocking capabilities. Note: Many control functions require root privileges and are not supported in virtual environments.
amdsmi_status_t amdsmi_set_gpu_perf_level(amdsmi_processor_handle processor_handle, amdsmi_dev_perf_level_t perf_lvl);
amdsmi_status_t amdsmi_set_power_cap(amdsmi_processor_handle processor_handle, uint32_t sensor_ind, uint64_t cap);
amdsmi_status_t amdsmi_set_gpu_fan_speed(amdsmi_processor_handle processor_handle, uint32_t sensor_ind, uint64_t speed);
amdsmi_status_t amdsmi_set_clk_freq(amdsmi_processor_handle processor_handle, amdsmi_clk_type_t clk_type, uint64_t freq_bitmask);Error detection, RAS (Reliability, Availability, Serviceability) features, ECC error monitoring, and comprehensive error reporting.
amdsmi_status_t amdsmi_get_gpu_ecc_count(amdsmi_processor_handle processor_handle, amdsmi_gpu_block_t block, amdsmi_error_count_t *ec);
amdsmi_status_t amdsmi_get_gpu_ecc_enabled(amdsmi_processor_handle processor_handle, uint64_t *enabled_blocks);
amdsmi_status_t amdsmi_status_code_to_string(amdsmi_status_t status, const char **status_string);
amdsmi_status_t amdsmi_get_gpu_ras_block_features_enabled(amdsmi_processor_handle processor_handle, amdsmi_gpu_block_t block, amdsmi_ras_err_state_t *state);Process monitoring, system-level GPU usage information, and multi-process GPU utilization tracking.
amdsmi_status_t amdsmi_get_gpu_process_list(amdsmi_processor_handle processor_handle, uint32_t *max_processes, amdsmi_process_handle_t *list);
amdsmi_status_t amdsmi_get_gpu_process_info(amdsmi_processor_handle processor_handle, amdsmi_process_handle_t process, amdsmi_proc_info_t *info);
amdsmi_status_t amdsmi_get_gpu_compute_process_info(amdsmi_process_info_t *procs, uint32_t *num_items);Process and System Information
Asynchronous event notification system for GPU state changes, thermal events, and error conditions.
amdsmi_status_t amdsmi_init_gpu_event_notification(amdsmi_processor_handle processor_handle);
amdsmi_status_t amdsmi_set_gpu_event_notification_mask(amdsmi_processor_handle processor_handle, uint64_t mask);
amdsmi_status_t amdsmi_get_gpu_event_notification(int timeout_ms, uint32_t *num_elem, amdsmi_evt_notification_data_t *data);
amdsmi_status_t amdsmi_stop_gpu_event_notification(amdsmi_processor_handle processor_handle);Low-level performance counter management for detailed GPU profiling and performance analysis.
amdsmi_status_t amdsmi_gpu_counter_group_supported(amdsmi_processor_handle processor_handle, amdsmi_event_group_t group);
amdsmi_status_t amdsmi_gpu_create_counter(amdsmi_processor_handle processor_handle, amdsmi_event_type_t type, amdsmi_event_handle_t *evnt_handle);
amdsmi_status_t amdsmi_gpu_destroy_counter(amdsmi_event_handle_t evnt_handle);
amdsmi_status_t amdsmi_gpu_control_counter(amdsmi_event_handle_t evt_handle, amdsmi_counter_command_t cmd, void *cmd_args);
amdsmi_status_t amdsmi_gpu_read_counter(amdsmi_event_handle_t evt_handle, amdsmi_counter_value_t *value);typedef void *amdsmi_socket_handle;
typedef void *amdsmi_processor_handle;
typedef uint32_t amdsmi_process_handle_t;
typedef uintptr_t amdsmi_event_handle_t;typedef enum {
AMDSMI_STATUS_SUCCESS = 0,
AMDSMI_STATUS_INVAL = 1,
AMDSMI_STATUS_NOT_SUPPORTED = 2,
AMDSMI_STATUS_NOT_YET_IMPLEMENTED = 3,
AMDSMI_STATUS_FAIL_LOAD_MODULE = 4,
AMDSMI_STATUS_FAIL_LOAD_SYMBOL = 5,
// ... additional error codes
AMDSMI_STATUS_UNKNOWN_ERROR = 0xFFFFFFFF
} amdsmi_status_t;
typedef enum {
AMDSMI_INIT_ALL_PROCESSORS = 0x0,
AMDSMI_INIT_AMD_CPUS = (1 << 0),
AMDSMI_INIT_AMD_GPUS = (1 << 1),
AMDSMI_INIT_NON_AMD_CPUS = (1 << 2),
AMDSMI_INIT_NON_AMD_GPUS = (1 << 3)
} amdsmi_init_flags_t;
typedef enum {
UNKNOWN = 0,
AMD_GPU,
AMD_CPU,
NON_AMD_GPU,
NON_AMD_CPU
} processor_type_t;typedef struct {
uint32_t year;
uint32_t major;
uint32_t minor;
uint32_t release;
const char *build;
} amdsmi_version_t;