0
# Power and Thermal Management
1
2
Power consumption monitoring, thermal sensors, fan control, and power limit management for AMD GPUs and CPUs. Essential for system thermal management and power optimization.
3
4
## Capabilities
5
6
### Power Information
7
8
Get comprehensive power consumption data including current, average, and maximum power draw.
9
10
```cpp { .api }
11
amdsmi_status_t amdsmi_get_power_info(amdsmi_processor_handle processor_handle,
12
amdsmi_power_info_t* power_info);
13
```
14
15
**Power Information Structure:**
16
```cpp { .api }
17
typedef struct {
18
uint64_t current_socket_power; // Current socket power (W)
19
uint64_t average_socket_power; // Average socket power (W)
20
uint64_t max_socket_power_limit; // Maximum power limit (W)
21
uint64_t min_socket_power_limit; // Minimum power limit (W)
22
} amdsmi_power_info_t;
23
```
24
25
**Usage Example:**
26
```cpp
27
amdsmi_power_info_t power_info;
28
amdsmi_status_t status = amdsmi_get_power_info(gpu_handle, &power_info);
29
30
if (status == AMDSMI_STATUS_SUCCESS) {
31
printf("Power Status:\n");
32
printf(" Current: %lu W\n", power_info.current_socket_power);
33
printf(" Average: %lu W\n", power_info.average_socket_power);
34
printf(" Max Limit: %lu W\n", power_info.max_socket_power_limit);
35
printf(" Min Limit: %lu W\n", power_info.min_socket_power_limit);
36
}
37
```
38
39
### Temperature Monitoring
40
41
Monitor various temperature sensors across the GPU die and components.
42
43
```cpp { .api }
44
amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle,
45
amdsmi_temperature_type_t sensor_type,
46
amdsmi_temperature_metric_t metric,
47
int64_t* temperature);
48
```
49
50
**Temperature Sensor Types:**
51
```cpp { .api }
52
typedef enum {
53
AMDSMI_TEMP_TYPE_EDGE, // Edge temperature sensor
54
AMDSMI_TEMP_TYPE_JUNCTION, // Junction temperature sensor
55
AMDSMI_TEMP_TYPE_MEMORY, // Memory temperature sensor
56
AMDSMI_TEMP_TYPE_HBM_0, // HBM instance 0
57
AMDSMI_TEMP_TYPE_HBM_1, // HBM instance 1
58
AMDSMI_TEMP_TYPE_HBM_2, // HBM instance 2
59
AMDSMI_TEMP_TYPE_HBM_3, // HBM instance 3
60
AMDSMI_TEMP_TYPE_PLX // PLX sensor
61
} amdsmi_temperature_type_t;
62
```
63
64
**Temperature Metrics:**
65
```cpp { .api }
66
typedef enum {
67
AMDSMI_TEMP_CURRENT, // Current temperature
68
AMDSMI_TEMP_MAX, // Maximum recorded temperature
69
AMDSMI_TEMP_MIN, // Minimum recorded temperature
70
AMDSMI_TEMP_MAX_HYST, // Maximum temperature hysteresis
71
AMDSMI_TEMP_MIN_HYST, // Minimum temperature hysteresis
72
AMDSMI_TEMP_CRITICAL, // Critical temperature threshold
73
AMDSMI_TEMP_CRITICAL_HYST, // Critical temperature hysteresis
74
AMDSMI_TEMP_EMERGENCY, // Emergency temperature threshold
75
AMDSMI_TEMP_EMERGENCY_HYST // Emergency temperature hysteresis
76
} amdsmi_temperature_metric_t;
77
```
78
79
**Usage Example:**
80
```cpp
81
// Get current edge temperature
82
int64_t edge_temp;
83
amdsmi_status_t status = amdsmi_get_temp_metric(gpu_handle,
84
AMDSMI_TEMP_TYPE_EDGE,
85
AMDSMI_TEMP_CURRENT,
86
&edge_temp);
87
88
if (status == AMDSMI_STATUS_SUCCESS) {
89
printf("GPU Edge Temperature: %ld°C\n", edge_temp / 1000); // Convert from millicelsius
90
}
91
92
// Get critical temperature threshold
93
int64_t critical_temp;
94
status = amdsmi_get_temp_metric(gpu_handle,
95
AMDSMI_TEMP_TYPE_EDGE,
96
AMDSMI_TEMP_CRITICAL,
97
&critical_temp);
98
99
if (status == AMDSMI_STATUS_SUCCESS) {
100
printf("Critical Temperature: %ld°C\n", critical_temp / 1000);
101
}
102
```
103
104
### Fan Speed Monitoring
105
106
Monitor fan speeds and RPM values for cooling system management.
107
108
```cpp { .api }
109
amdsmi_status_t amdsmi_get_fan_speed(amdsmi_processor_handle processor_handle,
110
uint32_t sensor_idx,
111
int64_t* speed);
112
```
113
114
**Parameters:**
115
- `processor_handle`: GPU handle
116
- `sensor_idx`: Fan sensor index (typically 0 for primary fan)
117
- `speed`: Output fan speed in RPM
118
119
```cpp { .api }
120
amdsmi_status_t amdsmi_get_fan_speed_max(amdsmi_processor_handle processor_handle,
121
uint32_t sensor_idx,
122
uint64_t* max_speed);
123
```
124
125
**Usage Example:**
126
```cpp
127
// Get current fan speed
128
int64_t fan_speed;
129
amdsmi_status_t status = amdsmi_get_fan_speed(gpu_handle, 0, &fan_speed);
130
131
if (status == AMDSMI_STATUS_SUCCESS) {
132
printf("Fan Speed: %ld RPM\n", fan_speed);
133
}
134
135
// Get maximum fan speed
136
uint64_t max_fan_speed;
137
status = amdsmi_get_fan_speed_max(gpu_handle, 0, &max_fan_speed);
138
139
if (status == AMDSMI_STATUS_SUCCESS) {
140
printf("Max Fan Speed: %lu RPM\n", max_fan_speed);
141
double fan_percent = (double)fan_speed / max_fan_speed * 100.0;
142
printf("Fan Usage: %.1f%%\n", fan_percent);
143
}
144
```
145
146
### Power Limit Control
147
148
Set and get power consumption limits for power management.
149
150
```cpp { .api }
151
amdsmi_status_t amdsmi_set_power_cap(amdsmi_processor_handle processor_handle,
152
uint32_t sensor_ind,
153
uint64_t cap);
154
```
155
156
```cpp { .api }
157
amdsmi_status_t amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle,
158
uint32_t sensor_ind,
159
amdsmi_power_cap_info_t* info);
160
```
161
162
**Power Cap Information Structure:**
163
```cpp { .api }
164
typedef struct {
165
uint64_t power_cap; // Current power cap (W)
166
uint64_t default_power_cap; // Default power cap (W)
167
uint64_t dpm_cap; // DPM power cap (W)
168
uint64_t min_power_cap; // Minimum power cap (W)
169
uint64_t max_power_cap; // Maximum power cap (W)
170
} amdsmi_power_cap_info_t;
171
```
172
173
**Usage Example:**
174
```cpp
175
// Get current power cap info
176
amdsmi_power_cap_info_t cap_info;
177
amdsmi_status_t status = amdsmi_get_power_cap_info(gpu_handle, 0, &cap_info);
178
179
if (status == AMDSMI_STATUS_SUCCESS) {
180
printf("Power Cap Info:\n");
181
printf(" Current: %lu W\n", cap_info.power_cap);
182
printf(" Default: %lu W\n", cap_info.default_power_cap);
183
printf(" Range: %lu - %lu W\n", cap_info.min_power_cap, cap_info.max_power_cap);
184
}
185
186
// Set new power limit (requires appropriate permissions)
187
uint64_t new_cap = 200; // 200W
188
status = amdsmi_set_power_cap(gpu_handle, 0, new_cap);
189
190
if (status == AMDSMI_STATUS_SUCCESS) {
191
printf("Power cap set to %lu W\n", new_cap);
192
} else if (status == AMDSMI_STATUS_PERMISSION) {
193
printf("Insufficient permissions to set power cap\n");
194
}
195
```
196
197
### Voltage Information
198
199
Monitor GPU voltage levels and voltage curves.
200
201
```cpp { .api }
202
amdsmi_status_t amdsmi_get_gpu_volt_metric(amdsmi_processor_handle processor_handle,
203
amdsmi_voltage_type_t sensor_type,
204
amdsmi_voltage_metric_t metric,
205
int64_t* voltage);
206
```
207
208
**Voltage Types:**
209
```cpp { .api }
210
typedef enum {
211
AMDSMI_VOLT_TYPE_VDDGFX, // Graphics voltage
212
AMDSMI_VOLT_TYPE_VDDNB, // Northbridge voltage
213
AMDSMI_VOLT_TYPE_VDDMEM // Memory voltage
214
} amdsmi_voltage_type_t;
215
```
216
217
**Voltage Metrics:**
218
```cpp { .api }
219
typedef enum {
220
AMDSMI_VOLT_CURRENT, // Current voltage
221
AMDSMI_VOLT_MAX, // Maximum voltage
222
AMDSMI_VOLT_MIN // Minimum voltage
223
} amdsmi_voltage_metric_t;
224
```
225
226
## Language Interface Examples
227
228
### Python
229
```python
230
import amdsmi
231
232
gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)
233
if gpu_handles:
234
gpu_handle = gpu_handles[0]
235
236
# Get power information
237
power_info = amdsmi.amdsmi_get_power_info(gpu_handle)
238
print(f"Current Power: {power_info['current_socket_power']}W")
239
print(f"Max Power Limit: {power_info['max_socket_power_limit']}W")
240
241
# Get temperature
242
temp = amdsmi.amdsmi_get_temp_metric(gpu_handle,
243
amdsmi.AmdSmiTemperatureType.EDGE,
244
amdsmi.AmdSmiTemperatureMetric.CURRENT)
245
print(f"GPU Temperature: {temp // 1000}°C")
246
247
# Get fan speed
248
fan_speed = amdsmi.amdsmi_get_fan_speed(gpu_handle, 0)
249
print(f"Fan Speed: {fan_speed} RPM")
250
251
# Get power cap info
252
power_cap_info = amdsmi.amdsmi_get_power_cap_info(gpu_handle, 0)
253
print(f"Current Power Cap: {power_cap_info['power_cap']}W")
254
```
255
256
### Go
257
```go
258
import "github.com/ROCm/amdsmi"
259
260
// Get power and thermal data for each GPU
261
for i := 0; i < int(goamdsmi.GO_gpu_num_monitor_devices()); i++ {
262
// Get power consumption
263
power := goamdsmi.GO_gpu_dev_power_ave_get(i)
264
fmt.Printf("GPU %d Average Power: %d W\n", i, power)
265
266
// Get temperature
267
temp := goamdsmi.GO_gpu_dev_temp_get(i, goamdsmi.TEMPERATURE_TYPE_EDGE)
268
fmt.Printf("GPU %d Temperature: %d°C\n", i, temp/1000)
269
270
// Get fan speed
271
fanSpeed := goamdsmi.GO_gpu_dev_fan_speed_get(i, 0)
272
fmt.Printf("GPU %d Fan Speed: %d RPM\n", i, fanSpeed)
273
}
274
```
275
276
### Rust
277
```rust
278
use amdsmi::{get_power_info, get_temp_metric, get_fan_speed};
279
use amdsmi::{TemperatureType, TemperatureMetric};
280
281
// Get comprehensive thermal and power data
282
let power_info = get_power_info(gpu_handle)?;
283
println!("Current Power: {}W", power_info.current_socket_power);
284
285
let edge_temp = get_temp_metric(gpu_handle,
286
TemperatureType::Edge,
287
TemperatureMetric::Current)?;
288
println!("GPU Temperature: {}°C", edge_temp / 1000);
289
290
let fan_speed = get_fan_speed(gpu_handle, 0)?;
291
println!("Fan Speed: {} RPM", fan_speed);
292
```
293
294
## Thermal Management Best Practices
295
296
1. **Temperature Monitoring**: Monitor edge and junction temperatures regularly
297
2. **Thermal Throttling**: Be aware that high temperatures (>90°C) may trigger automatic throttling
298
3. **Fan Curves**: Understand that fan speeds are typically controlled automatically by the driver
299
4. **Power Limits**: Setting power caps too low may significantly impact performance
300
5. **Cooling Solutions**: Consider ambient temperature and case airflow when interpreting thermal data
301
6. **Memory Temperatures**: HBM temperatures are critical for memory-intensive workloads