or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cpu-management.mddevice-info.mdevents.mdgpu-performance.mdindex.mdinitialization.mdmemory.mdperformance-control.mdpower-thermal.mdtopology-ras.md

power-thermal.mddocs/

0

# Power and Thermal Management

1

2

Power consumption monitoring, thermal sensors, fan control, and power limit management for AMD GPUs and CPUs. Essential for system thermal management and power optimization.

3

4

## Capabilities

5

6

### Power Information

7

8

Get comprehensive power consumption data including current, average, and maximum power draw.

9

10

```cpp { .api }

11

amdsmi_status_t amdsmi_get_power_info(amdsmi_processor_handle processor_handle,

12

amdsmi_power_info_t* power_info);

13

```

14

15

**Power Information Structure:**

16

```cpp { .api }

17

typedef struct {

18

uint64_t current_socket_power; // Current socket power (W)

19

uint64_t average_socket_power; // Average socket power (W)

20

uint64_t max_socket_power_limit; // Maximum power limit (W)

21

uint64_t min_socket_power_limit; // Minimum power limit (W)

22

} amdsmi_power_info_t;

23

```

24

25

**Usage Example:**

26

```cpp

27

amdsmi_power_info_t power_info;

28

amdsmi_status_t status = amdsmi_get_power_info(gpu_handle, &power_info);

29

30

if (status == AMDSMI_STATUS_SUCCESS) {

31

printf("Power Status:\n");

32

printf(" Current: %lu W\n", power_info.current_socket_power);

33

printf(" Average: %lu W\n", power_info.average_socket_power);

34

printf(" Max Limit: %lu W\n", power_info.max_socket_power_limit);

35

printf(" Min Limit: %lu W\n", power_info.min_socket_power_limit);

36

}

37

```

38

39

### Temperature Monitoring

40

41

Monitor various temperature sensors across the GPU die and components.

42

43

```cpp { .api }

44

amdsmi_status_t amdsmi_get_temp_metric(amdsmi_processor_handle processor_handle,

45

amdsmi_temperature_type_t sensor_type,

46

amdsmi_temperature_metric_t metric,

47

int64_t* temperature);

48

```

49

50

**Temperature Sensor Types:**

51

```cpp { .api }

52

typedef enum {

53

AMDSMI_TEMP_TYPE_EDGE, // Edge temperature sensor

54

AMDSMI_TEMP_TYPE_JUNCTION, // Junction temperature sensor

55

AMDSMI_TEMP_TYPE_MEMORY, // Memory temperature sensor

56

AMDSMI_TEMP_TYPE_HBM_0, // HBM instance 0

57

AMDSMI_TEMP_TYPE_HBM_1, // HBM instance 1

58

AMDSMI_TEMP_TYPE_HBM_2, // HBM instance 2

59

AMDSMI_TEMP_TYPE_HBM_3, // HBM instance 3

60

AMDSMI_TEMP_TYPE_PLX // PLX sensor

61

} amdsmi_temperature_type_t;

62

```

63

64

**Temperature Metrics:**

65

```cpp { .api }

66

typedef enum {

67

AMDSMI_TEMP_CURRENT, // Current temperature

68

AMDSMI_TEMP_MAX, // Maximum recorded temperature

69

AMDSMI_TEMP_MIN, // Minimum recorded temperature

70

AMDSMI_TEMP_MAX_HYST, // Maximum temperature hysteresis

71

AMDSMI_TEMP_MIN_HYST, // Minimum temperature hysteresis

72

AMDSMI_TEMP_CRITICAL, // Critical temperature threshold

73

AMDSMI_TEMP_CRITICAL_HYST, // Critical temperature hysteresis

74

AMDSMI_TEMP_EMERGENCY, // Emergency temperature threshold

75

AMDSMI_TEMP_EMERGENCY_HYST // Emergency temperature hysteresis

76

} amdsmi_temperature_metric_t;

77

```

78

79

**Usage Example:**

80

```cpp

81

// Get current edge temperature

82

int64_t edge_temp;

83

amdsmi_status_t status = amdsmi_get_temp_metric(gpu_handle,

84

AMDSMI_TEMP_TYPE_EDGE,

85

AMDSMI_TEMP_CURRENT,

86

&edge_temp);

87

88

if (status == AMDSMI_STATUS_SUCCESS) {

89

printf("GPU Edge Temperature: %ld°C\n", edge_temp / 1000); // Convert from millicelsius

90

}

91

92

// Get critical temperature threshold

93

int64_t critical_temp;

94

status = amdsmi_get_temp_metric(gpu_handle,

95

AMDSMI_TEMP_TYPE_EDGE,

96

AMDSMI_TEMP_CRITICAL,

97

&critical_temp);

98

99

if (status == AMDSMI_STATUS_SUCCESS) {

100

printf("Critical Temperature: %ld°C\n", critical_temp / 1000);

101

}

102

```

103

104

### Fan Speed Monitoring

105

106

Monitor fan speeds and RPM values for cooling system management.

107

108

```cpp { .api }

109

amdsmi_status_t amdsmi_get_fan_speed(amdsmi_processor_handle processor_handle,

110

uint32_t sensor_idx,

111

int64_t* speed);

112

```

113

114

**Parameters:**

115

- `processor_handle`: GPU handle

116

- `sensor_idx`: Fan sensor index (typically 0 for primary fan)

117

- `speed`: Output fan speed in RPM

118

119

```cpp { .api }

120

amdsmi_status_t amdsmi_get_fan_speed_max(amdsmi_processor_handle processor_handle,

121

uint32_t sensor_idx,

122

uint64_t* max_speed);

123

```

124

125

**Usage Example:**

126

```cpp

127

// Get current fan speed

128

int64_t fan_speed;

129

amdsmi_status_t status = amdsmi_get_fan_speed(gpu_handle, 0, &fan_speed);

130

131

if (status == AMDSMI_STATUS_SUCCESS) {

132

printf("Fan Speed: %ld RPM\n", fan_speed);

133

}

134

135

// Get maximum fan speed

136

uint64_t max_fan_speed;

137

status = amdsmi_get_fan_speed_max(gpu_handle, 0, &max_fan_speed);

138

139

if (status == AMDSMI_STATUS_SUCCESS) {

140

printf("Max Fan Speed: %lu RPM\n", max_fan_speed);

141

double fan_percent = (double)fan_speed / max_fan_speed * 100.0;

142

printf("Fan Usage: %.1f%%\n", fan_percent);

143

}

144

```

145

146

### Power Limit Control

147

148

Set and get power consumption limits for power management.

149

150

```cpp { .api }

151

amdsmi_status_t amdsmi_set_power_cap(amdsmi_processor_handle processor_handle,

152

uint32_t sensor_ind,

153

uint64_t cap);

154

```

155

156

```cpp { .api }

157

amdsmi_status_t amdsmi_get_power_cap_info(amdsmi_processor_handle processor_handle,

158

uint32_t sensor_ind,

159

amdsmi_power_cap_info_t* info);

160

```

161

162

**Power Cap Information Structure:**

163

```cpp { .api }

164

typedef struct {

165

uint64_t power_cap; // Current power cap (W)

166

uint64_t default_power_cap; // Default power cap (W)

167

uint64_t dpm_cap; // DPM power cap (W)

168

uint64_t min_power_cap; // Minimum power cap (W)

169

uint64_t max_power_cap; // Maximum power cap (W)

170

} amdsmi_power_cap_info_t;

171

```

172

173

**Usage Example:**

174

```cpp

175

// Get current power cap info

176

amdsmi_power_cap_info_t cap_info;

177

amdsmi_status_t status = amdsmi_get_power_cap_info(gpu_handle, 0, &cap_info);

178

179

if (status == AMDSMI_STATUS_SUCCESS) {

180

printf("Power Cap Info:\n");

181

printf(" Current: %lu W\n", cap_info.power_cap);

182

printf(" Default: %lu W\n", cap_info.default_power_cap);

183

printf(" Range: %lu - %lu W\n", cap_info.min_power_cap, cap_info.max_power_cap);

184

}

185

186

// Set new power limit (requires appropriate permissions)

187

uint64_t new_cap = 200; // 200W

188

status = amdsmi_set_power_cap(gpu_handle, 0, new_cap);

189

190

if (status == AMDSMI_STATUS_SUCCESS) {

191

printf("Power cap set to %lu W\n", new_cap);

192

} else if (status == AMDSMI_STATUS_PERMISSION) {

193

printf("Insufficient permissions to set power cap\n");

194

}

195

```

196

197

### Voltage Information

198

199

Monitor GPU voltage levels and voltage curves.

200

201

```cpp { .api }

202

amdsmi_status_t amdsmi_get_gpu_volt_metric(amdsmi_processor_handle processor_handle,

203

amdsmi_voltage_type_t sensor_type,

204

amdsmi_voltage_metric_t metric,

205

int64_t* voltage);

206

```

207

208

**Voltage Types:**

209

```cpp { .api }

210

typedef enum {

211

AMDSMI_VOLT_TYPE_VDDGFX, // Graphics voltage

212

AMDSMI_VOLT_TYPE_VDDNB, // Northbridge voltage

213

AMDSMI_VOLT_TYPE_VDDMEM // Memory voltage

214

} amdsmi_voltage_type_t;

215

```

216

217

**Voltage Metrics:**

218

```cpp { .api }

219

typedef enum {

220

AMDSMI_VOLT_CURRENT, // Current voltage

221

AMDSMI_VOLT_MAX, // Maximum voltage

222

AMDSMI_VOLT_MIN // Minimum voltage

223

} amdsmi_voltage_metric_t;

224

```

225

226

## Language Interface Examples

227

228

### Python

229

```python

230

import amdsmi

231

232

gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)

233

if gpu_handles:

234

gpu_handle = gpu_handles[0]

235

236

# Get power information

237

power_info = amdsmi.amdsmi_get_power_info(gpu_handle)

238

print(f"Current Power: {power_info['current_socket_power']}W")

239

print(f"Max Power Limit: {power_info['max_socket_power_limit']}W")

240

241

# Get temperature

242

temp = amdsmi.amdsmi_get_temp_metric(gpu_handle,

243

amdsmi.AmdSmiTemperatureType.EDGE,

244

amdsmi.AmdSmiTemperatureMetric.CURRENT)

245

print(f"GPU Temperature: {temp // 1000}°C")

246

247

# Get fan speed

248

fan_speed = amdsmi.amdsmi_get_fan_speed(gpu_handle, 0)

249

print(f"Fan Speed: {fan_speed} RPM")

250

251

# Get power cap info

252

power_cap_info = amdsmi.amdsmi_get_power_cap_info(gpu_handle, 0)

253

print(f"Current Power Cap: {power_cap_info['power_cap']}W")

254

```

255

256

### Go

257

```go

258

import "github.com/ROCm/amdsmi"

259

260

// Get power and thermal data for each GPU

261

for i := 0; i < int(goamdsmi.GO_gpu_num_monitor_devices()); i++ {

262

// Get power consumption

263

power := goamdsmi.GO_gpu_dev_power_ave_get(i)

264

fmt.Printf("GPU %d Average Power: %d W\n", i, power)

265

266

// Get temperature

267

temp := goamdsmi.GO_gpu_dev_temp_get(i, goamdsmi.TEMPERATURE_TYPE_EDGE)

268

fmt.Printf("GPU %d Temperature: %d°C\n", i, temp/1000)

269

270

// Get fan speed

271

fanSpeed := goamdsmi.GO_gpu_dev_fan_speed_get(i, 0)

272

fmt.Printf("GPU %d Fan Speed: %d RPM\n", i, fanSpeed)

273

}

274

```

275

276

### Rust

277

```rust

278

use amdsmi::{get_power_info, get_temp_metric, get_fan_speed};

279

use amdsmi::{TemperatureType, TemperatureMetric};

280

281

// Get comprehensive thermal and power data

282

let power_info = get_power_info(gpu_handle)?;

283

println!("Current Power: {}W", power_info.current_socket_power);

284

285

let edge_temp = get_temp_metric(gpu_handle,

286

TemperatureType::Edge,

287

TemperatureMetric::Current)?;

288

println!("GPU Temperature: {}°C", edge_temp / 1000);

289

290

let fan_speed = get_fan_speed(gpu_handle, 0)?;

291

println!("Fan Speed: {} RPM", fan_speed);

292

```

293

294

## Thermal Management Best Practices

295

296

1. **Temperature Monitoring**: Monitor edge and junction temperatures regularly

297

2. **Thermal Throttling**: Be aware that high temperatures (>90°C) may trigger automatic throttling

298

3. **Fan Curves**: Understand that fan speeds are typically controlled automatically by the driver

299

4. **Power Limits**: Setting power caps too low may significantly impact performance

300

5. **Cooling Solutions**: Consider ambient temperature and case airflow when interpreting thermal data

301

6. **Memory Temperatures**: HBM temperatures are critical for memory-intensive workloads