or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cpu-management.mddevice-info.mdevents.mdgpu-performance.mdindex.mdinitialization.mdmemory.mdperformance-control.mdpower-thermal.mdtopology-ras.md

memory.mddocs/

0

# Memory Management

1

2

VRAM usage monitoring, memory partitioning, bad page reporting, and memory error handling. Critical for understanding memory health and optimizing memory-intensive workloads.

3

4

## Capabilities

5

6

### Memory Usage Information

7

8

Monitor GPU memory usage across different memory types and regions.

9

10

```cpp { .api }

11

amdsmi_status_t amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle,

12

amdsmi_memory_type_t mem_type,

13

amdsmi_memory_usage_t* usage);

14

```

15

16

**Memory Types:**

17

```cpp { .api }

18

typedef enum {

19

AMDSMI_MEMORY_TYPE_VRAM, // Video RAM (on-device memory)

20

AMDSMI_MEMORY_TYPE_VIS_VRAM, // Visible VRAM (CPU accessible)

21

AMDSMI_MEMORY_TYPE_GTT // Graphics Translation Table

22

} amdsmi_memory_type_t;

23

```

24

25

**Memory Usage Structure:**

26

```cpp { .api }

27

typedef struct {

28

uint64_t used; // Used memory in bytes

29

uint64_t total; // Total memory in bytes

30

} amdsmi_memory_usage_t;

31

```

32

33

### Total Memory Capacity

34

35

Get total memory capacity for different memory types.

36

37

```cpp { .api }

38

amdsmi_status_t amdsmi_get_gpu_memory_total(amdsmi_processor_handle processor_handle,

39

amdsmi_memory_type_t mem_type,

40

uint64_t* total);

41

```

42

43

### Bad Page Information

44

45

Monitor memory reliability by checking for bad memory pages.

46

47

```cpp { .api }

48

amdsmi_status_t amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle,

49

uint32_t* num_pages,

50

amdsmi_retired_page_record_t* info);

51

```

52

53

**Retired Page Record Structure:**

54

```cpp { .api }

55

typedef struct {

56

uint64_t page_address; // Physical address of retired page

57

uint64_t page_size; // Size of retired page in bytes

58

amdsmi_memory_page_status_t status; // Page retirement status

59

} amdsmi_retired_page_record_t;

60

```

61

62

**Page Status Types:**

63

```cpp { .api }

64

typedef enum {

65

AMDSMI_MEM_PAGE_STATUS_RESERVED, // Page reserved but usable

66

AMDSMI_MEM_PAGE_STATUS_PENDING, // Page pending retirement

67

AMDSMI_MEM_PAGE_STATUS_UNRESERVABLE // Page unusable

68

} amdsmi_memory_page_status_t;

69

```

70

71

### Memory Partitioning

72

73

Get and manage GPU memory partitioning information.

74

75

```cpp { .api }

76

amdsmi_status_t amdsmi_get_gpu_memory_partition_info(amdsmi_processor_handle processor_handle,

77

char* memory_partition,

78

uint32_t len);

79

```

80

81

```cpp { .api }

82

amdsmi_status_t amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle,

83

amdsmi_memory_partition_type_t partition_type);

84

```

85

86

**Memory Partition Types:**

87

```cpp { .api }

88

typedef enum {

89

AMDSMI_MEMORY_PARTITION_UNKNOWN, // Unknown partition

90

AMDSMI_MEMORY_PARTITION_NPS1, // 1 NUMA node per socket

91

AMDSMI_MEMORY_PARTITION_NPS2, // 2 NUMA nodes per socket

92

AMDSMI_MEMORY_PARTITION_NPS4, // 4 NUMA nodes per socket

93

AMDSMI_MEMORY_PARTITION_NPS8 // 8 NUMA nodes per socket

94

} amdsmi_memory_partition_type_t;

95

```

96

97

## Language Interface Examples

98

99

### Python

100

```python

101

import amdsmi

102

103

gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)

104

if gpu_handles:

105

gpu_handle = gpu_handles[0]

106

107

# Get VRAM usage

108

vram_usage = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle,

109

amdsmi.AmdSmiMemoryType.VRAM)

110

vram_used_gb = vram_usage['used'] / (1024**3)

111

vram_total_gb = vram_usage['total'] / (1024**3)

112

vram_percent = (vram_usage['used'] / vram_usage['total']) * 100

113

114

print(f"VRAM Usage: {vram_used_gb:.2f} / {vram_total_gb:.2f} GB ({vram_percent:.1f}%)")

115

116

# Get visible VRAM usage

117

vis_vram = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle,

118

amdsmi.AmdSmiMemoryType.VIS_VRAM)

119

print(f"Visible VRAM: {vis_vram['used'] / (1024**3):.2f} GB")

120

121

# Check for bad pages

122

try:

123

bad_pages = amdsmi.amdsmi_get_gpu_bad_page_info(gpu_handle)

124

if bad_pages:

125

print(f"Found {len(bad_pages)} bad memory pages")

126

for page in bad_pages:

127

print(f" Address: 0x{page['page_address']:X}, Status: {page['status']}")

128

else:

129

print("No bad memory pages detected")

130

except amdsmi.AmdSmiException:

131

print("Bad page information not available")

132

```

133

134

### Go

135

```go

136

import "github.com/ROCm/amdsmi"

137

138

// Get memory information for each GPU

139

for i := 0; i < int(goamdsmi.GO_gpu_num_monitor_devices()); i++ {

140

// Get memory usage

141

memUsed := goamdsmi.GO_gpu_dev_memory_used_get(i)

142

memTotal := goamdsmi.GO_gpu_dev_memory_total_get(i)

143

memPercent := float64(memUsed) / float64(memTotal) * 100

144

145

fmt.Printf("GPU %d Memory: %d MB / %d MB (%.1f%%)\n",

146

i, memUsed/(1024*1024), memTotal/(1024*1024), memPercent)

147

148

// Get busy memory percentage

149

memBusy := goamdsmi.GO_gpu_dev_memory_busy_percent_get(i)

150

fmt.Printf("GPU %d Memory Activity: %d%%\n", i, memBusy)

151

}

152

```

153

154

### Rust

155

```rust

156

use amdsmi::{get_gpu_memory_usage, get_gpu_memory_total, get_gpu_bad_page_info};

157

use amdsmi::{MemoryType, MemoryUsage};

158

159

// Get comprehensive memory information

160

let vram_usage: MemoryUsage = get_gpu_memory_usage(gpu_handle, MemoryType::Vram)?;

161

let vram_used_gb = vram_usage.used as f64 / (1024.0_f64.powi(3));

162

let vram_total_gb = vram_usage.total as f64 / (1024.0_f64.powi(3));

163

let vram_percent = (vram_usage.used as f64 / vram_usage.total as f64) * 100.0;

164

165

println!("VRAM Usage: {:.2} / {:.2} GB ({:.1}%)",

166

vram_used_gb, vram_total_gb, vram_percent);

167

168

// Check memory health

169

match get_gpu_bad_page_info(gpu_handle) {

170

Ok(bad_pages) => {

171

if bad_pages.is_empty() {

172

println!("No bad memory pages detected");

173

} else {

174

println!("Found {} bad memory pages", bad_pages.len());

175

for page in bad_pages {

176

println!(" Address: 0x{:X}, Status: {:?}",

177

page.page_address, page.status);

178

}

179

}

180

},

181

Err(e) => println!("Could not get bad page info: {:?}", e),

182

}

183

```

184

185

## Memory Management Best Practices

186

187

1. **Regular Monitoring**: Monitor VRAM usage to prevent out-of-memory conditions

188

2. **Memory Health**: Check for bad pages regularly, especially in data center environments

189

3. **Visible VRAM**: Monitor visible VRAM for CPU-GPU data transfer bottlenecks

190

4. **Memory Partitioning**: Understand NUMA topology impacts on memory access patterns

191

5. **Error Handling**: Bad page queries may fail on older hardware or drivers