0
# Memory Management
1
2
VRAM usage monitoring, memory partitioning, bad page reporting, and memory error handling. Critical for understanding memory health and optimizing memory-intensive workloads.
3
4
## Capabilities
5
6
### Memory Usage Information
7
8
Monitor GPU memory usage across different memory types and regions.
9
10
```cpp { .api }
11
amdsmi_status_t amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle,
12
amdsmi_memory_type_t mem_type,
13
amdsmi_memory_usage_t* usage);
14
```
15
16
**Memory Types:**
17
```cpp { .api }
18
typedef enum {
19
AMDSMI_MEMORY_TYPE_VRAM, // Video RAM (on-device memory)
20
AMDSMI_MEMORY_TYPE_VIS_VRAM, // Visible VRAM (CPU accessible)
21
AMDSMI_MEMORY_TYPE_GTT // Graphics Translation Table
22
} amdsmi_memory_type_t;
23
```
24
25
**Memory Usage Structure:**
26
```cpp { .api }
27
typedef struct {
28
uint64_t used; // Used memory in bytes
29
uint64_t total; // Total memory in bytes
30
} amdsmi_memory_usage_t;
31
```
32
33
### Total Memory Capacity
34
35
Get total memory capacity for different memory types.
36
37
```cpp { .api }
38
amdsmi_status_t amdsmi_get_gpu_memory_total(amdsmi_processor_handle processor_handle,
39
amdsmi_memory_type_t mem_type,
40
uint64_t* total);
41
```
42
43
### Bad Page Information
44
45
Monitor memory reliability by checking for bad memory pages.
46
47
```cpp { .api }
48
amdsmi_status_t amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle,
49
uint32_t* num_pages,
50
amdsmi_retired_page_record_t* info);
51
```
52
53
**Retired Page Record Structure:**
54
```cpp { .api }
55
typedef struct {
56
uint64_t page_address; // Physical address of retired page
57
uint64_t page_size; // Size of retired page in bytes
58
amdsmi_memory_page_status_t status; // Page retirement status
59
} amdsmi_retired_page_record_t;
60
```
61
62
**Page Status Types:**
63
```cpp { .api }
64
typedef enum {
65
AMDSMI_MEM_PAGE_STATUS_RESERVED, // Page reserved but usable
66
AMDSMI_MEM_PAGE_STATUS_PENDING, // Page pending retirement
67
AMDSMI_MEM_PAGE_STATUS_UNRESERVABLE // Page unusable
68
} amdsmi_memory_page_status_t;
69
```
70
71
### Memory Partitioning
72
73
Get and manage GPU memory partitioning information.
74
75
```cpp { .api }
76
amdsmi_status_t amdsmi_get_gpu_memory_partition_info(amdsmi_processor_handle processor_handle,
77
char* memory_partition,
78
uint32_t len);
79
```
80
81
```cpp { .api }
82
amdsmi_status_t amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle,
83
amdsmi_memory_partition_type_t partition_type);
84
```
85
86
**Memory Partition Types:**
87
```cpp { .api }
88
typedef enum {
89
AMDSMI_MEMORY_PARTITION_UNKNOWN, // Unknown partition
90
AMDSMI_MEMORY_PARTITION_NPS1, // 1 NUMA node per socket
91
AMDSMI_MEMORY_PARTITION_NPS2, // 2 NUMA nodes per socket
92
AMDSMI_MEMORY_PARTITION_NPS4, // 4 NUMA nodes per socket
93
AMDSMI_MEMORY_PARTITION_NPS8 // 8 NUMA nodes per socket
94
} amdsmi_memory_partition_type_t;
95
```
96
97
## Language Interface Examples
98
99
### Python
100
```python
101
import amdsmi
102
103
gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)
104
if gpu_handles:
105
gpu_handle = gpu_handles[0]
106
107
# Get VRAM usage
108
vram_usage = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle,
109
amdsmi.AmdSmiMemoryType.VRAM)
110
vram_used_gb = vram_usage['used'] / (1024**3)
111
vram_total_gb = vram_usage['total'] / (1024**3)
112
vram_percent = (vram_usage['used'] / vram_usage['total']) * 100
113
114
print(f"VRAM Usage: {vram_used_gb:.2f} / {vram_total_gb:.2f} GB ({vram_percent:.1f}%)")
115
116
# Get visible VRAM usage
117
vis_vram = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle,
118
amdsmi.AmdSmiMemoryType.VIS_VRAM)
119
print(f"Visible VRAM: {vis_vram['used'] / (1024**3):.2f} GB")
120
121
# Check for bad pages
122
try:
123
bad_pages = amdsmi.amdsmi_get_gpu_bad_page_info(gpu_handle)
124
if bad_pages:
125
print(f"Found {len(bad_pages)} bad memory pages")
126
for page in bad_pages:
127
print(f" Address: 0x{page['page_address']:X}, Status: {page['status']}")
128
else:
129
print("No bad memory pages detected")
130
except amdsmi.AmdSmiException:
131
print("Bad page information not available")
132
```
133
134
### Go
135
```go
136
import "github.com/ROCm/amdsmi"
137
138
// Get memory information for each GPU
139
for i := 0; i < int(goamdsmi.GO_gpu_num_monitor_devices()); i++ {
140
// Get memory usage
141
memUsed := goamdsmi.GO_gpu_dev_memory_used_get(i)
142
memTotal := goamdsmi.GO_gpu_dev_memory_total_get(i)
143
memPercent := float64(memUsed) / float64(memTotal) * 100
144
145
fmt.Printf("GPU %d Memory: %d MB / %d MB (%.1f%%)\n",
146
i, memUsed/(1024*1024), memTotal/(1024*1024), memPercent)
147
148
// Get busy memory percentage
149
memBusy := goamdsmi.GO_gpu_dev_memory_busy_percent_get(i)
150
fmt.Printf("GPU %d Memory Activity: %d%%\n", i, memBusy)
151
}
152
```
153
154
### Rust
155
```rust
156
use amdsmi::{get_gpu_memory_usage, get_gpu_memory_total, get_gpu_bad_page_info};
157
use amdsmi::{MemoryType, MemoryUsage};
158
159
// Get comprehensive memory information
160
let vram_usage: MemoryUsage = get_gpu_memory_usage(gpu_handle, MemoryType::Vram)?;
161
let vram_used_gb = vram_usage.used as f64 / (1024.0_f64.powi(3));
162
let vram_total_gb = vram_usage.total as f64 / (1024.0_f64.powi(3));
163
let vram_percent = (vram_usage.used as f64 / vram_usage.total as f64) * 100.0;
164
165
println!("VRAM Usage: {:.2} / {:.2} GB ({:.1}%)",
166
vram_used_gb, vram_total_gb, vram_percent);
167
168
// Check memory health
169
match get_gpu_bad_page_info(gpu_handle) {
170
Ok(bad_pages) => {
171
if bad_pages.is_empty() {
172
println!("No bad memory pages detected");
173
} else {
174
println!("Found {} bad memory pages", bad_pages.len());
175
for page in bad_pages {
176
println!(" Address: 0x{:X}, Status: {:?}",
177
page.page_address, page.status);
178
}
179
}
180
},
181
Err(e) => println!("Could not get bad page info: {:?}", e),
182
}
183
```
184
185
## Memory Management Best Practices
186
187
1. **Regular Monitoring**: Monitor VRAM usage to prevent out-of-memory conditions
188
2. **Memory Health**: Check for bad pages regularly, especially in data center environments
189
3. **Visible VRAM**: Monitor visible VRAM for CPU-GPU data transfer bottlenecks
190
4. **Memory Partitioning**: Understand NUMA topology impacts on memory access patterns
191
5. **Error Handling**: Bad page queries may fail on older hardware or drivers