Tessl Tile for github/radeonopencompute/amdsmi@6.4.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cpu-management.md device-info.md events.md gpu-performance.md index.md initialization.md memory.md performance-control.md power-thermal.md topology-ras.md

memory.mddocs/

0
# Memory Management
1

2
VRAM usage monitoring, memory partitioning, bad page reporting, and memory error handling. Critical for understanding memory health and optimizing memory-intensive workloads.
3

4
## Capabilities
5

6
### Memory Usage Information
7

8
Monitor GPU memory usage across different memory types and regions.
9

10
```cpp { .api }
11
amdsmi_status_t amdsmi_get_gpu_memory_usage(amdsmi_processor_handle processor_handle,
12
                                           amdsmi_memory_type_t mem_type,
13
                                           amdsmi_memory_usage_t* usage);
14
```
15

16
**Memory Types:**
17
```cpp { .api }
18
typedef enum {
19
    AMDSMI_MEMORY_TYPE_VRAM,        // Video RAM (on-device memory)
20
    AMDSMI_MEMORY_TYPE_VIS_VRAM,    // Visible VRAM (CPU accessible)
21
    AMDSMI_MEMORY_TYPE_GTT          // Graphics Translation Table
22
} amdsmi_memory_type_t;
23
```
24

25
**Memory Usage Structure:**
26
```cpp { .api }
27
typedef struct {
28
    uint64_t used;          // Used memory in bytes
29
    uint64_t total;         // Total memory in bytes
30
} amdsmi_memory_usage_t;
31
```
32

33
### Total Memory Capacity
34

35
Get total memory capacity for different memory types.
36

37
```cpp { .api }
38
amdsmi_status_t amdsmi_get_gpu_memory_total(amdsmi_processor_handle processor_handle,
39
                                           amdsmi_memory_type_t mem_type,
40
                                           uint64_t* total);
41
```
42

43
### Bad Page Information
44

45
Monitor memory reliability by checking for bad memory pages.
46

47
```cpp { .api }
48
amdsmi_status_t amdsmi_get_gpu_bad_page_info(amdsmi_processor_handle processor_handle,
49
                                            uint32_t* num_pages,
50
                                            amdsmi_retired_page_record_t* info);
51
```
52

53
**Retired Page Record Structure:**
54
```cpp { .api }
55
typedef struct {
56
    uint64_t page_address;          // Physical address of retired page
57
    uint64_t page_size;             // Size of retired page in bytes
58
    amdsmi_memory_page_status_t status; // Page retirement status
59
} amdsmi_retired_page_record_t;
60
```
61

62
**Page Status Types:**
63
```cpp { .api }
64
typedef enum {
65
    AMDSMI_MEM_PAGE_STATUS_RESERVED,    // Page reserved but usable
66
    AMDSMI_MEM_PAGE_STATUS_PENDING,     // Page pending retirement
67
    AMDSMI_MEM_PAGE_STATUS_UNRESERVABLE // Page unusable
68
} amdsmi_memory_page_status_t;
69
```
70

71
### Memory Partitioning
72

73
Get and manage GPU memory partitioning information.
74

75
```cpp { .api }
76
amdsmi_status_t amdsmi_get_gpu_memory_partition_info(amdsmi_processor_handle processor_handle,
77
                                                    char* memory_partition,
78
                                                    uint32_t len);
79
```
80

81
```cpp { .api }
82
amdsmi_status_t amdsmi_set_gpu_memory_partition(amdsmi_processor_handle processor_handle,
83
                                               amdsmi_memory_partition_type_t partition_type);
84
```
85

86
**Memory Partition Types:**
87
```cpp { .api }
88
typedef enum {
89
    AMDSMI_MEMORY_PARTITION_UNKNOWN,    // Unknown partition
90
    AMDSMI_MEMORY_PARTITION_NPS1,       // 1 NUMA node per socket
91
    AMDSMI_MEMORY_PARTITION_NPS2,       // 2 NUMA nodes per socket
92
    AMDSMI_MEMORY_PARTITION_NPS4,       // 4 NUMA nodes per socket
93
    AMDSMI_MEMORY_PARTITION_NPS8        // 8 NUMA nodes per socket
94
} amdsmi_memory_partition_type_t;
95
```
96

97
## Language Interface Examples
98

99
### Python
100
```python
101
import amdsmi
102

103
gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)
104
if gpu_handles:
105
    gpu_handle = gpu_handles[0]
106
    
107
    # Get VRAM usage
108
    vram_usage = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle, 
109
                                                   amdsmi.AmdSmiMemoryType.VRAM)
110
    vram_used_gb = vram_usage['used'] / (1024**3)
111
    vram_total_gb = vram_usage['total'] / (1024**3)
112
    vram_percent = (vram_usage['used'] / vram_usage['total']) * 100
113
    
114
    print(f"VRAM Usage: {vram_used_gb:.2f} / {vram_total_gb:.2f} GB ({vram_percent:.1f}%)")
115
    
116
    # Get visible VRAM usage
117
    vis_vram = amdsmi.amdsmi_get_gpu_memory_usage(gpu_handle,
118
                                                 amdsmi.AmdSmiMemoryType.VIS_VRAM)
119
    print(f"Visible VRAM: {vis_vram['used'] / (1024**3):.2f} GB")
120
    
121
    # Check for bad pages
122
    try:
123
        bad_pages = amdsmi.amdsmi_get_gpu_bad_page_info(gpu_handle)
124
        if bad_pages:
125
            print(f"Found {len(bad_pages)} bad memory pages")
126
            for page in bad_pages:
127
                print(f"  Address: 0x{page['page_address']:X}, Status: {page['status']}")
128
        else:
129
            print("No bad memory pages detected")
130
    except amdsmi.AmdSmiException:
131
        print("Bad page information not available")
132
```
133

134
### Go
135
```go
136
import "github.com/ROCm/amdsmi"
137

138
// Get memory information for each GPU
139
for i := 0; i < int(goamdsmi.GO_gpu_num_monitor_devices()); i++ {
140
    // Get memory usage
141
    memUsed := goamdsmi.GO_gpu_dev_memory_used_get(i)
142
    memTotal := goamdsmi.GO_gpu_dev_memory_total_get(i)
143
    memPercent := float64(memUsed) / float64(memTotal) * 100
144
    
145
    fmt.Printf("GPU %d Memory: %d MB / %d MB (%.1f%%)\n", 
146
               i, memUsed/(1024*1024), memTotal/(1024*1024), memPercent)
147
    
148
    // Get busy memory percentage
149
    memBusy := goamdsmi.GO_gpu_dev_memory_busy_percent_get(i)
150
    fmt.Printf("GPU %d Memory Activity: %d%%\n", i, memBusy)
151
}
152
```
153

154
### Rust
155
```rust
156
use amdsmi::{get_gpu_memory_usage, get_gpu_memory_total, get_gpu_bad_page_info};
157
use amdsmi::{MemoryType, MemoryUsage};
158

159
// Get comprehensive memory information
160
let vram_usage: MemoryUsage = get_gpu_memory_usage(gpu_handle, MemoryType::Vram)?;
161
let vram_used_gb = vram_usage.used as f64 / (1024.0_f64.powi(3));
162
let vram_total_gb = vram_usage.total as f64 / (1024.0_f64.powi(3));
163
let vram_percent = (vram_usage.used as f64 / vram_usage.total as f64) * 100.0;
164

165
println!("VRAM Usage: {:.2} / {:.2} GB ({:.1}%)", 
166
         vram_used_gb, vram_total_gb, vram_percent);
167

168
// Check memory health
169
match get_gpu_bad_page_info(gpu_handle) {
170
    Ok(bad_pages) => {
171
        if bad_pages.is_empty() {
172
            println!("No bad memory pages detected");
173
        } else {
174
            println!("Found {} bad memory pages", bad_pages.len());
175
            for page in bad_pages {
176
                println!("  Address: 0x{:X}, Status: {:?}", 
177
                         page.page_address, page.status);
178
            }
179
        }
180
    },
181
    Err(e) => println!("Could not get bad page info: {:?}", e),
182
}
183
```
184

185
## Memory Management Best Practices
186

187
1. **Regular Monitoring**: Monitor VRAM usage to prevent out-of-memory conditions
188
2. **Memory Health**: Check for bad pages regularly, especially in data center environments
189
3. **Visible VRAM**: Monitor visible VRAM for CPU-GPU data transfer bottlenecks
190
4. **Memory Partitioning**: Understand NUMA topology impacts on memory access patterns
191
5. **Error Handling**: Bad page queries may fail on older hardware or drivers

Version

Tile

Files

memory.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

memory.mddocs/