or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cpu-management.mddevice-info.mdevents.mdgpu-performance.mdindex.mdinitialization.mdmemory.mdperformance-control.mdpower-thermal.mdtopology-ras.md

topology-ras.mddocs/

0

# System Topology and RAS

1

2

NUMA affinity, PCIe topology, XGMI links, reliability features, and error monitoring. Essential for multi-GPU systems and high-reliability computing environments.

3

4

## Capabilities

5

6

### Link Topology

7

8

Get information about GPU interconnect topology and nearest neighbor relationships.

9

10

```cpp { .api }

11

amdsmi_status_t amdsmi_get_link_topology_nearest(amdsmi_processor_handle processor_handle,

12

amdsmi_link_type_t link_type,

13

amdsmi_topology_t* topology);

14

```

15

16

**Link Types:**

17

```cpp { .api }

18

typedef enum {

19

AMDSMI_LINK_TYPE_PCIE, // PCIe links

20

AMDSMI_LINK_TYPE_XGMI, // XGMI (high-speed interconnect) links

21

AMDSMI_LINK_TYPE_UNKNOWN // Unknown link type

22

} amdsmi_link_type_t;

23

```

24

25

### RAS Feature Information

26

27

Get Reliability, Availability, and Serviceability feature status.

28

29

```cpp { .api }

30

amdsmi_status_t amdsmi_get_gpu_ras_feature_info(amdsmi_processor_handle processor_handle,

31

amdsmi_gpu_block_t block,

32

amdsmi_ras_feature_t* ras_feature);

33

```

34

35

**GPU Blocks:**

36

```cpp { .api }

37

typedef enum {

38

AMDSMI_GPU_BLOCK_UMC, // Unified Memory Controller

39

AMDSMI_GPU_BLOCK_SDMA, // System DMA

40

AMDSMI_GPU_BLOCK_GFX, // Graphics block

41

AMDSMI_GPU_BLOCK_MMHUB, // Memory Management Hub

42

AMDSMI_GPU_BLOCK_ATHUB, // Address Translation Hub

43

AMDSMI_GPU_BLOCK_PCIE_BIF, // PCIe Bus Interface

44

AMDSMI_GPU_BLOCK_HDP, // Host Data Path

45

AMDSMI_GPU_BLOCK_XGMI_WAFL, // XGMI Wide Area Fabric Link

46

AMDSMI_GPU_BLOCK_DF, // Data Fabric

47

AMDSMI_GPU_BLOCK_SMN, // System Management Network

48

AMDSMI_GPU_BLOCK_SEM, // SEM block

49

AMDSMI_GPU_BLOCK_MP0, // MP0 block

50

AMDSMI_GPU_BLOCK_MP1, // MP1 block

51

AMDSMI_GPU_BLOCK_FUSE // Fuse block

52

} amdsmi_gpu_block_t;

53

```

54

55

### Error Count Monitoring

56

57

Monitor error counts for different GPU blocks to assess system health.

58

59

```cpp { .api }

60

amdsmi_status_t amdsmi_get_gpu_ecc_count(amdsmi_processor_handle processor_handle,

61

amdsmi_gpu_block_t block,

62

amdsmi_error_count_t* ec);

63

```

64

65

**Error Count Structure:**

66

```cpp { .api }

67

typedef struct {

68

uint64_t correctable_count; // Correctable error count

69

uint64_t uncorrectable_count; // Uncorrectable error count

70

} amdsmi_error_count_t;

71

```

72

73

### NUMA Affinity

74

75

Get NUMA node affinity information for optimal memory allocation.

76

77

```cpp { .api }

78

amdsmi_status_t amdsmi_get_gpu_numa_affinity(amdsmi_processor_handle processor_handle,

79

uint32_t* numa_node);

80

```

81

82

## Language Interface Examples

83

84

### Python

85

```python

86

import amdsmi

87

88

gpu_handles = amdsmi.amdsmi_get_processor_handles(amdsmi.AmdSmiProcessorType.AMD_GPU)

89

for i, gpu_handle in enumerate(gpu_handles):

90

print(f"GPU {i}:")

91

92

# Get NUMA affinity

93

try:

94

numa_node = amdsmi.amdsmi_get_gpu_numa_affinity(gpu_handle)

95

print(f" NUMA Node: {numa_node}")

96

except amdsmi.AmdSmiException:

97

print(" NUMA affinity not available")

98

99

# Check RAS features for UMC block

100

try:

101

ras_info = amdsmi.amdsmi_get_gpu_ras_feature_info(gpu_handle,

102

amdsmi.AmdSmiGpuBlock.UMC)

103

print(f" UMC RAS enabled: {ras_info['ras_ecc_enabled']}")

104

except amdsmi.AmdSmiException:

105

print(" RAS information not available")

106

107

# Get error counts

108

try:

109

ecc_count = amdsmi.amdsmi_get_gpu_ecc_count(gpu_handle,

110

amdsmi.AmdSmiGpuBlock.UMC)

111

print(f" ECC Errors - Correctable: {ecc_count['correctable_count']}, "

112

f"Uncorrectable: {ecc_count['uncorrectable_count']}")

113

except amdsmi.AmdSmiException:

114

print(" ECC error counts not available")

115

```

116

117

### Rust

118

```rust

119

use amdsmi::{get_gpu_numa_affinity, get_gpu_ras_feature_info, get_gpu_ecc_count};

120

use amdsmi::{GpuBlock, ErrorCount};

121

122

// Get topology and reliability information

123

match get_gpu_numa_affinity(gpu_handle) {

124

Ok(numa_node) => println!("NUMA Node: {}", numa_node),

125

Err(_) => println!("NUMA affinity not available"),

126

}

127

128

// Check RAS features

129

match get_gpu_ras_feature_info(gpu_handle, GpuBlock::UMC) {

130

Ok(ras_info) => println!("UMC RAS enabled: {}", ras_info.ras_ecc_enabled),

131

Err(_) => println!("RAS information not available"),

132

}

133

134

// Monitor error counts

135

match get_gpu_ecc_count(gpu_handle, GpuBlock::UMC) {

136

Ok(ErrorCount { correctable_count, uncorrectable_count }) => {

137

println!("ECC Errors - Correctable: {}, Uncorrectable: {}",

138

correctable_count, uncorrectable_count);

139

},

140

Err(_) => println!("ECC error counts not available"),

141

}

142

```

143

144

## Topology and RAS Best Practices

145

146

1. **Multi-GPU Systems**: Use topology information for optimal task placement

147

2. **Error Monitoring**: Regularly check error counts in production environments

148

3. **NUMA Awareness**: Use NUMA affinity for memory allocation optimization

149

4. **High Reliability**: Monitor RAS features in mission-critical applications

150

5. **Platform Dependencies**: Some features may not be available on all hardware