or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

examples

edge-cases.mdreal-world-scenarios.md
index.md
tile.json

edge-cases.mddocs/examples/

Edge Cases and Advanced Scenarios

This document covers advanced usage patterns, edge cases, and complex scenarios for the AMDSMI library.

Handling Missing or Unsupported Features

Many AMDSMI features are hardware-dependent. Always handle cases where features may not be available:

import amdsmi
from amdsmi import AmdSmiLibraryException

def safe_get_feature(device, feature_func, *args, **kwargs):
    """Safely get a feature that may not be supported."""
    
    try:
        return feature_func(device, *args, **kwargs)
    except AmdSmiLibraryException as e:
        error_code = e.get_error_code()
        # Check if it's a NOT_SUPPORTED error
        if "NOT_SUPPORTED" in str(e) or error_code == 0x10000000:
            return None
        raise

# Usage
amdsmi.amdsmi_init()

try:
    devices = amdsmi.amdsmi_get_processor_handles()
    if devices:
        device = devices[0]
        
        # Try to get partition info (may not be supported)
        partition = safe_get_feature(
            device,
            amdsmi.amdsmi_get_gpu_compute_partition
        )
        
        if partition is None:
            print("GPU partitioning not supported on this device")
        else:
            print(f"Compute partition: {partition}")
            
finally:
    amdsmi.amdsmi_shut_down()

Multi-Device Coordination

Coordinate operations across multiple GPUs:

import amdsmi
import threading

class MultiGPUCoordinator:
    """Coordinate operations across multiple GPUs."""
    
    def __init__(self):
        amdsmi.amdsmi_init()
        self.devices = amdsmi.amdsmi_get_processor_handles()
        self.locks = {i: threading.Lock() for i in range(len(self.devices))}
    
    def execute_on_all(self, func, *args, **kwargs):
        """Execute a function on all devices."""
        results = []
        for i, device in enumerate(self.devices):
            with self.locks[i]:
                try:
                    result = func(device, *args, **kwargs)
                    results.append((i, result, None))
                except Exception as e:
                    results.append((i, None, e))
        return results
    
    def execute_on_device(self, device_index, func, *args, **kwargs):
        """Execute a function on a specific device."""
        if device_index >= len(self.devices):
            raise IndexError(f"Device index {device_index} out of range")
        
        with self.locks[device_index]:
            return func(self.devices[device_index], *args, **kwargs)
    
    def shutdown(self):
        """Shut down the library."""
        amdsmi.amdsmi_shut_down()

# Usage
coordinator = MultiGPUCoordinator()

try:
    # Get activity on all GPUs
    activities = coordinator.execute_on_all(amdsmi.amdsmi_get_gpu_activity)
    
    for device_idx, activity, error in activities:
        if error:
            print(f"GPU {device_idx}: Error - {error}")
        else:
            print(f"GPU {device_idx}: {activity['gfx_activity']}%")
            
finally:
    coordinator.shutdown()

Context Manager Pattern

Create a context manager for automatic initialization and cleanup:

import amdsmi
from contextlib import contextmanager

@contextmanager
def amdsmi_context(flags=amdsmi.AmdSmiInitFlags.INIT_AMD_GPUS):
    """Context manager for AMDSMI initialization."""
    
    amdsmi.amdsmi_init(flags)
    try:
        yield
    finally:
        amdsmi.amdsmi_shut_down()

# Usage
with amdsmi_context():
    devices = amdsmi.amdsmi_get_processor_handles()
    for device in devices:
        activity = amdsmi.amdsmi_get_gpu_activity(device)
        print(f"Activity: {activity}")

Error Recovery and Retry Logic

Implement retry logic for transient errors:

import amdsmi
import time
from amdsmi import AmdSmiRetryException, AmdSmiLibraryException

def retry_on_error(func, max_retries=3, delay=1.0, *args, **kwargs):
    """Retry a function call on retry exceptions."""
    
    for attempt in range(max_retries):
        try:
            return func(*args, **kwargs)
        except AmdSmiRetryException:
            if attempt < max_retries - 1:
                time.sleep(delay * (attempt + 1))  # Exponential backoff
                continue
            raise
        except AmdSmiLibraryException as e:
            # Don't retry on non-retry errors
            raise
    
    raise Exception("Max retries exceeded")

# Usage
amdsmi.amdsmi_init()

try:
    devices = amdsmi.amdsmi_get_processor_handles()
    if devices:
        # Retry getting activity if it fails
        activity = retry_on_error(
            amdsmi.amdsmi_get_gpu_activity,
            max_retries=3,
            device=devices[0]
        )
        print(f"Activity: {activity}")
        
finally:
    amdsmi.amdsmi_shut_down()

Handling N/A Values

Many AMDSMI functions return "N/A" for unavailable metrics:

import amdsmi

def safe_get_numeric_metric(metric_value, default=0):
    """Safely convert metric value to numeric, handling N/A."""
    
    if metric_value == "N/A" or metric_value is None:
        return default
    
    if isinstance(metric_value, (int, float)):
        return metric_value
    
    try:
        return float(metric_value)
    except (ValueError, TypeError):
        return default

# Usage
amdsmi.amdsmi_init()

try:
    devices = amdsmi.amdsmi_get_processor_handles()
    if devices:
        activity = amdsmi.amdsmi_get_gpu_activity(devices[0])
        
        # Handle N/A values
        gfx_activity = safe_get_numeric_metric(activity.get('gfx_activity'), 0)
        umc_activity = safe_get_numeric_metric(activity.get('umc_activity'), 0)
        
        print(f"GFX: {gfx_activity}%, UMC: {umc_activity}%")
        
finally:
    amdsmi.amdsmi_shut_down()

Temperature Unit Conversion

Handle temperature conversions (millidegrees to Celsius):

import amdsmi

def get_temperature_celsius(device, sensor_type, metric):
    """Get temperature in Celsius, handling conversion."""
    
    temp_millidegrees = amdsmi.amdsmi_get_temp_metric(
        device, sensor_type, metric
    )
    
    # Temperature is returned in millidegrees Celsius
    return temp_millidegrees / 1000.0

def get_all_temperatures(device):
    """Get all available temperature readings."""
    
    temperatures = {}
    sensor_types = [
        amdsmi.AmdSmiTemperatureType.EDGE,
        amdsmi.AmdSmiTemperatureType.HOTSPOT,
        amdsmi.AmdSmiTemperatureType.JUNCTION,
        amdsmi.AmdSmiTemperatureType.VRAM,
    ]
    
    for sensor_type in sensor_types:
        try:
            temp_c = get_temperature_celsius(
                device,
                sensor_type,
                amdsmi.AmdSmiTemperatureMetric.CURRENT
            )
            sensor_name = sensor_type.name
            temperatures[sensor_name] = temp_c
        except:
            pass  # Sensor may not be available
    
    return temperatures

# Usage
amdsmi.amdsmi_init()

try:
    devices = amdsmi.amdsmi_get_processor_handles()
    if devices:
        temps = get_all_temperatures(devices[0])
        for sensor, temp in temps.items():
            print(f"{sensor}: {temp:.1f}°C")
            
finally:
    amdsmi.amdsmi_shut_down()

Memory Unit Conversion

Handle memory size conversions:

import amdsmi

def format_bytes(bytes_value):
    """Format bytes to human-readable format."""
    
    for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
        if bytes_value < 1024.0:
            return f"{bytes_value:.2f} {unit}"
        bytes_value /= 1024.0
    return f"{bytes_value:.2f} PB"

def get_memory_info(device):
    """Get comprehensive memory information."""
    
    vram = amdsmi.amdsmi_get_gpu_vram_usage(device)
    
    return {
        'total': vram['vram_total'],
        'used': vram['vram_used'],
        'free': vram['vram_total'] - vram['vram_used'],
        'usage_percent': (vram['vram_used'] / vram['vram_total']) * 100,
        'total_formatted': format_bytes(vram['vram_total']),
        'used_formatted': format_bytes(vram['vram_used']),
        'free_formatted': format_bytes(vram['vram_total'] - vram['vram_used'])
    }

# Usage
amdsmi.amdsmi_init()

try:
    devices = amdsmi.amdsmi_get_processor_handles()
    if devices:
        mem_info = get_memory_info(devices[0])
        print(f"Total: {mem_info['total_formatted']}")
        print(f"Used: {mem_info['used_formatted']}")
        print(f"Free: {mem_info['free_formatted']}")
        print(f"Usage: {mem_info['usage_percent']:.1f}%")
        
finally:
    amdsmi.amdsmi_shut_down()

Partitioning with Error Handling

Safely configure GPU partitioning with proper error handling:

import amdsmi
from amdsmi import (
    AmdSmiComputePartitionType,
    AmdSmiMemoryPartitionType,
    AmdSmiLibraryException
)

def configure_partitioning_safe(device, compute_part, memory_part):
    """Safely configure GPU partitioning with error handling."""
    
    # Check if partitioning is supported
    try:
        current_compute = amdsmi.amdsmi_get_gpu_compute_partition(device)
        current_memory = amdsmi.amdsmi_get_gpu_memory_partition(device)
    except AmdSmiLibraryException as e:
        if "NOT_SUPPORTED" in str(e):
            return False, "Partitioning not supported on this device"
        raise
    
    # Check available modes
    try:
        mem_config = amdsmi.amdsmi_get_gpu_memory_partition_config(device)
        available_modes = mem_config.get('partition_caps', [])
        
        if memory_part.name not in available_modes:
            return False, f"Memory partition mode {memory_part.name} not available"
    except:
        pass  # May not be able to query config
    
    # Attempt to set partitions
    try:
        amdsmi.amdsmi_set_gpu_compute_partition(device, compute_part)
        amdsmi.amdsmi_set_gpu_memory_partition(device, memory_part)
        return True, "Partitioning configured (GPU reset may be required)"
    except AmdSmiLibraryException as e:
        return False, f"Failed to configure: {e}"

# Usage
amdsmi.amdsmi_init()

try:
    devices = amdsmi.amdsmi_get_processor_handles()
    if devices:
        success, message = configure_partitioning_safe(
            devices[0],
            AmdSmiComputePartitionType.DPX,
            AmdSmiMemoryPartitionType.NPS2
        )
        print(f"Result: {message}")
        
finally:
    amdsmi.amdsmi_shut_down()