or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

examples

edge-cases.mdreal-world-scenarios.md
index.md
tile.json

real-world-scenarios.mddocs/examples/

Real-World Scenarios

This document provides comprehensive real-world usage examples for the AMDSMI library.

GPU Monitoring Dashboard

Create a comprehensive GPU monitoring dashboard:

import amdsmi
import time

def monitor_gpu_dashboard():
    """Continuously monitor GPU metrics and display dashboard."""
    
    amdsmi.amdsmi_init()
    
    try:
        devices = amdsmi.amdsmi_get_processor_handles()
        
        while True:
            print("\n" + "="*80)
            print("GPU Monitoring Dashboard")
            print("="*80)
            
            for i, device in enumerate(devices):
                # Get device info
                asic_info = amdsmi.amdsmi_get_gpu_asic_info(device)
                
                print(f"\nGPU {i}: {asic_info['market_name']}")
                print("-" * 80)
                
                # Utilization
                activity = amdsmi.amdsmi_get_gpu_activity(device)
                print(f"  Utilization:")
                print(f"    GFX: {activity['gfx_activity']}%")
                print(f"    Memory: {activity['umc_activity']}%")
                print(f"    Multimedia: {activity['mm_activity']}%")
                
                # Temperature
                temp = amdsmi.amdsmi_get_temp_metric(
                    device,
                    amdsmi.AmdSmiTemperatureType.EDGE,
                    amdsmi.AmdSmiTemperatureMetric.CURRENT
                )
                print(f"  Temperature: {temp / 1000.0:.1f}°C")
                
                # Power
                power = amdsmi.amdsmi_get_power_info(device)
                print(f"  Power: {power['current_socket_power'] / 1000000.0:.2f}W")
                print(f"  Average Power: {power['average_socket_power'] / 1000000.0:.2f}W")
                
                # Memory
                vram = amdsmi.amdsmi_get_gpu_vram_usage(device)
                used_gb = vram['vram_used'] / (1024**3)
                total_gb = vram['vram_total'] / (1024**3)
                usage_pct = (vram['vram_used'] / vram['vram_total']) * 100
                print(f"  VRAM: {used_gb:.2f} GB / {total_gb:.2f} GB ({usage_pct:.1f}%)")
                
                # Clocks
                gfx_clock = amdsmi.amdsmi_get_clock_info(
                    device, amdsmi.AmdSmiClkType.GFX
                )
                mem_clock = amdsmi.amdsmi_get_clock_info(
                    device, amdsmi.AmdSmiClkType.MEM
                )
                print(f"  Clocks:")
                print(f"    GFX: {gfx_clock.get('current_clk', 'N/A')} MHz")
                print(f"    Memory: {mem_clock.get('current_clk', 'N/A')} MHz")
                
                # Processes
                processes = amdsmi.amdsmi_get_gpu_process_list(device)
                if processes:
                    print(f"  Processes: {len(processes)}")
                    for proc in processes[:5]:  # Show first 5
                        print(f"    PID {proc.get('pid', 'N/A')}: "
                              f"{proc.get('mem_usage', 0) / (1024**2):.2f} MB")
            
            time.sleep(2)  # Update every 2 seconds
            
    except KeyboardInterrupt:
        print("\nMonitoring stopped.")
    finally:
        amdsmi.amdsmi_shut_down()

if __name__ == "__main__":
    monitor_gpu_dashboard()

Multi-GPU Workload Distribution

Monitor and balance workloads across multiple GPUs:

import amdsmi

def find_least_utilized_gpu():
    """Find the GPU with the lowest utilization."""
    
    amdsmi.amdsmi_init()
    
    try:
        devices = amdsmi.amdsmi_get_processor_handles()
        
        if not devices:
            return None
        
        best_device = None
        lowest_util = 100
        
        for device in devices:
            activity = amdsmi.amdsmi_get_gpu_activity(device)
            gfx_util = activity.get('gfx_activity', 0)
            
            if isinstance(gfx_util, (int, float)) and gfx_util < lowest_util:
                lowest_util = gfx_util
                best_device = device
        
        return best_device
        
    finally:
        amdsmi.amdsmi_shut_down()

def get_gpu_memory_availability(device):
    """Get available GPU memory in GB."""
    
    vram = amdsmi.amdsmi_get_gpu_vram_usage(device)
    available = (vram['vram_total'] - vram['vram_used']) / (1024**3)
    return available

# Usage
device = find_least_utilized_gpu()
if device:
    available_mem = get_gpu_memory_availability(device)
    print(f"Available memory: {available_mem:.2f} GB")

Thermal Monitoring and Alerts

Monitor GPU temperatures and alert on high temperatures:

import amdsmi

def check_thermal_status(device, warning_temp=80, critical_temp=90):
    """Check GPU thermal status and return status."""
    
    temp = amdsmi.amdsmi_get_temp_metric(
        device,
        amdsmi.AmdSmiTemperatureType.EDGE,
        amdsmi.AmdSmiTemperatureMetric.CURRENT
    )
    temp_c = temp / 1000.0
    
    # Get critical threshold
    try:
        critical = amdsmi.amdsmi_get_temp_metric(
            device,
            amdsmi.AmdSmiTemperatureType.EDGE,
            amdsmi.AmdSmiTemperatureMetric.CRITICAL
        )
        critical_c = critical / 1000.0
    except:
        critical_c = critical_temp
    
    status = "OK"
    if temp_c >= critical_c:
        status = "CRITICAL"
    elif temp_c >= warning_temp:
        status = "WARNING"
    
    return {
        'temperature': temp_c,
        'critical_threshold': critical_c,
        'status': status
    }

def monitor_thermal_all_gpus():
    """Monitor thermal status for all GPUs."""
    
    amdsmi.amdsmi_init()
    
    try:
        devices = amdsmi.amdsmi_get_processor_handles()
        
        for i, device in enumerate(devices):
            asic_info = amdsmi.amdsmi_get_gpu_asic_info(device)
            thermal = check_thermal_status(device)
            
            print(f"GPU {i} ({asic_info['market_name']}):")
            print(f"  Temperature: {thermal['temperature']:.1f}°C")
            print(f"  Status: {thermal['status']}")
            
            if thermal['status'] != "OK":
                print(f"  ⚠️  Alert: Temperature is {thermal['status']}")
                
    finally:
        amdsmi.amdsmi_shut_down()

Power Consumption Tracking

Track and analyze GPU power consumption:

import amdsmi
import time
from collections import deque

class PowerTracker:
    """Track GPU power consumption over time."""
    
    def __init__(self, device, window_size=60):
        self.device = device
        self.power_history = deque(maxlen=window_size)
        self.start_time = time.time()
    
    def sample(self):
        """Take a power sample."""
        power = amdsmi.amdsmi_get_power_info(self.device)
        power_w = power['current_socket_power'] / 1000000.0
        self.power_history.append(power_w)
        return power_w
    
    def get_stats(self):
        """Get power statistics."""
        if not self.power_history:
            return None
        
        return {
            'current': self.power_history[-1],
            'average': sum(self.power_history) / len(self.power_history),
            'min': min(self.power_history),
            'max': max(self.power_history),
            'samples': len(self.power_history)
        }
    
    def estimate_energy(self, duration_seconds):
        """Estimate energy consumption in kWh."""
        if not self.power_history:
            return None
        
        avg_power_w = sum(self.power_history) / len(self.power_history)
        energy_wh = (avg_power_w * duration_seconds) / 3600
        return energy_wh / 1000  # Convert to kWh

# Usage
amdsmi.amdsmi_init()

try:
    devices = amdsmi.amdsmi_get_processor_handles()
    if devices:
        tracker = PowerTracker(devices[0])
        
        # Sample for 60 seconds
        for _ in range(60):
            tracker.sample()
            time.sleep(1)
        
        stats = tracker.get_stats()
        print(f"Power Statistics:")
        print(f"  Current: {stats['current']:.2f}W")
        print(f"  Average: {stats['average']:.2f}W")
        print(f"  Min: {stats['min']:.2f}W")
        print(f"  Max: {stats['max']:.2f}W")
        print(f"  Estimated Energy (1 min): {tracker.estimate_energy(60):.4f} kWh")
        
finally:
    amdsmi.amdsmi_shut_down()

Process Monitoring

Monitor which processes are using GPU resources:

import amdsmi

def monitor_gpu_processes():
    """Monitor processes using GPU resources."""
    
    amdsmi.amdsmi_init()
    
    try:
        devices = amdsmi.amdsmi_get_processor_handles()
        
        for i, device in enumerate(devices):
            asic_info = amdsmi.amdsmi_get_gpu_asic_info(device)
            print(f"\nGPU {i}: {asic_info['market_name']}")
            print("-" * 60)
            
            # Get process list
            processes = amdsmi.amdsmi_get_gpu_process_list(device)
            
            if not processes:
                print("  No processes using GPU")
                continue
            
            print(f"  Processes: {len(processes)}")
            print(f"  {'PID':<10} {'Memory (MB)':<15} {'Type':<10}")
            print("-" * 60)
            
            total_memory = 0
            for proc in processes:
                pid = proc.get('pid', 'N/A')
                mem_mb = proc.get('mem_usage', 0) / (1024**2)
                proc_type = proc.get('type', 'N/A')
                total_memory += mem_mb
                
                print(f"  {pid:<10} {mem_mb:<15.2f} {proc_type:<10}")
            
            print(f"\n  Total Memory Used: {total_memory:.2f} MB")
            
    finally:
        amdsmi.amdsmi_shut_down()

Device Information Report

Generate a comprehensive device information report:

import amdsmi

def generate_device_report():
    """Generate comprehensive device information report."""
    
    amdsmi.amdsmi_init()
    
    try:
        devices = amdsmi.amdsmi_get_processor_handles()
        
        for i, device in enumerate(devices):
            print(f"\n{'='*80}")
            print(f"Device {i} Report")
            print(f"{'='*80}\n")
            
            # ASIC Information
            asic_info = amdsmi.amdsmi_get_gpu_asic_info(device)
            print("ASIC Information:")
            print(f"  Market Name: {asic_info.get('market_name', 'N/A')}")
            print(f"  Vendor: {asic_info.get('vendor_name', 'N/A')}")
            print(f"  Device ID: {asic_info.get('device_id', 'N/A')}")
            
            # Driver Information
            try:
                driver_info = amdsmi.amdsmi_get_gpu_driver_info(device)
                print(f"\nDriver Information:")
                for key, value in driver_info.items():
                    print(f"  {key}: {value}")
            except:
                print("\nDriver Information: Not available")
            
            # VBIOS Information
            try:
                vbios_info = amdsmi.amdsmi_get_gpu_vbios_info(device)
                print(f"\nVBIOS Information:")
                for key, value in vbios_info.items():
                    print(f"  {key}: {value}")
            except:
                print("\nVBIOS Information: Not available")
            
            # Memory Information
            vram_info = amdsmi.amdsmi_get_gpu_vram_info(device)
            print(f"\nMemory Information:")
            print(f"  VRAM Type: {vram_info.get('vram_type', 'N/A')}")
            print(f"  VRAM Vendor: {vram_info.get('vram_vendor', 'N/A')}")
            print(f"  Bit Width: {vram_info.get('bit_width', 'N/A')} bits")
            
            # UUID and BDF
            try:
                uuid = amdsmi.amdsmi_get_gpu_device_uuid(device)
                bdf = amdsmi.amdsmi_get_gpu_device_bdf(device)
                print(f"\nDevice Identifiers:")
                print(f"  UUID: {uuid}")
                print(f"  BDF: {bdf}")
            except:
                pass
            
    finally:
        amdsmi.amdsmi_shut_down()