Process monitoring and tracking functions for AMD GPUs, enabling discovery and analysis of GPU compute workloads. These functions provide visibility into active processes using GPU resources, including memory consumption, compute engine usage, and device allocation across single or multiple GPUs.
Get detailed information about all processes currently using GPU resources.
def amdsmi_get_gpu_process_list(processor_handle: processor_handle) -> List[Dict[str, Any]]:
"""
Get list of processes currently using the specified GPU.
Returns detailed information about all processes that are actively using the GPU,
including process identification, memory usage, engine utilization, and compute unit
occupancy. This function provides a comprehensive snapshot of GPU resource allocation
across all running processes on a single device.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- List[dict]: List of dictionaries, one per process, each containing:
- name (str): Process name or "N/A" if unavailable
- pid (int): Process ID
- mem (int): Memory usage in bytes
- engine_usage (dict):
- gfx (int): Graphics engine usage percentage
- enc (int): Encoder engine usage percentage
- memory_usage (dict):
- gtt_mem (int): GTT (Graphics Translation Table) memory in bytes
- cpu_mem (int): CPU visible memory in bytes
- vram_mem (int): VRAM memory in bytes
- cu_occupancy (int): Compute unit occupancy percentage (0-100) or "N/A"
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Note:
- Maximum number of processes tracked: 1024 (MAX_NUM_PROCESSES)
- Empty process names are returned as "N/A"
- CU occupancy may be "N/A" if not supported by the hardware
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
devices = amdsmi.amdsmi_get_processor_handles()
for device in devices:
processes = amdsmi.amdsmi_get_gpu_process_list(device)
print(f"Found {len(processes)} processes on device")
for proc in processes:
print(f" PID {proc['pid']}: {proc['name']}")
print(f" VRAM: {proc['memory_usage']['vram_mem'] / (1024**2):.1f} MB")
print(f" GFX Usage: {proc['engine_usage']['gfx']}%")
print(f" CU Occupancy: {proc['cu_occupancy']}%")
amdsmi.amdsmi_shut_down()
```
"""Query compute processes across all GPUs in the system.
def amdsmi_get_gpu_compute_process_info() -> List[Dict[str, int]]:
"""
Get compute process information for all processes across all GPUs.
Returns a system-wide view of all compute processes using any GPU in the system.
This function provides a consolidated snapshot of GPU compute activity without
requiring individual device handles.
Parameters:
- None
Returns:
- List[dict]: List of dictionaries containing information for each compute process:
- process_id (int): Process ID
- vram_usage (int): VRAM usage in bytes
- sdma_usage (int): SDMA (System DMA) engine usage
- cu_occupancy (int): Compute unit occupancy
Raises:
- AmdSmiLibraryException: On query failure
Note:
- This function operates at the system level and does not require a processor handle
- First call determines the number of processes, second call retrieves the data
- Useful for system-wide monitoring dashboards and resource managers
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
# Get all compute processes across all GPUs
processes = amdsmi.amdsmi_get_gpu_compute_process_info()
print(f"Total compute processes: {len(processes)}")
for proc in processes:
vram_mb = proc['vram_usage'] / (1024**2)
print(f"PID {proc['process_id']}: {vram_mb:.1f} MB VRAM")
print(f" SDMA Usage: {proc['sdma_usage']}")
print(f" CU Occupancy: {proc['cu_occupancy']}")
amdsmi.amdsmi_shut_down()
```
"""Get information about a specific process by its PID.
def amdsmi_get_gpu_compute_process_info_by_pid(pid: int) -> Dict[str, int]:
"""
Get compute process information for a specific process ID.
Retrieves GPU usage details for a single process identified by its PID. This function
is useful for monitoring specific applications or tracking resource consumption of
known processes.
Parameters:
- pid (int): Process ID to query
Returns:
- dict: Dictionary containing:
- process_id (int): Process ID (same as input)
- vram_usage (int): VRAM usage in bytes
- sdma_usage (int): SDMA (System DMA) engine usage
- cu_occupancy (int): Compute unit occupancy
Raises:
- AmdSmiParameterException: If pid is not an integer
- AmdSmiLibraryException: On query failure (including if PID not found)
Note:
- Process must be actively using GPU compute resources
- Will fail if the specified PID is not using any GPU
Example:
```python
import amdsmi
import os
amdsmi.amdsmi_init()
# Get info for current process
current_pid = os.getpid()
try:
proc_info = amdsmi.amdsmi_get_gpu_compute_process_info_by_pid(current_pid)
print(f"Current process GPU usage:")
print(f" VRAM: {proc_info['vram_usage'] / (1024**2):.1f} MB")
print(f" SDMA: {proc_info['sdma_usage']}")
print(f" CU Occupancy: {proc_info['cu_occupancy']}")
except amdsmi.AmdSmiLibraryException:
print("Current process is not using GPU")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_get_gpu_compute_process_gpus(pid: int) -> List[int]:
"""
Get list of GPU device indices being used by a specific process.
Returns the indices of all GPU devices that the specified process is actively using.
This function is essential for understanding multi-GPU workload distribution and
process-to-device mapping.
Parameters:
- pid (int): Process ID to query
Returns:
- List[int]: List of GPU device indices (0-based) that the process is using
Raises:
- AmdSmiParameterException: If pid is not an integer
- AmdSmiLibraryException: On query failure
Note:
- Device indices correspond to the order returned by amdsmi_get_processor_handles()
- Empty list indicates process is not using any GPU
- First call determines device count, second call retrieves device indices
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
# Check which GPUs a specific process is using
pid_to_check = 12345
try:
gpu_indices = amdsmi.amdsmi_get_gpu_compute_process_gpus(pid_to_check)
if gpu_indices:
print(f"Process {pid_to_check} is using {len(gpu_indices)} GPU(s):")
for idx in gpu_indices:
print(f" - GPU {idx}")
else:
print(f"Process {pid_to_check} is not using any GPUs")
except amdsmi.AmdSmiLibraryException as e:
print(f"Error querying process: {e}")
amdsmi.amdsmi_shut_down()
```
"""Monitor all processes on all GPUs:
import amdsmi
amdsmi.amdsmi_init()
# Get all GPU devices
devices = amdsmi.amdsmi_get_processor_handles()
print(f"Monitoring {len(devices)} GPU device(s)\n")
for idx, device in enumerate(devices):
bdf = amdsmi.amdsmi_get_gpu_device_bdf(device)
print(f"GPU {idx} ({bdf}):")
processes = amdsmi.amdsmi_get_gpu_process_list(device)
if not processes:
print(" No active processes\n")
continue
print(f" {len(processes)} active process(es):")
for proc in processes:
vram_mb = proc['memory_usage']['vram_mem'] / (1024**2)
print(f" PID {proc['pid']}: {proc['name']}")
print(f" VRAM: {vram_mb:.1f} MB")
print(f" GFX Engine: {proc['engine_usage']['gfx']}%")
print(f" CU Occupancy: {proc['cu_occupancy']}%")
print()
amdsmi.amdsmi_shut_down()Track specific process resource usage over time:
import amdsmi
import time
def monitor_process(pid, duration=60, interval=5):
"""Monitor a specific process for a duration with periodic sampling."""
amdsmi.amdsmi_init()
print(f"Monitoring process {pid} for {duration} seconds...")
print(f"Sample interval: {interval} seconds\n")
start_time = time.time()
samples = []
while time.time() - start_time < duration:
try:
# Get process info
proc_info = amdsmi.amdsmi_get_gpu_compute_process_info_by_pid(pid)
# Get which GPUs the process is using
gpu_indices = amdsmi.amdsmi_get_gpu_compute_process_gpus(pid)
sample = {
'timestamp': time.time() - start_time,
'vram_mb': proc_info['vram_usage'] / (1024**2),
'sdma_usage': proc_info['sdma_usage'],
'cu_occupancy': proc_info['cu_occupancy'],
'gpu_count': len(gpu_indices),
'gpu_indices': gpu_indices
}
samples.append(sample)
print(f"[{sample['timestamp']:.1f}s] VRAM: {sample['vram_mb']:.1f} MB, "
f"CU: {sample['cu_occupancy']}%, GPUs: {sample['gpu_count']}")
except amdsmi.AmdSmiLibraryException:
print(f"[{time.time() - start_time:.1f}s] Process not found or not using GPU")
time.sleep(interval)
amdsmi.amdsmi_shut_down()
# Calculate statistics
if samples:
avg_vram = sum(s['vram_mb'] for s in samples) / len(samples)
max_vram = max(s['vram_mb'] for s in samples)
print(f"\nStatistics:")
print(f" Average VRAM: {avg_vram:.1f} MB")
print(f" Peak VRAM: {max_vram:.1f} MB")
print(f" Samples collected: {len(samples)}")
return samples
# Example usage
if __name__ == "__main__":
target_pid = 12345
monitor_process(target_pid, duration=30, interval=2)Analyze process distribution across multiple GPUs:
import amdsmi
def analyze_multi_gpu_usage():
"""Analyze how processes are distributed across multiple GPUs."""
amdsmi.amdsmi_init()
devices = amdsmi.amdsmi_get_processor_handles()
if len(devices) < 2:
print("This system has less than 2 GPUs")
amdsmi.amdsmi_shut_down()
return
print(f"Analyzing process distribution across {len(devices)} GPUs\n")
# Collect all processes from all GPUs
gpu_processes = {}
all_pids = set()
for idx, device in enumerate(devices):
bdf = amdsmi.amdsmi_get_gpu_device_bdf(device)
processes = amdsmi.amdsmi_get_gpu_process_list(device)
gpu_processes[idx] = {
'bdf': bdf,
'processes': processes
}
all_pids.update(p['pid'] for p in processes)
print(f"Total unique processes: {len(all_pids)}\n")
# Analyze each process
multi_gpu_procs = []
single_gpu_procs = []
for pid in all_pids:
gpu_indices = amdsmi.amdsmi_get_gpu_compute_process_gpus(pid)
# Find process name from any GPU
proc_name = "Unknown"
for gpu_data in gpu_processes.values():
for proc in gpu_data['processes']:
if proc['pid'] == pid:
proc_name = proc['name']
break
if len(gpu_indices) > 1:
multi_gpu_procs.append((pid, proc_name, gpu_indices))
else:
single_gpu_procs.append((pid, proc_name, gpu_indices))
# Report findings
print(f"Multi-GPU Processes: {len(multi_gpu_procs)}")
for pid, name, indices in multi_gpu_procs:
print(f" PID {pid} ({name}): Using GPUs {indices}")
print(f"\nSingle-GPU Processes: {len(single_gpu_procs)}")
for pid, name, indices in single_gpu_procs:
print(f" PID {pid} ({name}): Using GPU {indices[0] if indices else 'N/A'}")
# Show per-GPU load
print(f"\nPer-GPU Process Count:")
for idx, data in gpu_processes.items():
print(f" GPU {idx} ({data['bdf']}): {len(data['processes'])} process(es)")
amdsmi.amdsmi_shut_down()
# Run analysis
analyze_multi_gpu_usage()Find processes consuming the most GPU memory:
import amdsmi
def get_memory_leaderboard(top_n=10):
"""Display top N processes by GPU memory usage."""
amdsmi.amdsmi_init()
# Get all compute processes
processes = amdsmi.amdsmi_get_gpu_compute_process_info()
if not processes:
print("No GPU compute processes found")
amdsmi.amdsmi_shut_down()
return
# Sort by VRAM usage (descending)
sorted_procs = sorted(processes,
key=lambda p: p['vram_usage'],
reverse=True)
# Display top N
print(f"Top {min(top_n, len(sorted_procs))} GPU Memory Consumers:\n")
print(f"{'Rank':<6} {'PID':<10} {'VRAM (MB)':<12} {'CU Occ %':<10} {'SDMA':<8}")
print("-" * 56)
total_vram = 0
for rank, proc in enumerate(sorted_procs[:top_n], 1):
vram_mb = proc['vram_usage'] / (1024**2)
total_vram += vram_mb
# Get GPU indices for this process
gpu_indices = amdsmi.amdsmi_get_gpu_compute_process_gpus(proc['process_id'])
gpu_str = f"GPU {','.join(map(str, gpu_indices))}" if gpu_indices else "N/A"
print(f"{rank:<6} {proc['process_id']:<10} {vram_mb:<12.1f} "
f"{proc['cu_occupancy']:<10} {proc['sdma_usage']:<8} [{gpu_str}]")
print(f"\nTotal VRAM by top {min(top_n, len(sorted_procs))}: {total_vram:.1f} MB")
print(f"Total processes tracked: {len(processes)}")
amdsmi.amdsmi_shut_down()
# Show top 10 memory consumers
get_memory_leaderboard(top_n=10)Create a continuously updating process monitor:
import amdsmi
import time
import os
def clear_screen():
"""Clear terminal screen."""
os.system('clear' if os.name == 'posix' else 'cls')
def monitor_dashboard(refresh_interval=2):
"""Real-time dashboard showing GPU process activity."""
amdsmi.amdsmi_init()
try:
while True:
clear_screen()
print("=" * 80)
print("GPU PROCESS MONITOR DASHBOARD".center(80))
print("=" * 80)
print(f"Refresh: {refresh_interval}s | Press Ctrl+C to exit\n")
devices = amdsmi.amdsmi_get_processor_handles()
for idx, device in enumerate(devices):
try:
bdf = amdsmi.amdsmi_get_gpu_device_bdf(device)
processes = amdsmi.amdsmi_get_gpu_process_list(device)
print(f"\nGPU {idx} ({bdf})")
print("-" * 80)
if not processes:
print(" No active processes")
continue
print(f"{'PID':<8} {'Name':<20} {'VRAM (MB)':<12} "
f"{'GFX %':<8} {'ENC %':<8} {'CU %':<8}")
print("-" * 80)
for proc in processes:
vram_mb = proc['memory_usage']['vram_mem'] / (1024**2)
name = proc['name'][:18] # Truncate long names
print(f"{proc['pid']:<8} {name:<20} {vram_mb:<12.1f} "
f"{proc['engine_usage']['gfx']:<8} "
f"{proc['engine_usage']['enc']:<8} "
f"{proc['cu_occupancy']:<8}")
# Summary statistics
total_vram = sum(p['memory_usage']['vram_mem']
for p in processes) / (1024**2)
avg_cu = sum(p['cu_occupancy'] if isinstance(p['cu_occupancy'], int)
else 0 for p in processes) / len(processes)
print("-" * 80)
print(f"Processes: {len(processes)} | "
f"Total VRAM: {total_vram:.1f} MB | "
f"Avg CU: {avg_cu:.1f}%")
except amdsmi.AmdSmiLibraryException as e:
print(f" Error querying GPU {idx}: {e}")
time.sleep(refresh_interval)
except KeyboardInterrupt:
print("\n\nMonitoring stopped by user")
finally:
amdsmi.amdsmi_shut_down()
# Run dashboard
monitor_dashboard(refresh_interval=2)Monitor for processes exceeding resource thresholds:
import amdsmi
def check_process_thresholds(vram_threshold_gb=8.0, cu_threshold=80):
"""Alert on processes exceeding resource thresholds."""
amdsmi.amdsmi_init()
devices = amdsmi.amdsmi_get_processor_handles()
alerts = []
for idx, device in enumerate(devices):
processes = amdsmi.amdsmi_get_gpu_process_list(device)
for proc in processes:
vram_gb = proc['memory_usage']['vram_mem'] / (1024**3)
cu_occ = proc['cu_occupancy']
if vram_gb > vram_threshold_gb:
alerts.append({
'gpu': idx,
'pid': proc['pid'],
'name': proc['name'],
'type': 'HIGH_VRAM',
'value': vram_gb,
'threshold': vram_threshold_gb
})
if isinstance(cu_occ, int) and cu_occ > cu_threshold:
alerts.append({
'gpu': idx,
'pid': proc['pid'],
'name': proc['name'],
'type': 'HIGH_CU',
'value': cu_occ,
'threshold': cu_threshold
})
amdsmi.amdsmi_shut_down()
if alerts:
print(f"ALERT: {len(alerts)} threshold violation(s) detected!")
for alert in alerts:
print(f" GPU {alert['gpu']}: PID {alert['pid']} ({alert['name']})")
print(f" {alert['type']}: {alert['value']:.1f} "
f"(threshold: {alert['threshold']})")
else:
print("All processes within normal thresholds")
return alerts
# Check thresholds
check_process_thresholds(vram_threshold_gb=4.0, cu_threshold=75)Combine process monitoring with GPU health checks:
import amdsmi
def comprehensive_health_check():
"""Comprehensive system health check including process monitoring."""
amdsmi.amdsmi_init()
devices = amdsmi.amdsmi_get_processor_handles()
print("SYSTEM HEALTH CHECK")
print("=" * 80)
for idx, device in enumerate(devices):
bdf = amdsmi.amdsmi_get_gpu_device_bdf(device)
print(f"\nGPU {idx} ({bdf})")
print("-" * 80)
# Get GPU info
asic_info = amdsmi.amdsmi_get_gpu_asic_info(device)
print(f"Device: {asic_info['market_name']}")
# Get temperature
try:
temp = amdsmi.amdsmi_get_temp_metric(
device,
amdsmi.AmdSmiTemperatureType.EDGE,
amdsmi.AmdSmiTemperatureMetric.CURRENT
)
print(f"Temperature: {temp/1000:.1f}°C")
except:
print("Temperature: N/A")
# Get VRAM usage
try:
vram = amdsmi.amdsmi_get_gpu_vram_usage(device)
vram_pct = (vram['vram_used'] / vram['vram_total']) * 100
vram_used_gb = vram['vram_used'] / (1024**3)
vram_total_gb = vram['vram_total'] / (1024**3)
print(f"VRAM: {vram_used_gb:.1f}/{vram_total_gb:.1f} GB ({vram_pct:.1f}%)")
except:
print("VRAM: N/A")
# Get activity
try:
activity = amdsmi.amdsmi_get_gpu_activity(device)
print(f"GFX Activity: {activity['gfx_activity']}%")
except:
print("Activity: N/A")
# Get processes
processes = amdsmi.amdsmi_get_gpu_process_list(device)
print(f"Active Processes: {len(processes)}")
if processes:
print("\nTop 5 processes by VRAM:")
sorted_procs = sorted(processes,
key=lambda p: p['memory_usage']['vram_mem'],
reverse=True)[:5]
for proc in sorted_procs:
vram_mb = proc['memory_usage']['vram_mem'] / (1024**2)
print(f" PID {proc['pid']} ({proc['name']}): "
f"{vram_mb:.1f} MB, CU: {proc['cu_occupancy']}%")
print()
amdsmi.amdsmi_shut_down()
# Run health check
comprehensive_health_check()amdsmi_get_gpu_process_list tracks up to 1024 processes (MAX_NUM_PROCESSES)