Real-time monitoring functions for AMD GPU hardware metrics including utilization, power consumption, clock frequencies, memory usage, thermal status, and violation tracking. These functions provide comprehensive visibility into GPU performance and health status.
Monitor GPU engine utilization across different subsystems.
def amdsmi_get_gpu_activity(processor_handle: processor_handle) -> Dict[str, Any]:
"""
Get GPU engine activity/utilization percentages.
Returns the current activity level for the GFX (graphics), UMC (memory controller),
and MM (multimedia) engines on the GPU.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- dict: Dictionary containing activity metrics:
- gfx_activity (int): Graphics engine utilization percentage (0-100) or "N/A"
- umc_activity (int): Memory controller utilization percentage (0-100) or "N/A"
- mm_activity (int): Multimedia engine utilization percentage (0-100) or "N/A"
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
devices = amdsmi.amdsmi_get_processor_handles()
for device in devices:
activity = amdsmi.amdsmi_get_gpu_activity(device)
print(f"GFX Activity: {activity['gfx_activity']}%")
print(f"UMC Activity: {activity['umc_activity']}%")
print(f"MM Activity: {activity['mm_activity']}%")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_get_utilization_count(
processor_handle: processor_handle,
counter_types: List[AmdSmiUtilizationCounterType]
) -> List[Dict[str, Any]]:
"""
Get fine-grained utilization counters for specific GPU activities.
Provides detailed utilization metrics for various GPU engines and activities with
timestamp information for tracking changes over time.
Parameters:
- processor_handle: Handle for the target GPU device
- counter_types (List[AmdSmiUtilizationCounterType]): List of counter types to query.
Can include:
- COARSE_GRAIN_GFX_ACTIVITY
- COARSE_GRAIN_MEM_ACTIVITY
- COARSE_DECODER_ACTIVITY
- FINE_GRAIN_GFX_ACTIVITY
- FINE_GRAIN_MEM_ACTIVITY
- FINE_DECODER_ACTIVITY
Returns:
- List[dict]: List of dictionaries containing:
- First element: {"timestamp": int} - System timestamp
- Subsequent elements: {"type": str, "value": int} - Counter type and value
Raises:
- AmdSmiParameterException: If processor_handle or counter_types are invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
from amdsmi import AmdSmiUtilizationCounterType
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
counters = [
AmdSmiUtilizationCounterType.COARSE_GRAIN_GFX_ACTIVITY,
AmdSmiUtilizationCounterType.FINE_DECODER_ACTIVITY
]
results = amdsmi.amdsmi_get_utilization_count(device, counters)
print(f"Timestamp: {results[0]['timestamp']}")
for counter in results[1:]:
print(f"{counter['type']}: {counter['value']}")
amdsmi.amdsmi_shut_down()
```
"""Track GPU memory usage and health.
def amdsmi_get_gpu_vram_usage(processor_handle: processor_handle) -> Dict[str, Any]:
"""
Get VRAM usage statistics.
Returns the total and currently used VRAM on the GPU in bytes.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- dict: Dictionary containing:
- vram_total (int): Total VRAM capacity in bytes
- vram_used (int): Currently used VRAM in bytes
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
vram = amdsmi.amdsmi_get_gpu_vram_usage(device)
total_gb = vram['vram_total'] / (1024**3)
used_gb = vram['vram_used'] / (1024**3)
usage_percent = (vram['vram_used'] / vram['vram_total']) * 100
print(f"VRAM: {used_gb:.2f} GB / {total_gb:.2f} GB ({usage_percent:.1f}%)")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_get_gpu_bad_page_info(processor_handle: processor_handle) -> List[Dict[str, Any]]:
"""
Get information about bad/retired memory pages.
Returns a list of memory pages that have been retired due to errors.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- List[dict]: List of bad page records, each containing:
- page_address (int): Physical address of the bad page
- page_size (int): Size of the page in bytes
- status (AmdSmiMemoryPageStatus): Page status (RESERVED, PENDING, UNRESERVABLE)
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
bad_pages = amdsmi.amdsmi_get_gpu_bad_page_info(device)
if bad_pages:
print(f"Found {len(bad_pages)} bad pages:")
for page in bad_pages:
print(f" Address: 0x{page['page_address']:x}, Status: {page['status']}")
else:
print("No bad pages detected")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_get_gpu_bad_page_threshold(processor_handle: processor_handle) -> int:
"""
Get the bad page threshold count.
Returns the number of bad pages that triggers a critical threshold event.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- int: Bad page threshold count
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
"""Monitor power consumption and management status.
def amdsmi_get_power_info(processor_handle: processor_handle) -> Dict[str, int]:
"""
Get comprehensive power consumption metrics.
Returns power and voltage measurements for the GPU including socket power,
current power, average power, and voltage rails.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- dict: Dictionary containing power metrics (all in milliwatts or millivolts):
- socket_power (int): Socket power limit in mW or "N/A"
- current_socket_power (int): Current socket power consumption in mW or "N/A"
- average_socket_power (int): Average socket power consumption in mW or "N/A"
- gfx_voltage (int): Graphics voltage in mV or "N/A"
- soc_voltage (int): SoC voltage in mV or "N/A"
- mem_voltage (int): Memory voltage in mV or "N/A"
- power_limit (int): Power limit in mW or "N/A"
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
power = amdsmi.amdsmi_get_power_info(device)
if power['current_socket_power'] != "N/A":
watts = power['current_socket_power'] / 1000
print(f"Current Power: {watts:.2f} W")
if power['average_socket_power'] != "N/A":
avg_watts = power['average_socket_power'] / 1000
print(f"Average Power: {avg_watts:.2f} W")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_get_energy_count(processor_handle: processor_handle) -> Dict[str, Any]:
"""
Get accumulated energy consumption.
Returns the energy accumulator counter value with resolution and timestamp.
Useful for calculating energy consumption over time intervals.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- dict: Dictionary containing:
- energy_accumulator (int): Accumulated energy counter value
- counter_resolution (float): Energy counter resolution (microjoules per increment)
- timestamp (int): Timestamp of the reading
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
import time
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
# Take initial reading
start = amdsmi.amdsmi_get_energy_count(device)
start_energy = start['energy_accumulator'] * start['counter_resolution']
# Wait and take another reading
time.sleep(1)
end = amdsmi.amdsmi_get_energy_count(device)
end_energy = end['energy_accumulator'] * end['counter_resolution']
# Calculate energy consumed (in joules)
energy_consumed = (end_energy - start_energy) / 1_000_000
print(f"Energy consumed: {energy_consumed:.4f} J")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_is_gpu_power_management_enabled(processor_handle: processor_handle) -> bool:
"""
Check if GPU power management is enabled.
Returns whether dynamic power management features are active on the GPU.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- bool: True if power management is enabled, False otherwise
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
if amdsmi.amdsmi_is_gpu_power_management_enabled(device):
print("Power management is active")
else:
print("Power management is disabled")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_get_gpu_power_profile_presets(
processor_handle: processor_handle,
sensor_ind: int
) -> Dict[str, Any]:
"""
Get available power profile presets.
Returns information about available power profiles and which are currently active.
Parameters:
- processor_handle: Handle for the target GPU device
- sensor_ind (int): Sensor index (typically 0)
Returns:
- dict: Dictionary containing power profile information including available
profiles and currently active profile
Raises:
- AmdSmiParameterException: If processor_handle or sensor_ind is invalid
- AmdSmiLibraryException: On query failure
"""Monitor GPU clock frequencies across different domains.
def amdsmi_get_clock_info(
processor_handle: processor_handle,
clock_type: AmdSmiClkType
) -> Dict[str, Any]:
"""
Get clock frequency information for a specific clock domain.
Returns current, minimum, and maximum frequencies along with lock status for
the specified clock type.
Parameters:
- processor_handle: Handle for the target GPU device
- clock_type (AmdSmiClkType): Type of clock to query:
- SYS: System clock
- GFX: Graphics clock
- DF: Data Fabric clock
- DCEF: Display Controller Engine clock
- SOC: SoC clock
- MEM: Memory clock
- PCIE: PCIe clock
- VCLK0, VCLK1: Video clock
- DCLK0, DCLK1: Display clock
- FCLK: Fabric clock
- LCLK: Link clock
Returns:
- dict: Dictionary containing:
- clk (int): Current clock frequency in MHz or "N/A"
- min_clk (int): Minimum clock frequency in MHz or "N/A"
- max_clk (int): Maximum clock frequency in MHz or "N/A"
- clk_locked (bool): Whether clock is locked or "N/A"
- clk_deep_sleep (int): Deep sleep frequency in MHz or "N/A"
Raises:
- AmdSmiParameterException: If processor_handle or clock_type is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
from amdsmi import AmdSmiClkType
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
# Query graphics clock
gfx_clk = amdsmi.amdsmi_get_clock_info(device, AmdSmiClkType.GFX)
print(f"GFX Clock: {gfx_clk['clk']} MHz")
print(f" Range: {gfx_clk['min_clk']} - {gfx_clk['max_clk']} MHz")
# Query memory clock
mem_clk = amdsmi.amdsmi_get_clock_info(device, AmdSmiClkType.MEM)
print(f"Memory Clock: {mem_clk['clk']} MHz")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_get_clk_freq(
processor_handle: processor_handle,
clk_type: AmdSmiClkType
) -> Dict[str, Any]:
"""
Get detailed clock frequency levels.
Returns all supported frequency levels for the specified clock domain along
with the currently active level.
Parameters:
- processor_handle: Handle for the target GPU device
- clk_type (AmdSmiClkType): Type of clock to query (see AmdSmiClkType enum)
Returns:
- dict: Dictionary containing:
- num_supported (int): Number of supported frequency levels
- current (int): Index of currently active frequency level
- frequency (List[int]): List of supported frequencies in MHz
Raises:
- AmdSmiParameterException: If processor_handle or clk_type is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
from amdsmi import AmdSmiClkType
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
freq_info = amdsmi.amdsmi_get_clk_freq(device, AmdSmiClkType.GFX)
print(f"Supported GFX frequencies ({freq_info['num_supported']} levels):")
for i, freq in enumerate(freq_info['frequency']):
marker = " <-- Current" if i == freq_info['current'] else ""
print(f" Level {i}: {freq} MHz{marker}")
amdsmi.amdsmi_shut_down()
```
"""Monitor performance states and overdrive settings.
def amdsmi_get_gpu_perf_level(processor_handle: processor_handle) -> str:
"""
Get current GPU performance level.
Returns the current performance level setting which controls how aggressively
the GPU manages power and clocks.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- str: Performance level name:
- "AMDSMI_DEV_PERF_LEVEL_AUTO": Automatic performance management
- "AMDSMI_DEV_PERF_LEVEL_LOW": Low performance/power mode
- "AMDSMI_DEV_PERF_LEVEL_HIGH": High performance mode
- "AMDSMI_DEV_PERF_LEVEL_MANUAL": Manual performance control
- "AMDSMI_DEV_PERF_LEVEL_STABLE_STD": Stable standard clocks
- "AMDSMI_DEV_PERF_LEVEL_STABLE_PEAK": Stable peak clocks
- "AMDSMI_DEV_PERF_LEVEL_STABLE_MIN_MCLK": Stable minimum memory clock
- "AMDSMI_DEV_PERF_LEVEL_STABLE_MIN_SCLK": Stable minimum system clock
- "AMDSMI_DEV_PERF_LEVEL_DETERMINISM": Deterministic performance mode
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
perf_level = amdsmi.amdsmi_get_gpu_perf_level(device)
print(f"Performance Level: {perf_level}")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_get_gpu_overdrive_level(processor_handle: processor_handle) -> int:
"""
Get GPU core overdrive level.
Returns the overdrive percentage for GPU core clocks. Overdrive allows running
the GPU beyond its default specifications.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- int: Overdrive level as a percentage (0 = disabled, >0 = overclock percentage)
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
od_level = amdsmi.amdsmi_get_gpu_overdrive_level(device)
if od_level > 0:
print(f"GPU overdrive enabled: {od_level}%")
else:
print("GPU overdrive disabled")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_get_gpu_mem_overdrive_level(processor_handle: processor_handle) -> int:
"""
Get GPU memory overdrive level.
Returns the overdrive percentage for GPU memory clocks.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- int: Memory overdrive level as a percentage (0 = disabled, >0 = overclock percentage)
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
"""def amdsmi_get_gpu_od_volt_info(processor_handle: processor_handle) -> Dict[str, Any]:
"""
Get overdrive voltage/frequency curve information.
Returns detailed information about voltage and frequency ranges for overdrive
control, including curve points and limits.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- dict: Dictionary containing:
- curr_sclk_range (dict): Current system clock range
- lower_bound (int): Lower bound in MHz or "N/A"
- upper_bound (int): Upper bound in MHz or "N/A"
- curr_mclk_range (dict): Current memory clock range
- lower_bound (int): Lower bound in MHz or "N/A"
- upper_bound (int): Upper bound in MHz or "N/A"
- sclk_freq_limits (dict): System clock frequency limits
- lower_bound (int): Lower limit in MHz
- upper_bound (int): Upper limit in MHz
- mclk_freq_limits (dict): Memory clock frequency limits
- lower_bound (int): Lower limit in MHz
- upper_bound (int): Upper limit in MHz
- curve.vc_points (List): Voltage curve points
- num_regions (int): Number of voltage curve regions
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
od_info = amdsmi.amdsmi_get_gpu_od_volt_info(device)
print("SCLK Range:", od_info['curr_sclk_range'])
print("MCLK Range:", od_info['curr_mclk_range'])
print("Voltage Curve Points:", len(od_info['curve.vc_points']))
amdsmi.amdsmi_shut_down()
```
"""Monitor PCIe connection status and performance.
def amdsmi_get_pcie_info(processor_handle: processor_handle) -> Dict[str, Any]:
"""
Get PCIe interface information and metrics.
Returns comprehensive PCIe link information including static capabilities and
dynamic metrics like current speed, width, bandwidth, and error counters.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- dict: Dictionary with two sections:
- pcie_static (dict): Static PCIe capabilities
- max_pcie_width (int): Maximum lanes supported or "N/A"
- max_pcie_speed (int): Maximum speed in MT/s or "N/A"
- pcie_interface_version (int): PCIe generation or "N/A"
- slot_type (int): Physical slot type
- pcie_metric (dict): Current PCIe metrics
- pcie_width (int): Current active lanes or "N/A"
- pcie_speed (int): Current speed in MT/s or "N/A"
- pcie_bandwidth (int): Current bandwidth utilization or "N/A"
- pcie_replay_count (int): Replay counter or "N/A"
- pcie_l0_to_recovery_count (int): L0 to recovery transitions or "N/A"
- pcie_replay_roll_over_count (int): Replay rollover count or "N/A"
- pcie_nak_sent_count (int): NAK sent count or "N/A"
- pcie_nak_received_count (int): NAK received count or "N/A"
- pcie_lc_perf_other_end_recovery_count (int): Link recovery count or "N/A"
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
pcie = amdsmi.amdsmi_get_pcie_info(device)
# Static info
static = pcie['pcie_static']
print(f"PCIe Gen{static['pcie_interface_version']}")
print(f"Max: x{static['max_pcie_width']} @ {static['max_pcie_speed']} MT/s")
# Current metrics
metric = pcie['pcie_metric']
print(f"Current: x{metric['pcie_width']} @ {metric['pcie_speed']} MT/s")
print(f"Bandwidth: {metric['pcie_bandwidth']}")
print(f"Replays: {metric['pcie_replay_count']}")
amdsmi.amdsmi_shut_down()
```
"""Monitor GPU fan speed and status.
def amdsmi_get_gpu_fan_rpms(processor_handle: processor_handle, sensor_idx: int) -> int:
"""
Get GPU fan speed in RPMs.
Returns the current fan rotation speed in revolutions per minute.
Parameters:
- processor_handle: Handle for the target GPU device
- sensor_idx (int): Fan sensor index (typically 0 for single-fan GPUs)
Returns:
- int: Fan speed in RPMs
Raises:
- AmdSmiParameterException: If processor_handle or sensor_idx is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
fan_rpms = amdsmi.amdsmi_get_gpu_fan_rpms(device, 0)
print(f"Fan Speed: {fan_rpms} RPM")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_get_gpu_fan_speed(processor_handle: processor_handle, sensor_idx: int) -> int:
"""
Get GPU fan speed as a percentage.
Returns the current fan speed as a percentage of maximum speed.
Parameters:
- processor_handle: Handle for the target GPU device
- sensor_idx (int): Fan sensor index (typically 0 for single-fan GPUs)
Returns:
- int: Fan speed percentage (0-100)
Raises:
- AmdSmiParameterException: If processor_handle or sensor_idx is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
fan_percent = amdsmi.amdsmi_get_gpu_fan_speed(device, 0)
print(f"Fan Speed: {fan_percent}%")
amdsmi.amdsmi_shut_down()
```
"""def amdsmi_get_gpu_fan_speed_max(processor_handle: processor_handle, sensor_idx: int) -> int:
"""
Get maximum GPU fan speed.
Returns the maximum achievable fan speed in RPMs.
Parameters:
- processor_handle: Handle for the target GPU device
- sensor_idx (int): Fan sensor index (typically 0 for single-fan GPUs)
Returns:
- int: Maximum fan speed in RPMs
Raises:
- AmdSmiParameterException: If processor_handle or sensor_idx is invalid
- AmdSmiLibraryException: On query failure
"""Monitor thermal and power violations.
def amdsmi_get_violation_status(processor_handle: processor_handle) -> Dict[str, Any]:
"""
Get violation status for thermal and power limits.
Returns comprehensive information about throttling events and violations including
accumulated counters, percentages, and active status for various limit types.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- dict: Dictionary containing extensive violation metrics:
- reference_timestamp (int): Reference timestamp or "N/A"
- violation_timestamp (int): Last violation timestamp or "N/A"
- acc_counter (int): Accumulated violation counter or "N/A"
- acc_prochot_thrm (int): Accumulated PROCHOT thermal violations or "N/A"
- acc_ppt_pwr (int): Accumulated power (PVIOL) violations or "N/A"
- acc_socket_thrm (int): Accumulated socket thermal (TVIOL) violations or "N/A"
- acc_vr_thrm (int): Accumulated VR thermal violations or "N/A"
- acc_hbm_thrm (int): Accumulated HBM thermal violations or "N/A"
- acc_gfx_clk_below_host_limit (int): GFX clock below host limit counter or "N/A"
- acc_gfx_clk_below_host_limit_pwr (List[int]): Per-engine power limits
- acc_gfx_clk_below_host_limit_thm (List[int]): Per-engine thermal limits
- acc_gfx_clk_below_host_limit_total (List[int]): Per-engine total limits
- acc_low_utilization (List[int]): Low utilization counters
- per_prochot_thrm (float): PROCHOT thermal percentage or "N/A"
- per_ppt_pwr (float): Power violation percentage or "N/A"
- per_socket_thrm (float): Socket thermal percentage or "N/A"
- per_vr_thrm (float): VR thermal percentage or "N/A"
- per_hbm_thrm (float): HBM thermal percentage or "N/A"
- per_gfx_clk_below_host_limit (float): GFX clock limit percentage or "N/A"
- per_gfx_clk_below_host_limit_pwr (List): Per-engine power percentages
- per_gfx_clk_below_host_limit_thm (List): Per-engine thermal percentages
- per_gfx_clk_below_host_limit_total (List): Per-engine total percentages
- per_low_utilization (List): Low utilization percentages
- active_prochot_thrm (bool): Active PROCHOT thermal throttling or "N/A"
- active_ppt_pwr (bool): Active power throttling or "N/A"
- active_socket_thrm (bool): Active socket thermal throttling or "N/A"
- active_vr_thrm (bool): Active VR thermal throttling or "N/A"
- active_hbm_thrm (bool): Active HBM thermal throttling or "N/A"
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
violations = amdsmi.amdsmi_get_violation_status(device)
# Check for active throttling
if violations['active_ppt_pwr']:
print("WARNING: GPU is power throttling!")
print(f" Power violation %: {violations['per_ppt_pwr']}")
if violations['active_socket_thrm']:
print("WARNING: GPU is thermally throttling!")
print(f" Thermal violation %: {violations['per_socket_thrm']}")
# Show accumulated violations
if violations['acc_ppt_pwr'] != "N/A":
print(f"Total power violations: {violations['acc_ppt_pwr']}")
amdsmi.amdsmi_shut_down()
```
"""Monitor XGMI (high-speed inter-GPU) link status.
def amdsmi_get_gpu_xgmi_link_status(processor_handle: processor_handle) -> Dict[str, Any]:
"""
Get XGMI link status.
Returns the status of XGMI links used for high-speed GPU-to-GPU communication
in multi-GPU systems.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- dict: Dictionary containing:
- status (List[str]): List of link status strings:
- "U": Link is up
- "D": Link is down
- "X": Link is disabled
- "N/A": Status not available
- total_links (int): Total number of XGMI links
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
xgmi = amdsmi.amdsmi_get_gpu_xgmi_link_status(device)
print(f"XGMI Links ({xgmi['total_links']} total):")
for i, status in enumerate(xgmi['status']):
status_str = {
"U": "UP",
"D": "DOWN",
"X": "DISABLED",
"N/A": "N/A"
}.get(status, status)
print(f" Link {i}: {status_str}")
amdsmi.amdsmi_shut_down()
```
"""Get comprehensive GPU metrics in a single call.
def amdsmi_get_gpu_metrics_info(processor_handle: processor_handle) -> Dict[str, Any]:
"""
Get comprehensive GPU metrics information.
Returns a large dictionary containing comprehensive GPU metrics including
temperatures, activity, power, clocks, and link information in a single query.
This is more efficient than querying individual metrics separately.
Parameters:
- processor_handle: Handle for the target GPU device
Returns:
- dict: Dictionary with extensive metrics including:
- Header information (common_header.*)
- Temperature metrics (temperature_edge, temperature_hotspot, temperature_mem, etc.)
- Activity metrics (average_gfx_activity, average_umc_activity, average_mm_activity)
- Power metrics (average_socket_power, energy_accumulator, system_clock_counter)
- Clock frequencies (average_gfxclk_frequency, average_socclk_frequency, etc.)
- Current clock values (current_gfxclk, current_socclk, current_uclk, etc.)
- Throttle status (throttle_status)
- Fan speed (current_fan_speed)
- PCIe metrics (pcie_link_width, pcie_link_speed)
- Activity accumulators (gfx_activity_acc, mem_activity_acc)
- Additional voltage, current, throttle, and firmware metrics
All values are returned as integers or "N/A" if not available.
Raises:
- AmdSmiParameterException: If processor_handle is invalid
- AmdSmiLibraryException: On query failure
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
device = amdsmi.amdsmi_get_processor_handles()[0]
metrics = amdsmi.amdsmi_get_gpu_metrics_info(device)
# Temperature info
if metrics['temperature_edge'] != "N/A":
print(f"Edge Temperature: {metrics['temperature_edge']}°C")
if metrics['temperature_hotspot'] != "N/A":
print(f"Hotspot Temperature: {metrics['temperature_hotspot']}°C")
# Activity
if metrics['average_gfx_activity'] != "N/A":
print(f"GFX Activity: {metrics['average_gfx_activity']}%")
# Power
if metrics['average_socket_power'] != "N/A":
print(f"Average Power: {metrics['average_socket_power']} W")
# Clocks
if metrics['current_gfxclk'] != "N/A":
print(f"GFX Clock: {metrics['current_gfxclk']} MHz")
# Throttling
if metrics['throttle_status']:
print("WARNING: GPU is throttling!")
amdsmi.amdsmi_shut_down()
```
"""Get header information for the GPU metrics table structure.
def amdsmi_get_gpu_metrics_header_info(processor_handle: processor_handle) -> Dict[str, int]:
"""
Get GPU metrics header information.
Returns header information about the GPU metrics table structure, including
size and versioning information. This is useful for understanding the format
and compatibility of the metrics data structure.
Parameters:
- processor_handle (processor_handle): Handle for the GPU processor
Returns:
- Dict[str, int]: Dictionary containing:
- "structure_size" (int): Size of the metrics table structure in bytes
- "format_revision" (int): Format revision number
- "content_revision" (int): Content revision number
Raises:
- AmdSmiParameterException: If processor_handle is not valid
- AmdSmiLibraryException: If unable to retrieve metrics header info
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
for device in devices:
header = amdsmi.amdsmi_get_gpu_metrics_header_info(device)
print(f"Metrics table size: {header['structure_size']} bytes")
print(f"Format revision: {header['format_revision']}")
print(f"Content revision: {header['content_revision']}")
finally:
amdsmi.amdsmi_shut_down()
```
"""Get detailed power management metrics for the GPU.
def amdsmi_get_gpu_pm_metrics_info(processor_handle: processor_handle) -> List[Dict[str, Any]]:
"""
Get GPU power management (PM) metrics information.
Returns a list of power management metrics as name-value pairs. This provides
detailed power management telemetry data for the GPU, useful for advanced
power analysis and optimization.
Parameters:
- processor_handle (processor_handle): Handle for the GPU processor
Returns:
- List[Dict[str, Any]]: List of dictionaries, each containing:
- "name" (str): Metric name
- "value" (int or float): Metric value
Raises:
- AmdSmiParameterException: If processor_handle is not valid
- AmdSmiLibraryException: If unable to retrieve PM metrics
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
for device in devices:
pm_metrics = amdsmi.amdsmi_get_gpu_pm_metrics_info(device)
print("Power Management Metrics:")
for metric in pm_metrics:
print(f" {metric['name']}: {metric['value']}")
finally:
amdsmi.amdsmi_shut_down()
```
"""Get register table information for a specific register type.
def amdsmi_get_gpu_reg_table_info(
processor_handle: processor_handle,
reg_type: AmdSmiRegType
) -> List[Dict[str, Any]]:
"""
Get GPU register table information.
Returns register values for a specific register type (XGMI, WAFL, PCIE, USR, USR1).
This provides low-level register access for advanced GPU monitoring and debugging.
Parameters:
- processor_handle (processor_handle): Handle for the GPU processor
- reg_type (AmdSmiRegType): Register type to query
- AmdSmiRegType.XGMI (0): XGMI registers
- AmdSmiRegType.WAFL (1): WAFL registers
- AmdSmiRegType.PCIE (2): PCIe registers
- AmdSmiRegType.USR (3): User registers
- AmdSmiRegType.USR1 (4): User registers 1
Returns:
- List[Dict[str, Any]]: List of dictionaries, each containing:
- "name" (str): Register name
- "value" (int): Register value
Raises:
- AmdSmiParameterException: If processor_handle or reg_type is not valid
- AmdSmiLibraryException: If unable to retrieve register table info
Example:
```python
import amdsmi
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
device = devices[0]
# Get PCIE registers
pcie_regs = amdsmi.amdsmi_get_gpu_reg_table_info(
device,
amdsmi.AmdSmiRegType.PCIE
)
print("PCIe Registers:")
for reg in pcie_regs:
print(f" {reg['name']}: 0x{reg['value']:X}")
finally:
amdsmi.amdsmi_shut_down()
```
"""Monitor core GPU metrics:
import amdsmi
# Initialize
amdsmi.amdsmi_init()
try:
# Get all GPU devices
devices = amdsmi.amdsmi_get_processor_handles()
for i, device in enumerate(devices):
print(f"\n=== GPU {i} ===")
# Activity
activity = amdsmi.amdsmi_get_gpu_activity(device)
print(f"GFX Activity: {activity['gfx_activity']}%")
# Power
power = amdsmi.amdsmi_get_power_info(device)
if power['current_socket_power'] != "N/A":
watts = power['current_socket_power'] / 1000
print(f"Power: {watts:.2f} W")
# Memory
vram = amdsmi.amdsmi_get_gpu_vram_usage(device)
used_gb = vram['vram_used'] / (1024**3)
total_gb = vram['vram_total'] / (1024**3)
print(f"VRAM: {used_gb:.2f} / {total_gb:.2f} GB")
# Temperature (requires separate function not covered here)
# Fan
try:
fan_speed = amdsmi.amdsmi_get_gpu_fan_speed(device, 0)
print(f"Fan: {fan_speed}%")
except:
print("Fan: N/A")
finally:
amdsmi.amdsmi_shut_down()Monitor all clock domains:
import amdsmi
from amdsmi import AmdSmiClkType
amdsmi.amdsmi_init()
try:
device = amdsmi.amdsmi_get_processor_handles()[0]
clock_types = [
(AmdSmiClkType.GFX, "Graphics"),
(AmdSmiClkType.MEM, "Memory"),
(AmdSmiClkType.SOC, "SoC"),
]
print("Clock Frequencies:")
for clk_type, name in clock_types:
try:
info = amdsmi.amdsmi_get_clock_info(device, clk_type)
if info['clk'] != "N/A":
print(f"{name:10s}: {info['clk']:5d} MHz "
f"(range: {info['min_clk']}-{info['max_clk']} MHz)")
except:
print(f"{name:10s}: Not available")
finally:
amdsmi.amdsmi_shut_down()Monitor for performance throttling:
import amdsmi
import time
amdsmi.amdsmi_init()
try:
device = amdsmi.amdsmi_get_processor_handles()[0]
print("Monitoring for throttling events (Ctrl+C to stop)...")
while True:
violations = amdsmi.amdsmi_get_violation_status(device)
# Check active throttling
throttle_active = False
if violations['active_ppt_pwr']:
print("[THROTTLE] Power limit reached!")
throttle_active = True
if violations['active_socket_thrm']:
print("[THROTTLE] Thermal limit reached!")
throttle_active = True
if violations['active_prochot_thrm']:
print("[THROTTLE] PROCHOT thermal protection active!")
throttle_active = True
if not throttle_active:
print(".", end="", flush=True)
time.sleep(1)
except KeyboardInterrupt:
print("\nMonitoring stopped")
finally:
amdsmi.amdsmi_shut_down()Calculate energy consumption over time:
import amdsmi
import time
def calculate_energy_consumption(device, duration_seconds=10):
"""Calculate energy consumed over a time period."""
# Get initial reading
start = amdsmi.amdsmi_get_energy_count(device)
start_energy = start['energy_accumulator'] * start['counter_resolution']
start_time = time.time()
# Wait for specified duration
time.sleep(duration_seconds)
# Get final reading
end = amdsmi.amdsmi_get_energy_count(device)
end_energy = end['energy_accumulator'] * end['counter_resolution']
end_time = time.time()
# Calculate consumption
energy_microjoules = end_energy - start_energy
energy_joules = energy_microjoules / 1_000_000
actual_duration = end_time - start_time
average_power = energy_joules / actual_duration
return {
'energy_joules': energy_joules,
'duration_seconds': actual_duration,
'average_power_watts': average_power
}
amdsmi.amdsmi_init()
try:
device = amdsmi.amdsmi_get_processor_handles()[0]
print("Measuring energy consumption for 10 seconds...")
result = calculate_energy_consumption(device, 10)
print(f"Energy consumed: {result['energy_joules']:.2f} J")
print(f"Average power: {result['average_power_watts']:.2f} W")
finally:
amdsmi.amdsmi_shut_down()Create a complete monitoring dashboard:
import amdsmi
from amdsmi import AmdSmiClkType
import time
def print_gpu_status(device, device_num):
"""Print comprehensive GPU status."""
print(f"\n{'='*60}")
print(f"GPU {device_num} Status")
print(f"{'='*60}")
# GPU Activity
activity = amdsmi.amdsmi_get_gpu_activity(device)
print(f"\nActivity:")
print(f" GFX: {activity['gfx_activity']:>3}%")
print(f" UMC: {activity['umc_activity']:>3}%")
print(f" MM: {activity['mm_activity']:>3}%")
# Power
power = amdsmi.amdsmi_get_power_info(device)
print(f"\nPower:")
if power['current_socket_power'] != "N/A":
print(f" Current: {power['current_socket_power']/1000:.2f} W")
if power['average_socket_power'] != "N/A":
print(f" Average: {power['average_socket_power']/1000:.2f} W")
# Memory
vram = amdsmi.amdsmi_get_gpu_vram_usage(device)
used_gb = vram['vram_used'] / (1024**3)
total_gb = vram['vram_total'] / (1024**3)
usage_pct = (vram['vram_used'] / vram['vram_total']) * 100
print(f"\nVRAM:")
print(f" Used: {used_gb:.2f} GB / {total_gb:.2f} GB ({usage_pct:.1f}%)")
# Clocks
print(f"\nClocks:")
gfx = amdsmi.amdsmi_get_clock_info(device, AmdSmiClkType.GFX)
if gfx['clk'] != "N/A":
print(f" GFX: {gfx['clk']} MHz")
mem = amdsmi.amdsmi_get_clock_info(device, AmdSmiClkType.MEM)
if mem['clk'] != "N/A":
print(f" MEM: {mem['clk']} MHz")
# Fan
try:
fan_pct = amdsmi.amdsmi_get_gpu_fan_speed(device, 0)
fan_rpm = amdsmi.amdsmi_get_gpu_fan_rpms(device, 0)
print(f"\nFan:")
print(f" Speed: {fan_pct}% ({fan_rpm} RPM)")
except:
pass
# PCIe
pcie = amdsmi.amdsmi_get_pcie_info(device)
print(f"\nPCIe:")
static = pcie['pcie_static']
metric = pcie['pcie_metric']
if static['pcie_interface_version'] != "N/A":
print(f" Gen{static['pcie_interface_version']} x{metric['pcie_width']}")
# Throttling
violations = amdsmi.amdsmi_get_violation_status(device)
throttling = []
if violations['active_ppt_pwr']:
throttling.append("Power")
if violations['active_socket_thrm']:
throttling.append("Thermal")
if throttling:
print(f"\nWARNING: Throttling active: {', '.join(throttling)}")
amdsmi.amdsmi_init()
try:
devices = amdsmi.amdsmi_get_processor_handles()
while True:
for i, device in enumerate(devices):
print_gpu_status(device, i)
print(f"\n[Press Ctrl+C to exit]")
time.sleep(2)
except KeyboardInterrupt:
print("\n\nMonitoring stopped")
finally:
amdsmi.amdsmi_shut_down()Clock type enumeration for querying different clock domains:
SYS - System clockGFX - Graphics engine clockDF - Data Fabric clockDCEF - Display Controller Engine Fabric clockSOC - SoC clockMEM - Memory clockPCIE - PCIe interface clockVCLK0, VCLK1 - Video clock 0/1DCLK0, DCLK1 - Display clock 0/1FCLK - Fabric clockLCLK - Link clockUtilization counter types for detailed activity monitoring:
class AmdSmiUtilizationCounterType(IntEnum):
"""Utilization counter types for GPU activity monitoring."""
COARSE_GRAIN_GFX_ACTIVITY = ... # Coarse-grained graphics activity
COARSE_GRAIN_MEM_ACTIVITY = ... # Coarse-grained memory activity
COARSE_DECODER_ACTIVITY = ... # Coarse-grained decoder activity
FINE_GRAIN_GFX_ACTIVITY = ... # Fine-grained graphics activity
FINE_GRAIN_MEM_ACTIVITY = ... # Fine-grained memory activity
FINE_DECODER_ACTIVITY = ... # Fine-grained decoder activity
UTILIZATION_COUNTER_FIRST = ... # First counter type (sentinel value)
UTILIZATION_COUNTER_LAST = ... # Last counter type (sentinel value)amdsmi_get_gpu_metrics_info function is the most efficient way to get multiple metrics at once