Microsoft Azure IoT Hub Management Client Library for programmatic management of Azure IoT Hub resources through the Azure Resource Manager API
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Manual failover capabilities for IoT Hub disaster recovery, enabling controlled failover to paired Azure regions during planned maintenance or disaster recovery scenarios to ensure business continuity and minimal service disruption.
Initiate manual failover of IoT Hub to its paired region for disaster recovery scenarios, planned maintenance, or testing business continuity procedures.
def begin_manual_failover(
iot_hub_name: str,
resource_group_name: str,
failover_input: FailoverInput,
**kwargs
) -> LROPoller[None]:
"""
Initiate manual failover of IoT hub to its paired region.
Args:
iot_hub_name: Name of the IoT hub resource
resource_group_name: Name of the resource group
failover_input: Failover configuration including target region
Returns:
LROPoller[None]: Long-running operation for failover process monitoring
"""from azure.identity import DefaultAzureCredential
from azure.mgmt.iothub import IotHubClient
from azure.mgmt.iothub.models import FailoverInput
import time
# Initialize client
credential = DefaultAzureCredential()
client = IotHubClient(credential, "subscription-id")
resource_group = "myResourceGroup"
hub_name = "myIoTHub"
# Get current IoT Hub information before failover
print("Pre-Failover IoT Hub Status:")
print("=" * 40)
hub_info = client.iot_hub_resource.get(resource_group, hub_name)
print(f"Hub Name: {hub_info.name}")
print(f"Current Location: {hub_info.location}")
print(f"Provisioning State: {hub_info.properties.provisioning_state}")
print(f"State: {hub_info.properties.state}")
if hasattr(hub_info.properties, 'locations'):
print("Available Locations:")
for location in hub_info.properties.locations:
print(f" - {location.location} (Role: {location.role})")
print()
# Prepare failover configuration
failover_config = FailoverInput(
failover_region="paired-region-name" # Specify target region
)
# Initiate manual failover
print("Initiating Manual Failover...")
print("=" * 35)
try:
# Start failover operation
failover_operation = client.iot_hub.begin_manual_failover(
hub_name, resource_group, failover_config
)
print(f"✓ Failover initiated for {hub_name}")
print("⚠️ This is a long-running operation that may take several minutes")
print("⚠️ IoT Hub will be temporarily unavailable during failover")
# Monitor failover progress
print("\nMonitoring Failover Progress...")
start_time = time.time()
while not failover_operation.done():
elapsed_time = int(time.time() - start_time)
print(f" Failover in progress... ({elapsed_time}s elapsed)")
time.sleep(30) # Check every 30 seconds
# Wait for completion
failover_operation.result() # This will block until completion
elapsed_time = int(time.time() - start_time)
print(f"✓ Failover completed successfully in {elapsed_time} seconds")
except Exception as e:
print(f"✗ Failover failed: {e}")
raisedef verify_failover_completion(resource_group: str, hub_name: str):
"""Verify failover completion and new hub status."""
print("Post-Failover Verification:")
print("=" * 35)
try:
# Get updated hub information
hub_info = client.iot_hub_resource.get(resource_group, hub_name)
print(f"Hub Name: {hub_info.name}")
print(f"New Location: {hub_info.location}")
print(f"Provisioning State: {hub_info.properties.provisioning_state}")
print(f"State: {hub_info.properties.state}")
# Check if hub is operational
if hub_info.properties.state == "Active":
print("✓ IoT Hub is active and operational")
else:
print(f"⚠️ IoT Hub state: {hub_info.properties.state}")
# Verify endpoint health after failover
print("\nChecking Endpoint Health:")
try:
endpoint_health = list(client.iot_hub_resource.get_endpoint_health(resource_group, hub_name))
healthy_count = sum(1 for ep in endpoint_health if ep.health_status == "Healthy")
total_count = len(endpoint_health)
print(f" Healthy Endpoints: {healthy_count}/{total_count}")
for endpoint in endpoint_health:
status_icon = "✓" if endpoint.health_status == "Healthy" else "✗"
print(f" {status_icon} {endpoint.endpoint_id}: {endpoint.health_status}")
except Exception as e:
print(f" Could not retrieve endpoint health: {e}")
# Check device registry statistics
print("\nDevice Registry Status:")
try:
stats = client.iot_hub_resource.get_stats(resource_group, hub_name)
print(f" Total Devices: {stats.total_device_count}")
print(f" Enabled Devices: {stats.enabled_device_count}")
print(f" Disabled Devices: {stats.disabled_device_count}")
except Exception as e:
print(f" Could not retrieve device statistics: {e}")
return True
except Exception as e:
print(f"✗ Verification failed: {e}")
return False
# Verify failover completion
verification_success = verify_failover_completion(resource_group, hub_name)
if verification_success:
print("\n✓ Failover verification completed successfully")
else:
print("\n✗ Failover verification encountered issues")import json
from datetime import datetime
def execute_disaster_recovery_failover(resource_group: str, hub_name: str, target_region: str):
"""Execute comprehensive disaster recovery failover with full monitoring."""
dr_log = {
"operation": "disaster_recovery_failover",
"hub_name": hub_name,
"start_time": datetime.utcnow().isoformat(),
"target_region": target_region,
"steps": [],
"success": False
}
def log_step(step_name: str, status: str, details: str = ""):
step_entry = {
"step": step_name,
"timestamp": datetime.utcnow().isoformat(),
"status": status,
"details": details
}
dr_log["steps"].append(step_entry)
print(f"[{status.upper()}] {step_name}: {details}")
try:
# Step 1: Pre-failover validation
log_step("pre_failover_validation", "started", "Collecting pre-failover hub state")
pre_failover_hub = client.iot_hub_resource.get(resource_group, hub_name)
dr_log["pre_failover_location"] = pre_failover_hub.location
dr_log["pre_failover_state"] = pre_failover_hub.properties.state
if pre_failover_hub.properties.state != "Active":
log_step("pre_failover_validation", "warning", f"Hub not in Active state: {pre_failover_hub.properties.state}")
else:
log_step("pre_failover_validation", "success", f"Hub is active in {pre_failover_hub.location}")
# Step 2: Collect pre-failover metrics
log_step("collect_metrics", "started", "Collecting pre-failover device and quota metrics")
pre_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)
dr_log["pre_failover_device_count"] = pre_stats.total_device_count
pre_quotas = list(client.iot_hub_resource.get_quota_metrics(resource_group, hub_name))
dr_log["pre_failover_quotas"] = {q.name: {"current": q.current_value, "max": q.max_value} for q in pre_quotas}
log_step("collect_metrics", "success", f"Collected metrics for {pre_stats.total_device_count} devices")
# Step 3: Initiate failover
log_step("initiate_failover", "started", f"Starting failover to {target_region}")
failover_config = FailoverInput(failover_region=target_region)
failover_operation = client.iot_hub.begin_manual_failover(hub_name, resource_group, failover_config)
dr_log["failover_started"] = datetime.utcnow().isoformat()
# Step 4: Monitor failover progress
log_step("monitor_failover", "started", "Monitoring failover operation progress")
start_time = time.time()
while not failover_operation.done():
elapsed = int(time.time() - start_time)
log_step("monitor_failover", "in_progress", f"Failover running for {elapsed} seconds")
time.sleep(60) # Check every minute for DR scenario
# Wait for completion
failover_operation.result()
elapsed_total = int(time.time() - start_time)
dr_log["failover_duration_seconds"] = elapsed_total
dr_log["failover_completed"] = datetime.utcnow().isoformat()
log_step("failover_execution", "success", f"Failover completed in {elapsed_total} seconds")
# Step 5: Post-failover validation
log_step("post_failover_validation", "started", "Validating hub state after failover")
# Allow some time for system stabilization
time.sleep(30)
post_failover_hub = client.iot_hub_resource.get(resource_group, hub_name)
dr_log["post_failover_location"] = post_failover_hub.location
dr_log["post_failover_state"] = post_failover_hub.properties.state
if post_failover_hub.properties.state == "Active":
log_step("post_failover_validation", "success", f"Hub active in new location: {post_failover_hub.location}")
else:
log_step("post_failover_validation", "warning", f"Hub state: {post_failover_hub.properties.state}")
# Step 6: Verify data integrity
log_step("data_integrity_check", "started", "Verifying device count and data integrity")
post_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)
dr_log["post_failover_device_count"] = post_stats.total_device_count
if post_stats.total_device_count == pre_stats.total_device_count:
log_step("data_integrity_check", "success", f"Device count verified: {post_stats.total_device_count}")
else:
log_step("data_integrity_check", "error", f"Device count mismatch: {pre_stats.total_device_count} -> {post_stats.total_device_count}")
# Step 7: Check endpoint health
log_step("endpoint_health_check", "started", "Checking routing endpoint health")
try:
endpoint_health = list(client.iot_hub_resource.get_endpoint_health(resource_group, hub_name))
healthy_endpoints = [ep for ep in endpoint_health if ep.health_status == "Healthy"]
dr_log["post_failover_endpoint_health"] = {
"total": len(endpoint_health),
"healthy": len(healthy_endpoints),
"unhealthy": len(endpoint_health) - len(healthy_endpoints)
}
if len(healthy_endpoints) == len(endpoint_health):
log_step("endpoint_health_check", "success", f"All {len(endpoint_health)} endpoints healthy")
else:
log_step("endpoint_health_check", "warning", f"{len(healthy_endpoints)}/{len(endpoint_health)} endpoints healthy")
except Exception as e:
log_step("endpoint_health_check", "error", f"Could not check endpoint health: {e}")
dr_log["success"] = True
dr_log["end_time"] = datetime.utcnow().isoformat()
log_step("disaster_recovery", "completed", "Disaster recovery failover completed successfully")
except Exception as e:
dr_log["success"] = False
dr_log["error"] = str(e)
dr_log["end_time"] = datetime.utcnow().isoformat()
log_step("disaster_recovery", "failed", f"Disaster recovery failed: {e}")
raise
finally:
# Save disaster recovery log
log_filename = f"dr_failover_{hub_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(log_filename, "w") as f:
json.dump(dr_log, f, indent=2, default=str)
print(f"\nDisaster recovery log saved to: {log_filename}")
return dr_log
# Execute disaster recovery failover
try:
dr_result = execute_disaster_recovery_failover(
resource_group,
hub_name,
"East US 2" # Target failover region
)
if dr_result["success"]:
print("\n✅ DISASTER RECOVERY COMPLETED SUCCESSFULLY")
print(f" Hub failed over from {dr_result['pre_failover_location']} to {dr_result['post_failover_location']}")
print(f" Total time: {dr_result['failover_duration_seconds']} seconds")
else:
print("\n❌ DISASTER RECOVERY FAILED")
print(f" Error: {dr_result.get('error', 'Unknown error')}")
except Exception as e:
print(f"\n💥 CRITICAL FAILURE: {e}")def test_failover_procedure(resource_group: str, hub_name: str):
"""Test failover procedure in a controlled manner for DR testing."""
print("Failover Test Procedure:")
print("=" * 30)
# Pre-test validation
print("1. Pre-test validation...")
original_hub = client.iot_hub_resource.get(resource_group, hub_name)
original_location = original_hub.location
print(f" Original location: {original_location}")
print(f" Hub state: {original_hub.properties.state}")
if original_hub.properties.state != "Active":
print(" ⚠️ Hub not in Active state - test may not be reliable")
return False
# Collect baseline metrics
print("2. Collecting baseline metrics...")
baseline_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)
print(f" Device count: {baseline_stats.total_device_count}")
try:
# Test failover
print("3. Testing failover...")
failover_config = FailoverInput(failover_region="test-region") # Use appropriate test region
print(" ⚠️ This will temporarily disrupt hub operations")
confirm = input(" Continue with test failover? (yes/no): ")
if confirm.lower() != 'yes':
print(" Test cancelled by user")
return False
# Execute test failover
failover_op = client.iot_hub.begin_manual_failover(hub_name, resource_group, failover_config)
print(" Waiting for failover completion...")
start_time = time.time()
failover_op.result() # Wait for completion
test_duration = int(time.time() - start_time)
print(f" ✓ Test failover completed in {test_duration} seconds")
# Verify test results
print("4. Verifying test results...")
test_hub = client.iot_hub_resource.get(resource_group, hub_name)
print(f" New location: {test_hub.location}")
print(f" Hub state: {test_hub.properties.state}")
# Verify data consistency
test_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)
if test_stats.total_device_count == baseline_stats.total_device_count:
print(f" ✓ Device count consistent: {test_stats.total_device_count}")
else:
print(f" ✗ Device count changed: {baseline_stats.total_device_count} -> {test_stats.total_device_count}")
# Test successful
print("5. Test completed successfully")
print(f" Failover test took {test_duration} seconds")
print(" Hub operational in new region")
return True
except Exception as e:
print(f" ✗ Test failed: {e}")
return False
# Run failover test
test_success = test_failover_procedure(resource_group, hub_name)
if test_success:
print("\n✅ Failover test passed - DR procedures verified")
else:
print("\n❌ Failover test failed - review DR procedures")Configuration for manual failover operations including target region specification for disaster recovery scenarios.
class FailoverInput:
"""Manual failover configuration."""
failover_region: str # Target Azure region for failover operationInstall with Tessl CLI
npx tessl i tessl/pypi-azure-mgmt-iothub