0
# Failover Operations
1
2
Manual failover capabilities for IoT Hub disaster recovery, enabling controlled failover to paired Azure regions during planned maintenance or disaster recovery scenarios to ensure business continuity and minimal service disruption.
3
4
## Capabilities
5
6
### Manual Failover Initiation
7
8
Initiate manual failover of IoT Hub to its paired region for disaster recovery scenarios, planned maintenance, or testing business continuity procedures.
9
10
```python { .api }
11
def begin_manual_failover(
12
iot_hub_name: str,
13
resource_group_name: str,
14
failover_input: FailoverInput,
15
**kwargs
16
) -> LROPoller[None]:
17
"""
18
Initiate manual failover of IoT hub to its paired region.
19
20
Args:
21
iot_hub_name: Name of the IoT hub resource
22
resource_group_name: Name of the resource group
23
failover_input: Failover configuration including target region
24
25
Returns:
26
LROPoller[None]: Long-running operation for failover process monitoring
27
"""
28
```
29
30
## Usage Examples
31
32
### Initiating planned failover for maintenance
33
34
```python
35
from azure.identity import DefaultAzureCredential
36
from azure.mgmt.iothub import IotHubClient
37
from azure.mgmt.iothub.models import FailoverInput
38
import time
39
40
# Initialize client
41
credential = DefaultAzureCredential()
42
client = IotHubClient(credential, "subscription-id")
43
44
resource_group = "myResourceGroup"
45
hub_name = "myIoTHub"
46
47
# Get current IoT Hub information before failover
48
print("Pre-Failover IoT Hub Status:")
49
print("=" * 40)
50
51
hub_info = client.iot_hub_resource.get(resource_group, hub_name)
52
print(f"Hub Name: {hub_info.name}")
53
print(f"Current Location: {hub_info.location}")
54
print(f"Provisioning State: {hub_info.properties.provisioning_state}")
55
print(f"State: {hub_info.properties.state}")
56
57
if hasattr(hub_info.properties, 'locations'):
58
print("Available Locations:")
59
for location in hub_info.properties.locations:
60
print(f" - {location.location} (Role: {location.role})")
61
print()
62
63
# Prepare failover configuration
64
failover_config = FailoverInput(
65
failover_region="paired-region-name" # Specify target region
66
)
67
68
# Initiate manual failover
69
print("Initiating Manual Failover...")
70
print("=" * 35)
71
72
try:
73
# Start failover operation
74
failover_operation = client.iot_hub.begin_manual_failover(
75
hub_name, resource_group, failover_config
76
)
77
78
print(f"✓ Failover initiated for {hub_name}")
79
print("⚠️ This is a long-running operation that may take several minutes")
80
print("⚠️ IoT Hub will be temporarily unavailable during failover")
81
82
# Monitor failover progress
83
print("\nMonitoring Failover Progress...")
84
start_time = time.time()
85
86
while not failover_operation.done():
87
elapsed_time = int(time.time() - start_time)
88
print(f" Failover in progress... ({elapsed_time}s elapsed)")
89
time.sleep(30) # Check every 30 seconds
90
91
# Wait for completion
92
failover_operation.result() # This will block until completion
93
94
elapsed_time = int(time.time() - start_time)
95
print(f"✓ Failover completed successfully in {elapsed_time} seconds")
96
97
except Exception as e:
98
print(f"✗ Failover failed: {e}")
99
raise
100
```
101
102
### Post-failover verification and monitoring
103
104
```python
105
def verify_failover_completion(resource_group: str, hub_name: str):
106
"""Verify failover completion and new hub status."""
107
108
print("Post-Failover Verification:")
109
print("=" * 35)
110
111
try:
112
# Get updated hub information
113
hub_info = client.iot_hub_resource.get(resource_group, hub_name)
114
115
print(f"Hub Name: {hub_info.name}")
116
print(f"New Location: {hub_info.location}")
117
print(f"Provisioning State: {hub_info.properties.provisioning_state}")
118
print(f"State: {hub_info.properties.state}")
119
120
# Check if hub is operational
121
if hub_info.properties.state == "Active":
122
print("✓ IoT Hub is active and operational")
123
else:
124
print(f"⚠️ IoT Hub state: {hub_info.properties.state}")
125
126
# Verify endpoint health after failover
127
print("\nChecking Endpoint Health:")
128
try:
129
endpoint_health = list(client.iot_hub_resource.get_endpoint_health(resource_group, hub_name))
130
healthy_count = sum(1 for ep in endpoint_health if ep.health_status == "Healthy")
131
total_count = len(endpoint_health)
132
133
print(f" Healthy Endpoints: {healthy_count}/{total_count}")
134
135
for endpoint in endpoint_health:
136
status_icon = "✓" if endpoint.health_status == "Healthy" else "✗"
137
print(f" {status_icon} {endpoint.endpoint_id}: {endpoint.health_status}")
138
139
except Exception as e:
140
print(f" Could not retrieve endpoint health: {e}")
141
142
# Check device registry statistics
143
print("\nDevice Registry Status:")
144
try:
145
stats = client.iot_hub_resource.get_stats(resource_group, hub_name)
146
print(f" Total Devices: {stats.total_device_count}")
147
print(f" Enabled Devices: {stats.enabled_device_count}")
148
print(f" Disabled Devices: {stats.disabled_device_count}")
149
except Exception as e:
150
print(f" Could not retrieve device statistics: {e}")
151
152
return True
153
154
except Exception as e:
155
print(f"✗ Verification failed: {e}")
156
return False
157
158
# Verify failover completion
159
verification_success = verify_failover_completion(resource_group, hub_name)
160
161
if verification_success:
162
print("\n✓ Failover verification completed successfully")
163
else:
164
print("\n✗ Failover verification encountered issues")
165
```
166
167
### Disaster recovery failover with comprehensive monitoring
168
169
```python
170
import json
171
from datetime import datetime
172
173
def execute_disaster_recovery_failover(resource_group: str, hub_name: str, target_region: str):
174
"""Execute comprehensive disaster recovery failover with full monitoring."""
175
176
dr_log = {
177
"operation": "disaster_recovery_failover",
178
"hub_name": hub_name,
179
"start_time": datetime.utcnow().isoformat(),
180
"target_region": target_region,
181
"steps": [],
182
"success": False
183
}
184
185
def log_step(step_name: str, status: str, details: str = ""):
186
step_entry = {
187
"step": step_name,
188
"timestamp": datetime.utcnow().isoformat(),
189
"status": status,
190
"details": details
191
}
192
dr_log["steps"].append(step_entry)
193
print(f"[{status.upper()}] {step_name}: {details}")
194
195
try:
196
# Step 1: Pre-failover validation
197
log_step("pre_failover_validation", "started", "Collecting pre-failover hub state")
198
199
pre_failover_hub = client.iot_hub_resource.get(resource_group, hub_name)
200
dr_log["pre_failover_location"] = pre_failover_hub.location
201
dr_log["pre_failover_state"] = pre_failover_hub.properties.state
202
203
if pre_failover_hub.properties.state != "Active":
204
log_step("pre_failover_validation", "warning", f"Hub not in Active state: {pre_failover_hub.properties.state}")
205
else:
206
log_step("pre_failover_validation", "success", f"Hub is active in {pre_failover_hub.location}")
207
208
# Step 2: Collect pre-failover metrics
209
log_step("collect_metrics", "started", "Collecting pre-failover device and quota metrics")
210
211
pre_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)
212
dr_log["pre_failover_device_count"] = pre_stats.total_device_count
213
214
pre_quotas = list(client.iot_hub_resource.get_quota_metrics(resource_group, hub_name))
215
dr_log["pre_failover_quotas"] = {q.name: {"current": q.current_value, "max": q.max_value} for q in pre_quotas}
216
217
log_step("collect_metrics", "success", f"Collected metrics for {pre_stats.total_device_count} devices")
218
219
# Step 3: Initiate failover
220
log_step("initiate_failover", "started", f"Starting failover to {target_region}")
221
222
failover_config = FailoverInput(failover_region=target_region)
223
failover_operation = client.iot_hub.begin_manual_failover(hub_name, resource_group, failover_config)
224
225
dr_log["failover_started"] = datetime.utcnow().isoformat()
226
227
# Step 4: Monitor failover progress
228
log_step("monitor_failover", "started", "Monitoring failover operation progress")
229
230
start_time = time.time()
231
while not failover_operation.done():
232
elapsed = int(time.time() - start_time)
233
log_step("monitor_failover", "in_progress", f"Failover running for {elapsed} seconds")
234
time.sleep(60) # Check every minute for DR scenario
235
236
# Wait for completion
237
failover_operation.result()
238
239
elapsed_total = int(time.time() - start_time)
240
dr_log["failover_duration_seconds"] = elapsed_total
241
dr_log["failover_completed"] = datetime.utcnow().isoformat()
242
243
log_step("failover_execution", "success", f"Failover completed in {elapsed_total} seconds")
244
245
# Step 5: Post-failover validation
246
log_step("post_failover_validation", "started", "Validating hub state after failover")
247
248
# Allow some time for system stabilization
249
time.sleep(30)
250
251
post_failover_hub = client.iot_hub_resource.get(resource_group, hub_name)
252
dr_log["post_failover_location"] = post_failover_hub.location
253
dr_log["post_failover_state"] = post_failover_hub.properties.state
254
255
if post_failover_hub.properties.state == "Active":
256
log_step("post_failover_validation", "success", f"Hub active in new location: {post_failover_hub.location}")
257
else:
258
log_step("post_failover_validation", "warning", f"Hub state: {post_failover_hub.properties.state}")
259
260
# Step 6: Verify data integrity
261
log_step("data_integrity_check", "started", "Verifying device count and data integrity")
262
263
post_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)
264
dr_log["post_failover_device_count"] = post_stats.total_device_count
265
266
if post_stats.total_device_count == pre_stats.total_device_count:
267
log_step("data_integrity_check", "success", f"Device count verified: {post_stats.total_device_count}")
268
else:
269
log_step("data_integrity_check", "error", f"Device count mismatch: {pre_stats.total_device_count} -> {post_stats.total_device_count}")
270
271
# Step 7: Check endpoint health
272
log_step("endpoint_health_check", "started", "Checking routing endpoint health")
273
274
try:
275
endpoint_health = list(client.iot_hub_resource.get_endpoint_health(resource_group, hub_name))
276
healthy_endpoints = [ep for ep in endpoint_health if ep.health_status == "Healthy"]
277
278
dr_log["post_failover_endpoint_health"] = {
279
"total": len(endpoint_health),
280
"healthy": len(healthy_endpoints),
281
"unhealthy": len(endpoint_health) - len(healthy_endpoints)
282
}
283
284
if len(healthy_endpoints) == len(endpoint_health):
285
log_step("endpoint_health_check", "success", f"All {len(endpoint_health)} endpoints healthy")
286
else:
287
log_step("endpoint_health_check", "warning", f"{len(healthy_endpoints)}/{len(endpoint_health)} endpoints healthy")
288
289
except Exception as e:
290
log_step("endpoint_health_check", "error", f"Could not check endpoint health: {e}")
291
292
dr_log["success"] = True
293
dr_log["end_time"] = datetime.utcnow().isoformat()
294
295
log_step("disaster_recovery", "completed", "Disaster recovery failover completed successfully")
296
297
except Exception as e:
298
dr_log["success"] = False
299
dr_log["error"] = str(e)
300
dr_log["end_time"] = datetime.utcnow().isoformat()
301
302
log_step("disaster_recovery", "failed", f"Disaster recovery failed: {e}")
303
raise
304
305
finally:
306
# Save disaster recovery log
307
log_filename = f"dr_failover_{hub_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
308
with open(log_filename, "w") as f:
309
json.dump(dr_log, f, indent=2, default=str)
310
311
print(f"\nDisaster recovery log saved to: {log_filename}")
312
313
return dr_log
314
315
# Execute disaster recovery failover
316
try:
317
dr_result = execute_disaster_recovery_failover(
318
resource_group,
319
hub_name,
320
"East US 2" # Target failover region
321
)
322
323
if dr_result["success"]:
324
print("\n✅ DISASTER RECOVERY COMPLETED SUCCESSFULLY")
325
print(f" Hub failed over from {dr_result['pre_failover_location']} to {dr_result['post_failover_location']}")
326
print(f" Total time: {dr_result['failover_duration_seconds']} seconds")
327
else:
328
print("\n❌ DISASTER RECOVERY FAILED")
329
print(f" Error: {dr_result.get('error', 'Unknown error')}")
330
331
except Exception as e:
332
print(f"\n💥 CRITICAL FAILURE: {e}")
333
```
334
335
### Failover testing and rollback procedures
336
337
```python
338
def test_failover_procedure(resource_group: str, hub_name: str):
339
"""Test failover procedure in a controlled manner for DR testing."""
340
341
print("Failover Test Procedure:")
342
print("=" * 30)
343
344
# Pre-test validation
345
print("1. Pre-test validation...")
346
original_hub = client.iot_hub_resource.get(resource_group, hub_name)
347
original_location = original_hub.location
348
349
print(f" Original location: {original_location}")
350
print(f" Hub state: {original_hub.properties.state}")
351
352
if original_hub.properties.state != "Active":
353
print(" ⚠️ Hub not in Active state - test may not be reliable")
354
return False
355
356
# Collect baseline metrics
357
print("2. Collecting baseline metrics...")
358
baseline_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)
359
print(f" Device count: {baseline_stats.total_device_count}")
360
361
try:
362
# Test failover
363
print("3. Testing failover...")
364
failover_config = FailoverInput(failover_region="test-region") # Use appropriate test region
365
366
print(" ⚠️ This will temporarily disrupt hub operations")
367
confirm = input(" Continue with test failover? (yes/no): ")
368
369
if confirm.lower() != 'yes':
370
print(" Test cancelled by user")
371
return False
372
373
# Execute test failover
374
failover_op = client.iot_hub.begin_manual_failover(hub_name, resource_group, failover_config)
375
376
print(" Waiting for failover completion...")
377
start_time = time.time()
378
failover_op.result() # Wait for completion
379
380
test_duration = int(time.time() - start_time)
381
print(f" ✓ Test failover completed in {test_duration} seconds")
382
383
# Verify test results
384
print("4. Verifying test results...")
385
test_hub = client.iot_hub_resource.get(resource_group, hub_name)
386
387
print(f" New location: {test_hub.location}")
388
print(f" Hub state: {test_hub.properties.state}")
389
390
# Verify data consistency
391
test_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)
392
if test_stats.total_device_count == baseline_stats.total_device_count:
393
print(f" ✓ Device count consistent: {test_stats.total_device_count}")
394
else:
395
print(f" ✗ Device count changed: {baseline_stats.total_device_count} -> {test_stats.total_device_count}")
396
397
# Test successful
398
print("5. Test completed successfully")
399
print(f" Failover test took {test_duration} seconds")
400
print(" Hub operational in new region")
401
402
return True
403
404
except Exception as e:
405
print(f" ✗ Test failed: {e}")
406
return False
407
408
# Run failover test
409
test_success = test_failover_procedure(resource_group, hub_name)
410
411
if test_success:
412
print("\n✅ Failover test passed - DR procedures verified")
413
else:
414
print("\n❌ Failover test failed - review DR procedures")
415
```
416
417
## Types
418
419
### FailoverInput
420
Configuration for manual failover operations including target region specification for disaster recovery scenarios.
421
422
```python
423
class FailoverInput:
424
"""Manual failover configuration."""
425
failover_region: str # Target Azure region for failover operation
426
```