or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

device-operations.mdevent-hub-consumer-groups.mdfailover-operations.mdindex.mdmessage-routing.mdmonitoring-quotas.mdprivate-networking.mdresource-management.mdsecurity-management.mdutility-operations.md

failover-operations.mddocs/

0

# Failover Operations

1

2

Manual failover capabilities for IoT Hub disaster recovery, enabling controlled failover to paired Azure regions during planned maintenance or disaster recovery scenarios to ensure business continuity and minimal service disruption.

3

4

## Capabilities

5

6

### Manual Failover Initiation

7

8

Initiate manual failover of IoT Hub to its paired region for disaster recovery scenarios, planned maintenance, or testing business continuity procedures.

9

10

```python { .api }

11

def begin_manual_failover(

12

iot_hub_name: str,

13

resource_group_name: str,

14

failover_input: FailoverInput,

15

**kwargs

16

) -> LROPoller[None]:

17

"""

18

Initiate manual failover of IoT hub to its paired region.

19

20

Args:

21

iot_hub_name: Name of the IoT hub resource

22

resource_group_name: Name of the resource group

23

failover_input: Failover configuration including target region

24

25

Returns:

26

LROPoller[None]: Long-running operation for failover process monitoring

27

"""

28

```

29

30

## Usage Examples

31

32

### Initiating planned failover for maintenance

33

34

```python

35

from azure.identity import DefaultAzureCredential

36

from azure.mgmt.iothub import IotHubClient

37

from azure.mgmt.iothub.models import FailoverInput

38

import time

39

40

# Initialize client

41

credential = DefaultAzureCredential()

42

client = IotHubClient(credential, "subscription-id")

43

44

resource_group = "myResourceGroup"

45

hub_name = "myIoTHub"

46

47

# Get current IoT Hub information before failover

48

print("Pre-Failover IoT Hub Status:")

49

print("=" * 40)

50

51

hub_info = client.iot_hub_resource.get(resource_group, hub_name)

52

print(f"Hub Name: {hub_info.name}")

53

print(f"Current Location: {hub_info.location}")

54

print(f"Provisioning State: {hub_info.properties.provisioning_state}")

55

print(f"State: {hub_info.properties.state}")

56

57

if hasattr(hub_info.properties, 'locations'):

58

print("Available Locations:")

59

for location in hub_info.properties.locations:

60

print(f" - {location.location} (Role: {location.role})")

61

print()

62

63

# Prepare failover configuration

64

failover_config = FailoverInput(

65

failover_region="paired-region-name" # Specify target region

66

)

67

68

# Initiate manual failover

69

print("Initiating Manual Failover...")

70

print("=" * 35)

71

72

try:

73

# Start failover operation

74

failover_operation = client.iot_hub.begin_manual_failover(

75

hub_name, resource_group, failover_config

76

)

77

78

print(f"✓ Failover initiated for {hub_name}")

79

print("⚠️ This is a long-running operation that may take several minutes")

80

print("⚠️ IoT Hub will be temporarily unavailable during failover")

81

82

# Monitor failover progress

83

print("\nMonitoring Failover Progress...")

84

start_time = time.time()

85

86

while not failover_operation.done():

87

elapsed_time = int(time.time() - start_time)

88

print(f" Failover in progress... ({elapsed_time}s elapsed)")

89

time.sleep(30) # Check every 30 seconds

90

91

# Wait for completion

92

failover_operation.result() # This will block until completion

93

94

elapsed_time = int(time.time() - start_time)

95

print(f"✓ Failover completed successfully in {elapsed_time} seconds")

96

97

except Exception as e:

98

print(f"✗ Failover failed: {e}")

99

raise

100

```

101

102

### Post-failover verification and monitoring

103

104

```python

105

def verify_failover_completion(resource_group: str, hub_name: str):

106

"""Verify failover completion and new hub status."""

107

108

print("Post-Failover Verification:")

109

print("=" * 35)

110

111

try:

112

# Get updated hub information

113

hub_info = client.iot_hub_resource.get(resource_group, hub_name)

114

115

print(f"Hub Name: {hub_info.name}")

116

print(f"New Location: {hub_info.location}")

117

print(f"Provisioning State: {hub_info.properties.provisioning_state}")

118

print(f"State: {hub_info.properties.state}")

119

120

# Check if hub is operational

121

if hub_info.properties.state == "Active":

122

print("✓ IoT Hub is active and operational")

123

else:

124

print(f"⚠️ IoT Hub state: {hub_info.properties.state}")

125

126

# Verify endpoint health after failover

127

print("\nChecking Endpoint Health:")

128

try:

129

endpoint_health = list(client.iot_hub_resource.get_endpoint_health(resource_group, hub_name))

130

healthy_count = sum(1 for ep in endpoint_health if ep.health_status == "Healthy")

131

total_count = len(endpoint_health)

132

133

print(f" Healthy Endpoints: {healthy_count}/{total_count}")

134

135

for endpoint in endpoint_health:

136

status_icon = "✓" if endpoint.health_status == "Healthy" else "✗"

137

print(f" {status_icon} {endpoint.endpoint_id}: {endpoint.health_status}")

138

139

except Exception as e:

140

print(f" Could not retrieve endpoint health: {e}")

141

142

# Check device registry statistics

143

print("\nDevice Registry Status:")

144

try:

145

stats = client.iot_hub_resource.get_stats(resource_group, hub_name)

146

print(f" Total Devices: {stats.total_device_count}")

147

print(f" Enabled Devices: {stats.enabled_device_count}")

148

print(f" Disabled Devices: {stats.disabled_device_count}")

149

except Exception as e:

150

print(f" Could not retrieve device statistics: {e}")

151

152

return True

153

154

except Exception as e:

155

print(f"✗ Verification failed: {e}")

156

return False

157

158

# Verify failover completion

159

verification_success = verify_failover_completion(resource_group, hub_name)

160

161

if verification_success:

162

print("\n✓ Failover verification completed successfully")

163

else:

164

print("\n✗ Failover verification encountered issues")

165

```

166

167

### Disaster recovery failover with comprehensive monitoring

168

169

```python

170

import json

171

from datetime import datetime

172

173

def execute_disaster_recovery_failover(resource_group: str, hub_name: str, target_region: str):

174

"""Execute comprehensive disaster recovery failover with full monitoring."""

175

176

dr_log = {

177

"operation": "disaster_recovery_failover",

178

"hub_name": hub_name,

179

"start_time": datetime.utcnow().isoformat(),

180

"target_region": target_region,

181

"steps": [],

182

"success": False

183

}

184

185

def log_step(step_name: str, status: str, details: str = ""):

186

step_entry = {

187

"step": step_name,

188

"timestamp": datetime.utcnow().isoformat(),

189

"status": status,

190

"details": details

191

}

192

dr_log["steps"].append(step_entry)

193

print(f"[{status.upper()}] {step_name}: {details}")

194

195

try:

196

# Step 1: Pre-failover validation

197

log_step("pre_failover_validation", "started", "Collecting pre-failover hub state")

198

199

pre_failover_hub = client.iot_hub_resource.get(resource_group, hub_name)

200

dr_log["pre_failover_location"] = pre_failover_hub.location

201

dr_log["pre_failover_state"] = pre_failover_hub.properties.state

202

203

if pre_failover_hub.properties.state != "Active":

204

log_step("pre_failover_validation", "warning", f"Hub not in Active state: {pre_failover_hub.properties.state}")

205

else:

206

log_step("pre_failover_validation", "success", f"Hub is active in {pre_failover_hub.location}")

207

208

# Step 2: Collect pre-failover metrics

209

log_step("collect_metrics", "started", "Collecting pre-failover device and quota metrics")

210

211

pre_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)

212

dr_log["pre_failover_device_count"] = pre_stats.total_device_count

213

214

pre_quotas = list(client.iot_hub_resource.get_quota_metrics(resource_group, hub_name))

215

dr_log["pre_failover_quotas"] = {q.name: {"current": q.current_value, "max": q.max_value} for q in pre_quotas}

216

217

log_step("collect_metrics", "success", f"Collected metrics for {pre_stats.total_device_count} devices")

218

219

# Step 3: Initiate failover

220

log_step("initiate_failover", "started", f"Starting failover to {target_region}")

221

222

failover_config = FailoverInput(failover_region=target_region)

223

failover_operation = client.iot_hub.begin_manual_failover(hub_name, resource_group, failover_config)

224

225

dr_log["failover_started"] = datetime.utcnow().isoformat()

226

227

# Step 4: Monitor failover progress

228

log_step("monitor_failover", "started", "Monitoring failover operation progress")

229

230

start_time = time.time()

231

while not failover_operation.done():

232

elapsed = int(time.time() - start_time)

233

log_step("monitor_failover", "in_progress", f"Failover running for {elapsed} seconds")

234

time.sleep(60) # Check every minute for DR scenario

235

236

# Wait for completion

237

failover_operation.result()

238

239

elapsed_total = int(time.time() - start_time)

240

dr_log["failover_duration_seconds"] = elapsed_total

241

dr_log["failover_completed"] = datetime.utcnow().isoformat()

242

243

log_step("failover_execution", "success", f"Failover completed in {elapsed_total} seconds")

244

245

# Step 5: Post-failover validation

246

log_step("post_failover_validation", "started", "Validating hub state after failover")

247

248

# Allow some time for system stabilization

249

time.sleep(30)

250

251

post_failover_hub = client.iot_hub_resource.get(resource_group, hub_name)

252

dr_log["post_failover_location"] = post_failover_hub.location

253

dr_log["post_failover_state"] = post_failover_hub.properties.state

254

255

if post_failover_hub.properties.state == "Active":

256

log_step("post_failover_validation", "success", f"Hub active in new location: {post_failover_hub.location}")

257

else:

258

log_step("post_failover_validation", "warning", f"Hub state: {post_failover_hub.properties.state}")

259

260

# Step 6: Verify data integrity

261

log_step("data_integrity_check", "started", "Verifying device count and data integrity")

262

263

post_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)

264

dr_log["post_failover_device_count"] = post_stats.total_device_count

265

266

if post_stats.total_device_count == pre_stats.total_device_count:

267

log_step("data_integrity_check", "success", f"Device count verified: {post_stats.total_device_count}")

268

else:

269

log_step("data_integrity_check", "error", f"Device count mismatch: {pre_stats.total_device_count} -> {post_stats.total_device_count}")

270

271

# Step 7: Check endpoint health

272

log_step("endpoint_health_check", "started", "Checking routing endpoint health")

273

274

try:

275

endpoint_health = list(client.iot_hub_resource.get_endpoint_health(resource_group, hub_name))

276

healthy_endpoints = [ep for ep in endpoint_health if ep.health_status == "Healthy"]

277

278

dr_log["post_failover_endpoint_health"] = {

279

"total": len(endpoint_health),

280

"healthy": len(healthy_endpoints),

281

"unhealthy": len(endpoint_health) - len(healthy_endpoints)

282

}

283

284

if len(healthy_endpoints) == len(endpoint_health):

285

log_step("endpoint_health_check", "success", f"All {len(endpoint_health)} endpoints healthy")

286

else:

287

log_step("endpoint_health_check", "warning", f"{len(healthy_endpoints)}/{len(endpoint_health)} endpoints healthy")

288

289

except Exception as e:

290

log_step("endpoint_health_check", "error", f"Could not check endpoint health: {e}")

291

292

dr_log["success"] = True

293

dr_log["end_time"] = datetime.utcnow().isoformat()

294

295

log_step("disaster_recovery", "completed", "Disaster recovery failover completed successfully")

296

297

except Exception as e:

298

dr_log["success"] = False

299

dr_log["error"] = str(e)

300

dr_log["end_time"] = datetime.utcnow().isoformat()

301

302

log_step("disaster_recovery", "failed", f"Disaster recovery failed: {e}")

303

raise

304

305

finally:

306

# Save disaster recovery log

307

log_filename = f"dr_failover_{hub_name}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

308

with open(log_filename, "w") as f:

309

json.dump(dr_log, f, indent=2, default=str)

310

311

print(f"\nDisaster recovery log saved to: {log_filename}")

312

313

return dr_log

314

315

# Execute disaster recovery failover

316

try:

317

dr_result = execute_disaster_recovery_failover(

318

resource_group,

319

hub_name,

320

"East US 2" # Target failover region

321

)

322

323

if dr_result["success"]:

324

print("\n✅ DISASTER RECOVERY COMPLETED SUCCESSFULLY")

325

print(f" Hub failed over from {dr_result['pre_failover_location']} to {dr_result['post_failover_location']}")

326

print(f" Total time: {dr_result['failover_duration_seconds']} seconds")

327

else:

328

print("\n❌ DISASTER RECOVERY FAILED")

329

print(f" Error: {dr_result.get('error', 'Unknown error')}")

330

331

except Exception as e:

332

print(f"\n💥 CRITICAL FAILURE: {e}")

333

```

334

335

### Failover testing and rollback procedures

336

337

```python

338

def test_failover_procedure(resource_group: str, hub_name: str):

339

"""Test failover procedure in a controlled manner for DR testing."""

340

341

print("Failover Test Procedure:")

342

print("=" * 30)

343

344

# Pre-test validation

345

print("1. Pre-test validation...")

346

original_hub = client.iot_hub_resource.get(resource_group, hub_name)

347

original_location = original_hub.location

348

349

print(f" Original location: {original_location}")

350

print(f" Hub state: {original_hub.properties.state}")

351

352

if original_hub.properties.state != "Active":

353

print(" ⚠️ Hub not in Active state - test may not be reliable")

354

return False

355

356

# Collect baseline metrics

357

print("2. Collecting baseline metrics...")

358

baseline_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)

359

print(f" Device count: {baseline_stats.total_device_count}")

360

361

try:

362

# Test failover

363

print("3. Testing failover...")

364

failover_config = FailoverInput(failover_region="test-region") # Use appropriate test region

365

366

print(" ⚠️ This will temporarily disrupt hub operations")

367

confirm = input(" Continue with test failover? (yes/no): ")

368

369

if confirm.lower() != 'yes':

370

print(" Test cancelled by user")

371

return False

372

373

# Execute test failover

374

failover_op = client.iot_hub.begin_manual_failover(hub_name, resource_group, failover_config)

375

376

print(" Waiting for failover completion...")

377

start_time = time.time()

378

failover_op.result() # Wait for completion

379

380

test_duration = int(time.time() - start_time)

381

print(f" ✓ Test failover completed in {test_duration} seconds")

382

383

# Verify test results

384

print("4. Verifying test results...")

385

test_hub = client.iot_hub_resource.get(resource_group, hub_name)

386

387

print(f" New location: {test_hub.location}")

388

print(f" Hub state: {test_hub.properties.state}")

389

390

# Verify data consistency

391

test_stats = client.iot_hub_resource.get_stats(resource_group, hub_name)

392

if test_stats.total_device_count == baseline_stats.total_device_count:

393

print(f" ✓ Device count consistent: {test_stats.total_device_count}")

394

else:

395

print(f" ✗ Device count changed: {baseline_stats.total_device_count} -> {test_stats.total_device_count}")

396

397

# Test successful

398

print("5. Test completed successfully")

399

print(f" Failover test took {test_duration} seconds")

400

print(" Hub operational in new region")

401

402

return True

403

404

except Exception as e:

405

print(f" ✗ Test failed: {e}")

406

return False

407

408

# Run failover test

409

test_success = test_failover_procedure(resource_group, hub_name)

410

411

if test_success:

412

print("\n✅ Failover test passed - DR procedures verified")

413

else:

414

print("\n❌ Failover test failed - review DR procedures")

415

```

416

417

## Types

418

419

### FailoverInput

420

Configuration for manual failover operations including target region specification for disaster recovery scenarios.

421

422

```python

423

class FailoverInput:

424

"""Manual failover configuration."""

425

failover_region: str # Target Azure region for failover operation

426

```