or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

batch-systems.mdcore-workflow.mdfile-management.mdindex.mdjob-stores.mdprovisioning.mdutilities.mdworkflow-languages.md

utilities.mddocs/

0

# Utilities and CLI Tools

1

2

## Overview

3

4

Toil provides a comprehensive suite of command-line utilities and helper functions for workflow management, debugging, monitoring, and cluster operations. These tools cover the complete lifecycle from workflow development and testing to production deployment and maintenance. The utilities integrate seamlessly with Toil's core functionality and provide both interactive and scriptable interfaces for automation and integration with larger systems.

5

6

## Capabilities

7

8

### Main CLI Interface

9

{ .api }

10

11

The primary `toil` command provides the main interface for workflow execution and management.

12

13

```python

14

from toil.utils.toilMain import main as toil_main

15

import sys

16

17

# Command-line workflow execution

18

def run_toil_workflow():

19

"""Execute workflow using main CLI interface."""

20

21

# Basic workflow execution

22

# toil workflow.py file:jobstore

23

24

# With resource specifications

25

# toil --batchSystem=local --maxCores=8 --maxMemory=16G workflow.py file:jobstore

26

27

# Distributed execution

28

# toil --batchSystem=slurm --maxNodes=50 --nodeTypes=compute workflow.py file:jobstore

29

30

# Programmatic execution through main interface

31

sys.argv = [

32

'toil',

33

'--batchSystem=local',

34

'--jobStore=file:test-jobstore',

35

'--maxCores=4',

36

'--maxMemory=8G',

37

'--logLevel=INFO',

38

'--stats',

39

'workflow_script.py',

40

'input_file.txt'

41

]

42

43

# Execute workflow

44

exit_code = toil_main()

45

return exit_code

46

47

def advanced_toil_options():

48

"""Demonstrate advanced toil CLI options."""

49

50

# Comprehensive workflow execution with all major options

51

advanced_command = [

52

'toil',

53

54

# Job store and batch system

55

'--jobStore=aws:us-west-2:my-bucket:workflow-run',

56

'--batchSystem=kubernetes',

57

58

# Resource limits

59

'--maxCores=1000',

60

'--maxMemory=2T',

61

'--maxDisk=10T',

62

'--defaultMemory=4G',

63

'--defaultCores=2',

64

'--defaultDisk=10G',

65

66

# Scaling and provisioning

67

'--provisioner=aws',

68

'--nodeTypes=m5.large,m5.xlarge:0.50,c5.2xlarge',

69

'--maxNodes=100',

70

'--minNodes=5',

71

'--targetTime=1800', # Target job runtime in seconds

72

73

# Fault tolerance

74

'--retryCount=3',

75

'--rescueJobsFrequency=3600',

76

'--maxJobDuration=86400', # 24 hours max per job

77

78

# Preemption and spot instances

79

'--defaultPreemptible',

80

'--preemptibleCompensation=1.5',

81

'--preemptibleWorkerTimeout=1800',

82

83

# Cleanup and management

84

'--clean=onSuccess',

85

'--cleanWorkDir=onSuccess',

86

'--clusterStats=/tmp/cluster-stats',

87

88

# Logging and monitoring

89

'--logLevel=INFO',

90

'--logFile=/tmp/toil-workflow.log',

91

'--stats',

92

'--metrics',

93

94

# Working directories

95

'--workDir=/tmp/toil-work',

96

'--coordinationDir=/shared/coordination',

97

98

# Security and encryption

99

'--sseKey=alias/toil-kms-key',

100

'--encryptedFileStore',

101

102

# Container support

103

'--disableAutoDeployment=False',

104

'--containerEngine=docker',

105

106

# Workflow script and arguments

107

'complex_workflow.py',

108

'--input-dir=/data/inputs',

109

'--output-dir=/data/outputs',

110

'--reference-genome=/data/reference.fa',

111

'--threads=16'

112

]

113

114

return advanced_command

115

```

116

117

### Workflow Status and Monitoring

118

{ .api }

119

120

Tools for monitoring workflow progress and status in real-time.

121

122

```python

123

from toil.utils.toilStatus import main as status_main

124

from toil.utils.toilStats import main as stats_main

125

126

def monitor_workflow_status():

127

"""Monitor running workflow status."""

128

129

# Check workflow status

130

# toil status file:jobstore

131

132

# Programmatic status checking

133

sys.argv = ['toil-status', 'file:my-jobstore']

134

status_info = status_main()

135

136

# Detailed status with job breakdown

137

sys.argv = [

138

'toil-status',

139

'--verbose',

140

'--failIfNotComplete',

141

'aws:us-west-2:my-bucket:workflow'

142

]

143

144

detailed_status = status_main()

145

return detailed_status

146

147

def analyze_workflow_statistics():

148

"""Analyze workflow execution statistics."""

149

150

# Basic statistics

151

# toil stats file:jobstore

152

153

# Comprehensive statistics analysis

154

sys.argv = [

155

'toil-stats',

156

'--raw', # Raw statistics data

157

'--pretty', # Human-readable format

158

'--categories=time,clock,wait,memory,disk',

159

'--sortCategory=time',

160

'--sortField=total',

161

'--sortReverse',

162

'file:completed-jobstore'

163

]

164

165

stats_result = stats_main()

166

167

# Export statistics to file

168

sys.argv = [

169

'toil-stats',

170

'--outputFile=/tmp/workflow-stats.json',

171

'--format=json',

172

'file:jobstore'

173

]

174

175

stats_main()

176

177

return stats_result

178

179

def real_time_monitoring():

180

"""Real-time workflow monitoring implementation."""

181

182

import time

183

import json

184

from toil.statsAndLogging import StatsAndLogging

185

186

class WorkflowMonitor:

187

"""Real-time workflow monitoring."""

188

189

def __init__(self, jobstore_locator: str):

190

self.jobstore = jobstore_locator

191

self.stats_collector = StatsAndLogging()

192

self.monitoring = True

193

194

def start_monitoring(self, update_interval: int = 30):

195

"""Start real-time monitoring loop."""

196

197

while self.monitoring:

198

try:

199

# Get current status

200

status = self.get_workflow_status()

201

202

# Get performance metrics

203

metrics = self.get_performance_metrics()

204

205

# Display or log status

206

self.display_status(status, metrics)

207

208

# Check for completion

209

if status.get('completed', False):

210

print("Workflow completed successfully!")

211

break

212

213

if status.get('failed', False):

214

print("Workflow failed!")

215

self.handle_workflow_failure(status)

216

break

217

218

time.sleep(update_interval)

219

220

except KeyboardInterrupt:

221

print("Monitoring stopped by user")

222

break

223

except Exception as e:

224

print(f"Monitoring error: {e}")

225

time.sleep(5)

226

227

def get_workflow_status(self) -> dict:

228

"""Get current workflow status."""

229

230

# Implementation would call toil status programmatically

231

status = {

232

'total_jobs': 0,

233

'completed_jobs': 0,

234

'running_jobs': 0,

235

'failed_jobs': 0,

236

'queued_jobs': 0,

237

'completion_percentage': 0.0,

238

'estimated_time_remaining': None

239

}

240

241

# Populate status from job store

242

# ... implementation details ...

243

244

return status

245

246

def get_performance_metrics(self) -> dict:

247

"""Get performance metrics."""

248

249

metrics = {

250

'cpu_utilization': 0.0,

251

'memory_usage': 0.0,

252

'network_io': 0.0,

253

'disk_io': 0.0,

254

'cost_per_hour': 0.0,

255

'jobs_per_minute': 0.0

256

}

257

258

# Collect from various sources

259

# ... implementation details ...

260

261

return metrics

262

263

def display_status(self, status: dict, metrics: dict):

264

"""Display formatted status information."""

265

266

print("\n" + "="*60)

267

print(f"Workflow Status - {time.strftime('%Y-%m-%d %H:%M:%S')}")

268

print("="*60)

269

270

print(f"Jobs: {status['completed_jobs']}/{status['total_jobs']} completed")

271

print(f"Running: {status['running_jobs']}, Queued: {status['queued_jobs']}")

272

print(f"Failed: {status['failed_jobs']}")

273

print(f"Progress: {status['completion_percentage']:.1f}%")

274

275

if status['estimated_time_remaining']:

276

print(f"Est. remaining: {status['estimated_time_remaining']}")

277

278

print(f"\nPerformance:")

279

print(f"CPU Utilization: {metrics['cpu_utilization']:.1f}%")

280

print(f"Memory Usage: {metrics['memory_usage']:.1f}%")

281

print(f"Cost/Hour: ${metrics['cost_per_hour']:.2f}")

282

print(f"Jobs/Minute: {metrics['jobs_per_minute']:.1f}")

283

284

def stop_monitoring(self):

285

"""Stop monitoring loop."""

286

self.monitoring = False

287

288

# Usage

289

monitor = WorkflowMonitor("file:my-jobstore")

290

monitor.start_monitoring(update_interval=10)

291

```

292

293

### Debugging and Troubleshooting Tools

294

{ .api }

295

296

Comprehensive debugging utilities for workflow development and troubleshooting.

297

298

```python

299

from toil.utils.toilDebugJob import main as debug_job_main

300

from toil.utils.toilDebugFile import main as debug_file_main

301

302

def debug_workflow_issues():

303

"""Debug workflow execution issues."""

304

305

# Debug specific job

306

# toil debug-job file:jobstore <job-id>

307

308

sys.argv = [

309

'toil-debug-job',

310

'--printJobInfo', # Print job information

311

'--printJobChildren', # Print child jobs

312

'--printJobFiles', # Print associated files

313

'--printJobLogging', # Print job logs

314

'file:my-jobstore',

315

'job-id-12345'

316

]

317

318

debug_job_main()

319

320

# Debug file issues

321

# toil debug-file file:jobstore <file-id>

322

323

sys.argv = [

324

'toil-debug-file',

325

'--printFileInfo', # File metadata

326

'--printFileContent', # File contents (if small)

327

'--saveFile=/tmp/debug-file', # Save file locally

328

'file:my-jobstore',

329

'file-id-67890'

330

]

331

332

debug_file_main()

333

334

def advanced_debugging_tools():

335

"""Advanced debugging and analysis tools."""

336

337

class WorkflowDebugger:

338

"""Comprehensive workflow debugging toolkit."""

339

340

def __init__(self, jobstore_locator: str):

341

self.jobstore = jobstore_locator

342

343

def analyze_failed_jobs(self):

344

"""Analyze failed jobs and common failure patterns."""

345

346

failed_jobs = self.get_failed_jobs()

347

348

failure_patterns = {

349

'out_of_memory': 0,

350

'timeout': 0,

351

'missing_files': 0,

352

'command_not_found': 0,

353

'permission_denied': 0,

354

'network_error': 0,

355

'unknown': 0

356

}

357

358

for job in failed_jobs:

359

# Analyze failure reason

360

exit_code = job.get('exit_code', 0)

361

stderr = job.get('stderr', '')

362

363

if exit_code == 137: # SIGKILL - likely OOM

364

failure_patterns['out_of_memory'] += 1

365

elif exit_code == 124: # timeout

366

failure_patterns['timeout'] += 1

367

elif 'No such file' in stderr:

368

failure_patterns['missing_files'] += 1

369

elif 'command not found' in stderr:

370

failure_patterns['command_not_found'] += 1

371

elif 'Permission denied' in stderr:

372

failure_patterns['permission_denied'] += 1

373

elif 'network' in stderr.lower():

374

failure_patterns['network_error'] += 1

375

else:

376

failure_patterns['unknown'] += 1

377

378

# Generate debugging report

379

print("Failed Job Analysis:")

380

print("="*50)

381

for pattern, count in failure_patterns.items():

382

if count > 0:

383

print(f"{pattern.replace('_', ' ').title()}: {count} jobs")

384

self.suggest_fixes(pattern, count)

385

386

return failure_patterns

387

388

def suggest_fixes(self, pattern: str, count: int):

389

"""Suggest fixes for common failure patterns."""

390

391

suggestions = {

392

'out_of_memory': [

393

"Increase memory requirements for affected jobs",

394

"Use streaming or chunked processing for large data",

395

"Check for memory leaks in job code"

396

],

397

'timeout': [

398

"Increase job timeout limits",

399

"Optimize algorithm efficiency",

400

"Split large jobs into smaller chunks"

401

],

402

'missing_files': [

403

"Verify input file paths and availability",

404

"Check file transfer and staging",

405

"Ensure proper job dependencies"

406

],

407

'command_not_found': [

408

"Install missing software in Docker images",

409

"Check PATH environment variable",

410

"Verify tool versions and compatibility"

411

],

412

'permission_denied': [

413

"Fix file and directory permissions",

414

"Check Docker volume mounts",

415

"Verify user/group settings"

416

]

417

}

418

419

if pattern in suggestions:

420

print(f" Suggested fixes:")

421

for suggestion in suggestions[pattern]:

422

print(f" • {suggestion}")

423

424

def trace_job_execution(self, job_id: str):

425

"""Trace job execution history and dependencies."""

426

427

job_info = self.get_job_info(job_id)

428

429

print(f"Job Execution Trace: {job_id}")

430

print("="*50)

431

print(f"Job Name: {job_info.get('name', 'Unknown')}")

432

print(f"Status: {job_info.get('status', 'Unknown')}")

433

print(f"Start Time: {job_info.get('start_time', 'Unknown')}")

434

print(f"End Time: {job_info.get('end_time', 'Unknown')}")

435

print(f"Duration: {job_info.get('duration', 'Unknown')}")

436

print(f"Resources Used: CPU={job_info.get('cpu_used', 'N/A')}, "

437

f"Memory={job_info.get('memory_used', 'N/A')}")

438

439

# Show dependencies

440

predecessors = job_info.get('predecessors', [])

441

if predecessors:

442

print(f"\nPredecessor Jobs:")

443

for pred_id in predecessors:

444

pred_info = self.get_job_info(pred_id)

445

print(f" {pred_id}: {pred_info.get('status', 'Unknown')}")

446

447

# Show children

448

children = job_info.get('children', [])

449

if children:

450

print(f"\nChild Jobs:")

451

for child_id in children:

452

child_info = self.get_job_info(child_id)

453

print(f" {child_id}: {child_info.get('status', 'Unknown')}")

454

455

# Show files

456

input_files = job_info.get('input_files', [])

457

output_files = job_info.get('output_files', [])

458

459

if input_files:

460

print(f"\nInput Files:")

461

for file_id in input_files:

462

print(f" {file_id}")

463

464

if output_files:

465

print(f"\nOutput Files:")

466

for file_id in output_files:

467

print(f" {file_id}")

468

469

def performance_analysis(self):

470

"""Analyze workflow performance and bottlenecks."""

471

472

jobs = self.get_all_jobs()

473

474

# Resource utilization analysis

475

cpu_utilization = []

476

memory_utilization = []

477

job_durations = []

478

479

for job in jobs:

480

if job.get('completed'):

481

cpu_util = job.get('cpu_utilization', 0)

482

mem_util = job.get('memory_utilization', 0)

483

duration = job.get('duration', 0)

484

485

cpu_utilization.append(cpu_util)

486

memory_utilization.append(mem_util)

487

job_durations.append(duration)

488

489

# Calculate statistics

490

if cpu_utilization:

491

avg_cpu = sum(cpu_utilization) / len(cpu_utilization)

492

avg_memory = sum(memory_utilization) / len(memory_utilization)

493

avg_duration = sum(job_durations) / len(job_durations)

494

495

print("Performance Analysis:")

496

print("="*50)

497

print(f"Average CPU Utilization: {avg_cpu:.1f}%")

498

print(f"Average Memory Utilization: {avg_memory:.1f}%")

499

print(f"Average Job Duration: {avg_duration:.2f}s")

500

501

# Identify bottlenecks

502

long_jobs = [j for j in jobs if j.get('duration', 0) > avg_duration * 3]

503

if long_jobs:

504

print(f"\nLong-running jobs ({len(long_jobs)}):")

505

for job in sorted(long_jobs, key=lambda x: x.get('duration', 0), reverse=True)[:5]:

506

print(f" {job['id']}: {job.get('duration', 0):.2f}s")

507

508

# Resource inefficiency

509

inefficient_jobs = [j for j in jobs if j.get('cpu_utilization', 100) < 50]

510

if inefficient_jobs:

511

print(f"\nResource-inefficient jobs ({len(inefficient_jobs)}):")

512

for job in inefficient_jobs[:5]:

513

print(f" {job['id']}: {job.get('cpu_utilization', 0):.1f}% CPU")

514

```

515

516

### Cleanup and Maintenance Tools

517

{ .api }

518

519

Tools for cleaning up job stores and maintaining workflow environments.

520

521

```python

522

from toil.utils.toilClean import main as clean_main

523

from toil.utils.toilKill import main as kill_main

524

525

def cleanup_workflows():

526

"""Clean up workflow artifacts and job stores."""

527

528

# Clean completed workflow

529

# toil clean file:jobstore

530

531

sys.argv = ['toil-clean', 'file:completed-jobstore']

532

clean_main()

533

534

# Comprehensive cleanup with options

535

sys.argv = [

536

'toil-clean',

537

'--cleanWorkDir', # Clean working directories

538

'--cleanJobStore', # Clean job store completely

539

'aws:us-west-2:my-bucket:old-workflow'

540

]

541

542

clean_main()

543

544

# Kill running workflow

545

# toil kill file:jobstore

546

547

sys.argv = ['toil-kill', 'file:running-jobstore']

548

kill_main()

549

550

def maintenance_utilities():

551

"""Workflow maintenance and housekeeping utilities."""

552

553

class WorkflowMaintenance:

554

"""Workflow maintenance toolkit."""

555

556

def cleanup_old_jobstores(self, days_old: int = 30):

557

"""Clean up job stores older than specified days."""

558

559

import os

560

import time

561

562

# Find old job stores (implementation depends on storage type)

563

current_time = time.time()

564

cutoff_time = current_time - (days_old * 86400)

565

566

old_jobstores = []

567

568

# For file-based job stores

569

jobstore_dir = "/tmp/toil-jobstores"

570

if os.path.exists(jobstore_dir):

571

for item in os.listdir(jobstore_dir):

572

item_path = os.path.join(jobstore_dir, item)

573

if os.path.isdir(item_path):

574

mtime = os.path.getmtime(item_path)

575

if mtime < cutoff_time:

576

old_jobstores.append(f"file:{item_path}")

577

578

# Clean up old job stores

579

for jobstore in old_jobstores:

580

try:

581

print(f"Cleaning old jobstore: {jobstore}")

582

sys.argv = ['toil-clean', jobstore]

583

clean_main()

584

except Exception as e:

585

print(f"Failed to clean {jobstore}: {e}")

586

587

return len(old_jobstores)

588

589

def archive_completed_workflows(self, archive_dir: str):

590

"""Archive completed workflow results."""

591

592

import shutil

593

import json

594

595

completed_workflows = self.find_completed_workflows()

596

597

for workflow in completed_workflows:

598

try:

599

# Create archive directory

600

workflow_archive = os.path.join(

601

archive_dir,

602

f"workflow_{workflow['id']}_{workflow['completion_date']}"

603

)

604

os.makedirs(workflow_archive, exist_ok=True)

605

606

# Archive workflow results

607

if workflow['output_files']:

608

output_archive = os.path.join(workflow_archive, "outputs")

609

os.makedirs(output_archive, exist_ok=True)

610

611

for output_file in workflow['output_files']:

612

shutil.copy2(output_file, output_archive)

613

614

# Archive statistics

615

stats_file = os.path.join(workflow_archive, "statistics.json")

616

with open(stats_file, 'w') as f:

617

json.dump(workflow['statistics'], f, indent=2)

618

619

# Archive logs

620

if workflow.get('log_files'):

621

log_archive = os.path.join(workflow_archive, "logs")

622

os.makedirs(log_archive, exist_ok=True)

623

624

for log_file in workflow['log_files']:

625

shutil.copy2(log_file, log_archive)

626

627

print(f"Archived workflow {workflow['id']} to {workflow_archive}")

628

629

except Exception as e:

630

print(f"Failed to archive workflow {workflow['id']}: {e}")

631

632

def monitor_disk_usage(self, threshold_percent: float = 85.0):

633

"""Monitor and alert on high disk usage."""

634

635

import shutil

636

637

# Check various Toil directories

638

directories_to_check = [

639

"/tmp/toil-work",

640

"/var/tmp/toil",

641

"/tmp/toil-jobstores"

642

]

643

644

alerts = []

645

646

for directory in directories_to_check:

647

if os.path.exists(directory):

648

total, used, free = shutil.disk_usage(directory)

649

usage_percent = (used / total) * 100

650

651

if usage_percent > threshold_percent:

652

alerts.append({

653

'directory': directory,

654

'usage_percent': usage_percent,

655

'free_gb': free / (1024**3),

656

'total_gb': total / (1024**3)

657

})

658

659

if alerts:

660

print("DISK USAGE ALERTS:")

661

print("="*50)

662

for alert in alerts:

663

print(f"Directory: {alert['directory']}")

664

print(f"Usage: {alert['usage_percent']:.1f}%")

665

print(f"Free: {alert['free_gb']:.1f} GB")

666

print(f"Total: {alert['total_gb']:.1f} GB")

667

print("-" * 30)

668

669

# Suggest cleanup actions

670

print("Suggested Actions:")

671

print("• Run 'toil clean' on completed job stores")

672

print("• Archive old workflow outputs")

673

print("• Clear temporary directories")

674

print("• Check for large log files")

675

676

return alerts

677

```

678

679

### Cluster Management Tools

680

{ .api }

681

682

Command-line tools for managing cloud clusters and distributed environments.

683

684

```python

685

from toil.utils.toilLaunchCluster import main as launch_cluster_main

686

from toil.utils.toilDestroyCluster import main as destroy_cluster_main

687

from toil.utils.toilSshCluster import main as ssh_cluster_main

688

from toil.utils.toilRsyncCluster import main as rsync_cluster_main

689

690

def cluster_management():

691

"""Manage cloud clusters for distributed execution."""

692

693

# Launch cluster

694

# toil launch-cluster my-cluster --nodeTypes=m5.large,m5.xlarge --maxNodes=50

695

696

sys.argv = [

697

'toil-launch-cluster',

698

'research-cluster',

699

'--provisioner=aws',

700

'--nodeTypes=m5.large,m5.xlarge:0.50,c5.2xlarge',

701

'--maxNodes=100',

702

'--zone=us-west-2a',

703

'--keyPairName=research-keypair',

704

'--leaderNodeType=m5.large',

705

'--nodeStorage=100',

706

'--preemptibleWorkers',

707

'--logLevel=INFO'

708

]

709

710

launch_cluster_main()

711

712

# SSH to cluster leader

713

# toil ssh-cluster my-cluster

714

715

sys.argv = ['toil-ssh-cluster', 'research-cluster']

716

ssh_cluster_main()

717

718

# Sync files to cluster

719

# toil rsync-cluster -r local-dir my-cluster:remote-dir

720

721

sys.argv = [

722

'toil-rsync-cluster',

723

'--recursive',

724

'/local/data/',

725

'research-cluster:/shared/data/'

726

]

727

728

rsync_cluster_main()

729

730

# Destroy cluster

731

# toil destroy-cluster my-cluster

732

733

sys.argv = ['toil-destroy-cluster', 'research-cluster']

734

destroy_cluster_main()

735

736

def advanced_cluster_operations():

737

"""Advanced cluster management operations."""

738

739

class ClusterOperations:

740

"""Advanced cluster operation utilities."""

741

742

def deploy_workflow_to_cluster(self, cluster_name: str,

743

workflow_script: str,

744

input_data: str):

745

"""Deploy and run workflow on cluster."""

746

747

# Copy workflow and data to cluster

748

print(f"Deploying to cluster: {cluster_name}")

749

750

# Sync workflow files

751

sys.argv = [

752

'toil-rsync-cluster',

753

'--recursive',

754

os.path.dirname(workflow_script),

755

f'{cluster_name}:/toil/workflows/'

756

]

757

rsync_cluster_main()

758

759

# Sync input data

760

sys.argv = [

761

'toil-rsync-cluster',

762

'--recursive',

763

input_data,

764

f'{cluster_name}:/toil/data/'

765

]

766

rsync_cluster_main()

767

768

# Execute workflow on cluster

769

remote_command = f"""

770

cd /toil &&

771

toil --batchSystem=mesos \\

772

--jobStore=aws:us-west-2:results:workflow-run \\

773

--provisioner=aws \\

774

--nodeTypes=m5.large,m5.xlarge \\

775

--maxNodes=50 \\

776

--stats \\

777

workflows/{os.path.basename(workflow_script)} \\

778

data/

779

"""

780

781

# SSH and execute

782

sys.argv = [

783

'toil-ssh-cluster',

784

cluster_name,

785

remote_command

786

]

787

788

ssh_cluster_main()

789

790

def monitor_cluster_resources(self, cluster_name: str):

791

"""Monitor cluster resource usage."""

792

793

monitoring_script = """

794

# Get node information

795

kubectl get nodes -o wide

796

797

# Check resource usage

798

kubectl top nodes

799

800

# Check running pods

801

kubectl get pods --all-namespaces

802

803

# System resource usage

804

free -h

805

df -h

806

807

# Toil-specific monitoring

808

ps aux | grep toil

809

810

# Check logs

811

tail -n 50 /var/log/toil/*.log

812

"""

813

814

sys.argv = [

815

'toil-ssh-cluster',

816

cluster_name,

817

monitoring_script

818

]

819

820

ssh_cluster_main()

821

822

def backup_cluster_data(self, cluster_name: str, backup_location: str):

823

"""Backup important cluster data."""

824

825

# Sync results back to local

826

sys.argv = [

827

'toil-rsync-cluster',

828

'--recursive',

829

f'{cluster_name}:/toil/results/',

830

os.path.join(backup_location, 'results')

831

]

832

rsync_cluster_main()

833

834

# Sync logs

835

sys.argv = [

836

'toil-rsync-cluster',

837

'--recursive',

838

f'{cluster_name}:/var/log/toil/',

839

os.path.join(backup_location, 'logs')

840

]

841

rsync_cluster_main()

842

843

# Sync job store (if file-based)

844

sys.argv = [

845

'toil-rsync-cluster',

846

'--recursive',

847

f'{cluster_name}:/toil/jobstore/',

848

os.path.join(backup_location, 'jobstore')

849

]

850

rsync_cluster_main()

851

```

852

853

### Configuration Management

854

{ .api }

855

856

Tools for managing Toil configuration files and settings.

857

858

```python

859

from toil.utils.toilConfig import main as config_main

860

from toil.common import get_default_config_path, ensure_config, generate_config, update_config

861

862

def configuration_management():

863

"""Manage Toil configuration files and settings."""

864

865

# Generate default configuration

866

default_config_path = get_default_config_path()

867

ensure_config(default_config_path)

868

869

# Generate custom configuration

870

custom_config = "/etc/toil/production.conf"

871

generate_config(custom_config)

872

873

# Update configuration values

874

update_config(custom_config, "batchSystem", "slurm")

875

update_config(custom_config, "maxCores", "64")

876

update_config(custom_config, "defaultMemory", "8G")

877

878

# Use config management CLI

879

sys.argv = [

880

'toil-config',

881

'--set', 'jobStore=aws:us-west-2:my-bucket',

882

'--set', 'batchSystem=kubernetes',

883

'--set', 'maxNodes=100',

884

'--config-file', custom_config

885

]

886

887

config_main()

888

889

def configuration_templates():

890

"""Provide configuration templates for different use cases."""

891

892

templates = {

893

'local_development': {

894

'batchSystem': 'local',

895

'maxCores': 4,

896

'maxMemory': '8G',

897

'jobStore': 'file:/tmp/toil-dev',

898

'logLevel': 'DEBUG',

899

'stats': True

900

},

901

902

'hpc_cluster': {

903

'batchSystem': 'slurm',

904

'maxCores': 1000,

905

'maxMemory': '2T',

906

'jobStore': 'file:/shared/toil-jobs',

907

'workDir': '/tmp/toil-work',

908

'logLevel': 'INFO',

909

'retryCount': 3,

910

'rescueJobsFrequency': 3600

911

},

912

913

'cloud_production': {

914

'batchSystem': 'kubernetes',

915

'provisioner': 'aws',

916

'jobStore': 'aws:us-west-2:production-bucket:workflows',

917

'nodeTypes': ['m5.large', 'm5.xlarge', 'c5.2xlarge'],

918

'maxNodes': 500,

919

'defaultPreemptible': True,

920

'preemptibleCompensation': 1.5,

921

'sseKey': 'alias/toil-production-key',

922

'clean': 'onSuccess',

923

'stats': True,

924

'logLevel': 'INFO'

925

}

926

}

927

928

def create_config_from_template(template_name: str, output_file: str):

929

"""Create configuration file from template."""

930

931

if template_name not in templates:

932

raise ValueError(f"Unknown template: {template_name}")

933

934

template = templates[template_name]

935

936

# Generate base config

937

generate_config(output_file)

938

939

# Apply template values

940

for key, value in template.items():

941

update_config(output_file, key, value)

942

943

print(f"Configuration created: {output_file}")

944

print(f"Template used: {template_name}")

945

946

return create_config_from_template

947

```

948

949

This comprehensive utilities and CLI tools suite provides complete workflow lifecycle management from development through production deployment with extensive monitoring, debugging, and maintenance capabilities.