0
# Utilities and CLI Tools
1
2
## Overview
3
4
Toil provides a comprehensive suite of command-line utilities and helper functions for workflow management, debugging, monitoring, and cluster operations. These tools cover the complete lifecycle from workflow development and testing to production deployment and maintenance. The utilities integrate seamlessly with Toil's core functionality and provide both interactive and scriptable interfaces for automation and integration with larger systems.
5
6
## Capabilities
7
8
### Main CLI Interface
9
{ .api }
10
11
The primary `toil` command provides the main interface for workflow execution and management.
12
13
```python
14
from toil.utils.toilMain import main as toil_main
15
import sys
16
17
# Command-line workflow execution
18
def run_toil_workflow():
19
"""Execute workflow using main CLI interface."""
20
21
# Basic workflow execution
22
# toil workflow.py file:jobstore
23
24
# With resource specifications
25
# toil --batchSystem=local --maxCores=8 --maxMemory=16G workflow.py file:jobstore
26
27
# Distributed execution
28
# toil --batchSystem=slurm --maxNodes=50 --nodeTypes=compute workflow.py file:jobstore
29
30
# Programmatic execution through main interface
31
sys.argv = [
32
'toil',
33
'--batchSystem=local',
34
'--jobStore=file:test-jobstore',
35
'--maxCores=4',
36
'--maxMemory=8G',
37
'--logLevel=INFO',
38
'--stats',
39
'workflow_script.py',
40
'input_file.txt'
41
]
42
43
# Execute workflow
44
exit_code = toil_main()
45
return exit_code
46
47
def advanced_toil_options():
48
"""Demonstrate advanced toil CLI options."""
49
50
# Comprehensive workflow execution with all major options
51
advanced_command = [
52
'toil',
53
54
# Job store and batch system
55
'--jobStore=aws:us-west-2:my-bucket:workflow-run',
56
'--batchSystem=kubernetes',
57
58
# Resource limits
59
'--maxCores=1000',
60
'--maxMemory=2T',
61
'--maxDisk=10T',
62
'--defaultMemory=4G',
63
'--defaultCores=2',
64
'--defaultDisk=10G',
65
66
# Scaling and provisioning
67
'--provisioner=aws',
68
'--nodeTypes=m5.large,m5.xlarge:0.50,c5.2xlarge',
69
'--maxNodes=100',
70
'--minNodes=5',
71
'--targetTime=1800', # Target job runtime in seconds
72
73
# Fault tolerance
74
'--retryCount=3',
75
'--rescueJobsFrequency=3600',
76
'--maxJobDuration=86400', # 24 hours max per job
77
78
# Preemption and spot instances
79
'--defaultPreemptible',
80
'--preemptibleCompensation=1.5',
81
'--preemptibleWorkerTimeout=1800',
82
83
# Cleanup and management
84
'--clean=onSuccess',
85
'--cleanWorkDir=onSuccess',
86
'--clusterStats=/tmp/cluster-stats',
87
88
# Logging and monitoring
89
'--logLevel=INFO',
90
'--logFile=/tmp/toil-workflow.log',
91
'--stats',
92
'--metrics',
93
94
# Working directories
95
'--workDir=/tmp/toil-work',
96
'--coordinationDir=/shared/coordination',
97
98
# Security and encryption
99
'--sseKey=alias/toil-kms-key',
100
'--encryptedFileStore',
101
102
# Container support
103
'--disableAutoDeployment=False',
104
'--containerEngine=docker',
105
106
# Workflow script and arguments
107
'complex_workflow.py',
108
'--input-dir=/data/inputs',
109
'--output-dir=/data/outputs',
110
'--reference-genome=/data/reference.fa',
111
'--threads=16'
112
]
113
114
return advanced_command
115
```
116
117
### Workflow Status and Monitoring
118
{ .api }
119
120
Tools for monitoring workflow progress and status in real-time.
121
122
```python
123
from toil.utils.toilStatus import main as status_main
124
from toil.utils.toilStats import main as stats_main
125
126
def monitor_workflow_status():
127
"""Monitor running workflow status."""
128
129
# Check workflow status
130
# toil status file:jobstore
131
132
# Programmatic status checking
133
sys.argv = ['toil-status', 'file:my-jobstore']
134
status_info = status_main()
135
136
# Detailed status with job breakdown
137
sys.argv = [
138
'toil-status',
139
'--verbose',
140
'--failIfNotComplete',
141
'aws:us-west-2:my-bucket:workflow'
142
]
143
144
detailed_status = status_main()
145
return detailed_status
146
147
def analyze_workflow_statistics():
148
"""Analyze workflow execution statistics."""
149
150
# Basic statistics
151
# toil stats file:jobstore
152
153
# Comprehensive statistics analysis
154
sys.argv = [
155
'toil-stats',
156
'--raw', # Raw statistics data
157
'--pretty', # Human-readable format
158
'--categories=time,clock,wait,memory,disk',
159
'--sortCategory=time',
160
'--sortField=total',
161
'--sortReverse',
162
'file:completed-jobstore'
163
]
164
165
stats_result = stats_main()
166
167
# Export statistics to file
168
sys.argv = [
169
'toil-stats',
170
'--outputFile=/tmp/workflow-stats.json',
171
'--format=json',
172
'file:jobstore'
173
]
174
175
stats_main()
176
177
return stats_result
178
179
def real_time_monitoring():
180
"""Real-time workflow monitoring implementation."""
181
182
import time
183
import json
184
from toil.statsAndLogging import StatsAndLogging
185
186
class WorkflowMonitor:
187
"""Real-time workflow monitoring."""
188
189
def __init__(self, jobstore_locator: str):
190
self.jobstore = jobstore_locator
191
self.stats_collector = StatsAndLogging()
192
self.monitoring = True
193
194
def start_monitoring(self, update_interval: int = 30):
195
"""Start real-time monitoring loop."""
196
197
while self.monitoring:
198
try:
199
# Get current status
200
status = self.get_workflow_status()
201
202
# Get performance metrics
203
metrics = self.get_performance_metrics()
204
205
# Display or log status
206
self.display_status(status, metrics)
207
208
# Check for completion
209
if status.get('completed', False):
210
print("Workflow completed successfully!")
211
break
212
213
if status.get('failed', False):
214
print("Workflow failed!")
215
self.handle_workflow_failure(status)
216
break
217
218
time.sleep(update_interval)
219
220
except KeyboardInterrupt:
221
print("Monitoring stopped by user")
222
break
223
except Exception as e:
224
print(f"Monitoring error: {e}")
225
time.sleep(5)
226
227
def get_workflow_status(self) -> dict:
228
"""Get current workflow status."""
229
230
# Implementation would call toil status programmatically
231
status = {
232
'total_jobs': 0,
233
'completed_jobs': 0,
234
'running_jobs': 0,
235
'failed_jobs': 0,
236
'queued_jobs': 0,
237
'completion_percentage': 0.0,
238
'estimated_time_remaining': None
239
}
240
241
# Populate status from job store
242
# ... implementation details ...
243
244
return status
245
246
def get_performance_metrics(self) -> dict:
247
"""Get performance metrics."""
248
249
metrics = {
250
'cpu_utilization': 0.0,
251
'memory_usage': 0.0,
252
'network_io': 0.0,
253
'disk_io': 0.0,
254
'cost_per_hour': 0.0,
255
'jobs_per_minute': 0.0
256
}
257
258
# Collect from various sources
259
# ... implementation details ...
260
261
return metrics
262
263
def display_status(self, status: dict, metrics: dict):
264
"""Display formatted status information."""
265
266
print("\n" + "="*60)
267
print(f"Workflow Status - {time.strftime('%Y-%m-%d %H:%M:%S')}")
268
print("="*60)
269
270
print(f"Jobs: {status['completed_jobs']}/{status['total_jobs']} completed")
271
print(f"Running: {status['running_jobs']}, Queued: {status['queued_jobs']}")
272
print(f"Failed: {status['failed_jobs']}")
273
print(f"Progress: {status['completion_percentage']:.1f}%")
274
275
if status['estimated_time_remaining']:
276
print(f"Est. remaining: {status['estimated_time_remaining']}")
277
278
print(f"\nPerformance:")
279
print(f"CPU Utilization: {metrics['cpu_utilization']:.1f}%")
280
print(f"Memory Usage: {metrics['memory_usage']:.1f}%")
281
print(f"Cost/Hour: ${metrics['cost_per_hour']:.2f}")
282
print(f"Jobs/Minute: {metrics['jobs_per_minute']:.1f}")
283
284
def stop_monitoring(self):
285
"""Stop monitoring loop."""
286
self.monitoring = False
287
288
# Usage
289
monitor = WorkflowMonitor("file:my-jobstore")
290
monitor.start_monitoring(update_interval=10)
291
```
292
293
### Debugging and Troubleshooting Tools
294
{ .api }
295
296
Comprehensive debugging utilities for workflow development and troubleshooting.
297
298
```python
299
from toil.utils.toilDebugJob import main as debug_job_main
300
from toil.utils.toilDebugFile import main as debug_file_main
301
302
def debug_workflow_issues():
303
"""Debug workflow execution issues."""
304
305
# Debug specific job
306
# toil debug-job file:jobstore <job-id>
307
308
sys.argv = [
309
'toil-debug-job',
310
'--printJobInfo', # Print job information
311
'--printJobChildren', # Print child jobs
312
'--printJobFiles', # Print associated files
313
'--printJobLogging', # Print job logs
314
'file:my-jobstore',
315
'job-id-12345'
316
]
317
318
debug_job_main()
319
320
# Debug file issues
321
# toil debug-file file:jobstore <file-id>
322
323
sys.argv = [
324
'toil-debug-file',
325
'--printFileInfo', # File metadata
326
'--printFileContent', # File contents (if small)
327
'--saveFile=/tmp/debug-file', # Save file locally
328
'file:my-jobstore',
329
'file-id-67890'
330
]
331
332
debug_file_main()
333
334
def advanced_debugging_tools():
335
"""Advanced debugging and analysis tools."""
336
337
class WorkflowDebugger:
338
"""Comprehensive workflow debugging toolkit."""
339
340
def __init__(self, jobstore_locator: str):
341
self.jobstore = jobstore_locator
342
343
def analyze_failed_jobs(self):
344
"""Analyze failed jobs and common failure patterns."""
345
346
failed_jobs = self.get_failed_jobs()
347
348
failure_patterns = {
349
'out_of_memory': 0,
350
'timeout': 0,
351
'missing_files': 0,
352
'command_not_found': 0,
353
'permission_denied': 0,
354
'network_error': 0,
355
'unknown': 0
356
}
357
358
for job in failed_jobs:
359
# Analyze failure reason
360
exit_code = job.get('exit_code', 0)
361
stderr = job.get('stderr', '')
362
363
if exit_code == 137: # SIGKILL - likely OOM
364
failure_patterns['out_of_memory'] += 1
365
elif exit_code == 124: # timeout
366
failure_patterns['timeout'] += 1
367
elif 'No such file' in stderr:
368
failure_patterns['missing_files'] += 1
369
elif 'command not found' in stderr:
370
failure_patterns['command_not_found'] += 1
371
elif 'Permission denied' in stderr:
372
failure_patterns['permission_denied'] += 1
373
elif 'network' in stderr.lower():
374
failure_patterns['network_error'] += 1
375
else:
376
failure_patterns['unknown'] += 1
377
378
# Generate debugging report
379
print("Failed Job Analysis:")
380
print("="*50)
381
for pattern, count in failure_patterns.items():
382
if count > 0:
383
print(f"{pattern.replace('_', ' ').title()}: {count} jobs")
384
self.suggest_fixes(pattern, count)
385
386
return failure_patterns
387
388
def suggest_fixes(self, pattern: str, count: int):
389
"""Suggest fixes for common failure patterns."""
390
391
suggestions = {
392
'out_of_memory': [
393
"Increase memory requirements for affected jobs",
394
"Use streaming or chunked processing for large data",
395
"Check for memory leaks in job code"
396
],
397
'timeout': [
398
"Increase job timeout limits",
399
"Optimize algorithm efficiency",
400
"Split large jobs into smaller chunks"
401
],
402
'missing_files': [
403
"Verify input file paths and availability",
404
"Check file transfer and staging",
405
"Ensure proper job dependencies"
406
],
407
'command_not_found': [
408
"Install missing software in Docker images",
409
"Check PATH environment variable",
410
"Verify tool versions and compatibility"
411
],
412
'permission_denied': [
413
"Fix file and directory permissions",
414
"Check Docker volume mounts",
415
"Verify user/group settings"
416
]
417
}
418
419
if pattern in suggestions:
420
print(f" Suggested fixes:")
421
for suggestion in suggestions[pattern]:
422
print(f" • {suggestion}")
423
424
def trace_job_execution(self, job_id: str):
425
"""Trace job execution history and dependencies."""
426
427
job_info = self.get_job_info(job_id)
428
429
print(f"Job Execution Trace: {job_id}")
430
print("="*50)
431
print(f"Job Name: {job_info.get('name', 'Unknown')}")
432
print(f"Status: {job_info.get('status', 'Unknown')}")
433
print(f"Start Time: {job_info.get('start_time', 'Unknown')}")
434
print(f"End Time: {job_info.get('end_time', 'Unknown')}")
435
print(f"Duration: {job_info.get('duration', 'Unknown')}")
436
print(f"Resources Used: CPU={job_info.get('cpu_used', 'N/A')}, "
437
f"Memory={job_info.get('memory_used', 'N/A')}")
438
439
# Show dependencies
440
predecessors = job_info.get('predecessors', [])
441
if predecessors:
442
print(f"\nPredecessor Jobs:")
443
for pred_id in predecessors:
444
pred_info = self.get_job_info(pred_id)
445
print(f" {pred_id}: {pred_info.get('status', 'Unknown')}")
446
447
# Show children
448
children = job_info.get('children', [])
449
if children:
450
print(f"\nChild Jobs:")
451
for child_id in children:
452
child_info = self.get_job_info(child_id)
453
print(f" {child_id}: {child_info.get('status', 'Unknown')}")
454
455
# Show files
456
input_files = job_info.get('input_files', [])
457
output_files = job_info.get('output_files', [])
458
459
if input_files:
460
print(f"\nInput Files:")
461
for file_id in input_files:
462
print(f" {file_id}")
463
464
if output_files:
465
print(f"\nOutput Files:")
466
for file_id in output_files:
467
print(f" {file_id}")
468
469
def performance_analysis(self):
470
"""Analyze workflow performance and bottlenecks."""
471
472
jobs = self.get_all_jobs()
473
474
# Resource utilization analysis
475
cpu_utilization = []
476
memory_utilization = []
477
job_durations = []
478
479
for job in jobs:
480
if job.get('completed'):
481
cpu_util = job.get('cpu_utilization', 0)
482
mem_util = job.get('memory_utilization', 0)
483
duration = job.get('duration', 0)
484
485
cpu_utilization.append(cpu_util)
486
memory_utilization.append(mem_util)
487
job_durations.append(duration)
488
489
# Calculate statistics
490
if cpu_utilization:
491
avg_cpu = sum(cpu_utilization) / len(cpu_utilization)
492
avg_memory = sum(memory_utilization) / len(memory_utilization)
493
avg_duration = sum(job_durations) / len(job_durations)
494
495
print("Performance Analysis:")
496
print("="*50)
497
print(f"Average CPU Utilization: {avg_cpu:.1f}%")
498
print(f"Average Memory Utilization: {avg_memory:.1f}%")
499
print(f"Average Job Duration: {avg_duration:.2f}s")
500
501
# Identify bottlenecks
502
long_jobs = [j for j in jobs if j.get('duration', 0) > avg_duration * 3]
503
if long_jobs:
504
print(f"\nLong-running jobs ({len(long_jobs)}):")
505
for job in sorted(long_jobs, key=lambda x: x.get('duration', 0), reverse=True)[:5]:
506
print(f" {job['id']}: {job.get('duration', 0):.2f}s")
507
508
# Resource inefficiency
509
inefficient_jobs = [j for j in jobs if j.get('cpu_utilization', 100) < 50]
510
if inefficient_jobs:
511
print(f"\nResource-inefficient jobs ({len(inefficient_jobs)}):")
512
for job in inefficient_jobs[:5]:
513
print(f" {job['id']}: {job.get('cpu_utilization', 0):.1f}% CPU")
514
```
515
516
### Cleanup and Maintenance Tools
517
{ .api }
518
519
Tools for cleaning up job stores and maintaining workflow environments.
520
521
```python
522
from toil.utils.toilClean import main as clean_main
523
from toil.utils.toilKill import main as kill_main
524
525
def cleanup_workflows():
526
"""Clean up workflow artifacts and job stores."""
527
528
# Clean completed workflow
529
# toil clean file:jobstore
530
531
sys.argv = ['toil-clean', 'file:completed-jobstore']
532
clean_main()
533
534
# Comprehensive cleanup with options
535
sys.argv = [
536
'toil-clean',
537
'--cleanWorkDir', # Clean working directories
538
'--cleanJobStore', # Clean job store completely
539
'aws:us-west-2:my-bucket:old-workflow'
540
]
541
542
clean_main()
543
544
# Kill running workflow
545
# toil kill file:jobstore
546
547
sys.argv = ['toil-kill', 'file:running-jobstore']
548
kill_main()
549
550
def maintenance_utilities():
551
"""Workflow maintenance and housekeeping utilities."""
552
553
class WorkflowMaintenance:
554
"""Workflow maintenance toolkit."""
555
556
def cleanup_old_jobstores(self, days_old: int = 30):
557
"""Clean up job stores older than specified days."""
558
559
import os
560
import time
561
562
# Find old job stores (implementation depends on storage type)
563
current_time = time.time()
564
cutoff_time = current_time - (days_old * 86400)
565
566
old_jobstores = []
567
568
# For file-based job stores
569
jobstore_dir = "/tmp/toil-jobstores"
570
if os.path.exists(jobstore_dir):
571
for item in os.listdir(jobstore_dir):
572
item_path = os.path.join(jobstore_dir, item)
573
if os.path.isdir(item_path):
574
mtime = os.path.getmtime(item_path)
575
if mtime < cutoff_time:
576
old_jobstores.append(f"file:{item_path}")
577
578
# Clean up old job stores
579
for jobstore in old_jobstores:
580
try:
581
print(f"Cleaning old jobstore: {jobstore}")
582
sys.argv = ['toil-clean', jobstore]
583
clean_main()
584
except Exception as e:
585
print(f"Failed to clean {jobstore}: {e}")
586
587
return len(old_jobstores)
588
589
def archive_completed_workflows(self, archive_dir: str):
590
"""Archive completed workflow results."""
591
592
import shutil
593
import json
594
595
completed_workflows = self.find_completed_workflows()
596
597
for workflow in completed_workflows:
598
try:
599
# Create archive directory
600
workflow_archive = os.path.join(
601
archive_dir,
602
f"workflow_{workflow['id']}_{workflow['completion_date']}"
603
)
604
os.makedirs(workflow_archive, exist_ok=True)
605
606
# Archive workflow results
607
if workflow['output_files']:
608
output_archive = os.path.join(workflow_archive, "outputs")
609
os.makedirs(output_archive, exist_ok=True)
610
611
for output_file in workflow['output_files']:
612
shutil.copy2(output_file, output_archive)
613
614
# Archive statistics
615
stats_file = os.path.join(workflow_archive, "statistics.json")
616
with open(stats_file, 'w') as f:
617
json.dump(workflow['statistics'], f, indent=2)
618
619
# Archive logs
620
if workflow.get('log_files'):
621
log_archive = os.path.join(workflow_archive, "logs")
622
os.makedirs(log_archive, exist_ok=True)
623
624
for log_file in workflow['log_files']:
625
shutil.copy2(log_file, log_archive)
626
627
print(f"Archived workflow {workflow['id']} to {workflow_archive}")
628
629
except Exception as e:
630
print(f"Failed to archive workflow {workflow['id']}: {e}")
631
632
def monitor_disk_usage(self, threshold_percent: float = 85.0):
633
"""Monitor and alert on high disk usage."""
634
635
import shutil
636
637
# Check various Toil directories
638
directories_to_check = [
639
"/tmp/toil-work",
640
"/var/tmp/toil",
641
"/tmp/toil-jobstores"
642
]
643
644
alerts = []
645
646
for directory in directories_to_check:
647
if os.path.exists(directory):
648
total, used, free = shutil.disk_usage(directory)
649
usage_percent = (used / total) * 100
650
651
if usage_percent > threshold_percent:
652
alerts.append({
653
'directory': directory,
654
'usage_percent': usage_percent,
655
'free_gb': free / (1024**3),
656
'total_gb': total / (1024**3)
657
})
658
659
if alerts:
660
print("DISK USAGE ALERTS:")
661
print("="*50)
662
for alert in alerts:
663
print(f"Directory: {alert['directory']}")
664
print(f"Usage: {alert['usage_percent']:.1f}%")
665
print(f"Free: {alert['free_gb']:.1f} GB")
666
print(f"Total: {alert['total_gb']:.1f} GB")
667
print("-" * 30)
668
669
# Suggest cleanup actions
670
print("Suggested Actions:")
671
print("• Run 'toil clean' on completed job stores")
672
print("• Archive old workflow outputs")
673
print("• Clear temporary directories")
674
print("• Check for large log files")
675
676
return alerts
677
```
678
679
### Cluster Management Tools
680
{ .api }
681
682
Command-line tools for managing cloud clusters and distributed environments.
683
684
```python
685
from toil.utils.toilLaunchCluster import main as launch_cluster_main
686
from toil.utils.toilDestroyCluster import main as destroy_cluster_main
687
from toil.utils.toilSshCluster import main as ssh_cluster_main
688
from toil.utils.toilRsyncCluster import main as rsync_cluster_main
689
690
def cluster_management():
691
"""Manage cloud clusters for distributed execution."""
692
693
# Launch cluster
694
# toil launch-cluster my-cluster --nodeTypes=m5.large,m5.xlarge --maxNodes=50
695
696
sys.argv = [
697
'toil-launch-cluster',
698
'research-cluster',
699
'--provisioner=aws',
700
'--nodeTypes=m5.large,m5.xlarge:0.50,c5.2xlarge',
701
'--maxNodes=100',
702
'--zone=us-west-2a',
703
'--keyPairName=research-keypair',
704
'--leaderNodeType=m5.large',
705
'--nodeStorage=100',
706
'--preemptibleWorkers',
707
'--logLevel=INFO'
708
]
709
710
launch_cluster_main()
711
712
# SSH to cluster leader
713
# toil ssh-cluster my-cluster
714
715
sys.argv = ['toil-ssh-cluster', 'research-cluster']
716
ssh_cluster_main()
717
718
# Sync files to cluster
719
# toil rsync-cluster -r local-dir my-cluster:remote-dir
720
721
sys.argv = [
722
'toil-rsync-cluster',
723
'--recursive',
724
'/local/data/',
725
'research-cluster:/shared/data/'
726
]
727
728
rsync_cluster_main()
729
730
# Destroy cluster
731
# toil destroy-cluster my-cluster
732
733
sys.argv = ['toil-destroy-cluster', 'research-cluster']
734
destroy_cluster_main()
735
736
def advanced_cluster_operations():
737
"""Advanced cluster management operations."""
738
739
class ClusterOperations:
740
"""Advanced cluster operation utilities."""
741
742
def deploy_workflow_to_cluster(self, cluster_name: str,
743
workflow_script: str,
744
input_data: str):
745
"""Deploy and run workflow on cluster."""
746
747
# Copy workflow and data to cluster
748
print(f"Deploying to cluster: {cluster_name}")
749
750
# Sync workflow files
751
sys.argv = [
752
'toil-rsync-cluster',
753
'--recursive',
754
os.path.dirname(workflow_script),
755
f'{cluster_name}:/toil/workflows/'
756
]
757
rsync_cluster_main()
758
759
# Sync input data
760
sys.argv = [
761
'toil-rsync-cluster',
762
'--recursive',
763
input_data,
764
f'{cluster_name}:/toil/data/'
765
]
766
rsync_cluster_main()
767
768
# Execute workflow on cluster
769
remote_command = f"""
770
cd /toil &&
771
toil --batchSystem=mesos \\
772
--jobStore=aws:us-west-2:results:workflow-run \\
773
--provisioner=aws \\
774
--nodeTypes=m5.large,m5.xlarge \\
775
--maxNodes=50 \\
776
--stats \\
777
workflows/{os.path.basename(workflow_script)} \\
778
data/
779
"""
780
781
# SSH and execute
782
sys.argv = [
783
'toil-ssh-cluster',
784
cluster_name,
785
remote_command
786
]
787
788
ssh_cluster_main()
789
790
def monitor_cluster_resources(self, cluster_name: str):
791
"""Monitor cluster resource usage."""
792
793
monitoring_script = """
794
# Get node information
795
kubectl get nodes -o wide
796
797
# Check resource usage
798
kubectl top nodes
799
800
# Check running pods
801
kubectl get pods --all-namespaces
802
803
# System resource usage
804
free -h
805
df -h
806
807
# Toil-specific monitoring
808
ps aux | grep toil
809
810
# Check logs
811
tail -n 50 /var/log/toil/*.log
812
"""
813
814
sys.argv = [
815
'toil-ssh-cluster',
816
cluster_name,
817
monitoring_script
818
]
819
820
ssh_cluster_main()
821
822
def backup_cluster_data(self, cluster_name: str, backup_location: str):
823
"""Backup important cluster data."""
824
825
# Sync results back to local
826
sys.argv = [
827
'toil-rsync-cluster',
828
'--recursive',
829
f'{cluster_name}:/toil/results/',
830
os.path.join(backup_location, 'results')
831
]
832
rsync_cluster_main()
833
834
# Sync logs
835
sys.argv = [
836
'toil-rsync-cluster',
837
'--recursive',
838
f'{cluster_name}:/var/log/toil/',
839
os.path.join(backup_location, 'logs')
840
]
841
rsync_cluster_main()
842
843
# Sync job store (if file-based)
844
sys.argv = [
845
'toil-rsync-cluster',
846
'--recursive',
847
f'{cluster_name}:/toil/jobstore/',
848
os.path.join(backup_location, 'jobstore')
849
]
850
rsync_cluster_main()
851
```
852
853
### Configuration Management
854
{ .api }
855
856
Tools for managing Toil configuration files and settings.
857
858
```python
859
from toil.utils.toilConfig import main as config_main
860
from toil.common import get_default_config_path, ensure_config, generate_config, update_config
861
862
def configuration_management():
863
"""Manage Toil configuration files and settings."""
864
865
# Generate default configuration
866
default_config_path = get_default_config_path()
867
ensure_config(default_config_path)
868
869
# Generate custom configuration
870
custom_config = "/etc/toil/production.conf"
871
generate_config(custom_config)
872
873
# Update configuration values
874
update_config(custom_config, "batchSystem", "slurm")
875
update_config(custom_config, "maxCores", "64")
876
update_config(custom_config, "defaultMemory", "8G")
877
878
# Use config management CLI
879
sys.argv = [
880
'toil-config',
881
'--set', 'jobStore=aws:us-west-2:my-bucket',
882
'--set', 'batchSystem=kubernetes',
883
'--set', 'maxNodes=100',
884
'--config-file', custom_config
885
]
886
887
config_main()
888
889
def configuration_templates():
890
"""Provide configuration templates for different use cases."""
891
892
templates = {
893
'local_development': {
894
'batchSystem': 'local',
895
'maxCores': 4,
896
'maxMemory': '8G',
897
'jobStore': 'file:/tmp/toil-dev',
898
'logLevel': 'DEBUG',
899
'stats': True
900
},
901
902
'hpc_cluster': {
903
'batchSystem': 'slurm',
904
'maxCores': 1000,
905
'maxMemory': '2T',
906
'jobStore': 'file:/shared/toil-jobs',
907
'workDir': '/tmp/toil-work',
908
'logLevel': 'INFO',
909
'retryCount': 3,
910
'rescueJobsFrequency': 3600
911
},
912
913
'cloud_production': {
914
'batchSystem': 'kubernetes',
915
'provisioner': 'aws',
916
'jobStore': 'aws:us-west-2:production-bucket:workflows',
917
'nodeTypes': ['m5.large', 'm5.xlarge', 'c5.2xlarge'],
918
'maxNodes': 500,
919
'defaultPreemptible': True,
920
'preemptibleCompensation': 1.5,
921
'sseKey': 'alias/toil-production-key',
922
'clean': 'onSuccess',
923
'stats': True,
924
'logLevel': 'INFO'
925
}
926
}
927
928
def create_config_from_template(template_name: str, output_file: str):
929
"""Create configuration file from template."""
930
931
if template_name not in templates:
932
raise ValueError(f"Unknown template: {template_name}")
933
934
template = templates[template_name]
935
936
# Generate base config
937
generate_config(output_file)
938
939
# Apply template values
940
for key, value in template.items():
941
update_config(output_file, key, value)
942
943
print(f"Configuration created: {output_file}")
944
print(f"Template used: {template_name}")
945
946
return create_config_from_template
947
```
948
949
This comprehensive utilities and CLI tools suite provides complete workflow lifecycle management from development through production deployment with extensive monitoring, debugging, and maintenance capabilities.