Tessl Tile for pypi/toil@9.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

batch-systems.md core-workflow.md file-management.md index.md job-stores.md provisioning.md utilities.md workflow-languages.md

utilities.mddocs/

0
# Utilities and CLI Tools
1

2
## Overview
3

4
Toil provides a comprehensive suite of command-line utilities and helper functions for workflow management, debugging, monitoring, and cluster operations. These tools cover the complete lifecycle from workflow development and testing to production deployment and maintenance. The utilities integrate seamlessly with Toil's core functionality and provide both interactive and scriptable interfaces for automation and integration with larger systems.
5

6
## Capabilities
7

8
### Main CLI Interface
9
{ .api }
10

11
The primary `toil` command provides the main interface for workflow execution and management.
12

13
```python
14
from toil.utils.toilMain import main as toil_main
15
import sys
16

17
# Command-line workflow execution
18
def run_toil_workflow():
19
    """Execute workflow using main CLI interface."""
20
    
21
    # Basic workflow execution
22
    # toil workflow.py file:jobstore
23
    
24
    # With resource specifications
25
    # toil --batchSystem=local --maxCores=8 --maxMemory=16G workflow.py file:jobstore
26
    
27
    # Distributed execution
28
    # toil --batchSystem=slurm --maxNodes=50 --nodeTypes=compute workflow.py file:jobstore
29
    
30
    # Programmatic execution through main interface
31
    sys.argv = [
32
        'toil',
33
        '--batchSystem=local',
34
        '--jobStore=file:test-jobstore', 
35
        '--maxCores=4',
36
        '--maxMemory=8G',
37
        '--logLevel=INFO',
38
        '--stats',
39
        'workflow_script.py',
40
        'input_file.txt'
41
    ]
42
    
43
    # Execute workflow
44
    exit_code = toil_main()
45
    return exit_code
46

47
def advanced_toil_options():
48
    """Demonstrate advanced toil CLI options."""
49
    
50
    # Comprehensive workflow execution with all major options
51
    advanced_command = [
52
        'toil',
53
        
54
        # Job store and batch system
55
        '--jobStore=aws:us-west-2:my-bucket:workflow-run',
56
        '--batchSystem=kubernetes',
57
        
58
        # Resource limits
59
        '--maxCores=1000',
60
        '--maxMemory=2T', 
61
        '--maxDisk=10T',
62
        '--defaultMemory=4G',
63
        '--defaultCores=2',
64
        '--defaultDisk=10G',
65
        
66
        # Scaling and provisioning
67
        '--provisioner=aws',
68
        '--nodeTypes=m5.large,m5.xlarge:0.50,c5.2xlarge', 
69
        '--maxNodes=100',
70
        '--minNodes=5',
71
        '--targetTime=1800',  # Target job runtime in seconds
72
        
73
        # Fault tolerance  
74
        '--retryCount=3',
75
        '--rescueJobsFrequency=3600',
76
        '--maxJobDuration=86400',  # 24 hours max per job
77
        
78
        # Preemption and spot instances
79
        '--defaultPreemptible',
80
        '--preemptibleCompensation=1.5',
81
        '--preemptibleWorkerTimeout=1800',
82
        
83
        # Cleanup and management
84
        '--clean=onSuccess',
85
        '--cleanWorkDir=onSuccess',
86
        '--clusterStats=/tmp/cluster-stats',
87
        
88
        # Logging and monitoring  
89
        '--logLevel=INFO',
90
        '--logFile=/tmp/toil-workflow.log',
91
        '--stats',
92
        '--metrics',
93
        
94
        # Working directories
95
        '--workDir=/tmp/toil-work',
96
        '--coordinationDir=/shared/coordination',
97
        
98
        # Security and encryption
99
        '--sseKey=alias/toil-kms-key',
100
        '--encryptedFileStore',
101
        
102
        # Container support
103
        '--disableAutoDeployment=False',
104
        '--containerEngine=docker',
105
        
106
        # Workflow script and arguments
107
        'complex_workflow.py',
108
        '--input-dir=/data/inputs',
109
        '--output-dir=/data/outputs',
110
        '--reference-genome=/data/reference.fa',
111
        '--threads=16'
112
    ]
113
    
114
    return advanced_command
115
```
116

117
### Workflow Status and Monitoring
118
{ .api }
119

120
Tools for monitoring workflow progress and status in real-time.
121

122
```python
123
from toil.utils.toilStatus import main as status_main
124
from toil.utils.toilStats import main as stats_main
125

126
def monitor_workflow_status():
127
    """Monitor running workflow status."""
128
    
129
    # Check workflow status
130
    # toil status file:jobstore
131
    
132
    # Programmatic status checking
133
    sys.argv = ['toil-status', 'file:my-jobstore']
134
    status_info = status_main()
135
    
136
    # Detailed status with job breakdown
137
    sys.argv = [
138
        'toil-status',
139
        '--verbose', 
140
        '--failIfNotComplete',
141
        'aws:us-west-2:my-bucket:workflow'
142
    ]
143
    
144
    detailed_status = status_main()
145
    return detailed_status
146

147
def analyze_workflow_statistics():
148
    """Analyze workflow execution statistics."""
149
    
150
    # Basic statistics
151
    # toil stats file:jobstore
152
    
153
    # Comprehensive statistics analysis
154
    sys.argv = [
155
        'toil-stats', 
156
        '--raw',              # Raw statistics data
157
        '--pretty',           # Human-readable format
158
        '--categories=time,clock,wait,memory,disk',
159
        '--sortCategory=time',
160
        '--sortField=total',
161
        '--sortReverse',
162
        'file:completed-jobstore'
163
    ]
164
    
165
    stats_result = stats_main()
166
    
167
    # Export statistics to file
168
    sys.argv = [
169
        'toil-stats',
170
        '--outputFile=/tmp/workflow-stats.json',
171
        '--format=json', 
172
        'file:jobstore'
173
    ]
174
    
175
    stats_main()
176
    
177
    return stats_result
178

179
def real_time_monitoring():
180
    """Real-time workflow monitoring implementation."""
181
    
182
    import time
183
    import json
184
    from toil.statsAndLogging import StatsAndLogging
185
    
186
    class WorkflowMonitor:
187
        """Real-time workflow monitoring."""
188
        
189
        def __init__(self, jobstore_locator: str):
190
            self.jobstore = jobstore_locator
191
            self.stats_collector = StatsAndLogging()
192
            self.monitoring = True
193
        
194
        def start_monitoring(self, update_interval: int = 30):
195
            """Start real-time monitoring loop."""
196
            
197
            while self.monitoring:
198
                try:
199
                    # Get current status
200
                    status = self.get_workflow_status()
201
                    
202
                    # Get performance metrics
203
                    metrics = self.get_performance_metrics()
204
                    
205
                    # Display or log status
206
                    self.display_status(status, metrics)
207
                    
208
                    # Check for completion
209
                    if status.get('completed', False):
210
                        print("Workflow completed successfully!")
211
                        break
212
                    
213
                    if status.get('failed', False):
214
                        print("Workflow failed!")
215
                        self.handle_workflow_failure(status)
216
                        break
217
                        
218
                    time.sleep(update_interval)
219
                    
220
                except KeyboardInterrupt:
221
                    print("Monitoring stopped by user")
222
                    break
223
                except Exception as e:
224
                    print(f"Monitoring error: {e}")
225
                    time.sleep(5)
226
        
227
        def get_workflow_status(self) -> dict:
228
            """Get current workflow status."""
229
            
230
            # Implementation would call toil status programmatically
231
            status = {
232
                'total_jobs': 0,
233
                'completed_jobs': 0,
234
                'running_jobs': 0,
235
                'failed_jobs': 0,
236
                'queued_jobs': 0,
237
                'completion_percentage': 0.0,
238
                'estimated_time_remaining': None
239
            }
240
            
241
            # Populate status from job store
242
            # ... implementation details ...
243
            
244
            return status
245
        
246
        def get_performance_metrics(self) -> dict:
247
            """Get performance metrics."""
248
            
249
            metrics = {
250
                'cpu_utilization': 0.0,
251
                'memory_usage': 0.0,
252
                'network_io': 0.0,
253
                'disk_io': 0.0,
254
                'cost_per_hour': 0.0,
255
                'jobs_per_minute': 0.0
256
            }
257
            
258
            # Collect from various sources
259
            # ... implementation details ...
260
            
261
            return metrics
262
        
263
        def display_status(self, status: dict, metrics: dict):
264
            """Display formatted status information."""
265
            
266
            print("\n" + "="*60)
267
            print(f"Workflow Status - {time.strftime('%Y-%m-%d %H:%M:%S')}")
268
            print("="*60)
269
            
270
            print(f"Jobs: {status['completed_jobs']}/{status['total_jobs']} completed")
271
            print(f"Running: {status['running_jobs']}, Queued: {status['queued_jobs']}")
272
            print(f"Failed: {status['failed_jobs']}")
273
            print(f"Progress: {status['completion_percentage']:.1f}%")
274
            
275
            if status['estimated_time_remaining']:
276
                print(f"Est. remaining: {status['estimated_time_remaining']}")
277
            
278
            print(f"\nPerformance:")
279
            print(f"CPU Utilization: {metrics['cpu_utilization']:.1f}%")
280
            print(f"Memory Usage: {metrics['memory_usage']:.1f}%") 
281
            print(f"Cost/Hour: ${metrics['cost_per_hour']:.2f}")
282
            print(f"Jobs/Minute: {metrics['jobs_per_minute']:.1f}")
283
            
284
        def stop_monitoring(self):
285
            """Stop monitoring loop."""
286
            self.monitoring = False
287
    
288
    # Usage
289
    monitor = WorkflowMonitor("file:my-jobstore")
290
    monitor.start_monitoring(update_interval=10)
291
```
292

293
### Debugging and Troubleshooting Tools
294
{ .api }
295

296
Comprehensive debugging utilities for workflow development and troubleshooting.
297

298
```python
299
from toil.utils.toilDebugJob import main as debug_job_main  
300
from toil.utils.toilDebugFile import main as debug_file_main
301

302
def debug_workflow_issues():
303
    """Debug workflow execution issues."""
304
    
305
    # Debug specific job
306
    # toil debug-job file:jobstore <job-id>
307
    
308
    sys.argv = [
309
        'toil-debug-job',
310
        '--printJobInfo',      # Print job information
311
        '--printJobChildren',  # Print child jobs  
312
        '--printJobFiles',     # Print associated files
313
        '--printJobLogging',   # Print job logs
314
        'file:my-jobstore',
315
        'job-id-12345'
316
    ]
317
    
318
    debug_job_main()
319
    
320
    # Debug file issues
321
    # toil debug-file file:jobstore <file-id>
322
    
323
    sys.argv = [
324
        'toil-debug-file', 
325
        '--printFileInfo',     # File metadata
326
        '--printFileContent',  # File contents (if small)
327
        '--saveFile=/tmp/debug-file',  # Save file locally
328
        'file:my-jobstore',
329
        'file-id-67890'
330
    ]
331
    
332
    debug_file_main()
333

334
def advanced_debugging_tools():
335
    """Advanced debugging and analysis tools."""
336
    
337
    class WorkflowDebugger:
338
        """Comprehensive workflow debugging toolkit."""
339
        
340
        def __init__(self, jobstore_locator: str):
341
            self.jobstore = jobstore_locator
342
        
343
        def analyze_failed_jobs(self):
344
            """Analyze failed jobs and common failure patterns."""
345
            
346
            failed_jobs = self.get_failed_jobs()
347
            
348
            failure_patterns = {
349
                'out_of_memory': 0,
350
                'timeout': 0, 
351
                'missing_files': 0,
352
                'command_not_found': 0,
353
                'permission_denied': 0,
354
                'network_error': 0,
355
                'unknown': 0
356
            }
357
            
358
            for job in failed_jobs:
359
                # Analyze failure reason
360
                exit_code = job.get('exit_code', 0)
361
                stderr = job.get('stderr', '')
362
                
363
                if exit_code == 137:  # SIGKILL - likely OOM
364
                    failure_patterns['out_of_memory'] += 1
365
                elif exit_code == 124:  # timeout  
366
                    failure_patterns['timeout'] += 1
367
                elif 'No such file' in stderr:
368
                    failure_patterns['missing_files'] += 1
369
                elif 'command not found' in stderr:
370
                    failure_patterns['command_not_found'] += 1
371
                elif 'Permission denied' in stderr:
372
                    failure_patterns['permission_denied'] += 1
373
                elif 'network' in stderr.lower():
374
                    failure_patterns['network_error'] += 1
375
                else:
376
                    failure_patterns['unknown'] += 1
377
            
378
            # Generate debugging report
379
            print("Failed Job Analysis:")
380
            print("="*50)
381
            for pattern, count in failure_patterns.items():
382
                if count > 0:
383
                    print(f"{pattern.replace('_', ' ').title()}: {count} jobs")
384
                    self.suggest_fixes(pattern, count)
385
            
386
            return failure_patterns
387
        
388
        def suggest_fixes(self, pattern: str, count: int):
389
            """Suggest fixes for common failure patterns."""
390
            
391
            suggestions = {
392
                'out_of_memory': [
393
                    "Increase memory requirements for affected jobs",
394
                    "Use streaming or chunked processing for large data",
395
                    "Check for memory leaks in job code"
396
                ],
397
                'timeout': [
398
                    "Increase job timeout limits", 
399
                    "Optimize algorithm efficiency",
400
                    "Split large jobs into smaller chunks"
401
                ],
402
                'missing_files': [
403
                    "Verify input file paths and availability",
404
                    "Check file transfer and staging",
405
                    "Ensure proper job dependencies"
406
                ],
407
                'command_not_found': [
408
                    "Install missing software in Docker images",
409
                    "Check PATH environment variable",
410
                    "Verify tool versions and compatibility"
411
                ],
412
                'permission_denied': [
413
                    "Fix file and directory permissions",
414
                    "Check Docker volume mounts",
415
                    "Verify user/group settings"
416
                ]
417
            }
418
            
419
            if pattern in suggestions:
420
                print(f"  Suggested fixes:")
421
                for suggestion in suggestions[pattern]:
422
                    print(f"    • {suggestion}")
423
        
424
        def trace_job_execution(self, job_id: str):
425
            """Trace job execution history and dependencies."""
426
            
427
            job_info = self.get_job_info(job_id)
428
            
429
            print(f"Job Execution Trace: {job_id}")
430
            print("="*50)
431
            print(f"Job Name: {job_info.get('name', 'Unknown')}")
432
            print(f"Status: {job_info.get('status', 'Unknown')}")
433
            print(f"Start Time: {job_info.get('start_time', 'Unknown')}")
434
            print(f"End Time: {job_info.get('end_time', 'Unknown')}")
435
            print(f"Duration: {job_info.get('duration', 'Unknown')}")
436
            print(f"Resources Used: CPU={job_info.get('cpu_used', 'N/A')}, "
437
                  f"Memory={job_info.get('memory_used', 'N/A')}")
438
            
439
            # Show dependencies
440
            predecessors = job_info.get('predecessors', [])
441
            if predecessors:
442
                print(f"\nPredecessor Jobs:")
443
                for pred_id in predecessors:
444
                    pred_info = self.get_job_info(pred_id)
445
                    print(f"  {pred_id}: {pred_info.get('status', 'Unknown')}")
446
            
447
            # Show children
448
            children = job_info.get('children', [])
449
            if children:
450
                print(f"\nChild Jobs:")
451
                for child_id in children:
452
                    child_info = self.get_job_info(child_id)
453
                    print(f"  {child_id}: {child_info.get('status', 'Unknown')}")
454
            
455
            # Show files
456
            input_files = job_info.get('input_files', [])
457
            output_files = job_info.get('output_files', [])
458
            
459
            if input_files:
460
                print(f"\nInput Files:")
461
                for file_id in input_files:
462
                    print(f"  {file_id}")
463
            
464
            if output_files:
465
                print(f"\nOutput Files:")
466
                for file_id in output_files:
467
                    print(f"  {file_id}")
468
        
469
        def performance_analysis(self):
470
            """Analyze workflow performance and bottlenecks."""
471
            
472
            jobs = self.get_all_jobs()
473
            
474
            # Resource utilization analysis
475
            cpu_utilization = []
476
            memory_utilization = []
477
            job_durations = []
478
            
479
            for job in jobs:
480
                if job.get('completed'):
481
                    cpu_util = job.get('cpu_utilization', 0)
482
                    mem_util = job.get('memory_utilization', 0) 
483
                    duration = job.get('duration', 0)
484
                    
485
                    cpu_utilization.append(cpu_util)
486
                    memory_utilization.append(mem_util)
487
                    job_durations.append(duration)
488
            
489
            # Calculate statistics
490
            if cpu_utilization:
491
                avg_cpu = sum(cpu_utilization) / len(cpu_utilization)
492
                avg_memory = sum(memory_utilization) / len(memory_utilization)
493
                avg_duration = sum(job_durations) / len(job_durations)
494
                
495
                print("Performance Analysis:")
496
                print("="*50)
497
                print(f"Average CPU Utilization: {avg_cpu:.1f}%")
498
                print(f"Average Memory Utilization: {avg_memory:.1f}%")
499
                print(f"Average Job Duration: {avg_duration:.2f}s")
500
                
501
                # Identify bottlenecks
502
                long_jobs = [j for j in jobs if j.get('duration', 0) > avg_duration * 3]
503
                if long_jobs:
504
                    print(f"\nLong-running jobs ({len(long_jobs)}):")
505
                    for job in sorted(long_jobs, key=lambda x: x.get('duration', 0), reverse=True)[:5]:
506
                        print(f"  {job['id']}: {job.get('duration', 0):.2f}s")
507
                
508
                # Resource inefficiency  
509
                inefficient_jobs = [j for j in jobs if j.get('cpu_utilization', 100) < 50]
510
                if inefficient_jobs:
511
                    print(f"\nResource-inefficient jobs ({len(inefficient_jobs)}):")
512
                    for job in inefficient_jobs[:5]:
513
                        print(f"  {job['id']}: {job.get('cpu_utilization', 0):.1f}% CPU")
514
```
515

516
### Cleanup and Maintenance Tools
517
{ .api }
518

519
Tools for cleaning up job stores and maintaining workflow environments.
520

521
```python
522
from toil.utils.toilClean import main as clean_main
523
from toil.utils.toilKill import main as kill_main
524

525
def cleanup_workflows():
526
    """Clean up workflow artifacts and job stores."""
527
    
528
    # Clean completed workflow
529
    # toil clean file:jobstore
530
    
531
    sys.argv = ['toil-clean', 'file:completed-jobstore']
532
    clean_main()
533
    
534
    # Comprehensive cleanup with options
535
    sys.argv = [
536
        'toil-clean',
537
        '--cleanWorkDir',      # Clean working directories
538
        '--cleanJobStore',     # Clean job store completely  
539
        'aws:us-west-2:my-bucket:old-workflow'
540
    ]
541
    
542
    clean_main()
543
    
544
    # Kill running workflow
545
    # toil kill file:jobstore
546
    
547
    sys.argv = ['toil-kill', 'file:running-jobstore']
548
    kill_main()
549

550
def maintenance_utilities():
551
    """Workflow maintenance and housekeeping utilities."""
552
    
553
    class WorkflowMaintenance:
554
        """Workflow maintenance toolkit."""
555
        
556
        def cleanup_old_jobstores(self, days_old: int = 30):
557
            """Clean up job stores older than specified days."""
558
            
559
            import os
560
            import time
561
            
562
            # Find old job stores (implementation depends on storage type)
563
            current_time = time.time()
564
            cutoff_time = current_time - (days_old * 86400)
565
            
566
            old_jobstores = []
567
            
568
            # For file-based job stores
569
            jobstore_dir = "/tmp/toil-jobstores"
570
            if os.path.exists(jobstore_dir):
571
                for item in os.listdir(jobstore_dir):
572
                    item_path = os.path.join(jobstore_dir, item)
573
                    if os.path.isdir(item_path):
574
                        mtime = os.path.getmtime(item_path)
575
                        if mtime < cutoff_time:
576
                            old_jobstores.append(f"file:{item_path}")
577
            
578
            # Clean up old job stores
579
            for jobstore in old_jobstores:
580
                try:
581
                    print(f"Cleaning old jobstore: {jobstore}")
582
                    sys.argv = ['toil-clean', jobstore]
583
                    clean_main()
584
                except Exception as e:
585
                    print(f"Failed to clean {jobstore}: {e}")
586
            
587
            return len(old_jobstores)
588
        
589
        def archive_completed_workflows(self, archive_dir: str):
590
            """Archive completed workflow results."""
591
            
592
            import shutil
593
            import json
594
            
595
            completed_workflows = self.find_completed_workflows()
596
            
597
            for workflow in completed_workflows:
598
                try:
599
                    # Create archive directory
600
                    workflow_archive = os.path.join(
601
                        archive_dir, 
602
                        f"workflow_{workflow['id']}_{workflow['completion_date']}"
603
                    )
604
                    os.makedirs(workflow_archive, exist_ok=True)
605
                    
606
                    # Archive workflow results
607
                    if workflow['output_files']:
608
                        output_archive = os.path.join(workflow_archive, "outputs")
609
                        os.makedirs(output_archive, exist_ok=True)
610
                        
611
                        for output_file in workflow['output_files']:
612
                            shutil.copy2(output_file, output_archive)
613
                    
614
                    # Archive statistics
615
                    stats_file = os.path.join(workflow_archive, "statistics.json")
616
                    with open(stats_file, 'w') as f:
617
                        json.dump(workflow['statistics'], f, indent=2)
618
                    
619
                    # Archive logs
620
                    if workflow.get('log_files'):
621
                        log_archive = os.path.join(workflow_archive, "logs")
622
                        os.makedirs(log_archive, exist_ok=True)
623
                        
624
                        for log_file in workflow['log_files']:
625
                            shutil.copy2(log_file, log_archive)
626
                    
627
                    print(f"Archived workflow {workflow['id']} to {workflow_archive}")
628
                    
629
                except Exception as e:
630
                    print(f"Failed to archive workflow {workflow['id']}: {e}")
631
        
632
        def monitor_disk_usage(self, threshold_percent: float = 85.0):
633
            """Monitor and alert on high disk usage."""
634
            
635
            import shutil
636
            
637
            # Check various Toil directories
638
            directories_to_check = [
639
                "/tmp/toil-work",
640
                "/var/tmp/toil",
641
                "/tmp/toil-jobstores"
642
            ]
643
            
644
            alerts = []
645
            
646
            for directory in directories_to_check:
647
                if os.path.exists(directory):
648
                    total, used, free = shutil.disk_usage(directory)
649
                    usage_percent = (used / total) * 100
650
                    
651
                    if usage_percent > threshold_percent:
652
                        alerts.append({
653
                            'directory': directory,
654
                            'usage_percent': usage_percent,
655
                            'free_gb': free / (1024**3),
656
                            'total_gb': total / (1024**3)
657
                        })
658
            
659
            if alerts:
660
                print("DISK USAGE ALERTS:")
661
                print("="*50)
662
                for alert in alerts:
663
                    print(f"Directory: {alert['directory']}")
664
                    print(f"Usage: {alert['usage_percent']:.1f}%") 
665
                    print(f"Free: {alert['free_gb']:.1f} GB")
666
                    print(f"Total: {alert['total_gb']:.1f} GB")
667
                    print("-" * 30)
668
                
669
                # Suggest cleanup actions
670
                print("Suggested Actions:")
671
                print("• Run 'toil clean' on completed job stores")
672
                print("• Archive old workflow outputs")
673
                print("• Clear temporary directories")
674
                print("• Check for large log files")
675
            
676
            return alerts
677
```
678

679
### Cluster Management Tools
680
{ .api }
681

682
Command-line tools for managing cloud clusters and distributed environments.
683

684
```python
685
from toil.utils.toilLaunchCluster import main as launch_cluster_main
686
from toil.utils.toilDestroyCluster import main as destroy_cluster_main
687
from toil.utils.toilSshCluster import main as ssh_cluster_main
688
from toil.utils.toilRsyncCluster import main as rsync_cluster_main
689

690
def cluster_management():
691
    """Manage cloud clusters for distributed execution."""
692
    
693
    # Launch cluster
694
    # toil launch-cluster my-cluster --nodeTypes=m5.large,m5.xlarge --maxNodes=50
695
    
696
    sys.argv = [
697
        'toil-launch-cluster',
698
        'research-cluster',
699
        '--provisioner=aws',
700
        '--nodeTypes=m5.large,m5.xlarge:0.50,c5.2xlarge',
701
        '--maxNodes=100',
702
        '--zone=us-west-2a',
703
        '--keyPairName=research-keypair',
704
        '--leaderNodeType=m5.large',
705
        '--nodeStorage=100',
706
        '--preemptibleWorkers',
707
        '--logLevel=INFO'
708
    ]
709
    
710
    launch_cluster_main()
711
    
712
    # SSH to cluster leader
713
    # toil ssh-cluster my-cluster
714
    
715
    sys.argv = ['toil-ssh-cluster', 'research-cluster']
716
    ssh_cluster_main()
717
    
718
    # Sync files to cluster
719
    # toil rsync-cluster -r local-dir my-cluster:remote-dir
720
    
721
    sys.argv = [
722
        'toil-rsync-cluster',
723
        '--recursive',
724
        '/local/data/',
725
        'research-cluster:/shared/data/'
726
    ]
727
    
728
    rsync_cluster_main()
729
    
730
    # Destroy cluster
731
    # toil destroy-cluster my-cluster
732
    
733
    sys.argv = ['toil-destroy-cluster', 'research-cluster']
734
    destroy_cluster_main()
735

736
def advanced_cluster_operations():
737
    """Advanced cluster management operations."""
738
    
739
    class ClusterOperations:
740
        """Advanced cluster operation utilities."""
741
        
742
        def deploy_workflow_to_cluster(self, cluster_name: str, 
743
                                     workflow_script: str, 
744
                                     input_data: str):
745
            """Deploy and run workflow on cluster."""
746
            
747
            # Copy workflow and data to cluster
748
            print(f"Deploying to cluster: {cluster_name}")
749
            
750
            # Sync workflow files
751
            sys.argv = [
752
                'toil-rsync-cluster',
753
                '--recursive',
754
                os.path.dirname(workflow_script),
755
                f'{cluster_name}:/toil/workflows/'
756
            ]
757
            rsync_cluster_main()
758
            
759
            # Sync input data
760
            sys.argv = [
761
                'toil-rsync-cluster', 
762
                '--recursive',
763
                input_data,
764
                f'{cluster_name}:/toil/data/'
765
            ]
766
            rsync_cluster_main()
767
            
768
            # Execute workflow on cluster
769
            remote_command = f"""
770
                cd /toil &&
771
                toil --batchSystem=mesos \\
772
                     --jobStore=aws:us-west-2:results:workflow-run \\
773
                     --provisioner=aws \\
774
                     --nodeTypes=m5.large,m5.xlarge \\
775
                     --maxNodes=50 \\
776
                     --stats \\
777
                     workflows/{os.path.basename(workflow_script)} \\
778
                     data/
779
            """
780
            
781
            # SSH and execute
782
            sys.argv = [
783
                'toil-ssh-cluster',
784
                cluster_name,
785
                remote_command
786
            ]
787
            
788
            ssh_cluster_main()
789
        
790
        def monitor_cluster_resources(self, cluster_name: str):
791
            """Monitor cluster resource usage."""
792
            
793
            monitoring_script = """
794
                # Get node information
795
                kubectl get nodes -o wide
796
                
797
                # Check resource usage
798
                kubectl top nodes
799
                
800
                # Check running pods
801
                kubectl get pods --all-namespaces
802
                
803
                # System resource usage
804
                free -h
805
                df -h
806
                
807
                # Toil-specific monitoring
808
                ps aux | grep toil
809
                
810
                # Check logs
811
                tail -n 50 /var/log/toil/*.log
812
            """
813
            
814
            sys.argv = [
815
                'toil-ssh-cluster',
816
                cluster_name,
817
                monitoring_script
818
            ]
819
            
820
            ssh_cluster_main()
821
        
822
        def backup_cluster_data(self, cluster_name: str, backup_location: str):
823
            """Backup important cluster data."""
824
            
825
            # Sync results back to local
826
            sys.argv = [
827
                'toil-rsync-cluster',
828
                '--recursive',
829
                f'{cluster_name}:/toil/results/',
830
                os.path.join(backup_location, 'results')
831
            ]
832
            rsync_cluster_main()
833
            
834
            # Sync logs
835
            sys.argv = [
836
                'toil-rsync-cluster',
837
                '--recursive',  
838
                f'{cluster_name}:/var/log/toil/',
839
                os.path.join(backup_location, 'logs')
840
            ]
841
            rsync_cluster_main()
842
            
843
            # Sync job store (if file-based)
844
            sys.argv = [
845
                'toil-rsync-cluster',
846
                '--recursive',
847
                f'{cluster_name}:/toil/jobstore/',
848
                os.path.join(backup_location, 'jobstore')
849
            ]
850
            rsync_cluster_main()
851
```
852

853
### Configuration Management
854
{ .api }
855

856
Tools for managing Toil configuration files and settings.
857

858
```python
859
from toil.utils.toilConfig import main as config_main
860
from toil.common import get_default_config_path, ensure_config, generate_config, update_config
861

862
def configuration_management():
863
    """Manage Toil configuration files and settings."""
864
    
865
    # Generate default configuration
866
    default_config_path = get_default_config_path()
867
    ensure_config(default_config_path)
868
    
869
    # Generate custom configuration
870
    custom_config = "/etc/toil/production.conf"
871
    generate_config(custom_config)
872
    
873
    # Update configuration values
874
    update_config(custom_config, "batchSystem", "slurm")
875
    update_config(custom_config, "maxCores", "64")
876
    update_config(custom_config, "defaultMemory", "8G")
877
    
878
    # Use config management CLI
879
    sys.argv = [
880
        'toil-config',
881
        '--set', 'jobStore=aws:us-west-2:my-bucket',
882
        '--set', 'batchSystem=kubernetes',
883
        '--set', 'maxNodes=100',
884
        '--config-file', custom_config
885
    ]
886
    
887
    config_main()
888

889
def configuration_templates():
890
    """Provide configuration templates for different use cases."""
891
    
892
    templates = {
893
        'local_development': {
894
            'batchSystem': 'local',
895
            'maxCores': 4,
896
            'maxMemory': '8G',
897
            'jobStore': 'file:/tmp/toil-dev',
898
            'logLevel': 'DEBUG',
899
            'stats': True
900
        },
901
        
902
        'hpc_cluster': {
903
            'batchSystem': 'slurm',
904
            'maxCores': 1000,
905
            'maxMemory': '2T',
906
            'jobStore': 'file:/shared/toil-jobs',
907
            'workDir': '/tmp/toil-work',
908
            'logLevel': 'INFO',
909
            'retryCount': 3,
910
            'rescueJobsFrequency': 3600
911
        },
912
        
913
        'cloud_production': {
914
            'batchSystem': 'kubernetes',
915
            'provisioner': 'aws',
916
            'jobStore': 'aws:us-west-2:production-bucket:workflows',
917
            'nodeTypes': ['m5.large', 'm5.xlarge', 'c5.2xlarge'],
918
            'maxNodes': 500,
919
            'defaultPreemptible': True,
920
            'preemptibleCompensation': 1.5,
921
            'sseKey': 'alias/toil-production-key',
922
            'clean': 'onSuccess',
923
            'stats': True,
924
            'logLevel': 'INFO'
925
        }
926
    }
927
    
928
    def create_config_from_template(template_name: str, output_file: str):
929
        """Create configuration file from template."""
930
        
931
        if template_name not in templates:
932
            raise ValueError(f"Unknown template: {template_name}")
933
        
934
        template = templates[template_name]
935
        
936
        # Generate base config
937
        generate_config(output_file)
938
        
939
        # Apply template values
940
        for key, value in template.items():
941
            update_config(output_file, key, value)
942
        
943
        print(f"Configuration created: {output_file}")
944
        print(f"Template used: {template_name}")
945
    
946
    return create_config_from_template
947
```
948

949
This comprehensive utilities and CLI tools suite provides complete workflow lifecycle management from development through production deployment with extensive monitoring, debugging, and maintenance capabilities.

Version

Tile

Files

utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utilities.mddocs/