0
# Command-Line Tools Integration
1
2
Direct access to samtools and bcftools command-line functionality from Python. All subcommands are available as Python functions with the same arguments and behavior as the command-line tools.
3
4
## Capabilities
5
6
### Samtools Functions
7
8
Complete samtools toolkit exposed as Python functions for alignment file processing.
9
10
```python { .api }
11
def view(*args, **kwargs):
12
"""
13
View and convert alignment files.
14
15
Parameters:
16
*args: command-line arguments as strings
17
**kwargs: keyword arguments (catch_stdout, save_stdout, etc.)
18
19
Returns:
20
int, exit code (0 for success)
21
"""
22
23
def sort(*args, **kwargs):
24
"""
25
Sort alignment files.
26
27
Parameters:
28
*args: command-line arguments as strings
29
30
Returns:
31
int, exit code
32
"""
33
34
def index(*args, **kwargs):
35
"""
36
Index alignment files.
37
38
Parameters:
39
*args: command-line arguments as strings
40
41
Returns:
42
int, exit code
43
"""
44
45
def merge(*args, **kwargs):
46
"""
47
Merge multiple alignment files.
48
49
Parameters:
50
*args: command-line arguments as strings
51
52
Returns:
53
int, exit code
54
"""
55
56
def cat(*args, **kwargs):
57
"""
58
Concatenate alignment files.
59
60
Parameters:
61
*args: command-line arguments as strings
62
63
Returns:
64
int, exit code
65
"""
66
67
def stats(*args, **kwargs):
68
"""
69
Generate alignment statistics.
70
71
Parameters:
72
*args: command-line arguments as strings
73
74
Returns:
75
int, exit code
76
"""
77
78
def flagstat(*args, **kwargs):
79
"""
80
Generate flag statistics.
81
82
Parameters:
83
*args: command-line arguments as strings
84
85
Returns:
86
int, exit code
87
"""
88
89
def idxstats(*args, **kwargs):
90
"""
91
Generate index statistics.
92
93
Parameters:
94
*args: command-line arguments as strings
95
96
Returns:
97
int, exit code
98
"""
99
100
def depth(*args, **kwargs):
101
"""
102
Calculate per-position depth.
103
104
Parameters:
105
*args: command-line arguments as strings
106
107
Returns:
108
int, exit code
109
"""
110
111
def coverage(*args, **kwargs):
112
"""
113
Calculate coverage statistics.
114
115
Parameters:
116
*args: command-line arguments as strings
117
118
Returns:
119
int, exit code
120
"""
121
122
def mpileup(*args, **kwargs):
123
"""
124
Generate pileup format.
125
126
Parameters:
127
*args: command-line arguments as strings
128
129
Returns:
130
int, exit code
131
"""
132
133
def fixmate(*args, **kwargs):
134
"""
135
Fix mate pair information.
136
137
Parameters:
138
*args: command-line arguments as strings
139
140
Returns:
141
int, exit code
142
"""
143
144
def markdup(*args, **kwargs):
145
"""
146
Mark or remove duplicates.
147
148
Parameters:
149
*args: command-line arguments as strings
150
151
Returns:
152
int, exit code
153
"""
154
155
def rmdup(*args, **kwargs):
156
"""
157
Remove duplicates (legacy).
158
159
Parameters:
160
*args: command-line arguments as strings
161
162
Returns:
163
int, exit code
164
"""
165
166
def quickcheck(*args, **kwargs):
167
"""
168
Quick integrity check.
169
170
Parameters:
171
*args: command-line arguments as strings
172
173
Returns:
174
int, exit code
175
"""
176
177
def calmd(*args, **kwargs):
178
"""
179
Calculate MD tag.
180
181
Parameters:
182
*args: command-line arguments as strings
183
184
Returns:
185
int, exit code
186
"""
187
188
def fasta(*args, **kwargs):
189
"""
190
Convert to FASTA format.
191
192
Parameters:
193
*args: command-line arguments as strings
194
195
Returns:
196
int, exit code
197
"""
198
199
def fastq(*args, **kwargs):
200
"""
201
Convert to FASTQ format.
202
203
Parameters:
204
*args: command-line arguments as strings
205
206
Returns:
207
int, exit code
208
"""
209
210
def bam2fq(*args, **kwargs):
211
"""
212
Convert BAM to FASTQ.
213
214
Parameters:
215
*args: command-line arguments as strings
216
217
Returns:
218
int, exit code
219
"""
220
221
def split(*args, **kwargs):
222
"""
223
Split alignment files.
224
225
Parameters:
226
*args: command-line arguments as strings
227
228
Returns:
229
int, exit code
230
"""
231
232
def collate(*args, **kwargs):
233
"""
234
Collate alignment files.
235
236
Parameters:
237
*args: command-line arguments as strings
238
239
Returns:
240
int, exit code
241
"""
242
243
def faidx(*args, **kwargs):
244
"""
245
Index FASTA file.
246
247
Parameters:
248
*args: command-line arguments as strings
249
250
Returns:
251
int, exit code
252
"""
253
254
def dict(*args, **kwargs):
255
"""
256
Create sequence dictionary.
257
258
Parameters:
259
*args: command-line arguments as strings
260
261
Returns:
262
int, exit code
263
"""
264
265
def bedcov(*args, **kwargs):
266
"""
267
Calculate BED coverage.
268
269
Parameters:
270
*args: command-line arguments as strings
271
272
Returns:
273
int, exit code
274
"""
275
276
def phase(*args, **kwargs):
277
"""
278
Phase variants.
279
280
Parameters:
281
*args: command-line arguments as strings
282
283
Returns:
284
int, exit code
285
"""
286
287
def consensus(*args, **kwargs):
288
"""
289
Generate consensus sequence.
290
291
Parameters:
292
*args: command-line arguments as strings
293
294
Returns:
295
int, exit code
296
"""
297
298
def reheader(*args, **kwargs):
299
"""
300
Replace header.
301
302
Parameters:
303
*args: command-line arguments as strings
304
305
Returns:
306
int, exit code
307
"""
308
309
def depad(*args, **kwargs):
310
"""
311
Remove padding from alignment.
312
313
Parameters:
314
*args: command-line arguments as strings
315
316
Returns:
317
int, exit code
318
"""
319
320
def pad2unpad(*args, **kwargs):
321
"""
322
Convert padded to unpadded alignment.
323
324
Parameters:
325
*args: command-line arguments as strings
326
327
Returns:
328
int, exit code
329
"""
330
331
def addreplacerg(*args, **kwargs):
332
"""
333
Add or replace read groups.
334
335
Parameters:
336
*args: command-line arguments as strings
337
338
Returns:
339
int, exit code
340
"""
341
342
def ampliconstats(*args, **kwargs):
343
"""
344
Generate amplicon statistics.
345
346
Parameters:
347
*args: command-line arguments as strings
348
349
Returns:
350
int, exit code
351
"""
352
353
def ampliconclip(*args, **kwargs):
354
"""
355
Clip amplicon primers.
356
357
Parameters:
358
*args: command-line arguments as strings
359
360
Returns:
361
int, exit code
362
"""
363
364
def tview(*args, **kwargs):
365
"""
366
Text alignment viewer.
367
368
Parameters:
369
*args: command-line arguments as strings
370
371
Returns:
372
int, exit code
373
"""
374
375
def flags(*args, **kwargs):
376
"""
377
Explain SAM flags.
378
379
Parameters:
380
*args: command-line arguments as strings
381
382
Returns:
383
int, exit code
384
"""
385
386
def version(*args, **kwargs):
387
"""
388
Show samtools version.
389
390
Parameters:
391
*args: command-line arguments as strings
392
393
Returns:
394
int, exit code
395
"""
396
397
def bamshuf(*args, **kwargs):
398
"""
399
Shuffle and group alignments by name.
400
401
Parameters:
402
*args: command-line arguments as strings
403
404
Returns:
405
int, exit code
406
"""
407
408
def cram_size(*args, **kwargs):
409
"""
410
List the contents of a CRAM file.
411
412
Parameters:
413
*args: command-line arguments as strings
414
415
Returns:
416
int, exit code
417
"""
418
419
def fqidx(*args, **kwargs):
420
"""
421
Index/extract FASTQ.
422
423
Parameters:
424
*args: command-line arguments as strings
425
426
Returns:
427
int, exit code
428
"""
429
430
def fqimport(*args, **kwargs):
431
"""
432
Import FASTQ to SAM.
433
434
Parameters:
435
*args: command-line arguments as strings
436
437
Returns:
438
int, exit code
439
"""
440
441
def reference(*args, **kwargs):
442
"""
443
Generate reference sequence from MD tags.
444
445
Parameters:
446
*args: command-line arguments as strings
447
448
Returns:
449
int, exit code
450
"""
451
452
def reset(*args, **kwargs):
453
"""
454
Remove auxiliary fields.
455
456
Parameters:
457
*args: command-line arguments as strings
458
459
Returns:
460
int, exit code
461
"""
462
463
def samples(*args, **kwargs):
464
"""
465
List the samples in a set of SAM/BAM/CRAM files.
466
467
Parameters:
468
*args: command-line arguments as strings
469
470
Returns:
471
int, exit code
472
"""
473
474
def targetcut(*args, **kwargs):
475
"""
476
Cut out regions with excessive mismatches.
477
478
Parameters:
479
*args: command-line arguments as strings
480
481
Returns:
482
int, exit code
483
"""
484
```
485
486
### BCFtools Functions
487
488
Complete bcftools toolkit exposed as Python functions for variant file processing.
489
490
```python { .api }
491
def call(*args, **kwargs):
492
"""
493
SNP/indel calling.
494
495
Parameters:
496
*args: command-line arguments as strings
497
498
Returns:
499
int, exit code
500
"""
501
502
def filter(*args, **kwargs):
503
"""
504
Filter VCF/BCF records.
505
506
Parameters:
507
*args: command-line arguments as strings
508
509
Returns:
510
int, exit code
511
"""
512
513
def norm(*args, **kwargs):
514
"""
515
Normalize variants.
516
517
Parameters:
518
*args: command-line arguments as strings
519
520
Returns:
521
int, exit code
522
"""
523
524
def annotate(*args, **kwargs):
525
"""
526
Annotate variants.
527
528
Parameters:
529
*args: command-line arguments as strings
530
531
Returns:
532
int, exit code
533
"""
534
535
def query(*args, **kwargs):
536
"""
537
Extract/query VCF/BCF data.
538
539
Parameters:
540
*args: command-line arguments as strings
541
542
Returns:
543
int, exit code
544
"""
545
546
def convert(*args, **kwargs):
547
"""
548
Convert between formats.
549
550
Parameters:
551
*args: command-line arguments as strings
552
553
Returns:
554
int, exit code
555
"""
556
557
def concat(*args, **kwargs):
558
"""
559
Concatenate VCF files.
560
561
Parameters:
562
*args: command-line arguments as strings
563
564
Returns:
565
int, exit code
566
"""
567
568
def isec(*args, **kwargs):
569
"""
570
Create intersections of VCF files.
571
572
Parameters:
573
*args: command-line arguments as strings
574
575
Returns:
576
int, exit code
577
"""
578
579
def gtcheck(*args, **kwargs):
580
"""
581
Check sample identity.
582
583
Parameters:
584
*args: command-line arguments as strings
585
586
Returns:
587
int, exit code
588
"""
589
590
def roh(*args, **kwargs):
591
"""
592
Runs of homozygosity.
593
594
Parameters:
595
*args: command-line arguments as strings
596
597
Returns:
598
int, exit code
599
"""
600
601
def plugin(*args, **kwargs):
602
"""
603
Run bcftools plugin.
604
605
Parameters:
606
*args: command-line arguments as strings
607
608
Returns:
609
int, exit code
610
"""
611
612
def cnv(*args, **kwargs):
613
"""
614
Copy number variation analysis.
615
616
Parameters:
617
*args: command-line arguments as strings
618
619
Returns:
620
int, exit code
621
"""
622
623
def csq(*args, **kwargs):
624
"""
625
Consequence analysis.
626
627
Parameters:
628
*args: command-line arguments as strings
629
630
Returns:
631
int, exit code
632
"""
633
634
def head(*args, **kwargs):
635
"""
636
View header.
637
638
Parameters:
639
*args: command-line arguments as strings
640
641
Returns:
642
int, exit code
643
"""
644
645
def consensus(*args, **kwargs):
646
"""
647
Create consensus sequence by applying VCF variants.
648
649
Parameters:
650
*args: command-line arguments as strings
651
652
Returns:
653
int, exit code
654
"""
655
656
def index(*args, **kwargs):
657
"""
658
Index VCF/BCF files.
659
660
Parameters:
661
*args: command-line arguments as strings
662
663
Returns:
664
int, exit code
665
"""
666
667
def merge(*args, **kwargs):
668
"""
669
Merge VCF files.
670
671
Parameters:
672
*args: command-line arguments as strings
673
674
Returns:
675
int, exit code
676
"""
677
678
def mpileup(*args, **kwargs):
679
"""
680
Multi-way pileup producing genotype likelihoods.
681
682
Parameters:
683
*args: command-line arguments as strings
684
685
Returns:
686
int, exit code
687
"""
688
689
def reheader(*args, **kwargs):
690
"""
691
Modify header of VCF/BCF files.
692
693
Parameters:
694
*args: command-line arguments as strings
695
696
Returns:
697
int, exit code
698
"""
699
700
def sort(*args, **kwargs):
701
"""
702
Sort VCF/BCF files.
703
704
Parameters:
705
*args: command-line arguments as strings
706
707
Returns:
708
int, exit code
709
"""
710
711
def stats(*args, **kwargs):
712
"""
713
Produce VCF/BCF stats.
714
715
Parameters:
716
*args: command-line arguments as strings
717
718
Returns:
719
int, exit code
720
"""
721
722
def view(*args, **kwargs):
723
"""
724
View, subset and filter VCF or BCF files.
725
726
Parameters:
727
*args: command-line arguments as strings
728
729
Returns:
730
int, exit code
731
"""
732
```
733
734
### Error Handling
735
736
```python { .api }
737
class SamtoolsError(Exception):
738
"""
739
Exception raised when samtools/bcftools commands fail.
740
741
Attributes:
742
- cmd: str, command that failed
743
- returncode: int, exit code
744
- stderr: str, error output
745
"""
746
747
def __init__(self, cmd, returncode, stderr=None):
748
"""
749
Initialize SamtoolsError.
750
751
Parameters:
752
- cmd: str, command that failed
753
- returncode: int, exit code
754
- stderr: str, error output
755
"""
756
```
757
758
## Usage Examples
759
760
### Basic Command Execution
761
762
```python
763
import pysam
764
765
# Sort BAM file
766
pysam.sort("-o", "sorted.bam", "input.bam")
767
768
# Index sorted BAM
769
pysam.index("sorted.bam")
770
771
# Generate statistics
772
pysam.stats("sorted.bam")
773
774
# View specific region
775
pysam.view("-b", "-o", "region.bam", "sorted.bam", "chr1:1000-2000")
776
777
# Convert SAM to BAM
778
pysam.view("-b", "-o", "output.bam", "input.sam")
779
```
780
781
### Capturing Output
782
783
```python
784
import pysam
785
786
# Capture command output
787
result = pysam.flagstat("input.bam", catch_stdout=True)
788
print("Flagstat output:", result)
789
790
# Save output to file
791
pysam.stats("input.bam", save_stdout="stats.txt")
792
793
# Capture both stdout and stderr
794
try:
795
result = pysam.view("-h", "nonexistent.bam", catch_stdout=True)
796
except pysam.SamtoolsError as e:
797
print(f"Command failed: {e.cmd}")
798
print(f"Exit code: {e.returncode}")
799
print(f"Error: {e.stderr}")
800
```
801
802
### Complex Workflows
803
804
```python
805
import pysam
806
import os
807
808
def process_alignment_workflow(input_sam, output_prefix):
809
"""Complete alignment processing workflow."""
810
811
# Step 1: Convert SAM to BAM
812
temp_bam = f"{output_prefix}_temp.bam"
813
pysam.view("-b", "-o", temp_bam, input_sam)
814
815
# Step 2: Sort BAM
816
sorted_bam = f"{output_prefix}_sorted.bam"
817
pysam.sort("-o", sorted_bam, temp_bam)
818
819
# Step 3: Index sorted BAM
820
pysam.index(sorted_bam)
821
822
# Step 4: Mark duplicates
823
markdup_bam = f"{output_prefix}_markdup.bam"
824
pysam.markdup("-o", markdup_bam, sorted_bam)
825
826
# Step 5: Index marked BAM
827
pysam.index(markdup_bam)
828
829
# Step 6: Generate statistics
830
stats_file = f"{output_prefix}_stats.txt"
831
pysam.stats(markdup_bam, save_stdout=stats_file)
832
833
# Step 7: Generate flagstat
834
flagstat_file = f"{output_prefix}_flagstat.txt"
835
pysam.flagstat(markdup_bam, save_stdout=flagstat_file)
836
837
# Cleanup intermediate files
838
os.remove(temp_bam)
839
840
return {
841
'final_bam': markdup_bam,
842
'stats_file': stats_file,
843
'flagstat_file': flagstat_file
844
}
845
846
# Usage
847
results = process_alignment_workflow("input.sam", "processed")
848
print(f"Final BAM: {results['final_bam']}")
849
```
850
851
### Variant Calling Pipeline
852
853
```python
854
import pysam
855
856
def variant_calling_pipeline(bam_files, reference_fasta, output_vcf):
857
"""Complete variant calling pipeline using bcftools."""
858
859
# Step 1: Generate pileup
860
pileup_bcf = "pileup.bcf"
861
cmd_args = ["-f", reference_fasta, "-o", pileup_bcf, "-O", "b"] + bam_files
862
pysam.mpileup(*cmd_args)
863
864
# Step 2: Call variants
865
calls_vcf = "calls.vcf"
866
pysam.call("-mv", "-o", calls_vcf, "-O", "v", pileup_bcf)
867
868
# Step 3: Filter variants
869
pysam.filter("-i", "QUAL>=20 && DP>=10", "-o", output_vcf, calls_vcf)
870
871
# Step 4: Index final VCF
872
pysam.tabix_compress(output_vcf, f"{output_vcf}.gz")
873
pysam.tabix_index(f"{output_vcf}.gz", preset="vcf")
874
875
# Cleanup intermediate files
876
os.remove(pileup_bcf)
877
os.remove(calls_vcf)
878
879
return f"{output_vcf}.gz"
880
881
# Usage
882
bam_files = ["sample1.bam", "sample2.bam", "sample3.bam"]
883
final_vcf = variant_calling_pipeline(bam_files, "reference.fa", "variants.vcf")
884
print(f"Final VCF: {final_vcf}")
885
```
886
887
### Batch Processing
888
889
```python
890
import pysam
891
import glob
892
import os
893
from multiprocessing import Pool
894
895
def process_single_bam(bam_file):
896
"""Process single BAM file."""
897
base_name = os.path.splitext(bam_file)[0]
898
899
try:
900
# Sort if not already sorted
901
if not bam_file.endswith('_sorted.bam'):
902
sorted_bam = f"{base_name}_sorted.bam"
903
pysam.sort("-o", sorted_bam, bam_file)
904
bam_file = sorted_bam
905
906
# Index
907
pysam.index(bam_file)
908
909
# Generate statistics
910
stats_file = f"{base_name}_stats.txt"
911
pysam.stats(bam_file, save_stdout=stats_file)
912
913
return f"Processed: {bam_file}"
914
915
except pysam.SamtoolsError as e:
916
return f"Failed: {bam_file} - {e}"
917
918
def batch_process_bams(bam_pattern, num_processes=4):
919
"""Process multiple BAM files in parallel."""
920
bam_files = glob.glob(bam_pattern)
921
922
with Pool(processes=num_processes) as pool:
923
results = pool.map(process_single_bam, bam_files)
924
925
for result in results:
926
print(result)
927
928
# Usage
929
batch_process_bams("*.bam", num_processes=8)
930
```
931
932
### Quality Control Pipeline
933
934
```python
935
import pysam
936
import json
937
938
def quality_control_analysis(bam_file):
939
"""Comprehensive quality control analysis."""
940
base_name = os.path.splitext(bam_file)[0]
941
942
# Collect various statistics
943
qc_results = {}
944
945
# Basic statistics
946
stats_output = pysam.stats(bam_file, catch_stdout=True)
947
qc_results['basic_stats'] = stats_output
948
949
# Flag statistics
950
flagstat_output = pysam.flagstat(bam_file, catch_stdout=True)
951
qc_results['flag_stats'] = flagstat_output
952
953
# Index statistics (if indexed)
954
try:
955
idxstats_output = pysam.idxstats(bam_file, catch_stdout=True)
956
qc_results['index_stats'] = idxstats_output
957
except pysam.SamtoolsError:
958
qc_results['index_stats'] = "No index available"
959
960
# Quick integrity check
961
try:
962
pysam.quickcheck(bam_file)
963
qc_results['integrity_check'] = "PASS"
964
except pysam.SamtoolsError:
965
qc_results['integrity_check'] = "FAIL"
966
967
# Save results
968
qc_file = f"{base_name}_qc.json"
969
with open(qc_file, 'w') as f:
970
json.dump(qc_results, f, indent=2)
971
972
return qc_file
973
974
# Usage
975
qc_report = quality_control_analysis("sample.bam")
976
print(f"QC report saved to: {qc_report}")
977
```
978
979
## Performance Tips
980
981
- Commands run in separate processes, so there's some overhead
982
- For simple operations, using the API classes directly may be faster
983
- Use `catch_stdout=True` to capture output instead of printing to console
984
- Commands support all the same arguments as their command-line equivalents
985
- Error handling uses `SamtoolsError` exception for failed commands
986
- Multi-threading in commands can improve performance for compression/decompression