or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

constants.mdgenotype-analysis.mdindex.mdsample-filtering.mdutils.mdvariant-records.mdvcf-filtering.mdvcf-parsing.mdvcf-writing.md

utils.mddocs/

0

# VCF Utilities

1

2

Utility functions for advanced VCF file operations including multi-file synchronization and sequence manipulation for comparative genomics workflows.

3

4

## Capabilities

5

6

### Multi-File VCF Processing

7

8

Process multiple VCF files simultaneously with coordinated iteration through variant records.

9

10

```python { .api }

11

def walk_together(*readers, **kwargs):

12

"""

13

Iterate over multiple VCF files simultaneously, yielding coordinated variant records.

14

15

Parameters:

16

- *readers: VCF Reader objects to iterate over together

17

- **kwargs: vcf_record_sort_key - Optional function for custom record sorting/comparison

18

19

Yields:

20

List of _Record objects, one from each reader at synchronized positions.

21

Records are None for readers that don't have variants at the current position.

22

"""

23

```

24

25

### Sequence Manipulation

26

27

Utilities for working with reference and alternate allele sequences.

28

29

```python { .api }

30

def trim_common_suffix(*sequences):

31

"""

32

Remove common suffix from sequences while keeping at least 1 character.

33

34

Parameters:

35

- *sequences: str, input sequences to trim

36

37

Returns:

38

List[str]: Sequences with common suffix removed, maintaining minimum length

39

"""

40

```

41

42

### Usage Examples

43

44

```python

45

import vcf

46

from vcf.utils import walk_together, trim_common_suffix

47

48

# Synchronize multiple VCF files

49

reader1 = vcf.Reader(filename='population1.vcf')

50

reader2 = vcf.Reader(filename='population2.vcf')

51

reader3 = vcf.Reader(filename='population3.vcf')

52

53

# Process variants across all files simultaneously

54

for records in walk_together(reader1, reader2, reader3):

55

record1, record2, record3 = records

56

57

# Check which files have variants at this position

58

present_in = []

59

if record1: present_in.append('pop1')

60

if record2: present_in.append('pop2')

61

if record3: present_in.append('pop3')

62

63

if len(present_in) > 1:

64

print(f"Shared variant at {record1.CHROM if record1 else record2.CHROM if record2 else record3.CHROM}:")

65

print(f" Present in: {', '.join(present_in)}")

66

67

# Compare allele frequencies across populations

68

for i, record in enumerate(records):

69

if record and record.aaf:

70

pop_name = ['pop1', 'pop2', 'pop3'][i]

71

print(f" {pop_name} AAF: {record.aaf}")

72

73

# Sequence manipulation examples

74

sequences = ['ATCGATCG', 'TTCGATCG', 'CTCGATCG']

75

trimmed = trim_common_suffix(*sequences)

76

print(f"Original: {sequences}")

77

print(f"Trimmed: {trimmed}") # ['ATC', 'TTC', 'CTC'] - removed common 'GATCG'

78

79

# Example with variant alleles

80

ref = 'ATCGATCG'

81

alt = 'ATCGATCC'

82

trimmed_alleles = trim_common_suffix(ref, alt)

83

print(f"REF: {ref} -> {trimmed_alleles[0]}") # ATCGATCG -> ATCGATCG

84

print(f"ALT: {alt} -> {trimmed_alleles[1]}") # ATCGATCC -> ATCGATCC

85

```

86

87

### Advanced Multi-File Analysis

88

89

```python

90

import vcf

91

from vcf.utils import walk_together

92

93

# Compare variant calling across different methods

94

caller1_reader = vcf.Reader(filename='gatk_variants.vcf')

95

caller2_reader = vcf.Reader(filename='freebayes_variants.vcf')

96

caller3_reader = vcf.Reader(filename='varscan_variants.vcf')

97

98

concordant_variants = []

99

discordant_variants = []

100

101

for records in walk_together(caller1_reader, caller2_reader, caller3_reader):

102

gatk_record, freebayes_record, varscan_record = records

103

104

# Count how many callers found this variant

105

called_by = sum(1 for record in records if record is not None)

106

107

if called_by >= 2: # Concordant if found by 2+ callers

108

# Use the record with highest quality

109

best_record = max([r for r in records if r], key=lambda x: x.QUAL or 0)

110

concordant_variants.append(best_record)

111

elif called_by == 1: # Discordant if found by only 1 caller

112

solo_record = next(r for r in records if r)

113

discordant_variants.append(solo_record)

114

115

print(f"Concordant variants: {len(concordant_variants)}")

116

print(f"Discordant variants: {len(discordant_variants)}")

117

118

# Write high-confidence variants (found by multiple callers)

119

template_reader = vcf.Reader(filename='gatk_variants.vcf')

120

with open('high_confidence.vcf', 'w') as output:

121

writer = vcf.Writer(output, template_reader)

122

for variant in concordant_variants:

123

variant.add_info('CALLERS', called_by) # Add metadata

124

writer.write_record(variant)

125

writer.close()

126

```

127

128

### Custom Record Sorting

129

130

```python

131

import vcf

132

from vcf.utils import walk_together

133

134

def custom_sort_key(record):

135

"""Custom sorting function prioritizing by chromosome, then position."""

136

# Convert chromosome names to sortable format

137

chrom = record.CHROM

138

if chrom.startswith('chr'):

139

chrom = chrom[3:]

140

141

# Handle sex chromosomes and others

142

if chrom == 'X':

143

chrom_num = 23

144

elif chrom == 'Y':

145

chrom_num = 24

146

elif chrom == 'MT' or chrom == 'M':

147

chrom_num = 25

148

else:

149

try:

150

chrom_num = int(chrom)

151

except ValueError:

152

chrom_num = 26 # Unknown chromosomes last

153

154

return (chrom_num, record.POS)

155

156

# Use custom sorting for non-standard VCF files

157

reader1 = vcf.Reader(filename='unsorted1.vcf')

158

reader2 = vcf.Reader(filename='unsorted2.vcf')

159

160

for records in walk_together(reader1, reader2, vcf_record_sort_key=custom_sort_key):

161

record1, record2 = records

162

# Process synchronized records with custom sort order

163

pass

164

```