or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

constants.mdgenotype-analysis.mdindex.mdsample-filtering.mdutils.mdvariant-records.mdvcf-filtering.mdvcf-parsing.mdvcf-writing.md

constants.mddocs/

0

# Constants and Reserved Fields

1

2

PyVCF constants for VCF specification compliance, field validation, and reserved field definitions for INFO and FORMAT fields.

3

4

## Capabilities

5

6

### Package Version

7

8

```python { .api }

9

VERSION: str # PyVCF version string (e.g., '0.6.8')

10

```

11

12

### Reserved INFO Fields

13

14

Pre-defined INFO field specifications from VCF 4.0/4.1 specification.

15

16

```python { .api }

17

RESERVED_INFO: dict = {

18

'AA': 'String', # Ancestral Allele

19

'AC': 'Integer', # Allele count in genotypes

20

'AF': 'Float', # Allele Frequency

21

'AN': 'Integer', # Total number of alleles in called genotypes

22

'BQ': 'Float', # Base Quality

23

'CIGAR': 'String', # CIGAR string describing alignment

24

'DB': 'Flag', # dbSNP membership

25

'DP': 'Integer', # Combined depth across samples

26

'END': 'Integer', # End position of variant

27

'H2': 'Flag', # HapMap2 membership

28

'H3': 'Flag', # HapMap3 membership

29

'MQ': 'Float', # Mapping Quality

30

'MQ0': 'Integer', # Number of MAPQ == 0 reads

31

'NS': 'Integer', # Number of samples with data

32

'SB': 'String', # Strand bias

33

'SOMATIC': 'Flag', # Somatic mutation

34

'VALIDATED': 'Flag', # Validated by follow-up experiment

35

'1000G': 'Flag', # 1000 Genomes membership

36

37

# Structural variant INFO fields

38

'IMPRECISE': 'Flag', # Imprecise structural variation

39

'NOVEL': 'Flag', # Novel structural variation

40

'SVEND': 'Integer', # End position of SV

41

'SVLEN': 'Integer', # Length of SV

42

'SVTYPE': 'String', # Type of structural variant

43

'MATEID': 'String', # ID of mate breakend

44

'EVENT': 'String', # ID of associated event

45

'HOMLEN': 'Integer', # Length of base pair homology

46

'DGVID': 'String', # ID from Database of Genomic Variants

47

'DBVARID': 'String', # ID from NCBI dbVar

48

}

49

```

50

51

### Reserved FORMAT Fields

52

53

Pre-defined FORMAT field specifications from VCF 4.0/4.1 specification.

54

55

```python { .api }

56

RESERVED_FORMAT: dict = {

57

'GT': 'String', # Genotype

58

'DP': 'Integer', # Read depth at this position for this sample

59

'FT': 'String', # Sample genotype filter

60

'GL': 'Float', # Genotype likelihoods

61

'GLE': 'String', # Genotype likelihoods (log10 encoded)

62

'PL': 'Integer', # Phred-scaled genotype likelihoods

63

'GP': 'Float', # Genotype posterior probabilities

64

'GQ': 'Integer', # Conditional genotype quality

65

'HQ': 'Integer', # Haplotype qualities

66

'PS': 'Integer', # Phase set identifier

67

'PQ': 'Integer', # Phasing quality

68

'EC': 'Integer', # Expected alternate allele counts

69

'MQ': 'Integer', # Mapping quality

70

'AD': 'Integer', # Allelic depths

71

'ADF': 'Integer', # Forward strand allelic depths

72

'ADR': 'Integer', # Reverse strand allelic depths

73

74

# Copy number fields

75

'CN': 'Integer', # Copy number genotype

76

'CNQ': 'Float', # Copy number genotype quality

77

'CNL': 'Float', # Copy number genotype likelihood

78

'NQ': 'Integer', # Phred style probability that the variant is novel

79

'HAP': 'Integer', # Unique haplotype identifier

80

'AHAP': 'Integer', # Alternative haplotype identifier

81

}

82

```

83

84

### Field Number Constants

85

86

Constants for interpreting field number specifications in VCF headers.

87

88

```python { .api }

89

field_counts: dict = {

90

'.': None, # Unknown number of values

91

'A': -1, # Number of alternate alleles

92

'G': -2, # Number of genotypes (including reference)

93

'R': -3, # Number of alleles (reference + alternates)

94

}

95

```

96

97

### Usage Examples

98

99

```python

100

import vcf

101

102

# Access version information

103

print(f"PyVCF version: {vcf.VERSION}")

104

105

# Check if INFO field is reserved

106

if 'DP' in vcf.RESERVED_INFO:

107

print(f"DP is a reserved INFO field, type: {vcf.RESERVED_INFO['DP']}")

108

109

# Check if FORMAT field is reserved

110

if 'GT' in vcf.RESERVED_FORMAT:

111

print(f"GT is a reserved FORMAT field, type: {vcf.RESERVED_FORMAT['GT']}")

112

113

# Validate custom field names against reserved fields

114

def validate_custom_info_field(field_name):

115

if field_name in vcf.RESERVED_INFO:

116

print(f"Warning: {field_name} is a reserved INFO field")

117

return False

118

return True

119

120

def validate_custom_format_field(field_name):

121

if field_name in vcf.RESERVED_FORMAT:

122

print(f"Warning: {field_name} is a reserved FORMAT field")

123

return False

124

return True

125

126

# Example validation

127

validate_custom_info_field('CUSTOM_INFO') # OK

128

validate_custom_info_field('DP') # Warning: reserved

129

validate_custom_format_field('CUSTOM_GT') # OK

130

validate_custom_format_field('GT') # Warning: reserved

131

```

132

133

### Working with Reserved Fields

134

135

```python

136

import vcf

137

138

reader = vcf.Reader(filename='variants.vcf')

139

140

for record in reader:

141

# Access standard INFO fields

142

if 'DP' in record.INFO:

143

depth = record.INFO['DP']

144

print(f"Total depth: {depth}")

145

146

if 'AF' in record.INFO:

147

allele_freqs = record.INFO['AF']

148

print(f"Allele frequencies: {allele_freqs}")

149

150

# Check for structural variant INFO fields

151

if record.is_sv:

152

if 'SVTYPE' in record.INFO:

153

sv_type = record.INFO['SVTYPE']

154

print(f"SV type: {sv_type}")

155

156

if 'SVLEN' in record.INFO:

157

sv_length = record.INFO['SVLEN']

158

print(f"SV length: {sv_length}")

159

160

# Access standard FORMAT fields for samples

161

for call in record.samples:

162

if hasattr(call.data, 'GT'):

163

print(f"Sample {call.sample} genotype: {call.data.GT}")

164

165

if hasattr(call.data, 'DP'):

166

print(f"Sample {call.sample} depth: {call.data.DP}")

167

168

if hasattr(call.data, 'GQ'):

169

print(f"Sample {call.sample} quality: {call.data.GQ}")

170

```

171

172

### Header Field Type Validation

173

174

```python

175

import vcf

176

177

def validate_field_definition(field_dict, reserved_dict, field_type):

178

"""Validate field definitions against reserved specifications."""

179

for field_id, field_info in field_dict.items():

180

if field_id in reserved_dict:

181

expected_type = reserved_dict[field_id]

182

actual_type = field_info.type

183

184

if actual_type != expected_type:

185

print(f"Warning: {field_type} field {field_id} "

186

f"type mismatch: expected {expected_type}, "

187

f"got {actual_type}")

188

189

# Validate a VCF file's field definitions

190

reader = vcf.Reader(filename='variants.vcf')

191

192

# Validate INFO field definitions

193

validate_field_definition(reader.infos, vcf.RESERVED_INFO, 'INFO')

194

195

# Validate FORMAT field definitions

196

validate_field_definition(reader.formats, vcf.RESERVED_FORMAT, 'FORMAT')

197

```

198

199

### Custom Field Number Handling

200

201

```python

202

import vcf

203

204

def interpret_field_number(num_str, num_alts=None, num_samples=None):

205

"""Interpret VCF field number specifications."""

206

if num_str == '.':

207

return None # Variable number

208

elif num_str == 'A':

209

return num_alts if num_alts else -1

210

elif num_str == 'G':

211

# Number of genotypes = (n+1)(n+2)/2 where n = number of alleles

212

if num_alts:

213

n_alleles = num_alts + 1 # Include reference

214

return (n_alleles * (n_alleles + 1)) // 2

215

return -2

216

elif num_str == 'R':

217

return (num_alts + 1) if num_alts else -3

218

else:

219

try:

220

return int(num_str)

221

except ValueError:

222

return None

223

224

# Example usage with a record

225

reader = vcf.Reader(filename='variants.vcf')

226

record = next(reader)

227

228

# Interpret field numbers for this record's context

229

num_alts = len(record.ALT)

230

print(f"Number of alternates: {num_alts}")

231

232

for field_id, field_info in reader.infos.items():

233

expected_count = interpret_field_number(field_info.num, num_alts)

234

print(f"INFO {field_id}: expects {expected_count} values")

235

```