or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

constants.mdgenotype-analysis.mdindex.mdsample-filtering.mdutils.mdvariant-records.mdvcf-filtering.mdvcf-parsing.mdvcf-writing.md

sample-filtering.mddocs/

0

# Sample-Based Filtering

1

2

Filter VCF files by sample during parsing to create subset files with specific samples for population studies and cohort analysis.

3

4

## Capabilities

5

6

### Sample Filter

7

8

Filter VCF files by sample names to create subset files containing only specified samples.

9

10

```python { .api }

11

class SampleFilter:

12

def __init__(self, infile, outfile=None, filters=None, invert=False):

13

"""

14

Initialize sample filter for VCF files.

15

16

Parameters:

17

- infile: str or file-like, input VCF file

18

- outfile: str or file-like, optional output file

19

- filters: list of str, sample names to include/exclude

20

- invert: bool, if True exclude listed samples, if False include only listed samples

21

"""

22

23

def set_filters(self, filters=None, invert=False):

24

"""

25

Set sample filters after initialization.

26

27

Parameters:

28

- filters: list of str, sample names to filter

29

- invert: bool, invert filter logic

30

"""

31

32

def write(self, outfile=None):

33

"""

34

Write filtered VCF to output file.

35

36

Parameters:

37

- outfile: str or file-like, output file (uses initialization value if None)

38

"""

39

40

# Properties

41

samples: list # Original sample list

42

parser: 'Reader' # Modified Reader instance with filtered samples

43

```

44

45

### Usage Examples

46

47

```python

48

import vcf

49

from vcf.sample_filter import SampleFilter

50

51

# Include specific samples

52

filter_obj = SampleFilter(

53

infile='input.vcf',

54

outfile='subset.vcf',

55

filters=['SAMPLE1', 'SAMPLE2', 'SAMPLE3'],

56

invert=False

57

)

58

filter_obj.write()

59

60

# Exclude specific samples

61

filter_obj = SampleFilter(

62

infile='input.vcf',

63

outfile='filtered.vcf',

64

filters=['BAD_SAMPLE1', 'BAD_SAMPLE2'],

65

invert=True

66

)

67

filter_obj.write()

68

69

# Use with file handles

70

with open('input.vcf', 'r') as infile, open('output.vcf', 'w') as outfile:

71

filter_obj = SampleFilter(

72

infile=infile,

73

outfile=outfile,

74

filters=['KEEP1', 'KEEP2']

75

)

76

filter_obj.write()

77

78

# Dynamic filtering

79

filter_obj = SampleFilter('large_cohort.vcf')

80

81

# Filter to population 1

82

filter_obj.set_filters(['POP1_001', 'POP1_002', 'POP1_003'], invert=False)

83

filter_obj.write('population1.vcf')

84

85

# Filter to population 2

86

filter_obj.set_filters(['POP2_001', 'POP2_002', 'POP2_003'], invert=False)

87

filter_obj.write('population2.vcf')

88

89

# Access filtered parser

90

filter_obj = SampleFilter('input.vcf', filters=['SAMPLE1', 'SAMPLE2'])

91

reader = filter_obj.parser

92

93

print("Filtered samples:", reader.samples)

94

for record in reader:

95

print(f"Variant {record.CHROM}:{record.POS} has {len(record.samples)} sample calls")

96

```

97

98

### Population Analysis Example

99

100

```python

101

import vcf

102

from vcf.sample_filter import SampleFilter

103

104

# Define population groups

105

populations = {

106

'EUR': ['EUR001', 'EUR002', 'EUR003', 'EUR004', 'EUR005'],

107

'ASN': ['ASN001', 'ASN002', 'ASN003', 'ASN004', 'ASN005'],

108

'AFR': ['AFR001', 'AFR002', 'AFR003', 'AFR004', 'AFR005']

109

}

110

111

input_file = 'multi_population.vcf'

112

113

# Create population-specific VCF files

114

for pop_name, sample_list in populations.items():

115

output_file = f'{pop_name.lower()}_variants.vcf'

116

117

filter_obj = SampleFilter(

118

infile=input_file,

119

outfile=output_file,

120

filters=sample_list,

121

invert=False

122

)

123

filter_obj.write()

124

print(f"Created {output_file} with {len(sample_list)} samples")

125

126

# Analyze each population

127

for pop_name, sample_list in populations.items():

128

filter_obj = SampleFilter(input_file, filters=sample_list)

129

reader = filter_obj.parser

130

131

variant_count = 0

132

high_freq_variants = 0

133

134

for record in reader:

135

variant_count += 1

136

137

# Calculate allele frequency in this population

138

if record.aaf and max(record.aaf) > 0.1: # >10% frequency

139

high_freq_variants += 1

140

141

print(f"{pop_name}: {variant_count} variants, {high_freq_variants} common variants")

142

```

143

144

### Command Line Usage

145

146

```bash

147

# Command line sample filtering

148

vcf_sample_filter.py --include SAMPLE1,SAMPLE2,SAMPLE3 input.vcf output.vcf

149

vcf_sample_filter.py --exclude BAD1,BAD2 input.vcf filtered.vcf

150

```

151

152

### Advanced Sample Selection

153

154

```python

155

import vcf

156

from vcf.sample_filter import SampleFilter

157

158

# Read original file to inspect samples

159

reader = vcf.Reader(filename='input.vcf')

160

all_samples = reader.samples

161

print(f"Total samples: {len(all_samples)}")

162

163

# Filter samples by naming pattern

164

case_samples = [s for s in all_samples if s.startswith('CASE_')]

165

control_samples = [s for s in all_samples if s.startswith('CTRL_')]

166

167

print(f"Cases: {len(case_samples)}, Controls: {len(control_samples)}")

168

169

# Create case-only VCF

170

case_filter = SampleFilter(

171

infile='input.vcf',

172

outfile='cases_only.vcf',

173

filters=case_samples

174

)

175

case_filter.write()

176

177

# Create control-only VCF

178

control_filter = SampleFilter(

179

infile='input.vcf',

180

outfile='controls_only.vcf',

181

filters=control_samples

182

)

183

control_filter.write()

184

185

# Random sample subset

186

import random

187

random_samples = random.sample(all_samples, min(100, len(all_samples)))

188

189

random_filter = SampleFilter(

190

infile='input.vcf',

191

outfile='random_subset.vcf',

192

filters=random_samples

193

)

194

random_filter.write()

195

```