or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

data-structures.mdindex.mdio.mdstatistics.mdutilities.md

io.mddocs/

0

# I/O Operations

1

2

Scikit-allel provides comprehensive file I/O capabilities for working with standard genetic data formats including VCF, GFF, and FASTA files, with support for large datasets and various output formats.

3

4

## VCF File Operations

5

6

### Reading VCF Files

7

8

Read VCF (Variant Call Format) files into structured arrays.

9

10

```python

11

import allel

12

13

# Read entire VCF file

14

variants, samples, genotypes = allel.read_vcf('data.vcf')

15

16

# Read specific fields

17

variants, samples, genotypes = allel.read_vcf(

18

'data.vcf',

19

fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'calldata/GT']

20

)

21

22

# Read with region filtering

23

data = allel.read_vcf('data.vcf', region='chr1:1000000-2000000')

24

25

# Read with sample selection

26

data = allel.read_vcf('data.vcf', samples=['sample1', 'sample2', 'sample3'])

27

```

28

{ .api }

29

30

**Key Functions:**

31

32

```python

33

def read_vcf(input, fields=None, exclude_fields=None, rename_fields=None,

34

types=None, numbers=None, alt_number=DEFAULT_ALT_NUMBER, fills=None,

35

region=None, tabix='tabix', samples=None, transformers=None,

36

buffer_size=DEFAULT_BUFFER_SIZE, chunk_length=DEFAULT_CHUNK_LENGTH, log=None):

37

"""

38

Read a VCF file into structured arrays.

39

40

Args:

41

input (str): Path to VCF file or file-like object

42

fields (list, optional): Specific fields to read (e.g., ['variants/CHROM', 'calldata/GT'])

43

exclude_fields (list, optional): Fields to exclude from reading

44

rename_fields (dict, optional): Mapping to rename fields

45

types (dict, optional): Data types for specific fields

46

numbers (dict, optional): Number specifications for INFO fields

47

alt_number (int): Number of alternate alleles to read (default: DEFAULT_ALT_NUMBER)

48

fills (dict, optional): Fill values for missing data

49

region (str, optional): Genomic region to read (e.g., 'chr1:1000-2000')

50

tabix (str): Path to tabix executable for indexed files

51

samples (list, optional): Specific samples to include

52

transformers (dict, optional): Custom transformation functions

53

buffer_size (int): Buffer size for file reading

54

chunk_length (int): Number of variants to read per chunk

55

log (file, optional): Log file for messages

56

57

Returns:

58

tuple: (variants, samples, calldata) structured arrays

59

"""

60

61

def iter_vcf_chunks(input, fields=None, types=None, numbers=None, alt_number=None,

62

chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE, region=None,

63

samples=None, transformers=None, log=None):

64

"""

65

Iterate over VCF file in chunks for memory-efficient processing.

66

67

Args:

68

input (str): Path to VCF file

69

fields (list, optional): Fields to read

70

chunk_length (int): Variants per chunk

71

buffer_size (int): File buffer size

72

region (str, optional): Genomic region

73

samples (list, optional): Sample subset

74

transformers (dict, optional): Field transformers

75

log (file, optional): Log file

76

77

Yields:

78

tuple: (variants, samples, calldata) for each chunk

79

"""

80

81

def read_vcf_headers(input):

82

"""

83

Read only the header information from a VCF file.

84

85

Args:

86

input (str): Path to VCF file or file-like object

87

88

Returns:

89

dict: Header information including metadata and column names

90

"""

91

```

92

{ .api }

93

94

### VCF Format Conversion

95

96

Convert VCF files to other formats for analysis or storage.

97

98

```python

99

# Convert to NumPy compressed format

100

allel.vcf_to_npz('input.vcf', 'output.npz', compression='gzip')

101

102

# Convert to HDF5

103

allel.vcf_to_hdf5('input.vcf', 'output.h5', group='/variants',

104

compression='gzip', chunks=True)

105

106

# Convert to Zarr format

107

allel.vcf_to_zarr('input.vcf', 'output.zarr', compressor='blosc')

108

109

# Convert to pandas DataFrame

110

df = allel.vcf_to_dataframe('input.vcf', alt_number=1)

111

112

# Convert to CSV

113

allel.vcf_to_csv('input.vcf', 'output.csv', write_header=True)

114

```

115

{ .api }

116

117

```python

118

def vcf_to_npz(input, output, fields=None, types=None, numbers=None, alt_number=None,

119

chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE, region=None,

120

samples=None, transformers=None, log=None, compression='gzip'):

121

"""

122

Convert VCF file to NumPy compressed format.

123

124

Args:

125

input (str): Input VCF file path

126

output (str): Output NPZ file path

127

fields (list, optional): Fields to include

128

compression (str): Compression method ('gzip', 'bz2', 'lzma')

129

130

Returns:

131

None

132

"""

133

134

def vcf_to_hdf5(input, output, group='/', fields=None, types=None, numbers=None,

135

alt_number=None, chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE,

136

region=None, samples=None, transformers=None, log=None,

137

compression='gzip', compression_opts=1, chunks=True, fletcher32=False, shuffle=False):

138

"""

139

Convert VCF file to HDF5 format.

140

141

Args:

142

input (str): Input VCF file path

143

output (str): Output HDF5 file path

144

group (str): HDF5 group to write to

145

compression (str): Compression method

146

compression_opts (int): Compression level

147

chunks (bool or tuple): Chunking strategy

148

fletcher32 (bool): Enable Fletcher32 checksum

149

shuffle (bool): Enable shuffle filter

150

151

Returns:

152

None

153

"""

154

155

def vcf_to_zarr(input, output, group='', fields=None, types=None, numbers=None,

156

alt_number=None, chunk_length=1000, buffer_size=DEFAULT_BUFFER_SIZE,

157

region=None, samples=None, transformers=None, log=None,

158

compressor=None, chunks=True):

159

"""

160

Convert VCF file to Zarr format.

161

162

Args:

163

input (str): Input VCF file path

164

output (str): Output Zarr path

165

group (str): Zarr group name

166

compressor (object, optional): Compression algorithm (e.g., blosc.Blosc())

167

chunks (bool or tuple): Chunking strategy

168

169

Returns:

170

None

171

"""

172

173

def vcf_to_dataframe(input, fields=None, alt_number=None, chunk_length=1000,

174

buffer_size=DEFAULT_BUFFER_SIZE, region=None, samples=None,

175

transformers=None, log=None):

176

"""

177

Convert VCF file to pandas DataFrame.

178

179

Args:

180

input (str): Input VCF file path

181

fields (list, optional): Fields to include

182

alt_number (int, optional): Number of alternate alleles

183

184

Returns:

185

pandas.DataFrame: VCF data as DataFrame

186

"""

187

```

188

{ .api }

189

190

### Writing VCF Files

191

192

Write genetic data back to VCF format.

193

194

```python

195

# Write arrays to VCF file

196

allel.write_vcf('output.vcf', variants, calldata, samples=sample_names,

197

meta={'fileformat': 'VCFv4.2'})

198

199

# Customize field descriptions

200

descriptions = {

201

'DP': 'Total read depth',

202

'GQ': 'Genotype quality'

203

}

204

allel.write_vcf('output.vcf', variants, calldata, description=descriptions)

205

```

206

{ .api }

207

208

```python

209

def write_vcf(path, variants, calldata, samples=None, meta=None, rename=None,

210

number=None, description=None, fill=None, write_header=True):

211

"""

212

Write variant data to VCF format.

213

214

Args:

215

path (str): Output VCF file path

216

variants (dict): Variant information arrays

217

calldata (dict): Call data arrays (per-sample information)

218

samples (list, optional): Sample names

219

meta (dict, optional): Metadata for VCF header

220

rename (dict, optional): Field name mappings

221

number (dict, optional): Number specifications for INFO fields

222

description (dict, optional): Field descriptions for header

223

fill (dict, optional): Fill values for missing data

224

write_header (bool): Whether to write VCF header

225

226

Returns:

227

None

228

"""

229

```

230

{ .api }

231

232

## GFF File Operations

233

234

### Reading GFF Files

235

236

Parse GFF3 (General Feature Format) files for genomic annotations.

237

238

```python

239

# Read GFF3 file into record array

240

features = allel.gff3_to_recarray('annotations.gff3')

241

242

# Read specific genomic region

243

features = allel.gff3_to_recarray('annotations.gff3', region='chr1:1000000-2000000')

244

245

# Convert to pandas DataFrame for easier manipulation

246

df = allel.gff3_to_dataframe('annotations.gff3')

247

248

# Iterate over GFF records for memory efficiency

249

for record in allel.iter_gff3('annotations.gff3'):

250

print(record['type'], record['start'], record['end'])

251

```

252

{ .api }

253

254

```python

255

def gff3_to_recarray(path, attributes=None, region=None, score_fill=-1):

256

"""

257

Read GFF3 file into structured record array.

258

259

Args:

260

path (str): Path to GFF3 file

261

attributes (list, optional): Attribute names to parse from INFO field

262

region (str, optional): Genomic region to read (e.g., 'chr1:1000-2000')

263

score_fill (float, optional): Fill value for missing scores

264

265

Returns:

266

numpy.recarray: Structured array with GFF3 fields

267

"""

268

269

def gff3_to_dataframe(input, region=None):

270

"""

271

Read GFF3 file into pandas DataFrame.

272

273

Args:

274

input (str): Path to GFF3 file

275

region (str, optional): Genomic region to read

276

277

Returns:

278

pandas.DataFrame: GFF3 data as DataFrame

279

"""

280

281

def iter_gff3(path, attributes=None, region=None, score_fill=-1):

282

"""

283

Iterate over GFF3 records from file.

284

285

Args:

286

path (str): Path to GFF3 file

287

attributes (list, optional): Attribute names to parse

288

region (str, optional): Genomic region filter

289

score_fill (float, optional): Fill value for missing scores

290

291

Yields:

292

dict: GFF3 record as dictionary with standard fields

293

"""

294

```

295

{ .api }

296

297

## FASTA File Operations

298

299

### Reading FASTA Files

300

301

Read FASTA format sequences for reference genomes and sequence analysis.

302

303

```python

304

# Write sequences to FASTA file

305

sequences = {'chr1': 'ATCGATCG', 'chr2': 'GCTAGCTA'}

306

allel.write_fasta('output.fasta', sequences, ['chr1', 'chr2'])

307

```

308

{ .api }

309

310

```python

311

def write_fasta(path, sequences, names, mode='w', width=80):

312

"""

313

Write sequences to FASTA format file.

314

315

Args:

316

path (str): Output FASTA file path

317

sequences (dict or list): Sequences to write

318

names (list): Sequence names/identifiers

319

mode (str): File open mode ('w' for write, 'a' for append)

320

width (int): Line width for sequence wrapping

321

322

Returns:

323

None

324

"""

325

326

327

```

328

{ .api }

329

330

## File Format Utilities

331

332

### Format Detection and Handling

333

334

Utilities for working with different file formats and sources.

335

336

```python

337

# Handle compressed files automatically

338

data = allel.read_vcf('data.vcf.gz') # Automatically handles gzip

339

data = allel.read_vcf('data.vcf.bz2') # Automatically handles bzip2

340

341

# Read from URLs

342

data = allel.read_vcf('https://example.com/data.vcf.gz')

343

344

# Custom buffer sizes for large files

345

data = allel.read_vcf('large_file.vcf', buffer_size=1048576) # 1MB buffer

346

```

347

{ .api }

348

349

## Data Import Examples

350

351

### Complete Workflow Examples

352

353

```python

354

# Example 1: Basic VCF to analysis workflow

355

import allel

356

import numpy as np

357

358

# Read VCF file

359

variants, samples, genotypes = allel.read_vcf(

360

'my_data.vcf',

361

fields=['variants/CHROM', 'variants/POS', 'variants/REF', 'variants/ALT', 'calldata/GT']

362

)

363

364

# Create genotype array

365

g = allel.GenotypeArray(genotypes['GT'])

366

367

# Basic quality control

368

ac = g.count_alleles()

369

segregating = ac.is_segregating()

370

biallelic = ac.allelism() == 2

371

372

# Filter to high-quality, biallelic, segregating variants

373

mask = segregating & biallelic

374

g_filtered = g[mask]

375

variants_filtered = {k: v[mask] for k, v in variants.items()}

376

377

# Example 2: Large file processing with chunking

378

def process_large_vcf(filename):

379

results = []

380

for variants, samples, calldata in allel.iter_vcf_chunks(filename, chunk_length=10000):

381

g = allel.GenotypeArray(calldata['GT'])

382

ac = g.count_alleles()

383

diversity = allel.mean_pairwise_difference(ac)

384

results.extend(diversity)

385

return np.array(results)

386

387

# Example 3: Multi-format workflow

388

# Read VCF and annotations

389

variants, samples, genotypes = allel.read_vcf('variants.vcf')

390

annotations = allel.read_gff3('genes.gff3')

391

reference = allel.read_fasta('reference.fasta')

392

393

# Convert to efficient storage format

394

allel.vcf_to_zarr('variants.vcf', 'variants.zarr', compressor='blosc')

395

```

396

{ .api }

397

398

## Memory Management

399

400

### Efficient Data Loading

401

402

Handle large datasets efficiently with selective loading and chunking.

403

404

```python

405

# Read only essential fields to save memory

406

minimal_data = allel.read_vcf(

407

'large_file.vcf',

408

fields=['variants/CHROM', 'variants/POS', 'calldata/GT']

409

)

410

411

# Process in chunks for memory efficiency

412

def analyze_in_chunks(vcf_file, chunk_size=5000):

413

all_stats = []

414

for chunk in allel.iter_vcf_chunks(vcf_file, chunk_length=chunk_size):

415

variants, samples, calldata = chunk

416

g = allel.GenotypeArray(calldata['GT'])

417

stats = g.count_alleles()

418

all_stats.append(stats)

419

return all_stats

420

421

# Use region-based loading for targeted analysis

422

region_data = allel.read_vcf('genome.vcf', region='chr22:20000000-25000000')

423

```

424

{ .api }

425

426

## Error Handling

427

428

### Common I/O Issues

429

430

Handle common file format and data issues gracefully.

431

432

```python

433

try:

434

# Attempt to read VCF with error handling

435

data = allel.read_vcf('data.vcf')

436

except Exception as e:

437

print(f"Error reading VCF: {e}")

438

# Try with different parameters

439

data = allel.read_vcf('data.vcf', alt_number=1)

440

441

# Validate data after reading

442

def validate_genetic_data(variants, genotypes):

443

"""Validate genetic data integrity."""

444

if len(variants['POS']) != genotypes['GT'].shape[0]:

445

raise ValueError("Mismatch between variants and genotypes")

446

447

# Check for reasonable chromosome names

448

valid_chroms = set([str(i) for i in range(1, 23)] + ['X', 'Y', 'MT'])

449

invalid_chroms = set(variants['CHROM']) - valid_chroms

450

if invalid_chroms:

451

print(f"Warning: Unusual chromosome names found: {invalid_chroms}")

452

453

return True

454

```

455

{ .api }