or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

alignment-files.mdbgzf-files.mdcommand-tools.mdindex.mdsequence-files.mdtabix-files.mdutilities.mdvariant-files.md

sequence-files.mddocs/

0

# FASTA/FASTQ Sequence Files

1

2

Support for reading FASTA and FASTQ sequence files with both random access (indexed FASTA) and streaming capabilities. Handles sequence extraction, quality scores, and efficient iteration over large files.

3

4

## Capabilities

5

6

### FastaFile

7

8

Random access to indexed FASTA files using faidx index for efficient sequence retrieval.

9

10

```python { .api }

11

class FastaFile:

12

def __init__(self, filename, filepath_index=None, one_based_attributes=True, as_raw=False, reference_filename=None, duplicate_filehandle=True):

13

"""

14

Open indexed FASTA file for random access.

15

16

Parameters:

17

- filename: str, path to FASTA file

18

- filepath_index: str, path to .fai index file

19

- one_based_attributes: bool, use 1-based coordinates for attributes

20

- as_raw: bool, return sequences as bytes

21

- reference_filename: str, alternative name for file

22

- duplicate_filehandle: bool, allow multiple handles

23

24

Returns:

25

FastaFile object

26

"""

27

28

def fetch(self, reference, start=None, end=None, region=None):

29

"""

30

Fetch sequence from region.

31

32

Parameters:

33

- reference: str, sequence name

34

- start: int, start position (0-based or 1-based)

35

- end: int, end position (0-based or 1-based)

36

- region: str, region string (chr:start-end)

37

38

Returns:

39

str, sequence string

40

"""

41

42

def get_reference_length(self, reference):

43

"""

44

Get length of reference sequence.

45

46

Parameters:

47

- reference: str, sequence name

48

49

Returns:

50

int, sequence length

51

"""

52

53

def close(self):

54

"""Close the file."""

55

56

# Properties

57

@property

58

def filename(self) -> str:

59

"""FASTA filename."""

60

61

@property

62

def references(self) -> tuple:

63

"""Reference sequence names."""

64

65

@property

66

def nreferences(self) -> int:

67

"""Number of reference sequences."""

68

69

@property

70

def lengths(self) -> tuple:

71

"""Reference sequence lengths."""

72

73

@property

74

def is_open(self) -> bool:

75

"""True if file is open."""

76

77

def __contains__(self, reference):

78

"""Check if reference exists."""

79

80

def __len__(self):

81

"""Number of references."""

82

83

def keys(self):

84

"""

85

Get reference names.

86

87

Returns:

88

Iterator of reference names

89

"""

90

```

91

92

### FastxFile

93

94

Streaming access to FASTA and FASTQ files for sequential reading without indexing.

95

96

```python { .api }

97

class FastxFile:

98

def __init__(self, filename, mode="r", persist=True):

99

"""

100

Open FASTA/FASTQ file for streaming.

101

102

Parameters:

103

- filename: str, path to FASTA/FASTQ file

104

- mode: str, file mode ('r' for reading)

105

- persist: bool, keep file handle open between reads

106

107

Returns:

108

FastxFile object

109

"""

110

111

def __iter__(self):

112

"""

113

Iterate over sequences.

114

115

Returns:

116

Iterator of FastxRecord objects

117

"""

118

119

def __enter__(self):

120

"""Context manager entry."""

121

122

def __exit__(self, exc_type, exc_val, exc_tb):

123

"""Context manager exit."""

124

125

def close(self):

126

"""Close the file."""

127

128

@property

129

def filename(self) -> str:

130

"""File name."""

131

132

@property

133

def is_open(self) -> bool:

134

"""True if file is open."""

135

```

136

137

### FastxRecord

138

139

Individual sequence record from FASTA or FASTQ file with sequence and optional quality data.

140

141

```python { .api }

142

class FastxRecord:

143

def __init__(self, name=None, sequence=None, comment=None, quality=None):

144

"""

145

Create sequence record.

146

147

Parameters:

148

- name: str, sequence name/identifier

149

- sequence: str, sequence data

150

- comment: str, comment line

151

- quality: str, quality scores (FASTQ only)

152

"""

153

154

@property

155

def name(self) -> str:

156

"""Sequence name/identifier."""

157

158

@name.setter

159

def name(self, value: str):

160

"""Set sequence name."""

161

162

@property

163

def sequence(self) -> str:

164

"""Sequence data."""

165

166

@sequence.setter

167

def sequence(self, value: str):

168

"""Set sequence data."""

169

170

@property

171

def comment(self) -> str:

172

"""Comment/description."""

173

174

@comment.setter

175

def comment(self, value: str):

176

"""Set comment."""

177

178

@property

179

def quality(self) -> str:

180

"""Quality string (FASTQ only)."""

181

182

@quality.setter

183

def quality(self, value: str):

184

"""Set quality string."""

185

186

def __len__(self):

187

"""Sequence length."""

188

189

def __str__(self):

190

"""String representation."""

191

192

def get_quality_array(self, offset=33):

193

"""

194

Get quality scores as array.

195

196

Parameters:

197

- offset: int, quality score offset (33 for Sanger, 64 for Illumina 1.3+)

198

199

Returns:

200

list, quality scores as integers

201

"""

202

```

203

204

### FastqProxy (Legacy)

205

206

Fast read-only access to FASTQ entries for compatibility with older versions.

207

208

```python { .api }

209

class FastqProxy:

210

@property

211

def name(self) -> str:

212

"""Sequence name."""

213

214

@property

215

def sequence(self) -> str:

216

"""Sequence data."""

217

218

@property

219

def comment(self) -> str:

220

"""Comment line."""

221

222

@property

223

def quality(self) -> str:

224

"""Quality string."""

225

226

def get_quality_array(self, offset=33):

227

"""

228

Get quality scores as array.

229

230

Returns:

231

list, quality scores

232

"""

233

```

234

235

### Legacy Classes

236

237

Compatibility classes for older pysam versions.

238

239

```python { .api }

240

class Fastafile:

241

"""Legacy alias for FastaFile."""

242

def __init__(self, *args, **kwargs):

243

"""Same as FastaFile.__init__."""

244

245

class FastqFile:

246

"""Legacy alias for FastxFile."""

247

def __init__(self, *args, **kwargs):

248

"""Same as FastxFile.__init__."""

249

```

250

251

## Usage Examples

252

253

### Reading Indexed FASTA Files

254

255

```python

256

import pysam

257

258

# Random access to indexed FASTA

259

with pysam.FastaFile("reference.fa") as fastafile:

260

# Get full sequence

261

sequence = fastafile.fetch("chr1")

262

print(f"chr1 length: {len(sequence)}")

263

264

# Get subsequence

265

region_seq = fastafile.fetch("chr1", 1000, 2000)

266

print(f"Region sequence: {region_seq}")

267

268

# Check available references

269

for ref in fastafile.references:

270

length = fastafile.get_reference_length(ref)

271

print(f"{ref}: {length} bp")

272

273

# Use region string

274

seq = fastafile.fetch(region="chr1:1000-2000")

275

print(f"Fetched: {seq}")

276

```

277

278

### Streaming FASTA/FASTQ Files

279

280

```python

281

import pysam

282

283

# Read FASTA file sequentially

284

with pysam.FastxFile("sequences.fa") as fastafile:

285

for record in fastafile:

286

print(f">{record.name}")

287

print(f"Length: {len(record.sequence)}")

288

print(f"Sequence: {record.sequence[:50]}...")

289

if record.comment:

290

print(f"Comment: {record.comment}")

291

292

# Read FASTQ file with quality scores

293

with pysam.FastxFile("reads.fastq") as fastqfile:

294

for record in fastqfile:

295

print(f"@{record.name}")

296

print(f"Sequence: {record.sequence}")

297

print(f"Quality: {record.quality}")

298

299

# Convert quality to numeric scores

300

qual_scores = record.get_quality_array()

301

avg_qual = sum(qual_scores) / len(qual_scores)

302

print(f"Average quality: {avg_qual:.2f}")

303

```

304

305

### Processing and Filtering

306

307

```python

308

import pysam

309

310

# Filter sequences by length and quality

311

def filter_reads(input_file, output_file, min_length=50, min_avg_qual=20):

312

with pysam.FastxFile(input_file) as infile:

313

with open(output_file, 'w') as outfile:

314

for record in infile:

315

# Check length

316

if len(record.sequence) < min_length:

317

continue

318

319

# Check average quality (FASTQ only)

320

if record.quality:

321

qual_scores = record.get_quality_array()

322

avg_qual = sum(qual_scores) / len(qual_scores)

323

if avg_qual < min_avg_qual:

324

continue

325

326

# Write filtered record

327

if record.quality: # FASTQ format

328

outfile.write(f"@{record.name}\n")

329

outfile.write(f"{record.sequence}\n")

330

outfile.write("+\n")

331

outfile.write(f"{record.quality}\n")

332

else: # FASTA format

333

outfile.write(f">{record.name}\n")

334

outfile.write(f"{record.sequence}\n")

335

336

# Usage

337

filter_reads("input.fastq", "filtered.fastq", min_length=100, min_avg_qual=25)

338

```

339

340

### Sequence Analysis

341

342

```python

343

import pysam

344

from collections import Counter

345

346

# Analyze sequence composition

347

def analyze_composition(fasta_file):

348

composition = Counter()

349

total_length = 0

350

351

with pysam.FastxFile(fasta_file) as fastafile:

352

for record in fastafile:

353

sequence = record.sequence.upper()

354

composition.update(sequence)

355

total_length += len(sequence)

356

357

print(f"Total length: {total_length:,} bp")

358

print("Base composition:")

359

for base in ['A', 'T', 'G', 'C', 'N']:

360

count = composition[base]

361

percent = (count / total_length) * 100

362

print(f" {base}: {count:,} ({percent:.2f}%)")

363

364

gc_content = (composition['G'] + composition['C']) / total_length * 100

365

print(f"GC content: {gc_content:.2f}%")

366

367

# Extract sequences by region file

368

def extract_regions(fasta_file, regions_file, output_file):

369

"""Extract sequences from regions specified in BED-like format."""

370

with pysam.FastaFile(fasta_file) as fastafile:

371

with open(output_file, 'w') as outfile:

372

with open(regions_file) as regions:

373

for line in regions:

374

if line.startswith('#'):

375

continue

376

377

fields = line.strip().split('\t')

378

chrom, start, end = fields[0], int(fields[1]), int(fields[2])

379

name = f"{chrom}:{start}-{end}"

380

381

if len(fields) > 3:

382

name = fields[3]

383

384

# Extract sequence

385

sequence = fastafile.fetch(chrom, start, end)

386

387

# Write FASTA record

388

outfile.write(f">{name}\n")

389

outfile.write(f"{sequence}\n")

390

391

# Usage

392

analyze_composition("genome.fa")

393

extract_regions("genome.fa", "regions.bed", "extracted_sequences.fa")

394

```

395

396

### Quality Score Conversion

397

398

```python

399

import pysam

400

401

def convert_quality_encoding(input_fastq, output_fastq, input_offset=64, output_offset=33):

402

"""Convert FASTQ quality encoding (e.g., Illumina 1.3+ to Sanger)."""

403

with pysam.FastxFile(input_fastq) as infile:

404

with open(output_fastq, 'w') as outfile:

405

for record in infile:

406

# Convert quality scores

407

qual_scores = record.get_quality_array(offset=input_offset)

408

409

# Convert to new encoding

410

new_quality = ''.join(chr(q + output_offset) for q in qual_scores)

411

412

# Write record with new quality

413

outfile.write(f"@{record.name}\n")

414

outfile.write(f"{record.sequence}\n")

415

outfile.write("+\n")

416

outfile.write(f"{new_quality}\n")

417

418

# Convert Illumina 1.3+ to Sanger encoding

419

convert_quality_encoding("illumina_reads.fastq", "sanger_reads.fastq",

420

input_offset=64, output_offset=33)

421

```