0
# FASTA/FASTQ Sequence Files
1
2
Support for reading FASTA and FASTQ sequence files with both random access (indexed FASTA) and streaming capabilities. Handles sequence extraction, quality scores, and efficient iteration over large files.
3
4
## Capabilities
5
6
### FastaFile
7
8
Random access to indexed FASTA files using faidx index for efficient sequence retrieval.
9
10
```python { .api }
11
class FastaFile:
12
def __init__(self, filename, filepath_index=None, one_based_attributes=True, as_raw=False, reference_filename=None, duplicate_filehandle=True):
13
"""
14
Open indexed FASTA file for random access.
15
16
Parameters:
17
- filename: str, path to FASTA file
18
- filepath_index: str, path to .fai index file
19
- one_based_attributes: bool, use 1-based coordinates for attributes
20
- as_raw: bool, return sequences as bytes
21
- reference_filename: str, alternative name for file
22
- duplicate_filehandle: bool, allow multiple handles
23
24
Returns:
25
FastaFile object
26
"""
27
28
def fetch(self, reference, start=None, end=None, region=None):
29
"""
30
Fetch sequence from region.
31
32
Parameters:
33
- reference: str, sequence name
34
- start: int, start position (0-based or 1-based)
35
- end: int, end position (0-based or 1-based)
36
- region: str, region string (chr:start-end)
37
38
Returns:
39
str, sequence string
40
"""
41
42
def get_reference_length(self, reference):
43
"""
44
Get length of reference sequence.
45
46
Parameters:
47
- reference: str, sequence name
48
49
Returns:
50
int, sequence length
51
"""
52
53
def close(self):
54
"""Close the file."""
55
56
# Properties
57
@property
58
def filename(self) -> str:
59
"""FASTA filename."""
60
61
@property
62
def references(self) -> tuple:
63
"""Reference sequence names."""
64
65
@property
66
def nreferences(self) -> int:
67
"""Number of reference sequences."""
68
69
@property
70
def lengths(self) -> tuple:
71
"""Reference sequence lengths."""
72
73
@property
74
def is_open(self) -> bool:
75
"""True if file is open."""
76
77
def __contains__(self, reference):
78
"""Check if reference exists."""
79
80
def __len__(self):
81
"""Number of references."""
82
83
def keys(self):
84
"""
85
Get reference names.
86
87
Returns:
88
Iterator of reference names
89
"""
90
```
91
92
### FastxFile
93
94
Streaming access to FASTA and FASTQ files for sequential reading without indexing.
95
96
```python { .api }
97
class FastxFile:
98
def __init__(self, filename, mode="r", persist=True):
99
"""
100
Open FASTA/FASTQ file for streaming.
101
102
Parameters:
103
- filename: str, path to FASTA/FASTQ file
104
- mode: str, file mode ('r' for reading)
105
- persist: bool, keep file handle open between reads
106
107
Returns:
108
FastxFile object
109
"""
110
111
def __iter__(self):
112
"""
113
Iterate over sequences.
114
115
Returns:
116
Iterator of FastxRecord objects
117
"""
118
119
def __enter__(self):
120
"""Context manager entry."""
121
122
def __exit__(self, exc_type, exc_val, exc_tb):
123
"""Context manager exit."""
124
125
def close(self):
126
"""Close the file."""
127
128
@property
129
def filename(self) -> str:
130
"""File name."""
131
132
@property
133
def is_open(self) -> bool:
134
"""True if file is open."""
135
```
136
137
### FastxRecord
138
139
Individual sequence record from FASTA or FASTQ file with sequence and optional quality data.
140
141
```python { .api }
142
class FastxRecord:
143
def __init__(self, name=None, sequence=None, comment=None, quality=None):
144
"""
145
Create sequence record.
146
147
Parameters:
148
- name: str, sequence name/identifier
149
- sequence: str, sequence data
150
- comment: str, comment line
151
- quality: str, quality scores (FASTQ only)
152
"""
153
154
@property
155
def name(self) -> str:
156
"""Sequence name/identifier."""
157
158
@name.setter
159
def name(self, value: str):
160
"""Set sequence name."""
161
162
@property
163
def sequence(self) -> str:
164
"""Sequence data."""
165
166
@sequence.setter
167
def sequence(self, value: str):
168
"""Set sequence data."""
169
170
@property
171
def comment(self) -> str:
172
"""Comment/description."""
173
174
@comment.setter
175
def comment(self, value: str):
176
"""Set comment."""
177
178
@property
179
def quality(self) -> str:
180
"""Quality string (FASTQ only)."""
181
182
@quality.setter
183
def quality(self, value: str):
184
"""Set quality string."""
185
186
def __len__(self):
187
"""Sequence length."""
188
189
def __str__(self):
190
"""String representation."""
191
192
def get_quality_array(self, offset=33):
193
"""
194
Get quality scores as array.
195
196
Parameters:
197
- offset: int, quality score offset (33 for Sanger, 64 for Illumina 1.3+)
198
199
Returns:
200
list, quality scores as integers
201
"""
202
```
203
204
### FastqProxy (Legacy)
205
206
Fast read-only access to FASTQ entries for compatibility with older versions.
207
208
```python { .api }
209
class FastqProxy:
210
@property
211
def name(self) -> str:
212
"""Sequence name."""
213
214
@property
215
def sequence(self) -> str:
216
"""Sequence data."""
217
218
@property
219
def comment(self) -> str:
220
"""Comment line."""
221
222
@property
223
def quality(self) -> str:
224
"""Quality string."""
225
226
def get_quality_array(self, offset=33):
227
"""
228
Get quality scores as array.
229
230
Returns:
231
list, quality scores
232
"""
233
```
234
235
### Legacy Classes
236
237
Compatibility classes for older pysam versions.
238
239
```python { .api }
240
class Fastafile:
241
"""Legacy alias for FastaFile."""
242
def __init__(self, *args, **kwargs):
243
"""Same as FastaFile.__init__."""
244
245
class FastqFile:
246
"""Legacy alias for FastxFile."""
247
def __init__(self, *args, **kwargs):
248
"""Same as FastxFile.__init__."""
249
```
250
251
## Usage Examples
252
253
### Reading Indexed FASTA Files
254
255
```python
256
import pysam
257
258
# Random access to indexed FASTA
259
with pysam.FastaFile("reference.fa") as fastafile:
260
# Get full sequence
261
sequence = fastafile.fetch("chr1")
262
print(f"chr1 length: {len(sequence)}")
263
264
# Get subsequence
265
region_seq = fastafile.fetch("chr1", 1000, 2000)
266
print(f"Region sequence: {region_seq}")
267
268
# Check available references
269
for ref in fastafile.references:
270
length = fastafile.get_reference_length(ref)
271
print(f"{ref}: {length} bp")
272
273
# Use region string
274
seq = fastafile.fetch(region="chr1:1000-2000")
275
print(f"Fetched: {seq}")
276
```
277
278
### Streaming FASTA/FASTQ Files
279
280
```python
281
import pysam
282
283
# Read FASTA file sequentially
284
with pysam.FastxFile("sequences.fa") as fastafile:
285
for record in fastafile:
286
print(f">{record.name}")
287
print(f"Length: {len(record.sequence)}")
288
print(f"Sequence: {record.sequence[:50]}...")
289
if record.comment:
290
print(f"Comment: {record.comment}")
291
292
# Read FASTQ file with quality scores
293
with pysam.FastxFile("reads.fastq") as fastqfile:
294
for record in fastqfile:
295
print(f"@{record.name}")
296
print(f"Sequence: {record.sequence}")
297
print(f"Quality: {record.quality}")
298
299
# Convert quality to numeric scores
300
qual_scores = record.get_quality_array()
301
avg_qual = sum(qual_scores) / len(qual_scores)
302
print(f"Average quality: {avg_qual:.2f}")
303
```
304
305
### Processing and Filtering
306
307
```python
308
import pysam
309
310
# Filter sequences by length and quality
311
def filter_reads(input_file, output_file, min_length=50, min_avg_qual=20):
312
with pysam.FastxFile(input_file) as infile:
313
with open(output_file, 'w') as outfile:
314
for record in infile:
315
# Check length
316
if len(record.sequence) < min_length:
317
continue
318
319
# Check average quality (FASTQ only)
320
if record.quality:
321
qual_scores = record.get_quality_array()
322
avg_qual = sum(qual_scores) / len(qual_scores)
323
if avg_qual < min_avg_qual:
324
continue
325
326
# Write filtered record
327
if record.quality: # FASTQ format
328
outfile.write(f"@{record.name}\n")
329
outfile.write(f"{record.sequence}\n")
330
outfile.write("+\n")
331
outfile.write(f"{record.quality}\n")
332
else: # FASTA format
333
outfile.write(f">{record.name}\n")
334
outfile.write(f"{record.sequence}\n")
335
336
# Usage
337
filter_reads("input.fastq", "filtered.fastq", min_length=100, min_avg_qual=25)
338
```
339
340
### Sequence Analysis
341
342
```python
343
import pysam
344
from collections import Counter
345
346
# Analyze sequence composition
347
def analyze_composition(fasta_file):
348
composition = Counter()
349
total_length = 0
350
351
with pysam.FastxFile(fasta_file) as fastafile:
352
for record in fastafile:
353
sequence = record.sequence.upper()
354
composition.update(sequence)
355
total_length += len(sequence)
356
357
print(f"Total length: {total_length:,} bp")
358
print("Base composition:")
359
for base in ['A', 'T', 'G', 'C', 'N']:
360
count = composition[base]
361
percent = (count / total_length) * 100
362
print(f" {base}: {count:,} ({percent:.2f}%)")
363
364
gc_content = (composition['G'] + composition['C']) / total_length * 100
365
print(f"GC content: {gc_content:.2f}%")
366
367
# Extract sequences by region file
368
def extract_regions(fasta_file, regions_file, output_file):
369
"""Extract sequences from regions specified in BED-like format."""
370
with pysam.FastaFile(fasta_file) as fastafile:
371
with open(output_file, 'w') as outfile:
372
with open(regions_file) as regions:
373
for line in regions:
374
if line.startswith('#'):
375
continue
376
377
fields = line.strip().split('\t')
378
chrom, start, end = fields[0], int(fields[1]), int(fields[2])
379
name = f"{chrom}:{start}-{end}"
380
381
if len(fields) > 3:
382
name = fields[3]
383
384
# Extract sequence
385
sequence = fastafile.fetch(chrom, start, end)
386
387
# Write FASTA record
388
outfile.write(f">{name}\n")
389
outfile.write(f"{sequence}\n")
390
391
# Usage
392
analyze_composition("genome.fa")
393
extract_regions("genome.fa", "regions.bed", "extracted_sequences.fa")
394
```
395
396
### Quality Score Conversion
397
398
```python
399
import pysam
400
401
def convert_quality_encoding(input_fastq, output_fastq, input_offset=64, output_offset=33):
402
"""Convert FASTQ quality encoding (e.g., Illumina 1.3+ to Sanger)."""
403
with pysam.FastxFile(input_fastq) as infile:
404
with open(output_fastq, 'w') as outfile:
405
for record in infile:
406
# Convert quality scores
407
qual_scores = record.get_quality_array(offset=input_offset)
408
409
# Convert to new encoding
410
new_quality = ''.join(chr(q + output_offset) for q in qual_scores)
411
412
# Write record with new quality
413
outfile.write(f"@{record.name}\n")
414
outfile.write(f"{record.sequence}\n")
415
outfile.write("+\n")
416
outfile.write(f"{new_quality}\n")
417
418
# Convert Illumina 1.3+ to Sanger encoding
419
convert_quality_encoding("illumina_reads.fastq", "sanger_reads.fastq",
420
input_offset=64, output_offset=33)
421
```