Tessl Tile for pypi/pysam@0.23.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

alignment-files.md bgzf-files.md command-tools.md index.md sequence-files.md tabix-files.md utilities.md variant-files.md

sequence-files.mddocs/

0
# FASTA/FASTQ Sequence Files
1

2
Support for reading FASTA and FASTQ sequence files with both random access (indexed FASTA) and streaming capabilities. Handles sequence extraction, quality scores, and efficient iteration over large files.
3

4
## Capabilities
5

6
### FastaFile
7

8
Random access to indexed FASTA files using faidx index for efficient sequence retrieval.
9

10
```python { .api }
11
class FastaFile:
12
    def __init__(self, filename, filepath_index=None, one_based_attributes=True, as_raw=False, reference_filename=None, duplicate_filehandle=True):
13
        """
14
        Open indexed FASTA file for random access.
15
        
16
        Parameters:
17
        - filename: str, path to FASTA file
18
        - filepath_index: str, path to .fai index file
19
        - one_based_attributes: bool, use 1-based coordinates for attributes
20
        - as_raw: bool, return sequences as bytes
21
        - reference_filename: str, alternative name for file
22
        - duplicate_filehandle: bool, allow multiple handles
23
        
24
        Returns:
25
        FastaFile object
26
        """
27
    
28
    def fetch(self, reference, start=None, end=None, region=None):
29
        """
30
        Fetch sequence from region.
31
        
32
        Parameters:
33
        - reference: str, sequence name
34
        - start: int, start position (0-based or 1-based)
35
        - end: int, end position (0-based or 1-based)
36
        - region: str, region string (chr:start-end)
37
        
38
        Returns:
39
        str, sequence string
40
        """
41
    
42
    def get_reference_length(self, reference):
43
        """
44
        Get length of reference sequence.
45
        
46
        Parameters:
47
        - reference: str, sequence name
48
        
49
        Returns:
50
        int, sequence length
51
        """
52
    
53
    def close(self):
54
        """Close the file."""
55
    
56
    # Properties
57
    @property
58
    def filename(self) -> str:
59
        """FASTA filename."""
60
    
61
    @property
62
    def references(self) -> tuple:
63
        """Reference sequence names."""
64
    
65
    @property
66
    def nreferences(self) -> int:
67
        """Number of reference sequences."""
68
    
69
    @property
70
    def lengths(self) -> tuple:
71
        """Reference sequence lengths."""
72
    
73
    @property
74
    def is_open(self) -> bool:
75
        """True if file is open."""
76
    
77
    def __contains__(self, reference):
78
        """Check if reference exists."""
79
    
80
    def __len__(self):
81
        """Number of references."""
82
    
83
    def keys(self):
84
        """
85
        Get reference names.
86
        
87
        Returns:
88
        Iterator of reference names
89
        """
90
```
91

92
### FastxFile
93

94
Streaming access to FASTA and FASTQ files for sequential reading without indexing.
95

96
```python { .api }
97
class FastxFile:
98
    def __init__(self, filename, mode="r", persist=True):
99
        """
100
        Open FASTA/FASTQ file for streaming.
101
        
102
        Parameters:
103
        - filename: str, path to FASTA/FASTQ file
104
        - mode: str, file mode ('r' for reading)
105
        - persist: bool, keep file handle open between reads
106
        
107
        Returns:
108
        FastxFile object
109
        """
110
    
111
    def __iter__(self):
112
        """
113
        Iterate over sequences.
114
        
115
        Returns:
116
        Iterator of FastxRecord objects
117
        """
118
    
119
    def __enter__(self):
120
        """Context manager entry."""
121
    
122
    def __exit__(self, exc_type, exc_val, exc_tb):
123
        """Context manager exit."""
124
    
125
    def close(self):
126
        """Close the file."""
127
    
128
    @property
129
    def filename(self) -> str:
130
        """File name."""
131
    
132
    @property
133
    def is_open(self) -> bool:
134
        """True if file is open."""
135
```
136

137
### FastxRecord
138

139
Individual sequence record from FASTA or FASTQ file with sequence and optional quality data.
140

141
```python { .api }
142
class FastxRecord:
143
    def __init__(self, name=None, sequence=None, comment=None, quality=None):
144
        """
145
        Create sequence record.
146
        
147
        Parameters:
148
        - name: str, sequence name/identifier
149
        - sequence: str, sequence data
150
        - comment: str, comment line
151
        - quality: str, quality scores (FASTQ only)
152
        """
153
    
154
    @property
155
    def name(self) -> str:
156
        """Sequence name/identifier."""
157
    
158
    @name.setter
159
    def name(self, value: str):
160
        """Set sequence name."""
161
    
162
    @property
163
    def sequence(self) -> str:
164
        """Sequence data."""
165
    
166
    @sequence.setter
167
    def sequence(self, value: str):
168
        """Set sequence data."""
169
    
170
    @property
171
    def comment(self) -> str:
172
        """Comment/description."""
173
    
174
    @comment.setter
175
    def comment(self, value: str):
176
        """Set comment."""
177
    
178
    @property
179
    def quality(self) -> str:
180
        """Quality string (FASTQ only)."""
181
    
182
    @quality.setter
183
    def quality(self, value: str):
184
        """Set quality string."""
185
    
186
    def __len__(self):
187
        """Sequence length."""
188
    
189
    def __str__(self):
190
        """String representation."""
191
    
192
    def get_quality_array(self, offset=33):
193
        """
194
        Get quality scores as array.
195
        
196
        Parameters:
197
        - offset: int, quality score offset (33 for Sanger, 64 for Illumina 1.3+)
198
        
199
        Returns:
200
        list, quality scores as integers
201
        """
202
```
203

204
### FastqProxy (Legacy)
205

206
Fast read-only access to FASTQ entries for compatibility with older versions.
207

208
```python { .api }
209
class FastqProxy:
210
    @property
211
    def name(self) -> str:
212
        """Sequence name."""
213
    
214
    @property
215
    def sequence(self) -> str:
216
        """Sequence data."""
217
    
218
    @property
219
    def comment(self) -> str:
220
        """Comment line."""
221
    
222
    @property
223
    def quality(self) -> str:
224
        """Quality string."""
225
    
226
    def get_quality_array(self, offset=33):
227
        """
228
        Get quality scores as array.
229
        
230
        Returns:
231
        list, quality scores
232
        """
233
```
234

235
### Legacy Classes
236

237
Compatibility classes for older pysam versions.
238

239
```python { .api }
240
class Fastafile:
241
    """Legacy alias for FastaFile."""
242
    def __init__(self, *args, **kwargs):
243
        """Same as FastaFile.__init__."""
244

245
class FastqFile:
246
    """Legacy alias for FastxFile.""" 
247
    def __init__(self, *args, **kwargs):
248
        """Same as FastxFile.__init__."""
249
```
250

251
## Usage Examples
252

253
### Reading Indexed FASTA Files
254

255
```python
256
import pysam
257

258
# Random access to indexed FASTA
259
with pysam.FastaFile("reference.fa") as fastafile:
260
    # Get full sequence
261
    sequence = fastafile.fetch("chr1")
262
    print(f"chr1 length: {len(sequence)}")
263
    
264
    # Get subsequence
265
    region_seq = fastafile.fetch("chr1", 1000, 2000)
266
    print(f"Region sequence: {region_seq}")
267
    
268
    # Check available references
269
    for ref in fastafile.references:
270
        length = fastafile.get_reference_length(ref)
271
        print(f"{ref}: {length} bp")
272
    
273
    # Use region string
274
    seq = fastafile.fetch(region="chr1:1000-2000")
275
    print(f"Fetched: {seq}")
276
```
277

278
### Streaming FASTA/FASTQ Files
279

280
```python
281
import pysam
282

283
# Read FASTA file sequentially
284
with pysam.FastxFile("sequences.fa") as fastafile:
285
    for record in fastafile:
286
        print(f">{record.name}")
287
        print(f"Length: {len(record.sequence)}")
288
        print(f"Sequence: {record.sequence[:50]}...")
289
        if record.comment:
290
            print(f"Comment: {record.comment}")
291

292
# Read FASTQ file with quality scores
293
with pysam.FastxFile("reads.fastq") as fastqfile:
294
    for record in fastqfile:
295
        print(f"@{record.name}")
296
        print(f"Sequence: {record.sequence}")
297
        print(f"Quality: {record.quality}")
298
        
299
        # Convert quality to numeric scores
300
        qual_scores = record.get_quality_array()
301
        avg_qual = sum(qual_scores) / len(qual_scores)
302
        print(f"Average quality: {avg_qual:.2f}")
303
```
304

305
### Processing and Filtering
306

307
```python
308
import pysam
309

310
# Filter sequences by length and quality
311
def filter_reads(input_file, output_file, min_length=50, min_avg_qual=20):
312
    with pysam.FastxFile(input_file) as infile:
313
        with open(output_file, 'w') as outfile:
314
            for record in infile:
315
                # Check length
316
                if len(record.sequence) < min_length:
317
                    continue
318
                
319
                # Check average quality (FASTQ only)
320
                if record.quality:
321
                    qual_scores = record.get_quality_array()
322
                    avg_qual = sum(qual_scores) / len(qual_scores)
323
                    if avg_qual < min_avg_qual:
324
                        continue
325
                
326
                # Write filtered record
327
                if record.quality:  # FASTQ format
328
                    outfile.write(f"@{record.name}\n")
329
                    outfile.write(f"{record.sequence}\n")
330
                    outfile.write("+\n")
331
                    outfile.write(f"{record.quality}\n")
332
                else:  # FASTA format
333
                    outfile.write(f">{record.name}\n")
334
                    outfile.write(f"{record.sequence}\n")
335

336
# Usage
337
filter_reads("input.fastq", "filtered.fastq", min_length=100, min_avg_qual=25)
338
```
339

340
### Sequence Analysis
341

342
```python
343
import pysam
344
from collections import Counter
345

346
# Analyze sequence composition
347
def analyze_composition(fasta_file):
348
    composition = Counter()
349
    total_length = 0
350
    
351
    with pysam.FastxFile(fasta_file) as fastafile:
352
        for record in fastafile:
353
            sequence = record.sequence.upper()
354
            composition.update(sequence)
355
            total_length += len(sequence)
356
    
357
    print(f"Total length: {total_length:,} bp")
358
    print("Base composition:")
359
    for base in ['A', 'T', 'G', 'C', 'N']:
360
        count = composition[base]
361
        percent = (count / total_length) * 100
362
        print(f"  {base}: {count:,} ({percent:.2f}%)")
363
    
364
    gc_content = (composition['G'] + composition['C']) / total_length * 100
365
    print(f"GC content: {gc_content:.2f}%")
366

367
# Extract sequences by region file
368
def extract_regions(fasta_file, regions_file, output_file):
369
    """Extract sequences from regions specified in BED-like format."""
370
    with pysam.FastaFile(fasta_file) as fastafile:
371
        with open(output_file, 'w') as outfile:
372
            with open(regions_file) as regions:
373
                for line in regions:
374
                    if line.startswith('#'):
375
                        continue
376
                    
377
                    fields = line.strip().split('\t')
378
                    chrom, start, end = fields[0], int(fields[1]), int(fields[2])
379
                    name = f"{chrom}:{start}-{end}"
380
                    
381
                    if len(fields) > 3:
382
                        name = fields[3]
383
                    
384
                    # Extract sequence
385
                    sequence = fastafile.fetch(chrom, start, end)
386
                    
387
                    # Write FASTA record
388
                    outfile.write(f">{name}\n")
389
                    outfile.write(f"{sequence}\n")
390

391
# Usage
392
analyze_composition("genome.fa")
393
extract_regions("genome.fa", "regions.bed", "extracted_sequences.fa")
394
```
395

396
### Quality Score Conversion
397

398
```python
399
import pysam
400

401
def convert_quality_encoding(input_fastq, output_fastq, input_offset=64, output_offset=33):
402
    """Convert FASTQ quality encoding (e.g., Illumina 1.3+ to Sanger)."""
403
    with pysam.FastxFile(input_fastq) as infile:
404
        with open(output_fastq, 'w') as outfile:
405
            for record in infile:
406
                # Convert quality scores
407
                qual_scores = record.get_quality_array(offset=input_offset)
408
                
409
                # Convert to new encoding
410
                new_quality = ''.join(chr(q + output_offset) for q in qual_scores)
411
                
412
                # Write record with new quality
413
                outfile.write(f"@{record.name}\n")
414
                outfile.write(f"{record.sequence}\n")
415
                outfile.write("+\n")
416
                outfile.write(f"{new_quality}\n")
417

418
# Convert Illumina 1.3+ to Sanger encoding
419
convert_quality_encoding("illumina_reads.fastq", "sanger_reads.fastq", 
420
                        input_offset=64, output_offset=33)
421
```

Version

Tile

Files

sequence-files.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

sequence-files.mddocs/