Tessl Tile for pypi/gfftk@25.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cli-commands.md comparison.md consensus.md format-conversion.md genbank-tbl.md gff-processing.md index.md sequence-operations.md utilities.md

sequence-operations.mddocs/

0
# Sequence Operations
1

2
Comprehensive FASTA file parsing and genomic sequence manipulation capabilities, including coordinate-based sequence extraction, translation using multiple genetic codes, reverse complement operations, and efficient sequence access.
3

4
## Capabilities
5

6
### FASTA File Handling
7

8
Object-oriented and functional interfaces for working with FASTA files and sequence data.
9

10
```python { .api }
11
class FASTA:
12
    """FASTA file handler with efficient sequence access."""
13

14
    def __init__(self, fasta_file):
15
        """
16
        Initialize FASTA handler.
17

18
        Parameters:
19
        - fasta_file (str): Path to FASTA file
20
        """
21

22
    def get_seq(self, contig):
23
        """
24
        Get sequence for specified contig.
25

26
        Parameters:
27
        - contig (str): Contig/chromosome name
28

29
        Returns:
30
        str: DNA sequence for the contig
31
        """
32

33
def fastaparser(handle):
34
    """
35
    Parse FASTA file as generator yielding (header, sequence) tuples.
36

37
    Parameters:
38
    - handle (file-like): Open file handle to FASTA file
39

40
    Yields:
41
    tuple: (header, sequence) pairs
42
    """
43

44
def fasta2dict(fasta, full_header=False):
45
    """
46
    Convert FASTA file to dictionary.
47

48
    Parameters:
49
    - fasta (str): Path to FASTA file
50
    - full_header (bool): Use full header as key vs first word only
51

52
    Returns:
53
    dict: {header: sequence} mapping
54
    """
55

56
def fasta2headers(fasta, full_header=False):
57
    """
58
    Get FASTA headers as set.
59

60
    Parameters:
61
    - fasta (str): Path to FASTA file
62
    - full_header (bool): Use full header vs first word only
63

64
    Returns:
65
    set: Set of sequence headers
66
    """
67

68
def fasta2lengths(fasta, full_header=False):
69
    """
70
    Get sequence lengths as dictionary.
71

72
    Parameters:
73
    - fasta (str): Path to FASTA file
74
    - full_header (bool): Use full header as key vs first word only
75

76
    Returns:
77
    dict: {header: length} mapping
78
    """
79
```
80

81
### Sequence Extraction
82

83
Extract specific regions from genomic sequences based on coordinates.
84

85
```python { .api }
86
def getSeqRegions(seqs, header, coordinates, coords=False):
87
    """
88
    Extract sequence regions from coordinates.
89

90
    Parameters:
91
    - seqs (dict): Dictionary of sequences
92
    - header (str): Sequence header/contig name
93
    - coordinates (list): List of (start, end) coordinate tuples
94
    - coords (bool): Whether to include coordinate information
95

96
    Returns:
97
    str: Extracted sequence regions concatenated
98
    """
99
```
100

101
### DNA Translation and Manipulation
102

103
Translate DNA sequences to proteins using standard genetic codes and perform sequence manipulations.
104

105
```python { .api }
106
def translate(dna, strand, phase, table=1):
107
    """
108
    Translate DNA sequence to protein using genetic code.
109

110
    Parameters:
111
    - dna (str): DNA sequence to translate
112
    - strand (str): Strand orientation ("+" or "-")
113
    - phase (int): Reading frame phase (0, 1, or 2)
114
    - table (int): Genetic code table (1=standard, 11=bacterial)
115

116
    Returns:
117
    str: Translated protein sequence
118
    """
119

120
def RevComp(s):
121
    """
122
    Generate reverse complement of DNA sequence.
123

124
    Parameters:
125
    - s (str): Input DNA sequence
126

127
    Returns:
128
    str: Reverse complement sequence
129
    """
130
```
131

132
### Text Formatting
133

134
Format sequences and text for output with proper line wrapping.
135

136
```python { .api }
137
def softwrap(string, every=80):
138
    """
139
    Soft wrap text to specified width.
140

141
    Parameters:
142
    - string (str): Input string to wrap
143
    - every (int): Line width for wrapping
144

145
    Returns:
146
    str: Wrapped text with newlines
147
    """
148
```
149

150
### Genetic Code Tables
151

152
Access to standard genetic code tables for translation.
153

154
```python { .api }
155
codon_table = {
156
    "1": {
157
        # Standard genetic code table
158
        "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
159
        # ... (complete codon to amino acid mapping)
160
    },
161
    "11": {
162
        # Bacterial, archaeal and plant plastid genetic code
163
        "TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",
164
        # ... (complete codon to amino acid mapping)
165
    }
166
}
167
```
168

169
## Usage Examples
170

171
### Basic FASTA Operations
172

173
```python
174
from gfftk.fasta import FASTA, fasta2dict
175

176
# Object-oriented approach
177
fasta = FASTA("genome.fasta")
178
chr1_seq = fasta.get_seq("chr1")
179

180
# Functional approach
181
genome = fasta2dict("genome.fasta")
182
chr1_seq = genome["chr1"]
183

184
# Get sequence information
185
from gfftk.fasta import fasta2headers, fasta2lengths
186

187
headers = fasta2headers("genome.fasta")
188
lengths = fasta2lengths("genome.fasta")
189

190
print(f"Number of sequences: {len(headers)}")
191
print(f"Sequence lengths: {lengths}")
192
```
193

194
### Sequence Extraction
195

196
```python
197
from gfftk.fasta import fasta2dict, getSeqRegions
198

199
# Load genome
200
genome = fasta2dict("genome.fasta")
201

202
# Extract specific regions
203
coordinates = [(1000, 2000), (3000, 4000), (5000, 6000)]
204
extracted = getSeqRegions(genome, "chr1", coordinates)
205

206
print(f"Extracted sequence: {extracted}")
207
```
208

209
### DNA Translation
210

211
```python
212
from gfftk.fasta import translate, RevComp
213

214
# Example DNA sequence
215
dna_sequence = "ATGAAGTTTGCCTAG"
216

217
# Translate forward strand
218
protein_forward = translate(dna_sequence, "+", 0, table=1)
219
print(f"Forward translation: {protein_forward}")
220

221
# Translate reverse strand
222
dna_reverse = RevComp(dna_sequence)
223
protein_reverse = translate(dna_reverse, "-", 0, table=1)
224
print(f"Reverse translation: {protein_reverse}")
225

226
# Translate with different genetic code (bacterial)
227
protein_bacterial = translate(dna_sequence, "+", 0, table=11)
228
print(f"Bacterial code translation: {protein_bacterial}")
229

230
# Translate in different reading frames
231
for phase in [0, 1, 2]:
232
    protein = translate(dna_sequence, "+", phase, table=1)
233
    print(f"Phase {phase}: {protein}")
234
```
235

236
### Sequence Processing Pipeline
237

238
```python
239
from gfftk.fasta import FASTA, translate, softwrap
240

241
# Initialize genome access
242
genome = FASTA("genome.fasta")
243

244
# Define gene coordinates (from GFF3 parsing)
245
gene_coords = {
246
    "gene1": {
247
        "contig": "chr1",
248
        "strand": "+",
249
        "cds": [(1000, 1200), (1500, 1700), (2000, 2300)]
250
    }
251
}
252

253
# Extract and translate CDS sequences
254
for gene_id, gene_info in gene_coords.items():
255
    # Get contig sequence
256
    contig_seq = genome.get_seq(gene_info["contig"])
257

258
    # Extract CDS regions
259
    cds_sequence = ""
260
    for start, end in gene_info["cds"]:
261
        cds_sequence += contig_seq[start-1:end]  # Convert to 0-based
262

263
    # Handle reverse strand
264
    if gene_info["strand"] == "-":
265
        from gfftk.fasta import RevComp
266
        cds_sequence = RevComp(cds_sequence)
267

268
    # Translate to protein
269
    protein = translate(cds_sequence, gene_info["strand"], 0, table=1)
270

271
    # Format output
272
    wrapped_protein = softwrap(protein, every=60)
273
    print(f">{gene_id}\n{wrapped_protein}")
274
```
275

276
### Working with Compressed Files
277

278
```python
279
from gfftk.fasta import fasta2dict
280

281
# Works with compressed FASTA files automatically
282
genome = fasta2dict("genome.fasta.gz")
283
genome2 = fasta2dict("genome.fasta.bz2")
284

285
print(f"Loaded {len(genome)} sequences from compressed file")
286
```
287

288
## Types
289

290
```python { .api }
291
# Sequence dictionary format
292
SequenceDict = dict[str, str]  # {header: sequence}
293

294
# Sequence header set
295
HeaderSet = set[str]
296

297
# Sequence length dictionary
298
LengthDict = dict[str, int]  # {header: length}
299

300
# Coordinate tuple format
301
CoordinateTuple = tuple[int, int]  # (start, end) in 1-based coordinates
302

303
# Coordinate list
304
CoordinateList = list[CoordinateTuple]
305

306
# Strand orientation
307
Strand = str  # "+" or "-"
308

309
# Reading frame phase
310
Phase = int  # 0, 1, or 2
311

312
# Genetic code table identifier
313
GeneticCodeTable = int  # 1 (standard) or 11 (bacterial/archaeal/plant plastid)
314

315
# DNA sequence
316
DNASequence = str  # String containing A, T, G, C, N characters
317

318
# Protein sequence
319
ProteinSequence = str  # String containing single-letter amino acid codes
320

321
# Codon table structure
322
CodonTable = dict[str, str]  # {codon: amino_acid}
323

324
# Complete genetic code tables
325
GeneticCodeTables = dict[str, CodonTable]
326
```

Version

Tile

Files

sequence-operations.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

sequence-operations.mddocs/