or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cli-commands.mdcomparison.mdconsensus.mdformat-conversion.mdgenbank-tbl.mdgff-processing.mdindex.mdsequence-operations.mdutilities.md

sequence-operations.mddocs/

0

# Sequence Operations

1

2

Comprehensive FASTA file parsing and genomic sequence manipulation capabilities, including coordinate-based sequence extraction, translation using multiple genetic codes, reverse complement operations, and efficient sequence access.

3

4

## Capabilities

5

6

### FASTA File Handling

7

8

Object-oriented and functional interfaces for working with FASTA files and sequence data.

9

10

```python { .api }

11

class FASTA:

12

"""FASTA file handler with efficient sequence access."""

13

14

def __init__(self, fasta_file):

15

"""

16

Initialize FASTA handler.

17

18

Parameters:

19

- fasta_file (str): Path to FASTA file

20

"""

21

22

def get_seq(self, contig):

23

"""

24

Get sequence for specified contig.

25

26

Parameters:

27

- contig (str): Contig/chromosome name

28

29

Returns:

30

str: DNA sequence for the contig

31

"""

32

33

def fastaparser(handle):

34

"""

35

Parse FASTA file as generator yielding (header, sequence) tuples.

36

37

Parameters:

38

- handle (file-like): Open file handle to FASTA file

39

40

Yields:

41

tuple: (header, sequence) pairs

42

"""

43

44

def fasta2dict(fasta, full_header=False):

45

"""

46

Convert FASTA file to dictionary.

47

48

Parameters:

49

- fasta (str): Path to FASTA file

50

- full_header (bool): Use full header as key vs first word only

51

52

Returns:

53

dict: {header: sequence} mapping

54

"""

55

56

def fasta2headers(fasta, full_header=False):

57

"""

58

Get FASTA headers as set.

59

60

Parameters:

61

- fasta (str): Path to FASTA file

62

- full_header (bool): Use full header vs first word only

63

64

Returns:

65

set: Set of sequence headers

66

"""

67

68

def fasta2lengths(fasta, full_header=False):

69

"""

70

Get sequence lengths as dictionary.

71

72

Parameters:

73

- fasta (str): Path to FASTA file

74

- full_header (bool): Use full header as key vs first word only

75

76

Returns:

77

dict: {header: length} mapping

78

"""

79

```

80

81

### Sequence Extraction

82

83

Extract specific regions from genomic sequences based on coordinates.

84

85

```python { .api }

86

def getSeqRegions(seqs, header, coordinates, coords=False):

87

"""

88

Extract sequence regions from coordinates.

89

90

Parameters:

91

- seqs (dict): Dictionary of sequences

92

- header (str): Sequence header/contig name

93

- coordinates (list): List of (start, end) coordinate tuples

94

- coords (bool): Whether to include coordinate information

95

96

Returns:

97

str: Extracted sequence regions concatenated

98

"""

99

```

100

101

### DNA Translation and Manipulation

102

103

Translate DNA sequences to proteins using standard genetic codes and perform sequence manipulations.

104

105

```python { .api }

106

def translate(dna, strand, phase, table=1):

107

"""

108

Translate DNA sequence to protein using genetic code.

109

110

Parameters:

111

- dna (str): DNA sequence to translate

112

- strand (str): Strand orientation ("+" or "-")

113

- phase (int): Reading frame phase (0, 1, or 2)

114

- table (int): Genetic code table (1=standard, 11=bacterial)

115

116

Returns:

117

str: Translated protein sequence

118

"""

119

120

def RevComp(s):

121

"""

122

Generate reverse complement of DNA sequence.

123

124

Parameters:

125

- s (str): Input DNA sequence

126

127

Returns:

128

str: Reverse complement sequence

129

"""

130

```

131

132

### Text Formatting

133

134

Format sequences and text for output with proper line wrapping.

135

136

```python { .api }

137

def softwrap(string, every=80):

138

"""

139

Soft wrap text to specified width.

140

141

Parameters:

142

- string (str): Input string to wrap

143

- every (int): Line width for wrapping

144

145

Returns:

146

str: Wrapped text with newlines

147

"""

148

```

149

150

### Genetic Code Tables

151

152

Access to standard genetic code tables for translation.

153

154

```python { .api }

155

codon_table = {

156

"1": {

157

# Standard genetic code table

158

"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",

159

# ... (complete codon to amino acid mapping)

160

},

161

"11": {

162

# Bacterial, archaeal and plant plastid genetic code

163

"TTT": "F", "TTC": "F", "TTA": "L", "TTG": "L",

164

# ... (complete codon to amino acid mapping)

165

}

166

}

167

```

168

169

## Usage Examples

170

171

### Basic FASTA Operations

172

173

```python

174

from gfftk.fasta import FASTA, fasta2dict

175

176

# Object-oriented approach

177

fasta = FASTA("genome.fasta")

178

chr1_seq = fasta.get_seq("chr1")

179

180

# Functional approach

181

genome = fasta2dict("genome.fasta")

182

chr1_seq = genome["chr1"]

183

184

# Get sequence information

185

from gfftk.fasta import fasta2headers, fasta2lengths

186

187

headers = fasta2headers("genome.fasta")

188

lengths = fasta2lengths("genome.fasta")

189

190

print(f"Number of sequences: {len(headers)}")

191

print(f"Sequence lengths: {lengths}")

192

```

193

194

### Sequence Extraction

195

196

```python

197

from gfftk.fasta import fasta2dict, getSeqRegions

198

199

# Load genome

200

genome = fasta2dict("genome.fasta")

201

202

# Extract specific regions

203

coordinates = [(1000, 2000), (3000, 4000), (5000, 6000)]

204

extracted = getSeqRegions(genome, "chr1", coordinates)

205

206

print(f"Extracted sequence: {extracted}")

207

```

208

209

### DNA Translation

210

211

```python

212

from gfftk.fasta import translate, RevComp

213

214

# Example DNA sequence

215

dna_sequence = "ATGAAGTTTGCCTAG"

216

217

# Translate forward strand

218

protein_forward = translate(dna_sequence, "+", 0, table=1)

219

print(f"Forward translation: {protein_forward}")

220

221

# Translate reverse strand

222

dna_reverse = RevComp(dna_sequence)

223

protein_reverse = translate(dna_reverse, "-", 0, table=1)

224

print(f"Reverse translation: {protein_reverse}")

225

226

# Translate with different genetic code (bacterial)

227

protein_bacterial = translate(dna_sequence, "+", 0, table=11)

228

print(f"Bacterial code translation: {protein_bacterial}")

229

230

# Translate in different reading frames

231

for phase in [0, 1, 2]:

232

protein = translate(dna_sequence, "+", phase, table=1)

233

print(f"Phase {phase}: {protein}")

234

```

235

236

### Sequence Processing Pipeline

237

238

```python

239

from gfftk.fasta import FASTA, translate, softwrap

240

241

# Initialize genome access

242

genome = FASTA("genome.fasta")

243

244

# Define gene coordinates (from GFF3 parsing)

245

gene_coords = {

246

"gene1": {

247

"contig": "chr1",

248

"strand": "+",

249

"cds": [(1000, 1200), (1500, 1700), (2000, 2300)]

250

}

251

}

252

253

# Extract and translate CDS sequences

254

for gene_id, gene_info in gene_coords.items():

255

# Get contig sequence

256

contig_seq = genome.get_seq(gene_info["contig"])

257

258

# Extract CDS regions

259

cds_sequence = ""

260

for start, end in gene_info["cds"]:

261

cds_sequence += contig_seq[start-1:end] # Convert to 0-based

262

263

# Handle reverse strand

264

if gene_info["strand"] == "-":

265

from gfftk.fasta import RevComp

266

cds_sequence = RevComp(cds_sequence)

267

268

# Translate to protein

269

protein = translate(cds_sequence, gene_info["strand"], 0, table=1)

270

271

# Format output

272

wrapped_protein = softwrap(protein, every=60)

273

print(f">{gene_id}\n{wrapped_protein}")

274

```

275

276

### Working with Compressed Files

277

278

```python

279

from gfftk.fasta import fasta2dict

280

281

# Works with compressed FASTA files automatically

282

genome = fasta2dict("genome.fasta.gz")

283

genome2 = fasta2dict("genome.fasta.bz2")

284

285

print(f"Loaded {len(genome)} sequences from compressed file")

286

```

287

288

## Types

289

290

```python { .api }

291

# Sequence dictionary format

292

SequenceDict = dict[str, str] # {header: sequence}

293

294

# Sequence header set

295

HeaderSet = set[str]

296

297

# Sequence length dictionary

298

LengthDict = dict[str, int] # {header: length}

299

300

# Coordinate tuple format

301

CoordinateTuple = tuple[int, int] # (start, end) in 1-based coordinates

302

303

# Coordinate list

304

CoordinateList = list[CoordinateTuple]

305

306

# Strand orientation

307

Strand = str # "+" or "-"

308

309

# Reading frame phase

310

Phase = int # 0, 1, or 2

311

312

# Genetic code table identifier

313

GeneticCodeTable = int # 1 (standard) or 11 (bacterial/archaeal/plant plastid)

314

315

# DNA sequence

316

DNASequence = str # String containing A, T, G, C, N characters

317

318

# Protein sequence

319

ProteinSequence = str # String containing single-letter amino acid codes

320

321

# Codon table structure

322

CodonTable = dict[str, str] # {codon: amino_acid}

323

324

# Complete genetic code tables

325

GeneticCodeTables = dict[str, CodonTable]

326

```