or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

clustering.mdcore-tree.mddata-tables.mdexternal-formats.mdindex.mdncbi-taxonomy.mdphylogenetic.mdsequences.mdvisualization.md

sequences.mddocs/

0

# Sequence Handling

1

2

Multiple sequence alignment and sequence group operations for managing molecular data associated with phylogenetic trees. ETE3 provides comprehensive support for various sequence formats and sequence-based analysis.

3

4

## Capabilities

5

6

### SeqGroup Class

7

8

Main class for handling collections of sequences, supporting multiple file formats and sequence operations.

9

10

```python { .api }

11

class SeqGroup:

12

"""

13

Container for multiple sequences with format support and manipulation methods.

14

"""

15

16

def __init__(self, sequences=None, format="fasta", fix_duplicates=True, **kwargs):

17

"""

18

Initialize sequence group.

19

20

Parameters:

21

- sequences (str): File path or sequence string data

22

- format (str): Sequence format ("fasta", "phylip", "iphylip",

23

"phylip_relaxed", "iphylip_relaxed", "paml")

24

- fix_duplicates (bool): Handle duplicate sequence names

25

- kwargs: Format-specific parameters

26

"""

27

28

def __len__(self):

29

"""Number of sequences in group."""

30

31

def __contains__(self, item):

32

"""Check if sequence name exists."""

33

34

def __str__(self):

35

"""String representation in FASTA format."""

36

37

def __iter__(self):

38

"""Iterate over sequence entries."""

39

```

40

41

### Sequence Access and Retrieval

42

43

Methods for accessing individual sequences and sequence metadata.

44

45

```python { .api }

46

def get_seq(self, name):

47

"""

48

Get sequence by name.

49

50

Parameters:

51

- name (str): Sequence name/identifier

52

53

Returns:

54

str: Sequence string

55

"""

56

57

def get_seqname(self, index):

58

"""

59

Get sequence name by index position.

60

61

Parameters:

62

- index (int): Index position in sequence group

63

64

Returns:

65

str: Sequence name

66

"""

67

68

def iter_entries(self):

69

"""

70

Iterator over sequence entries.

71

72

Yields:

73

tuple: (name, sequence) for each entry

74

"""

75

76

# Properties

77

id2seq: dict # Dictionary mapping sequence IDs to sequences

78

name2id: dict # Dictionary mapping sequence names to IDs

79

```

80

81

### Sequence Modification

82

83

Add, modify, and remove sequences from the group.

84

85

```python { .api }

86

def set_seq(self, name, seq, append=True):

87

"""

88

Set or update sequence.

89

90

Parameters:

91

- name (str): Sequence name/identifier

92

- seq (str): Sequence string

93

- append (bool): Append if name doesn't exist, otherwise update

94

"""

95

96

def remove_seq(self, name):

97

"""

98

Remove sequence by name.

99

100

Parameters:

101

- name (str): Sequence name to remove

102

"""

103

```

104

105

### File I/O Operations

106

107

Read and write sequences in various standard formats.

108

109

```python { .api }

110

def write(self, format="fasta", outfile=None):

111

"""

112

Write sequences to file or return as string.

113

114

Parameters:

115

- format (str): Output format ("fasta", "phylip", "iphylip",

116

"phylip_relaxed", "iphylip_relaxed", "paml")

117

- outfile (str): Output file path, if None returns string

118

119

Returns:

120

str: Formatted sequence string (if outfile is None)

121

"""

122

```

123

124

### Supported Sequence Formats

125

126

ETE3 supports multiple sequence formats with specific parsing options.

127

128

```python { .api }

129

# Available formats and their parsers

130

FORMATS = {

131

"fasta": "Standard FASTA format",

132

"phylip": "PHYLIP sequential format (10-char name limit)",

133

"iphylip": "PHYLIP interleaved format (10-char name limit)",

134

"phylip_relaxed": "PHYLIP sequential format (no name length limit)",

135

"iphylip_relaxed": "PHYLIP interleaved format (no name length limit)",

136

"paml": "PAML format for phylogenetic analysis"

137

}

138

```

139

140

## Parser Functions

141

142

Direct access to format-specific parsers for advanced usage.

143

144

### FASTA Format

145

146

```python { .api }

147

def read_fasta(source, header_delimiter=None, **kwargs):

148

"""

149

Parse FASTA format sequences.

150

151

Parameters:

152

- source (str): File path or sequence string

153

- header_delimiter (str): Character to split header at

154

155

Returns:

156

dict: Sequence name to sequence mapping

157

"""

158

159

def write_fasta(sequences, outfile=None, **kwargs):

160

"""

161

Write sequences in FASTA format.

162

163

Parameters:

164

- sequences: Sequence collection or SeqGroup

165

- outfile (str): Output file path

166

167

Returns:

168

str: FASTA formatted string (if outfile is None)

169

"""

170

```

171

172

### PHYLIP Format

173

174

```python { .api }

175

def read_phylip(source, interleaved=False, relaxed=False, **kwargs):

176

"""

177

Parse PHYLIP format sequences.

178

179

Parameters:

180

- source (str): File path or sequence string

181

- interleaved (bool): PHYLIP interleaved format

182

- relaxed (bool): Allow names longer than 10 characters

183

184

Returns:

185

dict: Sequence name to sequence mapping

186

"""

187

188

def write_phylip(sequences, outfile=None, interleaved=False, relaxed=False, **kwargs):

189

"""

190

Write sequences in PHYLIP format.

191

192

Parameters:

193

- sequences: Sequence collection or SeqGroup

194

- outfile (str): Output file path

195

- interleaved (bool): Use interleaved format

196

- relaxed (bool): Allow long sequence names

197

198

Returns:

199

str: PHYLIP formatted string (if outfile is None)

200

"""

201

```

202

203

### PAML Format

204

205

```python { .api }

206

def read_paml(source, **kwargs):

207

"""

208

Parse PAML format sequences.

209

210

Parameters:

211

- source (str): File path or sequence string

212

213

Returns:

214

dict: Sequence name to sequence mapping

215

"""

216

217

def write_paml(sequences, outfile=None, **kwargs):

218

"""

219

Write sequences in PAML format.

220

221

Parameters:

222

- sequences: Sequence collection or SeqGroup

223

- outfile (str): Output file path

224

225

Returns:

226

str: PAML formatted string (if outfile is None)

227

"""

228

```

229

230

## Integration with Trees

231

232

### Linking Sequences to Trees

233

234

```python { .api }

235

# In PhyloTree class

236

def link_to_alignment(self, alignment, alg_format="fasta", **kwargs):

237

"""

238

Associate sequence alignment with phylogenetic tree.

239

240

Parameters:

241

- alignment (str or SeqGroup): Alignment file/string or SeqGroup object

242

- alg_format (str): Alignment format

243

- kwargs: Format-specific parameters

244

"""

245

246

# Access linked sequences

247

sequence: str # Node property containing associated sequence (when linked)

248

```

249

250

## Usage Examples

251

252

### Basic Sequence Operations

253

254

```python

255

from ete3 import SeqGroup

256

257

# Load sequences from FASTA file

258

seqs = SeqGroup("sequences.fasta", format="fasta")

259

260

# Basic operations

261

print(f"Number of sequences: {len(seqs)}")

262

print(f"Sequence names: {list(seqs.name2id.keys())}")

263

264

# Access specific sequence

265

seq1 = seqs.get_seq("sequence_1")

266

print(f"Sequence 1: {seq1}")

267

268

# Iterate over all sequences

269

for name, seq in seqs.iter_entries():

270

print(f"{name}: {len(seq)} bp")

271

```

272

273

### Format Conversion

274

275

```python

276

from ete3 import SeqGroup

277

278

# Load FASTA and convert to PHYLIP

279

seqs = SeqGroup("input.fasta", format="fasta")

280

phylip_output = seqs.write(format="phylip")

281

282

# Save to file

283

seqs.write(format="phylip", outfile="output.phy")

284

285

# Handle relaxed PHYLIP for long names

286

seqs.write(format="phylip_relaxed", outfile="output_relaxed.phy")

287

```

288

289

### Sequence Manipulation

290

291

```python

292

from ete3 import SeqGroup

293

294

# Create empty sequence group

295

seqs = SeqGroup()

296

297

# Add sequences

298

seqs.set_seq("species1", "ATCGATCGATCG")

299

seqs.set_seq("species2", "ATCGATCGATCG")

300

seqs.set_seq("species3", "ATCGATCCATCG")

301

302

# Modify existing sequence

303

seqs.set_seq("species1", "ATCGATCGATCGAAAA")

304

305

# Remove sequence

306

seqs.remove_seq("species3")

307

308

# Export modified sequences

309

fasta_output = seqs.write(format="fasta")

310

```

311

312

### Integration with Phylogenetic Trees

313

314

```python

315

from ete3 import PhyloTree, SeqGroup

316

317

# Create phylogenetic tree

318

tree = PhyloTree("(A:0.1,(B:0.2,C:0.2):0.1);")

319

320

# Link to sequence alignment

321

tree.link_to_alignment("alignment.fasta", alg_format="fasta")

322

323

# Access sequence data through tree nodes

324

for leaf in tree.get_leaves():

325

if hasattr(leaf, 'sequence'):

326

print(f"{leaf.name}: {leaf.sequence}")

327

328

# Alternative: Load sequences separately and manually associate

329

seqs = SeqGroup("alignment.fasta")

330

for leaf in tree.get_leaves():

331

if leaf.name in seqs:

332

leaf.sequence = seqs.get_seq(leaf.name)

333

```

334

335

### Advanced Format Handling

336

337

```python

338

from ete3 import SeqGroup

339

340

# Handle PHYLIP interleaved format

341

seqs_interleaved = SeqGroup("data.phy", format="iphylip")

342

343

# PAML format for evolutionary analysis

344

paml_seqs = SeqGroup("paml_data.txt", format="paml")

345

346

# Custom format parameters

347

seqs_custom = SeqGroup(

348

"sequences.fasta",

349

format="fasta",

350

header_delimiter="|" # Split headers at |

351

)

352

353

# Write with specific options

354

seqs.write(

355

format="phylip",

356

outfile="output.phy",

357

interleaved=True,

358

relaxed=True

359

)

360

```