A Python Environment for (phylogenetic) Tree Exploration
—
Multiple sequence alignment and sequence group operations for managing molecular data associated with phylogenetic trees. ETE3 provides comprehensive support for various sequence formats and sequence-based analysis.
Main class for handling collections of sequences, supporting multiple file formats and sequence operations.
class SeqGroup:
"""
Container for multiple sequences with format support and manipulation methods.
"""
def __init__(self, sequences=None, format="fasta", fix_duplicates=True, **kwargs):
"""
Initialize sequence group.
Parameters:
- sequences (str): File path or sequence string data
- format (str): Sequence format ("fasta", "phylip", "iphylip",
"phylip_relaxed", "iphylip_relaxed", "paml")
- fix_duplicates (bool): Handle duplicate sequence names
- kwargs: Format-specific parameters
"""
def __len__(self):
"""Number of sequences in group."""
def __contains__(self, item):
"""Check if sequence name exists."""
def __str__(self):
"""String representation in FASTA format."""
def __iter__(self):
"""Iterate over sequence entries."""Methods for accessing individual sequences and sequence metadata.
def get_seq(self, name):
"""
Get sequence by name.
Parameters:
- name (str): Sequence name/identifier
Returns:
str: Sequence string
"""
def get_seqname(self, index):
"""
Get sequence name by index position.
Parameters:
- index (int): Index position in sequence group
Returns:
str: Sequence name
"""
def iter_entries(self):
"""
Iterator over sequence entries.
Yields:
tuple: (name, sequence) for each entry
"""
# Properties
id2seq: dict # Dictionary mapping sequence IDs to sequences
name2id: dict # Dictionary mapping sequence names to IDsAdd, modify, and remove sequences from the group.
def set_seq(self, name, seq, append=True):
"""
Set or update sequence.
Parameters:
- name (str): Sequence name/identifier
- seq (str): Sequence string
- append (bool): Append if name doesn't exist, otherwise update
"""
def remove_seq(self, name):
"""
Remove sequence by name.
Parameters:
- name (str): Sequence name to remove
"""Read and write sequences in various standard formats.
def write(self, format="fasta", outfile=None):
"""
Write sequences to file or return as string.
Parameters:
- format (str): Output format ("fasta", "phylip", "iphylip",
"phylip_relaxed", "iphylip_relaxed", "paml")
- outfile (str): Output file path, if None returns string
Returns:
str: Formatted sequence string (if outfile is None)
"""ETE3 supports multiple sequence formats with specific parsing options.
# Available formats and their parsers
FORMATS = {
"fasta": "Standard FASTA format",
"phylip": "PHYLIP sequential format (10-char name limit)",
"iphylip": "PHYLIP interleaved format (10-char name limit)",
"phylip_relaxed": "PHYLIP sequential format (no name length limit)",
"iphylip_relaxed": "PHYLIP interleaved format (no name length limit)",
"paml": "PAML format for phylogenetic analysis"
}Direct access to format-specific parsers for advanced usage.
def read_fasta(source, header_delimiter=None, **kwargs):
"""
Parse FASTA format sequences.
Parameters:
- source (str): File path or sequence string
- header_delimiter (str): Character to split header at
Returns:
dict: Sequence name to sequence mapping
"""
def write_fasta(sequences, outfile=None, **kwargs):
"""
Write sequences in FASTA format.
Parameters:
- sequences: Sequence collection or SeqGroup
- outfile (str): Output file path
Returns:
str: FASTA formatted string (if outfile is None)
"""def read_phylip(source, interleaved=False, relaxed=False, **kwargs):
"""
Parse PHYLIP format sequences.
Parameters:
- source (str): File path or sequence string
- interleaved (bool): PHYLIP interleaved format
- relaxed (bool): Allow names longer than 10 characters
Returns:
dict: Sequence name to sequence mapping
"""
def write_phylip(sequences, outfile=None, interleaved=False, relaxed=False, **kwargs):
"""
Write sequences in PHYLIP format.
Parameters:
- sequences: Sequence collection or SeqGroup
- outfile (str): Output file path
- interleaved (bool): Use interleaved format
- relaxed (bool): Allow long sequence names
Returns:
str: PHYLIP formatted string (if outfile is None)
"""def read_paml(source, **kwargs):
"""
Parse PAML format sequences.
Parameters:
- source (str): File path or sequence string
Returns:
dict: Sequence name to sequence mapping
"""
def write_paml(sequences, outfile=None, **kwargs):
"""
Write sequences in PAML format.
Parameters:
- sequences: Sequence collection or SeqGroup
- outfile (str): Output file path
Returns:
str: PAML formatted string (if outfile is None)
"""# In PhyloTree class
def link_to_alignment(self, alignment, alg_format="fasta", **kwargs):
"""
Associate sequence alignment with phylogenetic tree.
Parameters:
- alignment (str or SeqGroup): Alignment file/string or SeqGroup object
- alg_format (str): Alignment format
- kwargs: Format-specific parameters
"""
# Access linked sequences
sequence: str # Node property containing associated sequence (when linked)from ete3 import SeqGroup
# Load sequences from FASTA file
seqs = SeqGroup("sequences.fasta", format="fasta")
# Basic operations
print(f"Number of sequences: {len(seqs)}")
print(f"Sequence names: {list(seqs.name2id.keys())}")
# Access specific sequence
seq1 = seqs.get_seq("sequence_1")
print(f"Sequence 1: {seq1}")
# Iterate over all sequences
for name, seq in seqs.iter_entries():
print(f"{name}: {len(seq)} bp")from ete3 import SeqGroup
# Load FASTA and convert to PHYLIP
seqs = SeqGroup("input.fasta", format="fasta")
phylip_output = seqs.write(format="phylip")
# Save to file
seqs.write(format="phylip", outfile="output.phy")
# Handle relaxed PHYLIP for long names
seqs.write(format="phylip_relaxed", outfile="output_relaxed.phy")from ete3 import SeqGroup
# Create empty sequence group
seqs = SeqGroup()
# Add sequences
seqs.set_seq("species1", "ATCGATCGATCG")
seqs.set_seq("species2", "ATCGATCGATCG")
seqs.set_seq("species3", "ATCGATCCATCG")
# Modify existing sequence
seqs.set_seq("species1", "ATCGATCGATCGAAAA")
# Remove sequence
seqs.remove_seq("species3")
# Export modified sequences
fasta_output = seqs.write(format="fasta")from ete3 import PhyloTree, SeqGroup
# Create phylogenetic tree
tree = PhyloTree("(A:0.1,(B:0.2,C:0.2):0.1);")
# Link to sequence alignment
tree.link_to_alignment("alignment.fasta", alg_format="fasta")
# Access sequence data through tree nodes
for leaf in tree.get_leaves():
if hasattr(leaf, 'sequence'):
print(f"{leaf.name}: {leaf.sequence}")
# Alternative: Load sequences separately and manually associate
seqs = SeqGroup("alignment.fasta")
for leaf in tree.get_leaves():
if leaf.name in seqs:
leaf.sequence = seqs.get_seq(leaf.name)from ete3 import SeqGroup
# Handle PHYLIP interleaved format
seqs_interleaved = SeqGroup("data.phy", format="iphylip")
# PAML format for evolutionary analysis
paml_seqs = SeqGroup("paml_data.txt", format="paml")
# Custom format parameters
seqs_custom = SeqGroup(
"sequences.fasta",
format="fasta",
header_delimiter="|" # Split headers at |
)
# Write with specific options
seqs.write(
format="phylip",
outfile="output.phy",
interleaved=True,
relaxed=True
)Install with Tessl CLI
npx tessl i tessl/pypi-ete3