A Python Environment for (phylogenetic) Tree Exploration
—
Comprehensive integration with the NCBI Taxonomy database for taxonomic annotation, lineage retrieval, species tree construction, and taxonomic analysis. ETE3 provides seamless access to taxonomic information and tree-based taxonomic operations.
Main interface for accessing and working with NCBI Taxonomy data.
class NCBITaxa:
"""
Interface to NCBI Taxonomy database with local caching and tree integration.
"""
def __init__(self, dbfile=None, taxdump_file=None, update=True):
"""
Initialize NCBI Taxonomy database interface.
Parameters:
- dbfile (str): Path to local taxonomy database file
If None, uses default location (~/.etetoolkit/taxa.sqlite)
- taxdump_file (str): Path to custom taxdump file for database initialization
- update (bool): Whether to automatically update database if outdated
"""Manage local taxonomy database and updates.
def update_taxonomy_database(self):
"""
Update local NCBI taxonomy database with latest data.
Downloads and processes current NCBI taxonomy dump files.
"""
def get_topology(self, taxids, intermediate_nodes=False, rank_limit=None, annotate=True):
"""
Build taxonomic tree from list of taxonomic IDs.
Parameters:
- taxids (list): List of NCBI taxonomic IDs
- intermediate_nodes (bool): Include intermediate taxonomic nodes
- rank_limit (str): Limit tree to specific taxonomic rank
- annotate (bool): Annotate nodes with taxonomic information
Returns:
Tree: Taxonomic tree with specified taxa
"""Convert between taxonomic names and NCBI taxonomic IDs.
def get_name_translator(self, names):
"""
Translate organism names to NCBI taxonomic IDs.
Parameters:
- names (list): List of organism names to translate
Returns:
dict: Mapping from names to taxonomic IDs
"""
def get_taxid_translator(self, taxids):
"""
Translate NCBI taxonomic IDs to organism names.
Parameters:
- taxids (list): List of taxonomic IDs to translate
Returns:
dict: Mapping from taxonomic IDs to names
"""
def translate_to_names(self, taxids):
"""
Convert taxonomic IDs to scientific names.
Parameters:
- taxids (list): List of taxonomic IDs
Returns:
list: List of corresponding scientific names
"""
def get_fuzzy_name_translation(self, names, sim=0.9):
"""
Fuzzy matching for organism names to taxonomic IDs.
Parameters:
- names (list): List of organism names (may contain typos/variations)
- sim (float): Similarity threshold (0.0-1.0)
Returns:
dict: Best matches mapping names to taxonomic IDs
"""Retrieve taxonomic classifications and hierarchical relationships.
def get_lineage(self, taxid):
"""
Get complete taxonomic lineage for a taxonomic ID.
Parameters:
- taxid (int): NCBI taxonomic ID
Returns:
list: List of taxonomic IDs from root to target taxon
"""
def get_rank(self, taxids):
"""
Get taxonomic ranks for taxonomic IDs.
Parameters:
- taxids (list): List of taxonomic IDs
Returns:
dict: Mapping from taxonomic IDs to their ranks
"""
def get_common_names(self, taxids):
"""
Get common names for taxonomic IDs.
Parameters:
- taxids (list): List of taxonomic IDs
Returns:
dict: Mapping from taxonomic IDs to common names
"""
def get_descendant_taxa(self, parent, collapse_subspecies=False, rank_limit=None):
"""
Get all descendant taxa for a parent taxonomic ID.
Parameters:
- parent (int): Parent taxonomic ID
- collapse_subspecies (bool): Exclude subspecies level taxa
- rank_limit (str): Only include taxa at or above specified rank
Returns:
list: List of descendant taxonomic IDs
"""Annotate phylogenetic trees with taxonomic information.
def annotate_tree(self, tree, taxid_attr="species", tax2name=None, tax2track=None):
"""
Annotate tree nodes with taxonomic information.
Parameters:
- tree (Tree): Tree to annotate
- taxid_attr (str): Node attribute containing taxonomic information
- tax2name (dict): Custom mapping from taxids to names
- tax2track (dict): Additional attributes to track
Returns:
Tree: Annotated tree with taxonomic data
"""def get_broken_branches(self, tree, species_attr="species"):
"""
Identify branches that break species monophyly.
Parameters:
- tree (Tree): Input phylogenetic tree
- species_attr (str): Node attribute containing species information
Returns:
list: List of branches breaking monophyly
"""
def annotate_tree_with_taxa(self, tree, taxid_attr="name", tax2name=None):
"""
Add taxonomic annotations to all tree nodes.
Parameters:
- tree (Tree): Tree to annotate
- taxid_attr (str): Attribute containing taxonomic identifiers
- tax2name (dict): Custom taxonomic ID to name mapping
Returns:
Tree: Tree with taxonomic annotations added
"""from ete3 import NCBITaxa
# Initialize NCBI taxonomy
ncbi = NCBITaxa()
# Translate names to taxonomic IDs
name2taxid = ncbi.get_name_translator(['Homo sapiens', 'Pan troglodytes', 'Gorilla gorilla'])
print(f"Human taxid: {name2taxid['Homo sapiens']}")
# Translate taxonomic IDs to names
taxid2name = ncbi.get_taxid_translator([9606, 9598, 9593])
print(f"Taxid 9606: {taxid2name[9606]}")
# Get taxonomic lineage
lineage = ncbi.get_lineage(9606) # Human
print(f"Human lineage: {lineage}")
# Get ranks for lineage
ranks = ncbi.get_rank(lineage)
for taxid in lineage:
print(f"{taxid}: {ranks[taxid]}")from ete3 import NCBITaxa
ncbi = NCBITaxa()
# Create taxonomic tree from species list
species_names = ['Homo sapiens', 'Pan troglodytes', 'Gorilla gorilla', 'Macaca mulatta']
name2taxid = ncbi.get_name_translator(species_names)
taxids = [name2taxid[name] for name in species_names]
# Build taxonomic tree
tree = ncbi.get_topology(taxids)
print(tree.get_ascii())
# Include intermediate nodes for complete taxonomy
full_tree = ncbi.get_topology(taxids, intermediate_nodes=True)
print(full_tree.get_ascii())from ete3 import PhyloTree, NCBITaxa
# Create phylogenetic tree
tree = PhyloTree("(9606:1,(9598:0.5,9593:0.5):0.5);") # Using taxids as names
# Initialize NCBI taxonomy
ncbi = NCBITaxa()
# Annotate tree with taxonomic information
annotated_tree = ncbi.annotate_tree(tree, taxid_attr="name")
# Access taxonomic information
for node in annotated_tree.traverse():
if hasattr(node, 'sci_name'):
print(f"Node {node.name}: {node.sci_name} ({node.rank})")from ete3 import NCBITaxa
ncbi = NCBITaxa()
# Handle names with potential typos or variations
fuzzy_names = ['Homo sapian', 'chimpanzee', 'gorill']
matches = ncbi.get_fuzzy_name_translation(fuzzy_names, sim=0.8)
for name, taxid in matches.items():
correct_name = ncbi.translate_to_names([taxid])[0]
print(f"'{name}' -> {taxid} ({correct_name})")from ete3 import NCBITaxa, PhyloTree
ncbi = NCBITaxa()
# Get all primates
primate_taxid = ncbi.get_name_translator(['Primates'])['Primates']
primate_descendants = ncbi.get_descendant_taxa(primate_taxid, rank_limit='species')
# Create comprehensive primate tree
primate_tree = ncbi.get_topology(primate_descendants[:50]) # Limit for example
# Analyze taxonomic ranks
ranks = ncbi.get_rank(primate_descendants[:20])
rank_counts = {}
for taxid, rank in ranks.items():
rank_counts[rank] = rank_counts.get(rank, 0) + 1
print(f"Taxonomic rank distribution: {rank_counts}")from ete3 import NCBITaxa
# Update local taxonomy database (run periodically)
ncbi = NCBITaxa()
# ncbi.update_taxonomy_database() # Uncomment to actually update
# Use custom database file
ncbi_custom = NCBITaxa(dbfile="/path/to/custom/taxa.sqlite")
# Check database version/status
# Access internal database methods if needed for maintenancefrom ete3 import PhyloTree, NCBITaxa
# Gene tree with species information
gene_tree = PhyloTree("(human_gene1:0.1,(chimp_gene1:0.05,gorilla_gene1:0.05):0.02);")
# Set up species naming
gene_tree.set_species_naming_function(lambda x: x.split('_')[0])
# Get NCBI taxonomy for comparison
ncbi = NCBITaxa()
species_names = ['human', 'chimp', 'gorilla']
name_mapping = {'human': 'Homo sapiens', 'chimp': 'Pan troglodytes', 'gorilla': 'Gorilla gorilla'}
full_names = [name_mapping[sp] for sp in species_names]
taxids = [ncbi.get_name_translator([name])[name] for name in full_names]
# Create species tree from NCBI
species_tree = ncbi.get_topology(taxids)
# Compare gene tree topology with species tree
# (This would involve reconciliation analysis)
print("Gene tree topology:")
print(gene_tree.get_ascii())
print("Species tree topology:")
print(species_tree.get_ascii())Install with Tessl CLI
npx tessl i tessl/pypi-ete3