CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-ete3

A Python Environment for (phylogenetic) Tree Exploration

Pending
Overview
Eval results
Files

ncbi-taxonomy.mddocs/

NCBI Taxonomy Integration

Comprehensive integration with the NCBI Taxonomy database for taxonomic annotation, lineage retrieval, species tree construction, and taxonomic analysis. ETE3 provides seamless access to taxonomic information and tree-based taxonomic operations.

Capabilities

NCBITaxa Class

Main interface for accessing and working with NCBI Taxonomy data.

class NCBITaxa:
    """
    Interface to NCBI Taxonomy database with local caching and tree integration.
    """
    
    def __init__(self, dbfile=None, taxdump_file=None, update=True):
        """
        Initialize NCBI Taxonomy database interface.

        Parameters:
        - dbfile (str): Path to local taxonomy database file
                       If None, uses default location (~/.etetoolkit/taxa.sqlite)
        - taxdump_file (str): Path to custom taxdump file for database initialization
        - update (bool): Whether to automatically update database if outdated
        """

Database Management

Manage local taxonomy database and updates.

def update_taxonomy_database(self):
    """
    Update local NCBI taxonomy database with latest data.
    Downloads and processes current NCBI taxonomy dump files.
    """

def get_topology(self, taxids, intermediate_nodes=False, rank_limit=None, annotate=True):
    """
    Build taxonomic tree from list of taxonomic IDs.

    Parameters:
    - taxids (list): List of NCBI taxonomic IDs
    - intermediate_nodes (bool): Include intermediate taxonomic nodes
    - rank_limit (str): Limit tree to specific taxonomic rank
    - annotate (bool): Annotate nodes with taxonomic information

    Returns:
    Tree: Taxonomic tree with specified taxa
    """

Taxonomic ID Translation

Convert between taxonomic names and NCBI taxonomic IDs.

def get_name_translator(self, names):
    """
    Translate organism names to NCBI taxonomic IDs.

    Parameters:
    - names (list): List of organism names to translate

    Returns:
    dict: Mapping from names to taxonomic IDs
    """

def get_taxid_translator(self, taxids):
    """
    Translate NCBI taxonomic IDs to organism names.

    Parameters:
    - taxids (list): List of taxonomic IDs to translate

    Returns:
    dict: Mapping from taxonomic IDs to names
    """

def translate_to_names(self, taxids):
    """
    Convert taxonomic IDs to scientific names.

    Parameters:
    - taxids (list): List of taxonomic IDs

    Returns:
    list: List of corresponding scientific names
    """

def get_fuzzy_name_translation(self, names, sim=0.9):
    """
    Fuzzy matching for organism names to taxonomic IDs.

    Parameters:
    - names (list): List of organism names (may contain typos/variations)
    - sim (float): Similarity threshold (0.0-1.0)

    Returns:
    dict: Best matches mapping names to taxonomic IDs
    """

Taxonomic Hierarchy and Lineages

Retrieve taxonomic classifications and hierarchical relationships.

def get_lineage(self, taxid):
    """
    Get complete taxonomic lineage for a taxonomic ID.

    Parameters:
    - taxid (int): NCBI taxonomic ID

    Returns:
    list: List of taxonomic IDs from root to target taxon
    """

def get_rank(self, taxids):
    """
    Get taxonomic ranks for taxonomic IDs.

    Parameters:
    - taxids (list): List of taxonomic IDs

    Returns:
    dict: Mapping from taxonomic IDs to their ranks
    """

def get_common_names(self, taxids):
    """
    Get common names for taxonomic IDs.

    Parameters:
    - taxids (list): List of taxonomic IDs

    Returns:
    dict: Mapping from taxonomic IDs to common names
    """

def get_descendant_taxa(self, parent, collapse_subspecies=False, rank_limit=None):
    """
    Get all descendant taxa for a parent taxonomic ID.

    Parameters:
    - parent (int): Parent taxonomic ID
    - collapse_subspecies (bool): Exclude subspecies level taxa
    - rank_limit (str): Only include taxa at or above specified rank

    Returns:
    list: List of descendant taxonomic IDs
    """

Tree Annotation

Annotate phylogenetic trees with taxonomic information.

def annotate_tree(self, tree, taxid_attr="species", tax2name=None, tax2track=None):
    """
    Annotate tree nodes with taxonomic information.

    Parameters:
    - tree (Tree): Tree to annotate
    - taxid_attr (str): Node attribute containing taxonomic information
    - tax2name (dict): Custom mapping from taxids to names
    - tax2track (dict): Additional attributes to track

    Returns:
    Tree: Annotated tree with taxonomic data
    """

Taxonomic Analysis Functions

Species Tree Construction

def get_broken_branches(self, tree, species_attr="species"):
    """
    Identify branches that break species monophyly.

    Parameters:
    - tree (Tree): Input phylogenetic tree
    - species_attr (str): Node attribute containing species information

    Returns:
    list: List of branches breaking monophyly
    """

def annotate_tree_with_taxa(self, tree, taxid_attr="name", tax2name=None):
    """
    Add taxonomic annotations to all tree nodes.

    Parameters:
    - tree (Tree): Tree to annotate
    - taxid_attr (str): Attribute containing taxonomic identifiers
    - tax2name (dict): Custom taxonomic ID to name mapping

    Returns:
    Tree: Tree with taxonomic annotations added
    """

Usage Examples

Basic Taxonomy Operations

from ete3 import NCBITaxa

# Initialize NCBI taxonomy
ncbi = NCBITaxa()

# Translate names to taxonomic IDs
name2taxid = ncbi.get_name_translator(['Homo sapiens', 'Pan troglodytes', 'Gorilla gorilla'])
print(f"Human taxid: {name2taxid['Homo sapiens']}")

# Translate taxonomic IDs to names
taxid2name = ncbi.get_taxid_translator([9606, 9598, 9593])
print(f"Taxid 9606: {taxid2name[9606]}")

# Get taxonomic lineage
lineage = ncbi.get_lineage(9606)  # Human
print(f"Human lineage: {lineage}")

# Get ranks for lineage
ranks = ncbi.get_rank(lineage)
for taxid in lineage:
    print(f"{taxid}: {ranks[taxid]}")

Building Taxonomic Trees

from ete3 import NCBITaxa

ncbi = NCBITaxa()

# Create taxonomic tree from species list
species_names = ['Homo sapiens', 'Pan troglodytes', 'Gorilla gorilla', 'Macaca mulatta']
name2taxid = ncbi.get_name_translator(species_names)
taxids = [name2taxid[name] for name in species_names]

# Build taxonomic tree
tree = ncbi.get_topology(taxids)
print(tree.get_ascii())

# Include intermediate nodes for complete taxonomy
full_tree = ncbi.get_topology(taxids, intermediate_nodes=True)
print(full_tree.get_ascii())

Tree Annotation

from ete3 import PhyloTree, NCBITaxa

# Create phylogenetic tree
tree = PhyloTree("(9606:1,(9598:0.5,9593:0.5):0.5);")  # Using taxids as names

# Initialize NCBI taxonomy
ncbi = NCBITaxa()

# Annotate tree with taxonomic information
annotated_tree = ncbi.annotate_tree(tree, taxid_attr="name")

# Access taxonomic information
for node in annotated_tree.traverse():
    if hasattr(node, 'sci_name'):
        print(f"Node {node.name}: {node.sci_name} ({node.rank})")

Fuzzy Name Matching

from ete3 import NCBITaxa

ncbi = NCBITaxa()

# Handle names with potential typos or variations
fuzzy_names = ['Homo sapian', 'chimpanzee', 'gorill']
matches = ncbi.get_fuzzy_name_translation(fuzzy_names, sim=0.8)

for name, taxid in matches.items():
    correct_name = ncbi.translate_to_names([taxid])[0]
    print(f"'{name}' -> {taxid} ({correct_name})")

Advanced Taxonomic Analysis

from ete3 import NCBITaxa, PhyloTree

ncbi = NCBITaxa()

# Get all primates
primate_taxid = ncbi.get_name_translator(['Primates'])['Primates']
primate_descendants = ncbi.get_descendant_taxa(primate_taxid, rank_limit='species')

# Create comprehensive primate tree
primate_tree = ncbi.get_topology(primate_descendants[:50])  # Limit for example

# Analyze taxonomic ranks
ranks = ncbi.get_rank(primate_descendants[:20])
rank_counts = {}
for taxid, rank in ranks.items():
    rank_counts[rank] = rank_counts.get(rank, 0) + 1

print(f"Taxonomic rank distribution: {rank_counts}")

Database Updates and Management

from ete3 import NCBITaxa

# Update local taxonomy database (run periodically)
ncbi = NCBITaxa()
# ncbi.update_taxonomy_database()  # Uncomment to actually update

# Use custom database file
ncbi_custom = NCBITaxa(dbfile="/path/to/custom/taxa.sqlite")

# Check database version/status
# Access internal database methods if needed for maintenance

Integration with Phylogenetic Analysis

from ete3 import PhyloTree, NCBITaxa

# Gene tree with species information
gene_tree = PhyloTree("(human_gene1:0.1,(chimp_gene1:0.05,gorilla_gene1:0.05):0.02);")

# Set up species naming
gene_tree.set_species_naming_function(lambda x: x.split('_')[0])

# Get NCBI taxonomy for comparison
ncbi = NCBITaxa()
species_names = ['human', 'chimp', 'gorilla']
name_mapping = {'human': 'Homo sapiens', 'chimp': 'Pan troglodytes', 'gorilla': 'Gorilla gorilla'}
full_names = [name_mapping[sp] for sp in species_names]
taxids = [ncbi.get_name_translator([name])[name] for name in full_names]

# Create species tree from NCBI
species_tree = ncbi.get_topology(taxids)

# Compare gene tree topology with species tree
# (This would involve reconciliation analysis)
print("Gene tree topology:")
print(gene_tree.get_ascii())
print("Species tree topology:")
print(species_tree.get_ascii())

Install with Tessl CLI

npx tessl i tessl/pypi-ete3

docs

clustering.md

core-tree.md

data-tables.md

external-formats.md

index.md

ncbi-taxonomy.md

phylogenetic.md

sequences.md

visualization.md

tile.json