tessl/pypi-ete3

A Python Environment for (phylogenetic) Tree Exploration

—

Pending

Overview

Eval results

Files

Clustering Analysis

Name: tessl/pypi-ete3
Author: tessl

Specialized clustering tree operations for hierarchical clustering analysis, cluster validation, and dendogram-based data exploration. ETE3 provides enhanced tree classes specifically designed for clustering workflows.

Capabilities

ClusterTree and ClusterNode Classes

Enhanced tree classes specialized for clustering analysis and validation.

class ClusterTree(Tree):
    """
    Tree specialized for hierarchical clustering analysis.
    Inherits all Tree functionality plus clustering-specific methods.
    """
    
    def __init__(self, newick=None, **kwargs):
        """
        Initialize clustering tree.

        Parameters:
        - newick (str): Newick format string or file path
        - kwargs: Additional Tree initialization parameters
        """

class ClusterNode(ClusterTree):
    """Alias for ClusterTree - same functionality."""
    pass

Cluster Profile Analysis

Extract and analyze cluster profiles and characteristics.

def get_cluster_profile(self):
    """
    Get profile characteristics for cluster represented by this node.

    Returns:
    dict: Cluster profile including:
        - size: Number of items in cluster
        - height: Cluster height/dissimilarity
        - members: List of cluster members
        - profile: Statistical summary of cluster data
    """

def get_cluster_size(self):
    """
    Get number of items in cluster.

    Returns:
    int: Cluster size (number of leaf nodes)
    """

def get_cluster_members(self):
    """
    Get all members (leaf names) of this cluster.

    Returns:
    list: List of member names in cluster
    """

def get_cluster_height(self):
    """
    Get cluster height (distance at which cluster was formed).

    Returns:
    float: Cluster formation height/distance
    """

Cluster Validation Metrics

Calculate various cluster validation and quality metrics.

def get_silhouette(self):
    """
    Calculate silhouette coefficient for cluster.
    
    Measures how similar items are to their own cluster compared to other clusters.
    Values range from -1 to 1, where higher values indicate better clustering.

    Returns:
    float: Silhouette coefficient (-1 to 1)
    """

def get_intra_cluster_distance(self):
    """
    Calculate average intra-cluster distance.

    Returns:
    float: Average distance between items within cluster
    """

def get_inter_cluster_distance(self, other_cluster):
    """
    Calculate distance between this cluster and another cluster.

    Parameters:
    - other_cluster (ClusterTree): Other cluster for comparison

    Returns:
    float: Distance between clusters
    """

def get_cluster_variance(self):
    """
    Calculate within-cluster variance.

    Returns:
    float: Variance of distances within cluster
    """

Cluster Cutting and Partitioning

Methods for extracting clusters at different levels of the hierarchy.

def get_clusters_at_height(self, height):
    """
    Get clusters by cutting tree at specified height.

    Parameters:
    - height (float): Height at which to cut the tree

    Returns:
    list: List of ClusterTree objects representing clusters
    """

def get_clusters_by_size(self, min_size=2, max_size=None):
    """
    Get clusters within specified size range.

    Parameters:
    - min_size (int): Minimum cluster size
    - max_size (int): Maximum cluster size (None for no limit)

    Returns:
    list: List of clusters meeting size criteria
    """

def get_optimal_clusters(self, criterion="silhouette"):
    """
    Find optimal number of clusters using specified criterion.

    Parameters:
    - criterion (str): Optimization criterion ("silhouette", "gap", "elbow")

    Returns:
    tuple: (optimal_k, clusters_list, criterion_values)
    """

Cluster Comparison and Analysis

Compare different clusterings and analyze cluster relationships.

def compare_clusters(self, other_clustering, method="adjusted_rand"):
    """
    Compare this clustering with another clustering.

    Parameters:
    - other_clustering (ClusterTree or dict): Other clustering to compare
    - method (str): Comparison metric ("adjusted_rand", "normalized_mutual_info", "homogeneity")

    Returns:
    float: Clustering similarity score
    """

def get_cluster_stability(self, bootstrap_samples=100):
    """
    Assess cluster stability through bootstrap resampling.

    Parameters:
    - bootstrap_samples (int): Number of bootstrap iterations

    Returns:
    dict: Stability scores for each cluster
    """

Distance Matrix Integration

Work with distance matrices and clustering algorithms.

def from_distance_matrix(self, distance_matrix, labels=None, method="average"):
    """
    Create clustering tree from distance matrix.

    Parameters:
    - distance_matrix (array-like): Symmetric distance matrix
    - labels (list): Labels for matrix rows/columns
    - method (str): Linkage method ("single", "complete", "average", "ward")

    Returns:
    ClusterTree: Hierarchical clustering result
    """

def get_distance_matrix(self):
    """
    Extract distance matrix from clustering tree.

    Returns:
    numpy.ndarray: Distance matrix between all leaf pairs
    """

Cluster Visualization

Specialized visualization methods for clustering results.

def show_cluster_heatmap(self, data_matrix=None, color_map="viridis"):
    """
    Display clustering results with associated data heatmap.

    Parameters:
    - data_matrix (array-like): Data matrix to display alongside tree
    - color_map (str): Color scheme for heatmap
    """

def render_dendrogram(self, orientation="top", leaf_rotation=90, **kwargs):
    """
    Render tree as dendrogram with clustering-specific formatting.

    Parameters:
    - orientation (str): Dendrogram orientation ("top", "bottom", "left", "right")  
    - leaf_rotation (int): Rotation angle for leaf labels
    - kwargs: Additional rendering parameters
    """

Integration with Data Analysis

ArrayTable Integration

Seamless integration with ETE3's ArrayTable for data-driven clustering.

# In ArrayTable class
def cluster_data(self, method="ward", metric="euclidean"):
    """
    Perform hierarchical clustering on table data.

    Parameters:
    - method (str): Linkage method ("ward", "complete", "average", "single")
    - metric (str): Distance metric ("euclidean", "manhattan", "cosine", "correlation")

    Returns:
    ClusterTree: Clustering result tree
    """

Usage Examples

Basic Clustering Analysis

from ete3 import ClusterTree
import numpy as np

# Load clustering result (from distance matrix or linkage)
cluster_tree = ClusterTree("clustering_result.nw")

# Basic cluster information
print(f"Total items clustered: {len(cluster_tree.get_leaves())}")
print(f"Tree height: {cluster_tree.get_tree_root().get_cluster_height()}")

# Analyze individual clusters
for node in cluster_tree.traverse():
    if not node.is_leaf():
        profile = node.get_cluster_profile()
        print(f"Cluster size: {profile['size']}, height: {profile['height']:.3f}")

Cluster Validation

from ete3 import ClusterTree

cluster_tree = ClusterTree("hierarchical_clustering.nw")

# Calculate silhouette scores for all clusters
silhouette_scores = {}
for node in cluster_tree.traverse():
    if not node.is_leaf() and len(node.get_leaves()) > 1:
        silhouette = node.get_silhouette()
        silhouette_scores[node] = silhouette
        print(f"Cluster {len(node.get_leaves())} items: silhouette = {silhouette:.3f}")

# Find best clusters based on silhouette
best_clusters = [node for node, score in silhouette_scores.items() if score > 0.5]
print(f"Found {len(best_clusters)} high-quality clusters")

Cluster Cutting and Optimization

from ete3 import ClusterTree

cluster_tree = ClusterTree("clustering_dendrogram.nw")

# Cut tree at different heights
heights = [0.1, 0.2, 0.5, 1.0]
for height in heights:
    clusters = cluster_tree.get_clusters_at_height(height)
    print(f"Height {height}: {len(clusters)} clusters")
    
    # Analyze cluster sizes
    sizes = [len(cluster.get_leaves()) for cluster in clusters]
    print(f"  Cluster sizes: {sizes}")

# Find optimal clustering
optimal_k, optimal_clusters, scores = cluster_tree.get_optimal_clusters(criterion="silhouette")
print(f"Optimal number of clusters: {optimal_k}")
print(f"Optimal clustering silhouette: {max(scores):.3f}")

Integration with Data Analysis

from ete3 import ArrayTable, ClusterTree
import numpy as np

# Load expression data
expression_data = ArrayTable("gene_expression.txt")

# Perform clustering
cluster_result = expression_data.cluster_data(method="ward", metric="euclidean")

# Analyze clustering quality
for node in cluster_result.traverse():
    if not node.is_leaf():
        cluster_profile = node.get_cluster_profile()
        if cluster_profile['size'] >= 5:  # Focus on larger clusters
            silhouette = node.get_silhouette()
            variance = node.get_cluster_variance()
            print(f"Cluster {cluster_profile['size']} genes:")
            print(f"  Silhouette: {silhouette:.3f}")
            print(f"  Variance: {variance:.3f}")
            print(f"  Members: {node.get_cluster_members()[:5]}...")  # Show first 5

Cluster Comparison

from ete3 import ClusterTree

# Load two different clustering results
clustering1 = ClusterTree("method1_clustering.nw")
clustering2 = ClusterTree("method2_clustering.nw")

# Compare clusterings
similarity = clustering1.compare_clusters(clustering2, method="adjusted_rand")
print(f"Clustering similarity (Adjusted Rand Index): {similarity:.3f}")

# Assess stability
stability_scores = clustering1.get_cluster_stability(bootstrap_samples=50)
for cluster, stability in stability_scores.items():
    print(f"Cluster stability: {stability:.3f}")

Advanced Clustering Workflow

from ete3 import ArrayTable, ClusterTree
import numpy as np

# Complete clustering analysis workflow
def analyze_clustering(data_file, methods=["ward", "complete", "average"]):
    # Load data
    data = ArrayTable(data_file)
    
    # Try different clustering methods
    results = {}
    for method in methods:
        cluster_tree = data.cluster_data(method=method, metric="euclidean")
        
        # Find optimal clusters
        opt_k, opt_clusters, scores = cluster_tree.get_optimal_clusters()
        
        # Calculate overall quality metrics
        avg_silhouette = np.mean([cluster.get_silhouette() 
                                 for cluster in opt_clusters 
                                 if len(cluster.get_leaves()) > 1])
        
        results[method] = {
            'tree': cluster_tree,
            'optimal_k': opt_k,
            'avg_silhouette': avg_silhouette,
            'clusters': opt_clusters
        }
        
        print(f"{method}: k={opt_k}, silhouette={avg_silhouette:.3f}")
    
    # Select best method
    best_method = max(results.keys(), 
                     key=lambda m: results[m]['avg_silhouette'])
    
    print(f"\nBest method: {best_method}")
    return results[best_method]

# Run analysis
best_clustering = analyze_clustering("expression_matrix.txt")

# Visualize best result
best_clustering['tree'].show_cluster_heatmap()

Custom Distance Metrics

from ete3 import ClusterTree
import numpy as np
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, to_tree

# Custom clustering with correlation distance
def correlation_clustering(data_matrix, method="average"):
    # Calculate correlation-based distances
    correlation_matrix = np.corrcoef(data_matrix)
    distance_matrix = 1 - np.abs(correlation_matrix)  # Convert correlation to distance
    
    # Perform hierarchical clustering
    condensed_distances = pdist(data_matrix, metric='correlation')
    linkage_matrix = linkage(condensed_distances, method=method)
    
    # Convert to ETE3 tree format
    scipy_tree = to_tree(linkage_matrix)
    
    # Create ClusterTree (would need conversion function)
    # This is a simplified example
    return ClusterTree(newick_from_scipy_tree(scipy_tree))

# Use custom clustering
data = np.random.rand(50, 100)  # 50 samples, 100 features
custom_cluster_tree = correlation_clustering(data)

# Analyze results
optimal_clusters = custom_cluster_tree.get_optimal_clusters()
print(f"Custom clustering found {len(optimal_clusters[1])} optimal clusters")

Install with Tessl CLI