A Python Environment for (phylogenetic) Tree Exploration
—
Specialized clustering tree operations for hierarchical clustering analysis, cluster validation, and dendogram-based data exploration. ETE3 provides enhanced tree classes specifically designed for clustering workflows.
Enhanced tree classes specialized for clustering analysis and validation.
class ClusterTree(Tree):
"""
Tree specialized for hierarchical clustering analysis.
Inherits all Tree functionality plus clustering-specific methods.
"""
def __init__(self, newick=None, **kwargs):
"""
Initialize clustering tree.
Parameters:
- newick (str): Newick format string or file path
- kwargs: Additional Tree initialization parameters
"""
class ClusterNode(ClusterTree):
"""Alias for ClusterTree - same functionality."""
passExtract and analyze cluster profiles and characteristics.
def get_cluster_profile(self):
"""
Get profile characteristics for cluster represented by this node.
Returns:
dict: Cluster profile including:
- size: Number of items in cluster
- height: Cluster height/dissimilarity
- members: List of cluster members
- profile: Statistical summary of cluster data
"""
def get_cluster_size(self):
"""
Get number of items in cluster.
Returns:
int: Cluster size (number of leaf nodes)
"""
def get_cluster_members(self):
"""
Get all members (leaf names) of this cluster.
Returns:
list: List of member names in cluster
"""
def get_cluster_height(self):
"""
Get cluster height (distance at which cluster was formed).
Returns:
float: Cluster formation height/distance
"""Calculate various cluster validation and quality metrics.
def get_silhouette(self):
"""
Calculate silhouette coefficient for cluster.
Measures how similar items are to their own cluster compared to other clusters.
Values range from -1 to 1, where higher values indicate better clustering.
Returns:
float: Silhouette coefficient (-1 to 1)
"""
def get_intra_cluster_distance(self):
"""
Calculate average intra-cluster distance.
Returns:
float: Average distance between items within cluster
"""
def get_inter_cluster_distance(self, other_cluster):
"""
Calculate distance between this cluster and another cluster.
Parameters:
- other_cluster (ClusterTree): Other cluster for comparison
Returns:
float: Distance between clusters
"""
def get_cluster_variance(self):
"""
Calculate within-cluster variance.
Returns:
float: Variance of distances within cluster
"""Methods for extracting clusters at different levels of the hierarchy.
def get_clusters_at_height(self, height):
"""
Get clusters by cutting tree at specified height.
Parameters:
- height (float): Height at which to cut the tree
Returns:
list: List of ClusterTree objects representing clusters
"""
def get_clusters_by_size(self, min_size=2, max_size=None):
"""
Get clusters within specified size range.
Parameters:
- min_size (int): Minimum cluster size
- max_size (int): Maximum cluster size (None for no limit)
Returns:
list: List of clusters meeting size criteria
"""
def get_optimal_clusters(self, criterion="silhouette"):
"""
Find optimal number of clusters using specified criterion.
Parameters:
- criterion (str): Optimization criterion ("silhouette", "gap", "elbow")
Returns:
tuple: (optimal_k, clusters_list, criterion_values)
"""Compare different clusterings and analyze cluster relationships.
def compare_clusters(self, other_clustering, method="adjusted_rand"):
"""
Compare this clustering with another clustering.
Parameters:
- other_clustering (ClusterTree or dict): Other clustering to compare
- method (str): Comparison metric ("adjusted_rand", "normalized_mutual_info", "homogeneity")
Returns:
float: Clustering similarity score
"""
def get_cluster_stability(self, bootstrap_samples=100):
"""
Assess cluster stability through bootstrap resampling.
Parameters:
- bootstrap_samples (int): Number of bootstrap iterations
Returns:
dict: Stability scores for each cluster
"""Work with distance matrices and clustering algorithms.
def from_distance_matrix(self, distance_matrix, labels=None, method="average"):
"""
Create clustering tree from distance matrix.
Parameters:
- distance_matrix (array-like): Symmetric distance matrix
- labels (list): Labels for matrix rows/columns
- method (str): Linkage method ("single", "complete", "average", "ward")
Returns:
ClusterTree: Hierarchical clustering result
"""
def get_distance_matrix(self):
"""
Extract distance matrix from clustering tree.
Returns:
numpy.ndarray: Distance matrix between all leaf pairs
"""Specialized visualization methods for clustering results.
def show_cluster_heatmap(self, data_matrix=None, color_map="viridis"):
"""
Display clustering results with associated data heatmap.
Parameters:
- data_matrix (array-like): Data matrix to display alongside tree
- color_map (str): Color scheme for heatmap
"""
def render_dendrogram(self, orientation="top", leaf_rotation=90, **kwargs):
"""
Render tree as dendrogram with clustering-specific formatting.
Parameters:
- orientation (str): Dendrogram orientation ("top", "bottom", "left", "right")
- leaf_rotation (int): Rotation angle for leaf labels
- kwargs: Additional rendering parameters
"""Seamless integration with ETE3's ArrayTable for data-driven clustering.
# In ArrayTable class
def cluster_data(self, method="ward", metric="euclidean"):
"""
Perform hierarchical clustering on table data.
Parameters:
- method (str): Linkage method ("ward", "complete", "average", "single")
- metric (str): Distance metric ("euclidean", "manhattan", "cosine", "correlation")
Returns:
ClusterTree: Clustering result tree
"""from ete3 import ClusterTree
import numpy as np
# Load clustering result (from distance matrix or linkage)
cluster_tree = ClusterTree("clustering_result.nw")
# Basic cluster information
print(f"Total items clustered: {len(cluster_tree.get_leaves())}")
print(f"Tree height: {cluster_tree.get_tree_root().get_cluster_height()}")
# Analyze individual clusters
for node in cluster_tree.traverse():
if not node.is_leaf():
profile = node.get_cluster_profile()
print(f"Cluster size: {profile['size']}, height: {profile['height']:.3f}")from ete3 import ClusterTree
cluster_tree = ClusterTree("hierarchical_clustering.nw")
# Calculate silhouette scores for all clusters
silhouette_scores = {}
for node in cluster_tree.traverse():
if not node.is_leaf() and len(node.get_leaves()) > 1:
silhouette = node.get_silhouette()
silhouette_scores[node] = silhouette
print(f"Cluster {len(node.get_leaves())} items: silhouette = {silhouette:.3f}")
# Find best clusters based on silhouette
best_clusters = [node for node, score in silhouette_scores.items() if score > 0.5]
print(f"Found {len(best_clusters)} high-quality clusters")from ete3 import ClusterTree
cluster_tree = ClusterTree("clustering_dendrogram.nw")
# Cut tree at different heights
heights = [0.1, 0.2, 0.5, 1.0]
for height in heights:
clusters = cluster_tree.get_clusters_at_height(height)
print(f"Height {height}: {len(clusters)} clusters")
# Analyze cluster sizes
sizes = [len(cluster.get_leaves()) for cluster in clusters]
print(f" Cluster sizes: {sizes}")
# Find optimal clustering
optimal_k, optimal_clusters, scores = cluster_tree.get_optimal_clusters(criterion="silhouette")
print(f"Optimal number of clusters: {optimal_k}")
print(f"Optimal clustering silhouette: {max(scores):.3f}")from ete3 import ArrayTable, ClusterTree
import numpy as np
# Load expression data
expression_data = ArrayTable("gene_expression.txt")
# Perform clustering
cluster_result = expression_data.cluster_data(method="ward", metric="euclidean")
# Analyze clustering quality
for node in cluster_result.traverse():
if not node.is_leaf():
cluster_profile = node.get_cluster_profile()
if cluster_profile['size'] >= 5: # Focus on larger clusters
silhouette = node.get_silhouette()
variance = node.get_cluster_variance()
print(f"Cluster {cluster_profile['size']} genes:")
print(f" Silhouette: {silhouette:.3f}")
print(f" Variance: {variance:.3f}")
print(f" Members: {node.get_cluster_members()[:5]}...") # Show first 5from ete3 import ClusterTree
# Load two different clustering results
clustering1 = ClusterTree("method1_clustering.nw")
clustering2 = ClusterTree("method2_clustering.nw")
# Compare clusterings
similarity = clustering1.compare_clusters(clustering2, method="adjusted_rand")
print(f"Clustering similarity (Adjusted Rand Index): {similarity:.3f}")
# Assess stability
stability_scores = clustering1.get_cluster_stability(bootstrap_samples=50)
for cluster, stability in stability_scores.items():
print(f"Cluster stability: {stability:.3f}")from ete3 import ArrayTable, ClusterTree
import numpy as np
# Complete clustering analysis workflow
def analyze_clustering(data_file, methods=["ward", "complete", "average"]):
# Load data
data = ArrayTable(data_file)
# Try different clustering methods
results = {}
for method in methods:
cluster_tree = data.cluster_data(method=method, metric="euclidean")
# Find optimal clusters
opt_k, opt_clusters, scores = cluster_tree.get_optimal_clusters()
# Calculate overall quality metrics
avg_silhouette = np.mean([cluster.get_silhouette()
for cluster in opt_clusters
if len(cluster.get_leaves()) > 1])
results[method] = {
'tree': cluster_tree,
'optimal_k': opt_k,
'avg_silhouette': avg_silhouette,
'clusters': opt_clusters
}
print(f"{method}: k={opt_k}, silhouette={avg_silhouette:.3f}")
# Select best method
best_method = max(results.keys(),
key=lambda m: results[m]['avg_silhouette'])
print(f"\nBest method: {best_method}")
return results[best_method]
# Run analysis
best_clustering = analyze_clustering("expression_matrix.txt")
# Visualize best result
best_clustering['tree'].show_cluster_heatmap()from ete3 import ClusterTree
import numpy as np
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, to_tree
# Custom clustering with correlation distance
def correlation_clustering(data_matrix, method="average"):
# Calculate correlation-based distances
correlation_matrix = np.corrcoef(data_matrix)
distance_matrix = 1 - np.abs(correlation_matrix) # Convert correlation to distance
# Perform hierarchical clustering
condensed_distances = pdist(data_matrix, metric='correlation')
linkage_matrix = linkage(condensed_distances, method=method)
# Convert to ETE3 tree format
scipy_tree = to_tree(linkage_matrix)
# Create ClusterTree (would need conversion function)
# This is a simplified example
return ClusterTree(newick_from_scipy_tree(scipy_tree))
# Use custom clustering
data = np.random.rand(50, 100) # 50 samples, 100 features
custom_cluster_tree = correlation_clustering(data)
# Analyze results
optimal_clusters = custom_cluster_tree.get_optimal_clusters()
print(f"Custom clustering found {len(optimal_clusters[1])} optimal clusters")Install with Tessl CLI
npx tessl i tessl/pypi-ete3