A Python Environment for (phylogenetic) Tree Exploration
—
Efficient handling of numerical data associated with trees and sequences, supporting matrix operations, statistical analysis, and integration with scientific computing workflows. ETE3's ArrayTable provides high-performance data manipulation capabilities.
Main class for handling 2D numerical data with matrix operations and scientific computing integration.
class ArrayTable:
"""
Efficient 2D data table with matrix operations and scientific computing support.
Built on NumPy for high performance numerical operations.
"""
def __init__(self, matrix_file=None, mtype="float"):
"""
Initialize array table.
Parameters:
- matrix_file (str): Path to matrix data file
- mtype (str): Data type ("float", "int", "str")
"""
def __len__(self):
"""Number of rows in table."""
def __str__(self):
"""String representation of table."""Methods for accessing rows, columns, and individual data elements.
def get_column_array(self, colname):
"""
Get column data as NumPy array.
Parameters:
- colname (str): Column name
Returns:
numpy.ndarray: Column data array
"""
def get_row_array(self, rowname):
"""
Get row data as NumPy array.
Parameters:
- rowname (str): Row name
Returns:
numpy.ndarray: Row data array
"""
def get_several_column_arrays(self, colnames):
"""
Get multiple columns as arrays.
Parameters:
- colnames (list): List of column names
Returns:
dict: Mapping from column names to arrays
"""
def get_several_row_arrays(self, rownames):
"""
Get multiple rows as arrays.
Parameters:
- rownames (list): List of row names
Returns:
dict: Mapping from row names to arrays
"""
# Properties for data access
matrix: numpy.ndarray # Underlying data matrix
colNames: list # Column names
rowNames: list # Row names
colValues: dict # Column name to index mapping
rowValues: dict # Row name to index mappingMathematical operations and transformations on the data matrix.
def transpose(self):
"""
Transpose the matrix (swap rows and columns).
Returns:
ArrayTable: New transposed table
"""
def remove_column(self, colname):
"""
Remove column from table.
Parameters:
- colname (str): Column name to remove
"""
def remove_row(self, rowname):
"""
Remove row from table.
Parameters:
- rowname (str): Row name to remove
"""
def add_column(self, colname, colvalues):
"""
Add new column to table.
Parameters:
- colname (str): Name for new column
- colvalues (array-like): Column data values
"""
def add_row(self, rowname, rowvalues):
"""
Add new row to table.
Parameters:
- rowname (str): Name for new row
- rowvalues (array-like): Row data values
"""Read and write table data in various formats.
def write(self, fname=None, colnames=None):
"""
Write table to file.
Parameters:
- fname (str): Output file path, if None returns string
- colnames (list): Specific columns to write
Returns:
str: Formatted table string (if fname is None)
"""
def read(self, matrix_file, mtype="float", **kwargs):
"""
Read table data from file.
Parameters:
- matrix_file (str): Input file path
- mtype (str): Data type for parsing
- kwargs: Additional parsing parameters
"""Built-in statistical analysis and data summary methods.
def get_stats(self):
"""
Calculate basic statistics for all columns.
Returns:
dict: Statistics including mean, std, min, max for each column
"""
def get_column_stats(self, colname):
"""
Calculate statistics for specific column.
Parameters:
- colname (str): Column name
Returns:
dict: Column statistics (mean, std, min, max, etc.)
"""
def normalize(self, method="standard"):
"""
Normalize data using specified method.
Parameters:
- method (str): Normalization method ("standard", "minmax", "robust")
Returns:
ArrayTable: Normalized table
"""Filter and select subsets of data based on criteria.
def filter_columns(self, condition_func):
"""
Filter columns based on condition function.
Parameters:
- condition_func (function): Function that takes column array, returns bool
Returns:
ArrayTable: Filtered table
"""
def filter_rows(self, condition_func):
"""
Filter rows based on condition function.
Parameters:
- condition_func (function): Function that takes row array, returns bool
Returns:
ArrayTable: Filtered table
"""
def select_columns(self, colnames):
"""
Select specific columns.
Parameters:
- colnames (list): Column names to select
Returns:
ArrayTable: Table with selected columns
"""
def select_rows(self, rownames):
"""
Select specific rows.
Parameters:
- rownames (list): Row names to select
Returns:
ArrayTable: Table with selected rows
"""Methods for associating tabular data with tree structures.
def link_to_tree(self, tree, attr_name="profile"):
"""
Link table data to tree nodes.
Parameters:
- tree (Tree): Tree to link data to
- attr_name (str): Attribute name for storing data in nodes
"""
def get_tree_profile(self, tree, attr_name="profile"):
"""
Extract profile data from tree nodes.
Parameters:
- tree (Tree): Tree with profile data
- attr_name (str): Attribute name containing data
Returns:
ArrayTable: Table with tree profile data
"""Enhanced clustering functionality when combined with data tables.
def get_distance_matrix(self):
"""
Calculate distance matrix between rows.
Returns:
numpy.ndarray: Symmetric distance matrix
"""
def cluster_data(self, method="ward", metric="euclidean"):
"""
Perform hierarchical clustering on data.
Parameters:
- method (str): Linkage method ("ward", "complete", "average", "single")
- metric (str): Distance metric ("euclidean", "manhattan", "cosine")
Returns:
ClusterTree: Tree representing clustering hierarchy
"""from ete3 import ArrayTable
import numpy as np
# Create table from file
table = ArrayTable("data_matrix.txt", mtype="float")
# Basic properties
print(f"Table dimensions: {len(table.rowNames)} x {len(table.colNames)}")
print(f"Column names: {table.colNames}")
print(f"Row names: {table.rowNames}")
# Access data
col_data = table.get_column_array("column1")
row_data = table.get_row_array("row1")
print(f"Column1 stats: mean={np.mean(col_data):.2f}, std={np.std(col_data):.2f}")from ete3 import ArrayTable
# Load data
table = ArrayTable("expression_data.txt")
# Remove unwanted columns/rows
table.remove_column("control_sample")
table.remove_row("uninformative_gene")
# Add new data
new_column_data = [1.5, 2.3, 0.8, 3.1, 1.9]
table.add_column("new_condition", new_column_data)
# Transpose for different analysis perspective
transposed = table.transpose()
# Save results
table.write("modified_data.txt")from ete3 import ArrayTable
table = ArrayTable("experimental_data.txt")
# Get overall statistics
stats = table.get_stats()
for col, col_stats in stats.items():
print(f"{col}: mean={col_stats['mean']:.2f}, std={col_stats['std']:.2f}")
# Normalize data
normalized_table = table.normalize(method="standard")
# Filter based on criteria
def high_variance_filter(col_array):
return np.var(col_array) > 1.0
high_var_table = table.filter_columns(high_variance_filter)
print(f"Filtered to {len(high_var_table.colNames)} high-variance columns")from ete3 import ArrayTable, Tree
# Load data and tree
table = ArrayTable("gene_expression.txt")
tree = Tree("species_tree.nw")
# Link expression data to tree nodes
table.link_to_tree(tree, attr_name="expression")
# Access linked data
for leaf in tree.get_leaves():
if hasattr(leaf, 'expression'):
print(f"{leaf.name}: {leaf.expression[:5]}...") # First 5 values
# Extract profile data back from tree
extracted_table = table.get_tree_profile(tree, attr_name="expression")from ete3 import ArrayTable
# Load expression data
expression_table = ArrayTable("gene_expression_matrix.txt")
# Perform hierarchical clustering
cluster_tree = expression_table.cluster_data(method="ward", metric="euclidean")
# Analyze clustering results
print(f"Clustering tree: {cluster_tree.get_ascii()}")
# Get distance matrix for further analysis
dist_matrix = expression_table.get_distance_matrix()
print(f"Distance matrix shape: {dist_matrix.shape}")from ete3 import ArrayTable, ClusterTree
import numpy as np
# Load and prepare data
table = ArrayTable("multi_condition_data.txt")
# Select specific conditions
selected_conditions = ["treatment1", "treatment2", "control"]
filtered_table = table.select_columns(selected_conditions)
# Normalize and filter
normalized = filtered_table.normalize(method="standard")
# Filter for genes with significant variation
def significant_variation(row_array):
return np.max(row_array) - np.min(row_array) > 2.0
variable_genes = normalized.filter_rows(significant_variation)
# Cluster the filtered, normalized data
cluster_result = variable_genes.cluster_data(method="complete")
# Visualize clustering
cluster_result.show()
# Save processed data
variable_genes.write("filtered_normalized_data.txt")from ete3 import ArrayTable
import numpy as np
# Create table from Python data
data_matrix = np.random.rand(100, 20) # 100 genes, 20 samples
row_names = [f"gene_{i}" for i in range(100)]
col_names = [f"sample_{i}" for i in range(20)]
# Initialize empty table and populate
table = ArrayTable()
table.matrix = data_matrix
table.rowNames = row_names
table.colNames = col_names
table.rowValues = {name: i for i, name in enumerate(row_names)}
table.colValues = {name: i for i, name in enumerate(col_names)}
# Apply custom transformations
log_transformed = table.matrix.copy()
log_transformed = np.log2(log_transformed + 1) # log2(x+1) transformation
# Create new table with transformed data
log_table = ArrayTable()
log_table.matrix = log_transformed
log_table.rowNames = table.rowNames
log_table.colNames = table.colNames
log_table.rowValues = table.rowValues
log_table.colValues = table.colValues
# Save transformed data
log_table.write("log_transformed_data.txt")Install with Tessl CLI
npx tessl i tessl/pypi-ete3