tessl/pypi-ete3

A Python Environment for (phylogenetic) Tree Exploration

—

Pending

Overview

Eval results

Files

Data Tables and Arrays

Name: tessl/pypi-ete3
Author: tessl

Efficient handling of numerical data associated with trees and sequences, supporting matrix operations, statistical analysis, and integration with scientific computing workflows. ETE3's ArrayTable provides high-performance data manipulation capabilities.

Capabilities

ArrayTable Class

Main class for handling 2D numerical data with matrix operations and scientific computing integration.

class ArrayTable:
    """
    Efficient 2D data table with matrix operations and scientific computing support.
    Built on NumPy for high performance numerical operations.
    """
    
    def __init__(self, matrix_file=None, mtype="float"):
        """
        Initialize array table.

        Parameters:
        - matrix_file (str): Path to matrix data file
        - mtype (str): Data type ("float", "int", "str")
        """

    def __len__(self):
        """Number of rows in table."""
        
    def __str__(self):
        """String representation of table."""

Data Access and Retrieval

Methods for accessing rows, columns, and individual data elements.

def get_column_array(self, colname):
    """
    Get column data as NumPy array.

    Parameters:
    - colname (str): Column name

    Returns:
    numpy.ndarray: Column data array
    """

def get_row_array(self, rowname):
    """
    Get row data as NumPy array.

    Parameters:
    - rowname (str): Row name

    Returns:
    numpy.ndarray: Row data array
    """

def get_several_column_arrays(self, colnames):
    """
    Get multiple columns as arrays.

    Parameters:
    - colnames (list): List of column names

    Returns:
    dict: Mapping from column names to arrays
    """

def get_several_row_arrays(self, rownames):
    """
    Get multiple rows as arrays.

    Parameters:
    - rownames (list): List of row names

    Returns:
    dict: Mapping from row names to arrays
    """

# Properties for data access
matrix: numpy.ndarray     # Underlying data matrix
colNames: list           # Column names
rowNames: list           # Row names
colValues: dict          # Column name to index mapping
rowValues: dict          # Row name to index mapping

Matrix Operations

Mathematical operations and transformations on the data matrix.

def transpose(self):
    """
    Transpose the matrix (swap rows and columns).

    Returns:
    ArrayTable: New transposed table
    """

def remove_column(self, colname):
    """
    Remove column from table.

    Parameters:
    - colname (str): Column name to remove
    """

def remove_row(self, rowname):
    """
    Remove row from table.

    Parameters:
    - rowname (str): Row name to remove
    """

def add_column(self, colname, colvalues):
    """
    Add new column to table.

    Parameters:
    - colname (str): Name for new column
    - colvalues (array-like): Column data values
    """

def add_row(self, rowname, rowvalues):
    """
    Add new row to table.

    Parameters:
    - rowname (str): Name for new row
    - rowvalues (array-like): Row data values
    """

File I/O Operations

Read and write table data in various formats.

def write(self, fname=None, colnames=None):
    """
    Write table to file.

    Parameters:
    - fname (str): Output file path, if None returns string
    - colnames (list): Specific columns to write

    Returns:
    str: Formatted table string (if fname is None)
    """

def read(self, matrix_file, mtype="float", **kwargs):
    """
    Read table data from file.

    Parameters:
    - matrix_file (str): Input file path
    - mtype (str): Data type for parsing
    - kwargs: Additional parsing parameters
    """

Statistical Operations

Built-in statistical analysis and data summary methods.

def get_stats(self):
    """
    Calculate basic statistics for all columns.

    Returns:
    dict: Statistics including mean, std, min, max for each column
    """

def get_column_stats(self, colname):
    """
    Calculate statistics for specific column.

    Parameters:
    - colname (str): Column name

    Returns:
    dict: Column statistics (mean, std, min, max, etc.)
    """

def normalize(self, method="standard"):
    """
    Normalize data using specified method.

    Parameters:
    - method (str): Normalization method ("standard", "minmax", "robust")

    Returns:
    ArrayTable: Normalized table
    """

Data Filtering and Selection

Filter and select subsets of data based on criteria.

def filter_columns(self, condition_func):
    """
    Filter columns based on condition function.

    Parameters:
    - condition_func (function): Function that takes column array, returns bool

    Returns:
    ArrayTable: Filtered table
    """

def filter_rows(self, condition_func):
    """
    Filter rows based on condition function.

    Parameters:
    - condition_func (function): Function that takes row array, returns bool

    Returns:
    ArrayTable: Filtered table
    """

def select_columns(self, colnames):
    """
    Select specific columns.

    Parameters:
    - colnames (list): Column names to select

    Returns:
    ArrayTable: Table with selected columns
    """

def select_rows(self, rownames):
    """
    Select specific rows.

    Parameters:
    - rownames (list): Row names to select

    Returns:
    ArrayTable: Table with selected rows
    """

Integration with Trees

Methods for associating tabular data with tree structures.

def link_to_tree(self, tree, attr_name="profile"):
    """
    Link table data to tree nodes.

    Parameters:
    - tree (Tree): Tree to link data to
    - attr_name (str): Attribute name for storing data in nodes
    """

def get_tree_profile(self, tree, attr_name="profile"):
    """
    Extract profile data from tree nodes.

    Parameters:
    - tree (Tree): Tree with profile data
    - attr_name (str): Attribute name containing data

    Returns:
    ArrayTable: Table with tree profile data
    """

Clustering Integration

ClusterTree with ArrayTable

Enhanced clustering functionality when combined with data tables.

def get_distance_matrix(self):
    """
    Calculate distance matrix between rows.

    Returns:
    numpy.ndarray: Symmetric distance matrix
    """

def cluster_data(self, method="ward", metric="euclidean"):
    """
    Perform hierarchical clustering on data.

    Parameters:
    - method (str): Linkage method ("ward", "complete", "average", "single")
    - metric (str): Distance metric ("euclidean", "manhattan", "cosine")

    Returns:
    ClusterTree: Tree representing clustering hierarchy
    """

Usage Examples

Basic Table Operations

from ete3 import ArrayTable
import numpy as np

# Create table from file
table = ArrayTable("data_matrix.txt", mtype="float")

# Basic properties
print(f"Table dimensions: {len(table.rowNames)} x {len(table.colNames)}")
print(f"Column names: {table.colNames}")
print(f"Row names: {table.rowNames}")

# Access data
col_data = table.get_column_array("column1")
row_data = table.get_row_array("row1")

print(f"Column1 stats: mean={np.mean(col_data):.2f}, std={np.std(col_data):.2f}")

Data Manipulation

from ete3 import ArrayTable

# Load data
table = ArrayTable("expression_data.txt")

# Remove unwanted columns/rows
table.remove_column("control_sample")
table.remove_row("uninformative_gene")

# Add new data
new_column_data = [1.5, 2.3, 0.8, 3.1, 1.9]
table.add_column("new_condition", new_column_data)

# Transpose for different analysis perspective
transposed = table.transpose()

# Save results
table.write("modified_data.txt")

Statistical Analysis

from ete3 import ArrayTable

table = ArrayTable("experimental_data.txt")

# Get overall statistics
stats = table.get_stats()
for col, col_stats in stats.items():
    print(f"{col}: mean={col_stats['mean']:.2f}, std={col_stats['std']:.2f}")

# Normalize data
normalized_table = table.normalize(method="standard")

# Filter based on criteria
def high_variance_filter(col_array):
    return np.var(col_array) > 1.0

high_var_table = table.filter_columns(high_variance_filter)
print(f"Filtered to {len(high_var_table.colNames)} high-variance columns")

Integration with Trees

from ete3 import ArrayTable, Tree

# Load data and tree
table = ArrayTable("gene_expression.txt")
tree = Tree("species_tree.nw")

# Link expression data to tree nodes
table.link_to_tree(tree, attr_name="expression")

# Access linked data
for leaf in tree.get_leaves():
    if hasattr(leaf, 'expression'):
        print(f"{leaf.name}: {leaf.expression[:5]}...")  # First 5 values

# Extract profile data back from tree
extracted_table = table.get_tree_profile(tree, attr_name="expression")

Clustering Analysis

from ete3 import ArrayTable

# Load expression data
expression_table = ArrayTable("gene_expression_matrix.txt")

# Perform hierarchical clustering
cluster_tree = expression_table.cluster_data(method="ward", metric="euclidean")

# Analyze clustering results
print(f"Clustering tree: {cluster_tree.get_ascii()}")

# Get distance matrix for further analysis
dist_matrix = expression_table.get_distance_matrix()
print(f"Distance matrix shape: {dist_matrix.shape}")

Advanced Data Analysis

from ete3 import ArrayTable, ClusterTree
import numpy as np

# Load and prepare data
table = ArrayTable("multi_condition_data.txt")

# Select specific conditions
selected_conditions = ["treatment1", "treatment2", "control"]
filtered_table = table.select_columns(selected_conditions)

# Normalize and filter
normalized = filtered_table.normalize(method="standard")

# Filter for genes with significant variation
def significant_variation(row_array):
    return np.max(row_array) - np.min(row_array) > 2.0

variable_genes = normalized.filter_rows(significant_variation)

# Cluster the filtered, normalized data
cluster_result = variable_genes.cluster_data(method="complete")

# Visualize clustering
cluster_result.show()

# Save processed data
variable_genes.write("filtered_normalized_data.txt")

Custom Data Processing

from ete3 import ArrayTable
import numpy as np

# Create table from Python data
data_matrix = np.random.rand(100, 20)  # 100 genes, 20 samples
row_names = [f"gene_{i}" for i in range(100)]
col_names = [f"sample_{i}" for i in range(20)]

# Initialize empty table and populate
table = ArrayTable()
table.matrix = data_matrix
table.rowNames = row_names  
table.colNames = col_names
table.rowValues = {name: i for i, name in enumerate(row_names)}
table.colValues = {name: i for i, name in enumerate(col_names)}

# Apply custom transformations
log_transformed = table.matrix.copy()
log_transformed = np.log2(log_transformed + 1)  # log2(x+1) transformation

# Create new table with transformed data
log_table = ArrayTable()
log_table.matrix = log_transformed
log_table.rowNames = table.rowNames
log_table.colNames = table.colNames
log_table.rowValues = table.rowValues
log_table.colValues = table.colValues

# Save transformed data
log_table.write("log_transformed_data.txt")

Install with Tessl CLI