CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-orange3

Orange, a component-based data mining framework.

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

distance.mddocs/

Distance Metrics

Orange3 provides a comprehensive collection of distance and similarity measures for various data types and analysis tasks.

Capabilities

Base Distance Classes

Foundation classes for distance computation.

class Distance:
    """Base class for all distance measures."""
    def __call__(self, data):
        """
        Compute distance matrix for data.
        
        Args:
            data: Orange Table
            
        Returns:
            Distance matrix
        """

class DistanceModel:
    """Fitted distance computation model."""
    def __call__(self, data1, data2=None):
        """Compute distances between data points."""

Euclidean Distance

Standard geometric distance in multidimensional space.

class Euclidean(Distance):
    """
    Euclidean distance metric.
    
    The straight-line distance between two points in Euclidean space.
    """
    def __call__(self, data):
        """Compute Euclidean distance matrix."""

Manhattan Distance

City-block or L1 distance.

class Manhattan(Distance):
    """
    Manhattan (city-block) distance metric.
    
    Sum of absolute differences between coordinates.
    """
    def __call__(self, data):
        """Compute Manhattan distance matrix."""

Cosine Distance

Angular distance based on cosine similarity.

class Cosine(Distance):
    """
    Cosine distance metric.
    
    Based on cosine similarity between vectors, measures angle rather than magnitude.
    """
    def __call__(self, data):
        """Compute cosine distance matrix."""

Correlation-Based Distances

Distances based on statistical correlation.

class PearsonR(Distance):
    """
    Pearson correlation distance.
    
    Distance based on Pearson correlation coefficient (1 - correlation).
    """
    def __call__(self, data):
        """Compute Pearson correlation distance matrix."""

class PearsonRAbsolute(Distance):
    """
    Absolute Pearson correlation distance.
    
    Distance based on absolute Pearson correlation (1 - |correlation|).
    """
    def __call__(self, data):
        """Compute absolute Pearson correlation distance matrix."""

class SpearmanR(Distance):
    """
    Spearman rank correlation distance.
    
    Distance based on Spearman rank correlation coefficient.
    """
    def __call__(self, data):
        """Compute Spearman correlation distance matrix."""

class SpearmanRAbsolute(Distance):
    """
    Absolute Spearman rank correlation distance.
    
    Distance based on absolute Spearman rank correlation.
    """
    def __call__(self, data):
        """Compute absolute Spearman correlation distance matrix."""

Jaccard Distance

Distance for binary and categorical data.

class Jaccard(Distance):
    """
    Jaccard distance metric.
    
    For binary data: 1 - (intersection / union)
    Measures dissimilarity between sets.
    """
    def __call__(self, data):
        """Compute Jaccard distance matrix."""

Hamming Distance

Distance for categorical data.

class Hamming(Distance):
    """
    Hamming distance metric.
    
    Proportion of differing categorical attributes.
    """
    def __call__(self, data):
        """Compute Hamming distance matrix."""

Mahalanobis Distance

Distance accounting for data covariance.

class Mahalanobis(Distance):
    """
    Mahalanobis distance metric.
    
    Distance that accounts for covariance structure of the data.
    """
    def __call__(self, data):
        """Compute Mahalanobis distance matrix."""

class MahalanobisDistance:
    """Mahalanobis distance computation utilities."""
    def __init__(self, data): ...
    
    def __call__(self, data1, data2=None):
        """Compute Mahalanobis distances."""

Bhattacharyya Distance

Distance for probability distributions.

class Bhattacharyya(Distance):
    """
    Bhattacharyya distance metric.
    
    Measures similarity between probability distributions.
    """
    def __call__(self, data):
        """Compute Bhattacharyya distance matrix."""

Distance Preprocessing Utilities

Helper functions for distance computation.

def _preprocess(data, remove_discrete=False, remove_nonbinary=False, 
                impute=True, normalize=False):
    """
    Preprocess data for distance computation.
    
    Args:
        data: Orange Table
        remove_discrete: Remove discrete attributes
        remove_nonbinary: Remove non-binary discrete attributes
        impute: Impute missing values
        normalize: Normalize features
        
    Returns:
        Preprocessed data
    """

def remove_discrete_features(data):
    """Remove discrete attributes from data."""

def remove_nonbinary_features(data):
    """Remove non-binary discrete attributes from data."""

def impute(data, method='average'):
    """
    Impute missing values for distance computation.
    
    Args:
        data: Orange Table
        method: Imputation method
        
    Returns:
        Data with imputed values
    """

Usage Examples

# Basic distance computation
from Orange.data import Table
from Orange.distance import Euclidean, Manhattan, Cosine

# Load data
data = Table("iris")

# Compute different distance matrices
euclidean = Euclidean()
manhattan = Manhattan()
cosine = Cosine()

euclidean_dist = euclidean(data)
manhattan_dist = manhattan(data)
cosine_dist = cosine(data)

print(f"Euclidean distance matrix shape: {euclidean_dist.shape}")
print(f"Manhattan distance matrix shape: {manhattan_dist.shape}")
print(f"Cosine distance matrix shape: {cosine_dist.shape}")

# Correlation-based distances
from Orange.distance import PearsonR, SpearmanR

pearson_dist = PearsonR()(data)
spearman_dist = SpearmanR()(data)

print(f"Pearson correlation distance range: {pearson_dist.min():.3f} - {pearson_dist.max():.3f}")

# Distances for categorical data
from Orange.distance import Jaccard, Hamming

# Create categorical data example
categorical_data = Table("zoo")  # Assuming zoo dataset has categorical features

jaccard_dist = Jaccard()(categorical_data)
hamming_dist = Hamming()(categorical_data)

print(f"Jaccard distance matrix shape: {jaccard_dist.shape}")
print(f"Hamming distance matrix shape: {hamming_dist.shape}")

# Mahalanobis distance
from Orange.distance import Mahalanobis

mahalanobis_dist = Mahalanobis()(data)
print(f"Mahalanobis distance matrix shape: {mahalanobis_dist.shape}")

# Distance preprocessing
from Orange.distance import _preprocess, remove_discrete_features

# Remove discrete features before computing distance
continuous_data = remove_discrete_features(data)
print(f"Original features: {len(data.domain.attributes)}")
print(f"Continuous features: {len(continuous_data.domain.attributes)}")

# Preprocess data for distance computation
preprocessed_data = _preprocess(data, remove_discrete=True, 
                               impute=True, normalize=True)

# Compute distance on preprocessed data
preprocessed_dist = euclidean(preprocessed_data)

# Compare distances between first few samples
print("Distance comparison (first 3x3 submatrix):")
print("Original Euclidean:")
print(euclidean_dist[:3, :3])
print("Preprocessed Euclidean:")
print(preprocessed_dist[:3, :3])

# Use distances with clustering
from Orange.clustering import HierarchicalClustering
import numpy as np

# Convert distance matrix to format suitable for clustering
dist_array = np.array(euclidean_dist)

# Note: Hierarchical clustering can use precomputed distances
# hierarchical = HierarchicalClustering(linkage='average', metric='precomputed')
# clusters = hierarchical.fit(dist_array)

# Find nearest neighbors using distance matrix
def find_k_nearest(distance_matrix, point_idx, k=5):
    """Find k nearest neighbors for a given point."""
    distances = distance_matrix[point_idx]
    nearest_indices = np.argsort(distances)[1:k+1]  # Exclude self (index 0)
    return nearest_indices, distances[nearest_indices]

# Example: Find 5 nearest neighbors for first data point
nearest_idx, nearest_dist = find_k_nearest(euclidean_dist, 0, k=5)
print(f"5 nearest neighbors of point 0: {nearest_idx}")
print(f"Their distances: {nearest_dist}")

Install with Tessl CLI

npx tessl i tessl/pypi-orange3

docs

classification.md

clustering.md

data-handling.md

distance.md

evaluation.md

index.md

preprocessing.md

projection.md

regression.md

widgets.md

tile.json