Orange, a component-based data mining framework.
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Orange3 provides a comprehensive collection of distance and similarity measures for various data types and analysis tasks.
Foundation classes for distance computation.
class Distance:
"""Base class for all distance measures."""
def __call__(self, data):
"""
Compute distance matrix for data.
Args:
data: Orange Table
Returns:
Distance matrix
"""
class DistanceModel:
"""Fitted distance computation model."""
def __call__(self, data1, data2=None):
"""Compute distances between data points."""Standard geometric distance in multidimensional space.
class Euclidean(Distance):
"""
Euclidean distance metric.
The straight-line distance between two points in Euclidean space.
"""
def __call__(self, data):
"""Compute Euclidean distance matrix."""City-block or L1 distance.
class Manhattan(Distance):
"""
Manhattan (city-block) distance metric.
Sum of absolute differences between coordinates.
"""
def __call__(self, data):
"""Compute Manhattan distance matrix."""Angular distance based on cosine similarity.
class Cosine(Distance):
"""
Cosine distance metric.
Based on cosine similarity between vectors, measures angle rather than magnitude.
"""
def __call__(self, data):
"""Compute cosine distance matrix."""Distances based on statistical correlation.
class PearsonR(Distance):
"""
Pearson correlation distance.
Distance based on Pearson correlation coefficient (1 - correlation).
"""
def __call__(self, data):
"""Compute Pearson correlation distance matrix."""
class PearsonRAbsolute(Distance):
"""
Absolute Pearson correlation distance.
Distance based on absolute Pearson correlation (1 - |correlation|).
"""
def __call__(self, data):
"""Compute absolute Pearson correlation distance matrix."""
class SpearmanR(Distance):
"""
Spearman rank correlation distance.
Distance based on Spearman rank correlation coefficient.
"""
def __call__(self, data):
"""Compute Spearman correlation distance matrix."""
class SpearmanRAbsolute(Distance):
"""
Absolute Spearman rank correlation distance.
Distance based on absolute Spearman rank correlation.
"""
def __call__(self, data):
"""Compute absolute Spearman correlation distance matrix."""Distance for binary and categorical data.
class Jaccard(Distance):
"""
Jaccard distance metric.
For binary data: 1 - (intersection / union)
Measures dissimilarity between sets.
"""
def __call__(self, data):
"""Compute Jaccard distance matrix."""Distance for categorical data.
class Hamming(Distance):
"""
Hamming distance metric.
Proportion of differing categorical attributes.
"""
def __call__(self, data):
"""Compute Hamming distance matrix."""Distance accounting for data covariance.
class Mahalanobis(Distance):
"""
Mahalanobis distance metric.
Distance that accounts for covariance structure of the data.
"""
def __call__(self, data):
"""Compute Mahalanobis distance matrix."""
class MahalanobisDistance:
"""Mahalanobis distance computation utilities."""
def __init__(self, data): ...
def __call__(self, data1, data2=None):
"""Compute Mahalanobis distances."""Distance for probability distributions.
class Bhattacharyya(Distance):
"""
Bhattacharyya distance metric.
Measures similarity between probability distributions.
"""
def __call__(self, data):
"""Compute Bhattacharyya distance matrix."""Helper functions for distance computation.
def _preprocess(data, remove_discrete=False, remove_nonbinary=False,
impute=True, normalize=False):
"""
Preprocess data for distance computation.
Args:
data: Orange Table
remove_discrete: Remove discrete attributes
remove_nonbinary: Remove non-binary discrete attributes
impute: Impute missing values
normalize: Normalize features
Returns:
Preprocessed data
"""
def remove_discrete_features(data):
"""Remove discrete attributes from data."""
def remove_nonbinary_features(data):
"""Remove non-binary discrete attributes from data."""
def impute(data, method='average'):
"""
Impute missing values for distance computation.
Args:
data: Orange Table
method: Imputation method
Returns:
Data with imputed values
"""# Basic distance computation
from Orange.data import Table
from Orange.distance import Euclidean, Manhattan, Cosine
# Load data
data = Table("iris")
# Compute different distance matrices
euclidean = Euclidean()
manhattan = Manhattan()
cosine = Cosine()
euclidean_dist = euclidean(data)
manhattan_dist = manhattan(data)
cosine_dist = cosine(data)
print(f"Euclidean distance matrix shape: {euclidean_dist.shape}")
print(f"Manhattan distance matrix shape: {manhattan_dist.shape}")
print(f"Cosine distance matrix shape: {cosine_dist.shape}")
# Correlation-based distances
from Orange.distance import PearsonR, SpearmanR
pearson_dist = PearsonR()(data)
spearman_dist = SpearmanR()(data)
print(f"Pearson correlation distance range: {pearson_dist.min():.3f} - {pearson_dist.max():.3f}")
# Distances for categorical data
from Orange.distance import Jaccard, Hamming
# Create categorical data example
categorical_data = Table("zoo") # Assuming zoo dataset has categorical features
jaccard_dist = Jaccard()(categorical_data)
hamming_dist = Hamming()(categorical_data)
print(f"Jaccard distance matrix shape: {jaccard_dist.shape}")
print(f"Hamming distance matrix shape: {hamming_dist.shape}")
# Mahalanobis distance
from Orange.distance import Mahalanobis
mahalanobis_dist = Mahalanobis()(data)
print(f"Mahalanobis distance matrix shape: {mahalanobis_dist.shape}")
# Distance preprocessing
from Orange.distance import _preprocess, remove_discrete_features
# Remove discrete features before computing distance
continuous_data = remove_discrete_features(data)
print(f"Original features: {len(data.domain.attributes)}")
print(f"Continuous features: {len(continuous_data.domain.attributes)}")
# Preprocess data for distance computation
preprocessed_data = _preprocess(data, remove_discrete=True,
impute=True, normalize=True)
# Compute distance on preprocessed data
preprocessed_dist = euclidean(preprocessed_data)
# Compare distances between first few samples
print("Distance comparison (first 3x3 submatrix):")
print("Original Euclidean:")
print(euclidean_dist[:3, :3])
print("Preprocessed Euclidean:")
print(preprocessed_dist[:3, :3])
# Use distances with clustering
from Orange.clustering import HierarchicalClustering
import numpy as np
# Convert distance matrix to format suitable for clustering
dist_array = np.array(euclidean_dist)
# Note: Hierarchical clustering can use precomputed distances
# hierarchical = HierarchicalClustering(linkage='average', metric='precomputed')
# clusters = hierarchical.fit(dist_array)
# Find nearest neighbors using distance matrix
def find_k_nearest(distance_matrix, point_idx, k=5):
"""Find k nearest neighbors for a given point."""
distances = distance_matrix[point_idx]
nearest_indices = np.argsort(distances)[1:k+1] # Exclude self (index 0)
return nearest_indices, distances[nearest_indices]
# Example: Find 5 nearest neighbors for first data point
nearest_idx, nearest_dist = find_k_nearest(euclidean_dist, 0, k=5)
print(f"5 nearest neighbors of point 0: {nearest_idx}")
print(f"Their distances: {nearest_dist}")Install with Tessl CLI
npx tessl i tessl/pypi-orange3