or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdclustering.mddata-handling.mddistance.mdevaluation.mdindex.mdpreprocessing.mdprojection.mdregression.mdwidgets.md

distance.mddocs/

0

# Distance Metrics

1

2

Orange3 provides a comprehensive collection of distance and similarity measures for various data types and analysis tasks.

3

4

## Capabilities

5

6

### Base Distance Classes

7

8

Foundation classes for distance computation.

9

10

```python { .api }

11

class Distance:

12

"""Base class for all distance measures."""

13

def __call__(self, data):

14

"""

15

Compute distance matrix for data.

16

17

Args:

18

data: Orange Table

19

20

Returns:

21

Distance matrix

22

"""

23

24

class DistanceModel:

25

"""Fitted distance computation model."""

26

def __call__(self, data1, data2=None):

27

"""Compute distances between data points."""

28

```

29

30

### Euclidean Distance

31

32

Standard geometric distance in multidimensional space.

33

34

```python { .api }

35

class Euclidean(Distance):

36

"""

37

Euclidean distance metric.

38

39

The straight-line distance between two points in Euclidean space.

40

"""

41

def __call__(self, data):

42

"""Compute Euclidean distance matrix."""

43

```

44

45

### Manhattan Distance

46

47

City-block or L1 distance.

48

49

```python { .api }

50

class Manhattan(Distance):

51

"""

52

Manhattan (city-block) distance metric.

53

54

Sum of absolute differences between coordinates.

55

"""

56

def __call__(self, data):

57

"""Compute Manhattan distance matrix."""

58

```

59

60

### Cosine Distance

61

62

Angular distance based on cosine similarity.

63

64

```python { .api }

65

class Cosine(Distance):

66

"""

67

Cosine distance metric.

68

69

Based on cosine similarity between vectors, measures angle rather than magnitude.

70

"""

71

def __call__(self, data):

72

"""Compute cosine distance matrix."""

73

```

74

75

### Correlation-Based Distances

76

77

Distances based on statistical correlation.

78

79

```python { .api }

80

class PearsonR(Distance):

81

"""

82

Pearson correlation distance.

83

84

Distance based on Pearson correlation coefficient (1 - correlation).

85

"""

86

def __call__(self, data):

87

"""Compute Pearson correlation distance matrix."""

88

89

class PearsonRAbsolute(Distance):

90

"""

91

Absolute Pearson correlation distance.

92

93

Distance based on absolute Pearson correlation (1 - |correlation|).

94

"""

95

def __call__(self, data):

96

"""Compute absolute Pearson correlation distance matrix."""

97

98

class SpearmanR(Distance):

99

"""

100

Spearman rank correlation distance.

101

102

Distance based on Spearman rank correlation coefficient.

103

"""

104

def __call__(self, data):

105

"""Compute Spearman correlation distance matrix."""

106

107

class SpearmanRAbsolute(Distance):

108

"""

109

Absolute Spearman rank correlation distance.

110

111

Distance based on absolute Spearman rank correlation.

112

"""

113

def __call__(self, data):

114

"""Compute absolute Spearman correlation distance matrix."""

115

```

116

117

### Jaccard Distance

118

119

Distance for binary and categorical data.

120

121

```python { .api }

122

class Jaccard(Distance):

123

"""

124

Jaccard distance metric.

125

126

For binary data: 1 - (intersection / union)

127

Measures dissimilarity between sets.

128

"""

129

def __call__(self, data):

130

"""Compute Jaccard distance matrix."""

131

```

132

133

### Hamming Distance

134

135

Distance for categorical data.

136

137

```python { .api }

138

class Hamming(Distance):

139

"""

140

Hamming distance metric.

141

142

Proportion of differing categorical attributes.

143

"""

144

def __call__(self, data):

145

"""Compute Hamming distance matrix."""

146

```

147

148

### Mahalanobis Distance

149

150

Distance accounting for data covariance.

151

152

```python { .api }

153

class Mahalanobis(Distance):

154

"""

155

Mahalanobis distance metric.

156

157

Distance that accounts for covariance structure of the data.

158

"""

159

def __call__(self, data):

160

"""Compute Mahalanobis distance matrix."""

161

162

class MahalanobisDistance:

163

"""Mahalanobis distance computation utilities."""

164

def __init__(self, data): ...

165

166

def __call__(self, data1, data2=None):

167

"""Compute Mahalanobis distances."""

168

```

169

170

### Bhattacharyya Distance

171

172

Distance for probability distributions.

173

174

```python { .api }

175

class Bhattacharyya(Distance):

176

"""

177

Bhattacharyya distance metric.

178

179

Measures similarity between probability distributions.

180

"""

181

def __call__(self, data):

182

"""Compute Bhattacharyya distance matrix."""

183

```

184

185

### Distance Preprocessing Utilities

186

187

Helper functions for distance computation.

188

189

```python { .api }

190

def _preprocess(data, remove_discrete=False, remove_nonbinary=False,

191

impute=True, normalize=False):

192

"""

193

Preprocess data for distance computation.

194

195

Args:

196

data: Orange Table

197

remove_discrete: Remove discrete attributes

198

remove_nonbinary: Remove non-binary discrete attributes

199

impute: Impute missing values

200

normalize: Normalize features

201

202

Returns:

203

Preprocessed data

204

"""

205

206

def remove_discrete_features(data):

207

"""Remove discrete attributes from data."""

208

209

def remove_nonbinary_features(data):

210

"""Remove non-binary discrete attributes from data."""

211

212

def impute(data, method='average'):

213

"""

214

Impute missing values for distance computation.

215

216

Args:

217

data: Orange Table

218

method: Imputation method

219

220

Returns:

221

Data with imputed values

222

"""

223

```

224

225

### Usage Examples

226

227

```python

228

# Basic distance computation

229

from Orange.data import Table

230

from Orange.distance import Euclidean, Manhattan, Cosine

231

232

# Load data

233

data = Table("iris")

234

235

# Compute different distance matrices

236

euclidean = Euclidean()

237

manhattan = Manhattan()

238

cosine = Cosine()

239

240

euclidean_dist = euclidean(data)

241

manhattan_dist = manhattan(data)

242

cosine_dist = cosine(data)

243

244

print(f"Euclidean distance matrix shape: {euclidean_dist.shape}")

245

print(f"Manhattan distance matrix shape: {manhattan_dist.shape}")

246

print(f"Cosine distance matrix shape: {cosine_dist.shape}")

247

248

# Correlation-based distances

249

from Orange.distance import PearsonR, SpearmanR

250

251

pearson_dist = PearsonR()(data)

252

spearman_dist = SpearmanR()(data)

253

254

print(f"Pearson correlation distance range: {pearson_dist.min():.3f} - {pearson_dist.max():.3f}")

255

256

# Distances for categorical data

257

from Orange.distance import Jaccard, Hamming

258

259

# Create categorical data example

260

categorical_data = Table("zoo") # Assuming zoo dataset has categorical features

261

262

jaccard_dist = Jaccard()(categorical_data)

263

hamming_dist = Hamming()(categorical_data)

264

265

print(f"Jaccard distance matrix shape: {jaccard_dist.shape}")

266

print(f"Hamming distance matrix shape: {hamming_dist.shape}")

267

268

# Mahalanobis distance

269

from Orange.distance import Mahalanobis

270

271

mahalanobis_dist = Mahalanobis()(data)

272

print(f"Mahalanobis distance matrix shape: {mahalanobis_dist.shape}")

273

274

# Distance preprocessing

275

from Orange.distance import _preprocess, remove_discrete_features

276

277

# Remove discrete features before computing distance

278

continuous_data = remove_discrete_features(data)

279

print(f"Original features: {len(data.domain.attributes)}")

280

print(f"Continuous features: {len(continuous_data.domain.attributes)}")

281

282

# Preprocess data for distance computation

283

preprocessed_data = _preprocess(data, remove_discrete=True,

284

impute=True, normalize=True)

285

286

# Compute distance on preprocessed data

287

preprocessed_dist = euclidean(preprocessed_data)

288

289

# Compare distances between first few samples

290

print("Distance comparison (first 3x3 submatrix):")

291

print("Original Euclidean:")

292

print(euclidean_dist[:3, :3])

293

print("Preprocessed Euclidean:")

294

print(preprocessed_dist[:3, :3])

295

296

# Use distances with clustering

297

from Orange.clustering import HierarchicalClustering

298

import numpy as np

299

300

# Convert distance matrix to format suitable for clustering

301

dist_array = np.array(euclidean_dist)

302

303

# Note: Hierarchical clustering can use precomputed distances

304

# hierarchical = HierarchicalClustering(linkage='average', metric='precomputed')

305

# clusters = hierarchical.fit(dist_array)

306

307

# Find nearest neighbors using distance matrix

308

def find_k_nearest(distance_matrix, point_idx, k=5):

309

"""Find k nearest neighbors for a given point."""

310

distances = distance_matrix[point_idx]

311

nearest_indices = np.argsort(distances)[1:k+1] # Exclude self (index 0)

312

return nearest_indices, distances[nearest_indices]

313

314

# Example: Find 5 nearest neighbors for first data point

315

nearest_idx, nearest_dist = find_k_nearest(euclidean_dist, 0, k=5)

316

print(f"5 nearest neighbors of point 0: {nearest_idx}")

317

print(f"Their distances: {nearest_dist}")

318

```