or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

corpus-management.mddata-downloading.mdindex.mdmathematical-utilities.mdnlp-models.mdsimilarity-computations.mdtext-preprocessing.md

mathematical-utilities.mddocs/

0

# Mathematical Utilities

1

2

Linear algebra operations, vector manipulations, and distance metrics optimized for NLP tasks. Gensim's mathematical utilities provide efficient implementations of common operations needed for text processing and machine learning.

3

4

## Capabilities

5

6

### Vector Operations

7

8

Core vector operations for normalizing, measuring, and manipulating document vectors.

9

10

```python { .api }

11

def unitvec(vec, norm='l2', return_norm=False):

12

"""

13

Scale vector to unit length.

14

15

Parameters:

16

- vec: Input vector (scipy.sparse or numpy array)

17

- norm: Normalization method ('l1' or 'l2')

18

- return_norm: Whether to return the original norm

19

20

Returns:

21

Normalized vector, optionally with original norm

22

"""

23

24

def veclen(vec):

25

"""

26

Calculate length/magnitude of vector.

27

28

Parameters:

29

- vec: Input vector (scipy.sparse or numpy array)

30

31

Returns:

32

Vector length as float

33

"""

34

35

def cossim(vec1, vec2):

36

"""

37

Calculate cosine similarity between two vectors.

38

39

Parameters:

40

- vec1: First vector

41

- vec2: Second vector

42

43

Returns:

44

Cosine similarity as float (-1 to 1)

45

"""

46

47

def ret_normalized_vec(vec, length):

48

"""

49

Return vector normalized to specified length.

50

51

Parameters:

52

- vec: Input vector

53

- length: Target length

54

55

Returns:

56

Normalized vector of specified length

57

"""

58

59

def ret_log_normalize_vec(vec, axis=1):

60

"""

61

Log-normalize vector values.

62

63

Parameters:

64

- vec: Input vector

65

- axis: Normalization axis

66

67

Returns:

68

Log-normalized vector

69

"""

70

71

def isbow(vec):

72

"""

73

Check if vector is in bag-of-words format.

74

75

Parameters:

76

- vec: Input vector

77

78

Returns:

79

Boolean indicating if vector is BOW format

80

"""

81

```

82

83

### Matrix Operations

84

85

Efficient matrix operations for corpus processing and linear algebra.

86

87

```python { .api }

88

def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0):

89

"""

90

Convert corpus to scipy.sparse.csc_matrix format.

91

92

Parameters:

93

- corpus: Input corpus in BOW format

94

- num_terms: Number of terms (optional)

95

- dtype: Data type for matrix values

96

- num_docs: Number of documents (optional)

97

- num_nnz: Number of non-zero elements (optional)

98

- printprogress: Progress reporting frequency

99

100

Returns:

101

CSC sparse matrix with documents as columns

102

"""

103

104

def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32):

105

"""

106

Convert corpus to dense numpy matrix.

107

108

Parameters:

109

- corpus: Input corpus in BOW format

110

- num_terms: Number of terms

111

- num_docs: Number of documents (optional)

112

- dtype: Data type for matrix values

113

114

Returns:

115

Dense numpy matrix

116

"""

117

118

def pad(mat, padrow=False, padcol=False):

119

"""

120

Pad matrix with zeros.

121

122

Parameters:

123

- mat: Input matrix

124

- padrow: Whether to pad rows

125

- padcol: Whether to pad columns

126

127

Returns:

128

Padded matrix

129

"""

130

131

def zeros_aligned(shape, dtype, order='C', align=128):

132

"""

133

Create aligned zero array for optimized operations.

134

135

Parameters:

136

- shape: Array shape

137

- dtype: Data type

138

- order: Memory layout ('C' or 'F')

139

- align: Memory alignment in bytes

140

141

Returns:

142

Aligned zero array

143

"""

144

145

def ismatrix(m):

146

"""

147

Check if object is a matrix.

148

149

Parameters:

150

- m: Object to check

151

152

Returns:

153

Boolean indicating if object is matrix-like

154

"""

155

```

156

157

### Sparse/Dense Conversions

158

159

Functions for converting between sparse and dense vector representations.

160

161

```python { .api }

162

def sparse2full(vec, length):

163

"""

164

Convert sparse vector to dense representation.

165

166

Parameters:

167

- vec: Sparse vector in BOW format

168

- length: Length of full vector

169

170

Returns:

171

Dense numpy array

172

"""

173

174

def full2sparse(vec, eps=1e-9):

175

"""

176

Convert dense vector to sparse BOW format.

177

178

Parameters:

179

- vec: Dense vector

180

- eps: Minimum value threshold

181

182

Returns:

183

Sparse vector in BOW format

184

"""

185

186

def full2sparse_clipped(vec, topn, eps=1e-9):

187

"""

188

Convert dense vector to sparse format, keeping only top-N values.

189

190

Parameters:

191

- vec: Dense vector

192

- topn: Number of top values to keep

193

- eps: Minimum value threshold

194

195

Returns:

196

Clipped sparse vector in BOW format

197

"""

198

199

def any2sparse(vec, eps=1e-9):

200

"""

201

Convert vector to sparse format regardless of input type.

202

203

Parameters:

204

- vec: Input vector (any format)

205

- eps: Minimum value threshold

206

207

Returns:

208

Sparse vector in BOW format

209

"""

210

211

def scipy2sparse(vec):

212

"""

213

Convert scipy sparse vector to gensim sparse format.

214

215

Parameters:

216

- vec: Scipy sparse matrix/vector

217

218

Returns:

219

Gensim sparse vector (BOW format)

220

"""

221

222

def scipy2scipy_clipped(matrix, topn, eps=1e-9):

223

"""

224

Clip scipy sparse matrix to top-N values per row/column.

225

226

Parameters:

227

- matrix: Scipy sparse matrix

228

- topn: Number of top values to keep

229

- eps: Minimum value threshold

230

231

Returns:

232

Clipped scipy sparse matrix

233

"""

234

```

235

236

### Distance Metrics

237

238

Statistical distance measures for comparing probability distributions and vectors.

239

240

```python { .api }

241

def kullback_leibler(vec1, vec2, num_features=None):

242

"""

243

Calculate Kullback-Leibler divergence between two probability distributions.

244

245

Parameters:

246

- vec1: First probability distribution

247

- vec2: Second probability distribution

248

- num_features: Number of features (optional)

249

250

Returns:

251

KL divergence as float

252

"""

253

254

def jensen_shannon(vec1, vec2, num_features=None):

255

"""

256

Calculate Jensen-Shannon distance between two probability distributions.

257

258

Parameters:

259

- vec1: First probability distribution

260

- vec2: Second probability distribution

261

- num_features: Number of features (optional)

262

263

Returns:

264

JS distance as float (0 to 1)

265

"""

266

267

def hellinger(vec1, vec2):

268

"""

269

Calculate Hellinger distance between two probability distributions.

270

271

Parameters:

272

- vec1: First probability distribution

273

- vec2: Second probability distribution

274

275

Returns:

276

Hellinger distance as float (0 to 1)

277

"""

278

279

def jaccard(vec1, vec2):

280

"""

281

Calculate Jaccard similarity coefficient.

282

283

Parameters:

284

- vec1: First vector

285

- vec2: Second vector

286

287

Returns:

288

Jaccard similarity as float (0 to 1)

289

"""

290

291

def jaccard_distance(vec1, vec2):

292

"""

293

Calculate Jaccard distance.

294

295

Parameters:

296

- vec1: First vector

297

- vec2: Second vector

298

299

Returns:

300

Jaccard distance as float (0 to 1)

301

"""

302

```

303

304

### Linear Algebra

305

306

Advanced linear algebra operations with BLAS integration.

307

308

```python { .api }

309

def blas(name, ndarray):

310

"""

311

Get appropriate BLAS function for array operations.

312

313

Parameters:

314

- name: BLAS function name

315

- ndarray: Input array to determine data type

316

317

Returns:

318

BLAS function object

319

"""

320

321

def argsort(x, topn=None, reverse=False):

322

"""

323

Efficiently find indices of smallest/largest elements.

324

325

Parameters:

326

- x: Input array

327

- topn: Number of top elements to return

328

- reverse: Whether to return largest elements

329

330

Returns:

331

Array of indices

332

"""

333

334

def qr_destroy(la):

335

"""

336

QR decomposition that destroys input matrix for memory efficiency.

337

338

Parameters:

339

- la: Input matrix (will be destroyed)

340

341

Returns:

342

Q and R matrices from QR decomposition

343

"""

344

```

345

346

## Usage Examples

347

348

### Basic Vector Operations

349

350

```python

351

import numpy as np

352

from gensim import matutils

353

354

# Create sample vectors

355

vec1 = [(0, 1.0), (1, 2.0), (2, 3.0)] # BOW format

356

vec2 = [(0, 2.0), (1, 1.0), (3, 1.0)] # BOW format

357

358

# Calculate vector length

359

length1 = matutils.veclen(vec1)

360

print(f"Vector 1 length: {length1}")

361

362

# Normalize vector to unit length

363

unit_vec1 = matutils.unitvec(vec1)

364

print(f"Unit vector 1: {unit_vec1}")

365

366

# Calculate cosine similarity

367

similarity = matutils.cossim(vec1, vec2)

368

print(f"Cosine similarity: {similarity}")

369

370

# Check if vector is BOW format

371

is_bow = matutils.isbow(vec1)

372

print(f"Is BOW format: {is_bow}")

373

```

374

375

### Sparse/Dense Conversions

376

377

```python

378

# Convert sparse to dense

379

dense_vec1 = matutils.sparse2full(vec1, length=5)

380

print(f"Dense vector: {dense_vec1}")

381

382

# Convert dense to sparse

383

dense_array = np.array([1.0, 2.0, 0.0, 3.0, 0.0])

384

sparse_vec = matutils.full2sparse(dense_array)

385

print(f"Sparse vector: {sparse_vec}")

386

387

# Keep only top-N values

388

top2_sparse = matutils.full2sparse_clipped(dense_array, topn=2)

389

print(f"Top-2 sparse: {top2_sparse}")

390

```

391

392

### Matrix Operations with Corpus

393

394

```python

395

from gensim import corpora

396

from gensim.test.utils import common_texts

397

398

# Create sample corpus

399

dictionary = corpora.Dictionary(common_texts)

400

corpus = [dictionary.doc2bow(text) for text in common_texts]

401

402

# Convert corpus to CSC matrix

403

csc_matrix = matutils.corpus2csc(corpus, num_terms=len(dictionary))

404

print(f"CSC matrix shape: {csc_matrix.shape}")

405

print(f"CSC matrix type: {type(csc_matrix)}")

406

407

# Convert corpus to dense matrix

408

dense_matrix = matutils.corpus2dense(corpus, num_terms=len(dictionary))

409

print(f"Dense matrix shape: {dense_matrix.shape}")

410

print(f"Dense matrix type: {type(dense_matrix)}")

411

```

412

413

### Distance Metrics

414

415

```python

416

# Create probability distributions

417

prob1 = [(0, 0.3), (1, 0.4), (2, 0.3)]

418

prob2 = [(0, 0.2), (1, 0.5), (2, 0.3)]

419

420

# Calculate various distance metrics

421

kl_div = matutils.kullback_leibler(prob1, prob2)

422

print(f"KL divergence: {kl_div}")

423

424

js_dist = matutils.jensen_shannon(prob1, prob2)

425

print(f"Jensen-Shannon distance: {js_dist}")

426

427

hellinger_dist = matutils.hellinger(prob1, prob2)

428

print(f"Hellinger distance: {hellinger_dist}")

429

430

# Jaccard similarity for binary vectors

431

binary1 = [(0, 1), (1, 1), (3, 1)]

432

binary2 = [(0, 1), (2, 1), (3, 1)]

433

434

jaccard_sim = matutils.jaccard(binary1, binary2)

435

jaccard_dist = matutils.jaccard_distance(binary1, binary2)

436

print(f"Jaccard similarity: {jaccard_sim}")

437

print(f"Jaccard distance: {jaccard_dist}")

438

```

439

440

### Efficient Sorting Operations

441

442

```python

443

# Create large array for demonstration

444

large_array = np.random.rand(10000)

445

446

# Find indices of top 10 largest values efficiently

447

top10_indices = matutils.argsort(large_array, topn=10, reverse=True)

448

print(f"Top 10 indices: {top10_indices}")

449

print(f"Top 10 values: {large_array[top10_indices]}")

450

451

# Find indices of top 5 smallest values

452

bottom5_indices = matutils.argsort(large_array, topn=5, reverse=False)

453

print(f"Bottom 5 indices: {bottom5_indices}")

454

print(f"Bottom 5 values: {large_array[bottom5_indices]}")

455

```

456

457

### BLAS Integration

458

459

```python

460

# Get BLAS function for dot product

461

test_array = np.array([1.0, 2.0, 3.0], dtype=np.float64)

462

dot_func = matutils.blas('dot', test_array)

463

print(f"BLAS dot function: {dot_func}")

464

465

# Use BLAS function for efficient computation

466

result = dot_func(test_array, test_array)

467

print(f"Dot product result: {result}")

468

```

469

470

### Memory-Efficient Operations

471

472

```python

473

# Create aligned zero array for optimized operations

474

aligned_zeros = matutils.zeros_aligned((1000, 100), dtype=np.float32)

475

print(f"Aligned array shape: {aligned_zeros.shape}")

476

print(f"Aligned array dtype: {aligned_zeros.dtype}")

477

478

# Check if object is matrix-like

479

is_matrix = matutils.ismatrix(aligned_zeros)

480

print(f"Is matrix: {is_matrix}")

481

482

# Pad matrix with zeros

483

small_matrix = np.array([[1, 2], [3, 4]])

484

padded_matrix = matutils.pad(small_matrix, padrow=True, padcol=True)

485

print(f"Original matrix:\n{small_matrix}")

486

print(f"Padded matrix:\n{padded_matrix}")

487

```

488

489

### Working with Scipy Sparse Matrices

490

491

```python

492

from scipy import sparse

493

494

# Create scipy sparse matrix

495

scipy_matrix = sparse.csr_matrix([[1, 0, 2], [0, 3, 0], [4, 0, 5]])

496

497

# Convert scipy sparse to gensim format (for first row)

498

gensim_sparse = matutils.scipy2sparse(scipy_matrix.getrow(0))

499

print(f"Scipy to gensim: {gensim_sparse}")

500

501

# Clip scipy matrix to top values

502

clipped_matrix = matutils.scipy2scipy_clipped(scipy_matrix, topn=2)

503

print(f"Original matrix:\n{scipy_matrix.toarray()}")

504

print(f"Clipped matrix:\n{clipped_matrix.toarray()}")

505

```

506

507

### Vector Normalization Variations

508

509

```python

510

# L2 normalization (default)

511

l2_normalized = matutils.unitvec(vec1, norm='l2')

512

print(f"L2 normalized: {l2_normalized}")

513

514

# L1 normalization

515

l1_normalized = matutils.unitvec(vec1, norm='l1')

516

print(f"L1 normalized: {l1_normalized}")

517

518

# Get normalized vector with original norm

519

normalized_with_norm = matutils.unitvec(vec1, return_norm=True)

520

print(f"Normalized vector: {normalized_with_norm[0]}")

521

print(f"Original norm: {normalized_with_norm[1]}")

522

523

# Log normalization

524

dense_vec = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])

525

log_normalized = matutils.ret_log_normalize_vec(dense_vec)

526

print(f"Log normalized:\n{log_normalized}")

527

```