0
# Mathematical Utilities
1
2
Linear algebra operations, vector manipulations, and distance metrics optimized for NLP tasks. Gensim's mathematical utilities provide efficient implementations of common operations needed for text processing and machine learning.
3
4
## Capabilities
5
6
### Vector Operations
7
8
Core vector operations for normalizing, measuring, and manipulating document vectors.
9
10
```python { .api }
11
def unitvec(vec, norm='l2', return_norm=False):
12
"""
13
Scale vector to unit length.
14
15
Parameters:
16
- vec: Input vector (scipy.sparse or numpy array)
17
- norm: Normalization method ('l1' or 'l2')
18
- return_norm: Whether to return the original norm
19
20
Returns:
21
Normalized vector, optionally with original norm
22
"""
23
24
def veclen(vec):
25
"""
26
Calculate length/magnitude of vector.
27
28
Parameters:
29
- vec: Input vector (scipy.sparse or numpy array)
30
31
Returns:
32
Vector length as float
33
"""
34
35
def cossim(vec1, vec2):
36
"""
37
Calculate cosine similarity between two vectors.
38
39
Parameters:
40
- vec1: First vector
41
- vec2: Second vector
42
43
Returns:
44
Cosine similarity as float (-1 to 1)
45
"""
46
47
def ret_normalized_vec(vec, length):
48
"""
49
Return vector normalized to specified length.
50
51
Parameters:
52
- vec: Input vector
53
- length: Target length
54
55
Returns:
56
Normalized vector of specified length
57
"""
58
59
def ret_log_normalize_vec(vec, axis=1):
60
"""
61
Log-normalize vector values.
62
63
Parameters:
64
- vec: Input vector
65
- axis: Normalization axis
66
67
Returns:
68
Log-normalized vector
69
"""
70
71
def isbow(vec):
72
"""
73
Check if vector is in bag-of-words format.
74
75
Parameters:
76
- vec: Input vector
77
78
Returns:
79
Boolean indicating if vector is BOW format
80
"""
81
```
82
83
### Matrix Operations
84
85
Efficient matrix operations for corpus processing and linear algebra.
86
87
```python { .api }
88
def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0):
89
"""
90
Convert corpus to scipy.sparse.csc_matrix format.
91
92
Parameters:
93
- corpus: Input corpus in BOW format
94
- num_terms: Number of terms (optional)
95
- dtype: Data type for matrix values
96
- num_docs: Number of documents (optional)
97
- num_nnz: Number of non-zero elements (optional)
98
- printprogress: Progress reporting frequency
99
100
Returns:
101
CSC sparse matrix with documents as columns
102
"""
103
104
def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32):
105
"""
106
Convert corpus to dense numpy matrix.
107
108
Parameters:
109
- corpus: Input corpus in BOW format
110
- num_terms: Number of terms
111
- num_docs: Number of documents (optional)
112
- dtype: Data type for matrix values
113
114
Returns:
115
Dense numpy matrix
116
"""
117
118
def pad(mat, padrow=False, padcol=False):
119
"""
120
Pad matrix with zeros.
121
122
Parameters:
123
- mat: Input matrix
124
- padrow: Whether to pad rows
125
- padcol: Whether to pad columns
126
127
Returns:
128
Padded matrix
129
"""
130
131
def zeros_aligned(shape, dtype, order='C', align=128):
132
"""
133
Create aligned zero array for optimized operations.
134
135
Parameters:
136
- shape: Array shape
137
- dtype: Data type
138
- order: Memory layout ('C' or 'F')
139
- align: Memory alignment in bytes
140
141
Returns:
142
Aligned zero array
143
"""
144
145
def ismatrix(m):
146
"""
147
Check if object is a matrix.
148
149
Parameters:
150
- m: Object to check
151
152
Returns:
153
Boolean indicating if object is matrix-like
154
"""
155
```
156
157
### Sparse/Dense Conversions
158
159
Functions for converting between sparse and dense vector representations.
160
161
```python { .api }
162
def sparse2full(vec, length):
163
"""
164
Convert sparse vector to dense representation.
165
166
Parameters:
167
- vec: Sparse vector in BOW format
168
- length: Length of full vector
169
170
Returns:
171
Dense numpy array
172
"""
173
174
def full2sparse(vec, eps=1e-9):
175
"""
176
Convert dense vector to sparse BOW format.
177
178
Parameters:
179
- vec: Dense vector
180
- eps: Minimum value threshold
181
182
Returns:
183
Sparse vector in BOW format
184
"""
185
186
def full2sparse_clipped(vec, topn, eps=1e-9):
187
"""
188
Convert dense vector to sparse format, keeping only top-N values.
189
190
Parameters:
191
- vec: Dense vector
192
- topn: Number of top values to keep
193
- eps: Minimum value threshold
194
195
Returns:
196
Clipped sparse vector in BOW format
197
"""
198
199
def any2sparse(vec, eps=1e-9):
200
"""
201
Convert vector to sparse format regardless of input type.
202
203
Parameters:
204
- vec: Input vector (any format)
205
- eps: Minimum value threshold
206
207
Returns:
208
Sparse vector in BOW format
209
"""
210
211
def scipy2sparse(vec):
212
"""
213
Convert scipy sparse vector to gensim sparse format.
214
215
Parameters:
216
- vec: Scipy sparse matrix/vector
217
218
Returns:
219
Gensim sparse vector (BOW format)
220
"""
221
222
def scipy2scipy_clipped(matrix, topn, eps=1e-9):
223
"""
224
Clip scipy sparse matrix to top-N values per row/column.
225
226
Parameters:
227
- matrix: Scipy sparse matrix
228
- topn: Number of top values to keep
229
- eps: Minimum value threshold
230
231
Returns:
232
Clipped scipy sparse matrix
233
"""
234
```
235
236
### Distance Metrics
237
238
Statistical distance measures for comparing probability distributions and vectors.
239
240
```python { .api }
241
def kullback_leibler(vec1, vec2, num_features=None):
242
"""
243
Calculate Kullback-Leibler divergence between two probability distributions.
244
245
Parameters:
246
- vec1: First probability distribution
247
- vec2: Second probability distribution
248
- num_features: Number of features (optional)
249
250
Returns:
251
KL divergence as float
252
"""
253
254
def jensen_shannon(vec1, vec2, num_features=None):
255
"""
256
Calculate Jensen-Shannon distance between two probability distributions.
257
258
Parameters:
259
- vec1: First probability distribution
260
- vec2: Second probability distribution
261
- num_features: Number of features (optional)
262
263
Returns:
264
JS distance as float (0 to 1)
265
"""
266
267
def hellinger(vec1, vec2):
268
"""
269
Calculate Hellinger distance between two probability distributions.
270
271
Parameters:
272
- vec1: First probability distribution
273
- vec2: Second probability distribution
274
275
Returns:
276
Hellinger distance as float (0 to 1)
277
"""
278
279
def jaccard(vec1, vec2):
280
"""
281
Calculate Jaccard similarity coefficient.
282
283
Parameters:
284
- vec1: First vector
285
- vec2: Second vector
286
287
Returns:
288
Jaccard similarity as float (0 to 1)
289
"""
290
291
def jaccard_distance(vec1, vec2):
292
"""
293
Calculate Jaccard distance.
294
295
Parameters:
296
- vec1: First vector
297
- vec2: Second vector
298
299
Returns:
300
Jaccard distance as float (0 to 1)
301
"""
302
```
303
304
### Linear Algebra
305
306
Advanced linear algebra operations with BLAS integration.
307
308
```python { .api }
309
def blas(name, ndarray):
310
"""
311
Get appropriate BLAS function for array operations.
312
313
Parameters:
314
- name: BLAS function name
315
- ndarray: Input array to determine data type
316
317
Returns:
318
BLAS function object
319
"""
320
321
def argsort(x, topn=None, reverse=False):
322
"""
323
Efficiently find indices of smallest/largest elements.
324
325
Parameters:
326
- x: Input array
327
- topn: Number of top elements to return
328
- reverse: Whether to return largest elements
329
330
Returns:
331
Array of indices
332
"""
333
334
def qr_destroy(la):
335
"""
336
QR decomposition that destroys input matrix for memory efficiency.
337
338
Parameters:
339
- la: Input matrix (will be destroyed)
340
341
Returns:
342
Q and R matrices from QR decomposition
343
"""
344
```
345
346
## Usage Examples
347
348
### Basic Vector Operations
349
350
```python
351
import numpy as np
352
from gensim import matutils
353
354
# Create sample vectors
355
vec1 = [(0, 1.0), (1, 2.0), (2, 3.0)] # BOW format
356
vec2 = [(0, 2.0), (1, 1.0), (3, 1.0)] # BOW format
357
358
# Calculate vector length
359
length1 = matutils.veclen(vec1)
360
print(f"Vector 1 length: {length1}")
361
362
# Normalize vector to unit length
363
unit_vec1 = matutils.unitvec(vec1)
364
print(f"Unit vector 1: {unit_vec1}")
365
366
# Calculate cosine similarity
367
similarity = matutils.cossim(vec1, vec2)
368
print(f"Cosine similarity: {similarity}")
369
370
# Check if vector is BOW format
371
is_bow = matutils.isbow(vec1)
372
print(f"Is BOW format: {is_bow}")
373
```
374
375
### Sparse/Dense Conversions
376
377
```python
378
# Convert sparse to dense
379
dense_vec1 = matutils.sparse2full(vec1, length=5)
380
print(f"Dense vector: {dense_vec1}")
381
382
# Convert dense to sparse
383
dense_array = np.array([1.0, 2.0, 0.0, 3.0, 0.0])
384
sparse_vec = matutils.full2sparse(dense_array)
385
print(f"Sparse vector: {sparse_vec}")
386
387
# Keep only top-N values
388
top2_sparse = matutils.full2sparse_clipped(dense_array, topn=2)
389
print(f"Top-2 sparse: {top2_sparse}")
390
```
391
392
### Matrix Operations with Corpus
393
394
```python
395
from gensim import corpora
396
from gensim.test.utils import common_texts
397
398
# Create sample corpus
399
dictionary = corpora.Dictionary(common_texts)
400
corpus = [dictionary.doc2bow(text) for text in common_texts]
401
402
# Convert corpus to CSC matrix
403
csc_matrix = matutils.corpus2csc(corpus, num_terms=len(dictionary))
404
print(f"CSC matrix shape: {csc_matrix.shape}")
405
print(f"CSC matrix type: {type(csc_matrix)}")
406
407
# Convert corpus to dense matrix
408
dense_matrix = matutils.corpus2dense(corpus, num_terms=len(dictionary))
409
print(f"Dense matrix shape: {dense_matrix.shape}")
410
print(f"Dense matrix type: {type(dense_matrix)}")
411
```
412
413
### Distance Metrics
414
415
```python
416
# Create probability distributions
417
prob1 = [(0, 0.3), (1, 0.4), (2, 0.3)]
418
prob2 = [(0, 0.2), (1, 0.5), (2, 0.3)]
419
420
# Calculate various distance metrics
421
kl_div = matutils.kullback_leibler(prob1, prob2)
422
print(f"KL divergence: {kl_div}")
423
424
js_dist = matutils.jensen_shannon(prob1, prob2)
425
print(f"Jensen-Shannon distance: {js_dist}")
426
427
hellinger_dist = matutils.hellinger(prob1, prob2)
428
print(f"Hellinger distance: {hellinger_dist}")
429
430
# Jaccard similarity for binary vectors
431
binary1 = [(0, 1), (1, 1), (3, 1)]
432
binary2 = [(0, 1), (2, 1), (3, 1)]
433
434
jaccard_sim = matutils.jaccard(binary1, binary2)
435
jaccard_dist = matutils.jaccard_distance(binary1, binary2)
436
print(f"Jaccard similarity: {jaccard_sim}")
437
print(f"Jaccard distance: {jaccard_dist}")
438
```
439
440
### Efficient Sorting Operations
441
442
```python
443
# Create large array for demonstration
444
large_array = np.random.rand(10000)
445
446
# Find indices of top 10 largest values efficiently
447
top10_indices = matutils.argsort(large_array, topn=10, reverse=True)
448
print(f"Top 10 indices: {top10_indices}")
449
print(f"Top 10 values: {large_array[top10_indices]}")
450
451
# Find indices of top 5 smallest values
452
bottom5_indices = matutils.argsort(large_array, topn=5, reverse=False)
453
print(f"Bottom 5 indices: {bottom5_indices}")
454
print(f"Bottom 5 values: {large_array[bottom5_indices]}")
455
```
456
457
### BLAS Integration
458
459
```python
460
# Get BLAS function for dot product
461
test_array = np.array([1.0, 2.0, 3.0], dtype=np.float64)
462
dot_func = matutils.blas('dot', test_array)
463
print(f"BLAS dot function: {dot_func}")
464
465
# Use BLAS function for efficient computation
466
result = dot_func(test_array, test_array)
467
print(f"Dot product result: {result}")
468
```
469
470
### Memory-Efficient Operations
471
472
```python
473
# Create aligned zero array for optimized operations
474
aligned_zeros = matutils.zeros_aligned((1000, 100), dtype=np.float32)
475
print(f"Aligned array shape: {aligned_zeros.shape}")
476
print(f"Aligned array dtype: {aligned_zeros.dtype}")
477
478
# Check if object is matrix-like
479
is_matrix = matutils.ismatrix(aligned_zeros)
480
print(f"Is matrix: {is_matrix}")
481
482
# Pad matrix with zeros
483
small_matrix = np.array([[1, 2], [3, 4]])
484
padded_matrix = matutils.pad(small_matrix, padrow=True, padcol=True)
485
print(f"Original matrix:\n{small_matrix}")
486
print(f"Padded matrix:\n{padded_matrix}")
487
```
488
489
### Working with Scipy Sparse Matrices
490
491
```python
492
from scipy import sparse
493
494
# Create scipy sparse matrix
495
scipy_matrix = sparse.csr_matrix([[1, 0, 2], [0, 3, 0], [4, 0, 5]])
496
497
# Convert scipy sparse to gensim format (for first row)
498
gensim_sparse = matutils.scipy2sparse(scipy_matrix.getrow(0))
499
print(f"Scipy to gensim: {gensim_sparse}")
500
501
# Clip scipy matrix to top values
502
clipped_matrix = matutils.scipy2scipy_clipped(scipy_matrix, topn=2)
503
print(f"Original matrix:\n{scipy_matrix.toarray()}")
504
print(f"Clipped matrix:\n{clipped_matrix.toarray()}")
505
```
506
507
### Vector Normalization Variations
508
509
```python
510
# L2 normalization (default)
511
l2_normalized = matutils.unitvec(vec1, norm='l2')
512
print(f"L2 normalized: {l2_normalized}")
513
514
# L1 normalization
515
l1_normalized = matutils.unitvec(vec1, norm='l1')
516
print(f"L1 normalized: {l1_normalized}")
517
518
# Get normalized vector with original norm
519
normalized_with_norm = matutils.unitvec(vec1, return_norm=True)
520
print(f"Normalized vector: {normalized_with_norm[0]}")
521
print(f"Original norm: {normalized_with_norm[1]}")
522
523
# Log normalization
524
dense_vec = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
525
log_normalized = matutils.ret_log_normalize_vec(dense_vec)
526
print(f"Log normalized:\n{log_normalized}")
527
```