Tessl Tile for pypi/gensim@4.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

corpus-management.md data-downloading.md index.md mathematical-utilities.md nlp-models.md similarity-computations.md text-preprocessing.md

mathematical-utilities.mddocs/

0
# Mathematical Utilities
1

2
Linear algebra operations, vector manipulations, and distance metrics optimized for NLP tasks. Gensim's mathematical utilities provide efficient implementations of common operations needed for text processing and machine learning.
3

4
## Capabilities
5

6
### Vector Operations
7

8
Core vector operations for normalizing, measuring, and manipulating document vectors.
9

10
```python { .api }
11
def unitvec(vec, norm='l2', return_norm=False):
12
    """
13
    Scale vector to unit length.
14
    
15
    Parameters:
16
    - vec: Input vector (scipy.sparse or numpy array)
17
    - norm: Normalization method ('l1' or 'l2')
18
    - return_norm: Whether to return the original norm
19
    
20
    Returns:
21
    Normalized vector, optionally with original norm
22
    """
23

24
def veclen(vec):
25
    """
26
    Calculate length/magnitude of vector.
27
    
28
    Parameters:
29
    - vec: Input vector (scipy.sparse or numpy array)
30
    
31
    Returns:
32
    Vector length as float
33
    """
34

35
def cossim(vec1, vec2):
36
    """
37
    Calculate cosine similarity between two vectors.
38
    
39
    Parameters:
40
    - vec1: First vector
41
    - vec2: Second vector
42
    
43
    Returns:
44
    Cosine similarity as float (-1 to 1)
45
    """
46

47
def ret_normalized_vec(vec, length):
48
    """
49
    Return vector normalized to specified length.
50
    
51
    Parameters:
52
    - vec: Input vector
53
    - length: Target length
54
    
55
    Returns:
56
    Normalized vector of specified length
57
    """
58

59
def ret_log_normalize_vec(vec, axis=1):
60
    """
61
    Log-normalize vector values.
62
    
63
    Parameters:
64
    - vec: Input vector
65
    - axis: Normalization axis
66
    
67
    Returns:
68
    Log-normalized vector
69
    """
70

71
def isbow(vec):
72
    """
73
    Check if vector is in bag-of-words format.
74
    
75
    Parameters:
76
    - vec: Input vector
77
    
78
    Returns:
79
    Boolean indicating if vector is BOW format
80
    """
81
```
82

83
### Matrix Operations
84

85
Efficient matrix operations for corpus processing and linear algebra.
86

87
```python { .api }
88
def corpus2csc(corpus, num_terms=None, dtype=np.float64, num_docs=None, num_nnz=None, printprogress=0):
89
    """
90
    Convert corpus to scipy.sparse.csc_matrix format.
91
    
92
    Parameters:
93
    - corpus: Input corpus in BOW format
94
    - num_terms: Number of terms (optional)
95
    - dtype: Data type for matrix values
96
    - num_docs: Number of documents (optional)
97
    - num_nnz: Number of non-zero elements (optional)
98
    - printprogress: Progress reporting frequency
99
    
100
    Returns:
101
    CSC sparse matrix with documents as columns
102
    """
103

104
def corpus2dense(corpus, num_terms, num_docs=None, dtype=np.float32):
105
    """
106
    Convert corpus to dense numpy matrix.
107
    
108
    Parameters:
109
    - corpus: Input corpus in BOW format
110
    - num_terms: Number of terms
111
    - num_docs: Number of documents (optional)
112
    - dtype: Data type for matrix values
113
    
114
    Returns:
115
    Dense numpy matrix
116
    """
117

118
def pad(mat, padrow=False, padcol=False):
119
    """
120
    Pad matrix with zeros.
121
    
122
    Parameters:
123
    - mat: Input matrix
124
    - padrow: Whether to pad rows
125
    - padcol: Whether to pad columns
126
    
127
    Returns:
128
    Padded matrix
129
    """
130

131
def zeros_aligned(shape, dtype, order='C', align=128):
132
    """
133
    Create aligned zero array for optimized operations.
134
    
135
    Parameters:
136
    - shape: Array shape
137
    - dtype: Data type
138
    - order: Memory layout ('C' or 'F')
139
    - align: Memory alignment in bytes
140
    
141
    Returns:
142
    Aligned zero array
143
    """
144

145
def ismatrix(m):
146
    """
147
    Check if object is a matrix.
148
    
149
    Parameters:
150
    - m: Object to check
151
    
152
    Returns:
153
    Boolean indicating if object is matrix-like
154
    """
155
```
156

157
### Sparse/Dense Conversions
158

159
Functions for converting between sparse and dense vector representations.
160

161
```python { .api }
162
def sparse2full(vec, length):
163
    """
164
    Convert sparse vector to dense representation.
165
    
166
    Parameters:
167
    - vec: Sparse vector in BOW format
168
    - length: Length of full vector
169
    
170
    Returns:
171
    Dense numpy array
172
    """
173

174
def full2sparse(vec, eps=1e-9):
175
    """
176
    Convert dense vector to sparse BOW format.
177
    
178
    Parameters:
179
    - vec: Dense vector
180
    - eps: Minimum value threshold
181
    
182
    Returns:
183
    Sparse vector in BOW format
184
    """
185

186
def full2sparse_clipped(vec, topn, eps=1e-9):
187
    """
188
    Convert dense vector to sparse format, keeping only top-N values.
189
    
190
    Parameters:
191
    - vec: Dense vector
192
    - topn: Number of top values to keep
193
    - eps: Minimum value threshold
194
    
195
    Returns:
196
    Clipped sparse vector in BOW format
197
    """
198

199
def any2sparse(vec, eps=1e-9):
200
    """
201
    Convert vector to sparse format regardless of input type.
202
    
203
    Parameters:
204
    - vec: Input vector (any format)
205
    - eps: Minimum value threshold
206
    
207
    Returns:
208
    Sparse vector in BOW format
209
    """
210

211
def scipy2sparse(vec):
212
    """
213
    Convert scipy sparse vector to gensim sparse format.
214
    
215
    Parameters:
216
    - vec: Scipy sparse matrix/vector
217
    
218
    Returns:
219
    Gensim sparse vector (BOW format)
220
    """
221

222
def scipy2scipy_clipped(matrix, topn, eps=1e-9):
223
    """
224
    Clip scipy sparse matrix to top-N values per row/column.
225
    
226
    Parameters:
227
    - matrix: Scipy sparse matrix
228
    - topn: Number of top values to keep
229
    - eps: Minimum value threshold
230
    
231
    Returns:
232
    Clipped scipy sparse matrix
233
    """
234
```
235

236
### Distance Metrics
237

238
Statistical distance measures for comparing probability distributions and vectors.
239

240
```python { .api }
241
def kullback_leibler(vec1, vec2, num_features=None):
242
    """
243
    Calculate Kullback-Leibler divergence between two probability distributions.
244
    
245
    Parameters:
246
    - vec1: First probability distribution
247
    - vec2: Second probability distribution  
248
    - num_features: Number of features (optional)
249
    
250
    Returns:
251
    KL divergence as float
252
    """
253

254
def jensen_shannon(vec1, vec2, num_features=None):
255
    """
256
    Calculate Jensen-Shannon distance between two probability distributions.
257
    
258
    Parameters:
259
    - vec1: First probability distribution
260
    - vec2: Second probability distribution
261
    - num_features: Number of features (optional)
262
    
263
    Returns:
264
    JS distance as float (0 to 1)
265
    """
266

267
def hellinger(vec1, vec2):
268
    """
269
    Calculate Hellinger distance between two probability distributions.
270
    
271
    Parameters:
272
    - vec1: First probability distribution
273
    - vec2: Second probability distribution
274
    
275
    Returns:
276
    Hellinger distance as float (0 to 1)
277
    """
278

279
def jaccard(vec1, vec2):
280
    """
281
    Calculate Jaccard similarity coefficient.
282
    
283
    Parameters:
284
    - vec1: First vector
285
    - vec2: Second vector
286
    
287
    Returns:
288
    Jaccard similarity as float (0 to 1)
289
    """
290

291
def jaccard_distance(vec1, vec2):
292
    """
293
    Calculate Jaccard distance.
294
    
295
    Parameters:
296
    - vec1: First vector
297
    - vec2: Second vector
298
    
299
    Returns:
300
    Jaccard distance as float (0 to 1)
301
    """
302
```
303

304
### Linear Algebra
305

306
Advanced linear algebra operations with BLAS integration.
307

308
```python { .api }
309
def blas(name, ndarray):
310
    """
311
    Get appropriate BLAS function for array operations.
312
    
313
    Parameters:
314
    - name: BLAS function name
315
    - ndarray: Input array to determine data type
316
    
317
    Returns:
318
    BLAS function object
319
    """
320

321
def argsort(x, topn=None, reverse=False):
322
    """
323
    Efficiently find indices of smallest/largest elements.
324
    
325
    Parameters:
326
    - x: Input array
327
    - topn: Number of top elements to return
328
    - reverse: Whether to return largest elements
329
    
330
    Returns:
331
    Array of indices
332
    """
333

334
def qr_destroy(la):
335
    """
336
    QR decomposition that destroys input matrix for memory efficiency.
337
    
338
    Parameters:
339
    - la: Input matrix (will be destroyed)
340
    
341
    Returns:
342
    Q and R matrices from QR decomposition
343
    """
344
```
345

346
## Usage Examples
347

348
### Basic Vector Operations
349

350
```python
351
import numpy as np
352
from gensim import matutils
353

354
# Create sample vectors
355
vec1 = [(0, 1.0), (1, 2.0), (2, 3.0)]  # BOW format
356
vec2 = [(0, 2.0), (1, 1.0), (3, 1.0)]  # BOW format
357

358
# Calculate vector length
359
length1 = matutils.veclen(vec1)
360
print(f"Vector 1 length: {length1}")
361

362
# Normalize vector to unit length
363
unit_vec1 = matutils.unitvec(vec1)
364
print(f"Unit vector 1: {unit_vec1}")
365

366
# Calculate cosine similarity
367
similarity = matutils.cossim(vec1, vec2)
368
print(f"Cosine similarity: {similarity}")
369

370
# Check if vector is BOW format
371
is_bow = matutils.isbow(vec1)
372
print(f"Is BOW format: {is_bow}")
373
```
374

375
### Sparse/Dense Conversions
376

377
```python
378
# Convert sparse to dense
379
dense_vec1 = matutils.sparse2full(vec1, length=5)
380
print(f"Dense vector: {dense_vec1}")
381

382
# Convert dense to sparse
383
dense_array = np.array([1.0, 2.0, 0.0, 3.0, 0.0])
384
sparse_vec = matutils.full2sparse(dense_array)
385
print(f"Sparse vector: {sparse_vec}")
386

387
# Keep only top-N values
388
top2_sparse = matutils.full2sparse_clipped(dense_array, topn=2)
389
print(f"Top-2 sparse: {top2_sparse}")
390
```
391

392
### Matrix Operations with Corpus
393

394
```python
395
from gensim import corpora
396
from gensim.test.utils import common_texts
397

398
# Create sample corpus
399
dictionary = corpora.Dictionary(common_texts)
400
corpus = [dictionary.doc2bow(text) for text in common_texts]
401

402
# Convert corpus to CSC matrix
403
csc_matrix = matutils.corpus2csc(corpus, num_terms=len(dictionary))
404
print(f"CSC matrix shape: {csc_matrix.shape}")
405
print(f"CSC matrix type: {type(csc_matrix)}")
406

407
# Convert corpus to dense matrix
408
dense_matrix = matutils.corpus2dense(corpus, num_terms=len(dictionary))
409
print(f"Dense matrix shape: {dense_matrix.shape}")
410
print(f"Dense matrix type: {type(dense_matrix)}")
411
```
412

413
### Distance Metrics
414

415
```python
416
# Create probability distributions
417
prob1 = [(0, 0.3), (1, 0.4), (2, 0.3)]
418
prob2 = [(0, 0.2), (1, 0.5), (2, 0.3)]
419

420
# Calculate various distance metrics
421
kl_div = matutils.kullback_leibler(prob1, prob2)
422
print(f"KL divergence: {kl_div}")
423

424
js_dist = matutils.jensen_shannon(prob1, prob2)
425
print(f"Jensen-Shannon distance: {js_dist}")
426

427
hellinger_dist = matutils.hellinger(prob1, prob2)
428
print(f"Hellinger distance: {hellinger_dist}")
429

430
# Jaccard similarity for binary vectors
431
binary1 = [(0, 1), (1, 1), (3, 1)]
432
binary2 = [(0, 1), (2, 1), (3, 1)]
433

434
jaccard_sim = matutils.jaccard(binary1, binary2)
435
jaccard_dist = matutils.jaccard_distance(binary1, binary2)
436
print(f"Jaccard similarity: {jaccard_sim}")
437
print(f"Jaccard distance: {jaccard_dist}")
438
```
439

440
### Efficient Sorting Operations
441

442
```python
443
# Create large array for demonstration
444
large_array = np.random.rand(10000)
445

446
# Find indices of top 10 largest values efficiently
447
top10_indices = matutils.argsort(large_array, topn=10, reverse=True)
448
print(f"Top 10 indices: {top10_indices}")
449
print(f"Top 10 values: {large_array[top10_indices]}")
450

451
# Find indices of top 5 smallest values
452
bottom5_indices = matutils.argsort(large_array, topn=5, reverse=False)
453
print(f"Bottom 5 indices: {bottom5_indices}")
454
print(f"Bottom 5 values: {large_array[bottom5_indices]}")
455
```
456

457
### BLAS Integration
458

459
```python
460
# Get BLAS function for dot product
461
test_array = np.array([1.0, 2.0, 3.0], dtype=np.float64)
462
dot_func = matutils.blas('dot', test_array)
463
print(f"BLAS dot function: {dot_func}")
464

465
# Use BLAS function for efficient computation
466
result = dot_func(test_array, test_array)
467
print(f"Dot product result: {result}")
468
```
469

470
### Memory-Efficient Operations
471

472
```python
473
# Create aligned zero array for optimized operations
474
aligned_zeros = matutils.zeros_aligned((1000, 100), dtype=np.float32)
475
print(f"Aligned array shape: {aligned_zeros.shape}")
476
print(f"Aligned array dtype: {aligned_zeros.dtype}")
477

478
# Check if object is matrix-like
479
is_matrix = matutils.ismatrix(aligned_zeros)
480
print(f"Is matrix: {is_matrix}")
481

482
# Pad matrix with zeros
483
small_matrix = np.array([[1, 2], [3, 4]])
484
padded_matrix = matutils.pad(small_matrix, padrow=True, padcol=True)
485
print(f"Original matrix:\n{small_matrix}")
486
print(f"Padded matrix:\n{padded_matrix}")
487
```
488

489
### Working with Scipy Sparse Matrices
490

491
```python
492
from scipy import sparse
493

494
# Create scipy sparse matrix
495
scipy_matrix = sparse.csr_matrix([[1, 0, 2], [0, 3, 0], [4, 0, 5]])
496

497
# Convert scipy sparse to gensim format (for first row)
498
gensim_sparse = matutils.scipy2sparse(scipy_matrix.getrow(0))
499
print(f"Scipy to gensim: {gensim_sparse}")
500

501
# Clip scipy matrix to top values
502
clipped_matrix = matutils.scipy2scipy_clipped(scipy_matrix, topn=2)
503
print(f"Original matrix:\n{scipy_matrix.toarray()}")
504
print(f"Clipped matrix:\n{clipped_matrix.toarray()}")
505
```
506

507
### Vector Normalization Variations
508

509
```python
510
# L2 normalization (default)
511
l2_normalized = matutils.unitvec(vec1, norm='l2')
512
print(f"L2 normalized: {l2_normalized}")
513

514
# L1 normalization
515
l1_normalized = matutils.unitvec(vec1, norm='l1')
516
print(f"L1 normalized: {l1_normalized}")
517

518
# Get normalized vector with original norm
519
normalized_with_norm = matutils.unitvec(vec1, return_norm=True)
520
print(f"Normalized vector: {normalized_with_norm[0]}")
521
print(f"Original norm: {normalized_with_norm[1]}")
522

523
# Log normalization
524
dense_vec = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
525
log_normalized = matutils.ret_log_normalize_vec(dense_vec)
526
print(f"Log normalized:\n{log_normalized}")
527
```

Version

Tile

Files

mathematical-utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

mathematical-utilities.mddocs/