Tessl Tile for pypi/fasttext@0.9.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

classification.md index.md training.md utilities.md word-vectors.md

word-vectors.mddocs/

0
# Word Vector Operations
1

2
FastText provides comprehensive access to word and sentence vector representations, enabling semantic similarity analysis, analogies, and vector arithmetic operations. The model handles out-of-vocabulary words through subword information.
3

4
## Capabilities
5

6
### Vector Retrieval
7

8
Access vector representations for words, sentences, and subword components.
9

10
```python { .api }
11
def get_word_vector(word):
12
    """
13
    Get vector representation of a word.
14
    
15
    Args:
16
        word (str): Input word
17
        
18
    Returns:
19
        numpy.ndarray: Word vector of shape (dim,)
20
        
21
    Note:
22
        Handles out-of-vocabulary words using subword information
23
    """
24

25
def get_sentence_vector(text):
26
    """
27
    Get vector representation of a sentence.
28
    
29
    Args:
30
        text (str): Input text/sentence (must not contain newlines)
31
        
32
    Returns:
33
        numpy.ndarray: Sentence vector of shape (dim,)
34
        
35
    Raises:
36
        ValueError: If text contains newline characters
37
    """
38

39
def get_input_vector(ind):
40
    """
41
    Get input matrix vector by index.
42
    
43
    Args:
44
        ind (int): Word index in vocabulary
45
        
46
    Returns:
47
        numpy.ndarray: Input vector of shape (dim,)
48
        
49
    Note:
50
        Direct access to input matrix vectors for advanced use cases
51
    """
52
```
53

54
#### Usage Example
55

56
```python
57
import fasttext
58
import numpy as np
59

60
# Load model
61
model = fasttext.load_model('model.bin')
62

63
# Get word vectors
64
king_vector = model.get_word_vector('king')
65
queen_vector = model.get_word_vector('queen')
66

67
# Get sentence vector
68
sentence = "The quick brown fox jumps over the lazy dog"
69
sentence_vector = model.get_sentence_vector(sentence)
70

71
# Vector arithmetic
72
man_vector = model.get_word_vector('man')
73
woman_vector = model.get_word_vector('woman')
74
result = king_vector - man_vector + woman_vector
75

76
print(f"Word vector shape: {king_vector.shape}")
77
print(f"Sentence vector shape: {sentence_vector.shape}")
78
```
79

80
### Matrix Access
81

82
Access the full input and output matrices for advanced operations (non-quantized models only).
83

84
```python { .api }
85
def get_input_matrix():
86
    """
87
    Get the full input matrix.
88
    
89
    Returns:
90
        numpy.ndarray: Input matrix of shape (vocab_size, dim)
91
        
92
    Raises:
93
        ValueError: If model is quantized
94
    """
95

96
def get_output_matrix():
97
    """
98
    Get the full output matrix.
99
    
100
    Returns:
101
        numpy.ndarray: Output matrix of shape (vocab_size, dim)
102
        
103
    Raises:
104
        ValueError: If model is quantized
105
    """
106
```
107

108
#### Usage Example
109

110
```python
111
import fasttext
112

113
model = fasttext.load_model('model.bin')
114

115
if not model.is_quantized():
116
    # Get full matrices for analysis
117
    input_matrix = model.get_input_matrix()
118
    output_matrix = model.get_output_matrix()
119
    
120
    print(f"Input matrix shape: {input_matrix.shape}")
121
    print(f"Output matrix shape: {output_matrix.shape}")
122
    
123
    # Custom matrix operations
124
    custom_input = input_matrix * 0.5
125
    custom_output = output_matrix * 2.0
126
    model.set_matrices(custom_input, custom_output)
127
```
128

129
### Similarity and Analogies
130

131
Find semantically similar words and solve word analogies using vector arithmetic.
132

133
```python { .api }
134
def get_nearest_neighbors(word, k=10, on_unicode_error='strict'):
135
    """
136
    Find k nearest neighbors of a word.
137
    
138
    Args:
139
        word (str): Query word
140
        k (int): Number of neighbors to return (default: 10)
141
        on_unicode_error (str): Unicode error handling (default: 'strict')
142
        
143
    Returns:
144
        list: List of (similarity_score, neighbor_word) tuples
145
        
146
    Raises:
147
        UnicodeError: If word contains invalid Unicode and on_unicode_error='strict'
148
    """
149

150
def get_analogies(wordA, wordB, wordC, k=10, on_unicode_error='strict'):
151
    """
152
    Find analogies of the form A:B::C:?.
153
    
154
    Args:
155
        wordA (str): First word in analogy
156
        wordB (str): Second word in analogy  
157
        wordC (str): Third word in analogy
158
        k (int): Number of analogies to return (default: 10)
159
        on_unicode_error (str): Unicode error handling (default: 'strict')
160
        
161
    Returns:
162
        list: List of (similarity_score, word) tuples solving A:B::C:word
163
    """
164
```
165

166
#### Usage Example
167

168
```python
169
import fasttext
170

171
model = fasttext.load_model('model.bin')
172

173
# Find similar words
174
neighbors = model.get_nearest_neighbors('king', k=5)
175
print("Words similar to 'king':")
176
for score, word in neighbors:
177
    print(f"  {word}: {score:.4f}")
178

179
# Solve analogies: king - man + woman = ?
180
analogies = model.get_analogies('king', 'man', 'woman', k=3)
181
print("king:man::woman:?")
182
for score, word in analogies:
183
    print(f"  {word}: {score:.4f}")
184

185
# Handle Unicode errors gracefully
186
try:
187
    neighbors = model.get_nearest_neighbors('café', k=5, on_unicode_error='strict')
188
except UnicodeError:
189
    neighbors = model.get_nearest_neighbors('café', k=5, on_unicode_error='replace')
190
```
191

192
### Word and Label Information
193

194
Access vocabulary, labels, and internal model structure information.
195

196
```python { .api }
197
def get_words(include_freq=False, on_unicode_error='strict'):
198
    """
199
    Get vocabulary words.
200
    
201
    Args:
202
        include_freq (bool): Include word frequencies (default: False)
203
        on_unicode_error (str): Unicode error handling (default: 'strict')
204
        
205
    Returns:
206
        list: List of words or (word, frequency) tuples if include_freq=True
207
    """
208

209
def get_labels(include_freq=False, on_unicode_error='strict'):
210
    """
211
    Get classification labels (supervised models only).
212
    
213
    Args:
214
        include_freq (bool): Include label frequencies (default: False)
215
        on_unicode_error (str): Unicode error handling (default: 'strict')
216
        
217
    Returns:
218
        list: List of labels or (label, frequency) tuples if include_freq=True
219
    """
220

221
def get_word_id(word):
222
    """
223
    Get word ID in internal dictionary.
224
    
225
    Args:
226
        word (str): Input word
227
        
228
    Returns:
229
        int: Word ID or -1 if not found
230
    """
231

232
def get_label_id(label):
233
    """
234
    Get label ID in internal dictionary.
235
    
236
    Args:
237
        label (str): Input label
238
        
239
    Returns:
240
        int: Label ID or -1 if not found
241
    """
242
```
243

244
#### Usage Example
245

246
```python
247
import fasttext
248

249
model = fasttext.load_model('model.bin')
250

251
# Get vocabulary information
252
vocab = model.get_words()
253
print(f"Vocabulary size: {len(vocab)}")
254
print(f"First 10 words: {vocab[:10]}")
255

256
# Get word frequencies
257
vocab_freq = model.get_words(include_freq=True)
258
print("Most frequent words:")
259
for word, freq in sorted(vocab_freq, key=lambda x: x[1], reverse=True)[:10]:
260
    print(f"  {word}: {freq}")
261

262
# Check if words exist
263
word_id = model.get_word_id('king')
264
if word_id != -1:
265
    print(f"'king' is in vocabulary with ID: {word_id}")
266
else:
267
    print("'king' is not in vocabulary")
268

269
# For supervised models, get labels
270
if hasattr(model, 'get_labels'):
271
    labels = model.get_labels()
272
    print(f"Available labels: {labels}")
273
```
274

275
### Subword Information
276

277
Access subword components and character n-gram information for handling out-of-vocabulary words.
278

279
```python { .api }
280
def get_subwords(word, on_unicode_error='strict'):
281
    """
282
    Get subwords and their indices for a word.
283
    
284
    Args:
285
        word (str): Input word
286
        on_unicode_error (str): Unicode error handling (default: 'strict')
287
        
288
    Returns:
289
        tuple: (subwords_list, indices_list) where subwords_list contains
290
               character n-grams and indices_list contains their hash indices
291
    """
292

293
def get_subword_id(subword):
294
    """
295
    Get hash index for a subword.
296
    
297
    Args:
298
        subword (str): Character n-gram subword
299
        
300
    Returns:
301
        int: Hash index for the subword
302
    """
303
```
304

305
#### Usage Example
306

307
```python
308
import fasttext
309

310
model = fasttext.load_model('model.bin')
311

312
# Analyze subword structure
313
word = 'running'
314
subwords, indices = model.get_subwords(word)
315

316
print(f"Subwords for '{word}':")
317
for subword, idx in zip(subwords, indices):
318
    print(f"  {subword}: {idx}")
319

320
# This is especially useful for out-of-vocabulary words
321
oov_word = 'unknownword'
322
if model.get_word_id(oov_word) == -1:
323
    print(f"'{oov_word}' is OOV, using subword information")
324
    vector = model.get_word_vector(oov_word)  # Still works via subwords
325
    print(f"OOV vector shape: {vector.shape}")
326
```
327

328
### Model Properties
329

330
Access model metadata and cached properties.
331

332
```python { .api }
333
@property
334
def words(self):
335
    """Cached list of vocabulary words."""
336

337
@property  
338
def labels(self):
339
    """Cached list of labels (supervised models only)."""
340

341
def get_dimension():
342
    """
343
    Get vector dimension size.
344
    
345
    Returns:
346
        int: Dimension of word vectors
347
    """
348

349
def is_quantized():
350
    """
351
    Check if model is quantized.
352
    
353
    Returns:
354
        bool: True if model is quantized, False otherwise
355
    """
356

357
def __contains__(word):
358
    """Check if word is in vocabulary using 'in' operator."""
359

360
def __getitem__(word):
361
    """Get word vector using [] syntax."""
362
```
363

364
#### Usage Example
365

366
```python
367
import fasttext
368

369
model = fasttext.load_model('model.bin')
370

371
# Model information
372
print(f"Vector dimension: {model.get_dimension()}")
373
print(f"Is quantized: {model.is_quantized()}")
374
print(f"Vocabulary size: {len(model.words)}")
375

376
# Convenient access patterns
377
if 'king' in model:
378
    king_vector = model['king']  # Same as model.get_word_vector('king')
379
    print(f"King vector: {king_vector[:5]}...")  # First 5 dimensions
380

381
# Access cached vocabulary
382
frequent_words = model.words[:100]  # First 100 words
383
print(f"Sample vocabulary: {frequent_words[:10]}")
384
```

Version

Tile

Files

word-vectors.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

word-vectors.mddocs/