0
# Word Vector Operations
1
2
FastText provides comprehensive access to word and sentence vector representations, enabling semantic similarity analysis, analogies, and vector arithmetic operations. The model handles out-of-vocabulary words through subword information.
3
4
## Capabilities
5
6
### Vector Retrieval
7
8
Access vector representations for words, sentences, and subword components.
9
10
```python { .api }
11
def get_word_vector(word):
12
"""
13
Get vector representation of a word.
14
15
Args:
16
word (str): Input word
17
18
Returns:
19
numpy.ndarray: Word vector of shape (dim,)
20
21
Note:
22
Handles out-of-vocabulary words using subword information
23
"""
24
25
def get_sentence_vector(text):
26
"""
27
Get vector representation of a sentence.
28
29
Args:
30
text (str): Input text/sentence (must not contain newlines)
31
32
Returns:
33
numpy.ndarray: Sentence vector of shape (dim,)
34
35
Raises:
36
ValueError: If text contains newline characters
37
"""
38
39
def get_input_vector(ind):
40
"""
41
Get input matrix vector by index.
42
43
Args:
44
ind (int): Word index in vocabulary
45
46
Returns:
47
numpy.ndarray: Input vector of shape (dim,)
48
49
Note:
50
Direct access to input matrix vectors for advanced use cases
51
"""
52
```
53
54
#### Usage Example
55
56
```python
57
import fasttext
58
import numpy as np
59
60
# Load model
61
model = fasttext.load_model('model.bin')
62
63
# Get word vectors
64
king_vector = model.get_word_vector('king')
65
queen_vector = model.get_word_vector('queen')
66
67
# Get sentence vector
68
sentence = "The quick brown fox jumps over the lazy dog"
69
sentence_vector = model.get_sentence_vector(sentence)
70
71
# Vector arithmetic
72
man_vector = model.get_word_vector('man')
73
woman_vector = model.get_word_vector('woman')
74
result = king_vector - man_vector + woman_vector
75
76
print(f"Word vector shape: {king_vector.shape}")
77
print(f"Sentence vector shape: {sentence_vector.shape}")
78
```
79
80
### Matrix Access
81
82
Access the full input and output matrices for advanced operations (non-quantized models only).
83
84
```python { .api }
85
def get_input_matrix():
86
"""
87
Get the full input matrix.
88
89
Returns:
90
numpy.ndarray: Input matrix of shape (vocab_size, dim)
91
92
Raises:
93
ValueError: If model is quantized
94
"""
95
96
def get_output_matrix():
97
"""
98
Get the full output matrix.
99
100
Returns:
101
numpy.ndarray: Output matrix of shape (vocab_size, dim)
102
103
Raises:
104
ValueError: If model is quantized
105
"""
106
```
107
108
#### Usage Example
109
110
```python
111
import fasttext
112
113
model = fasttext.load_model('model.bin')
114
115
if not model.is_quantized():
116
# Get full matrices for analysis
117
input_matrix = model.get_input_matrix()
118
output_matrix = model.get_output_matrix()
119
120
print(f"Input matrix shape: {input_matrix.shape}")
121
print(f"Output matrix shape: {output_matrix.shape}")
122
123
# Custom matrix operations
124
custom_input = input_matrix * 0.5
125
custom_output = output_matrix * 2.0
126
model.set_matrices(custom_input, custom_output)
127
```
128
129
### Similarity and Analogies
130
131
Find semantically similar words and solve word analogies using vector arithmetic.
132
133
```python { .api }
134
def get_nearest_neighbors(word, k=10, on_unicode_error='strict'):
135
"""
136
Find k nearest neighbors of a word.
137
138
Args:
139
word (str): Query word
140
k (int): Number of neighbors to return (default: 10)
141
on_unicode_error (str): Unicode error handling (default: 'strict')
142
143
Returns:
144
list: List of (similarity_score, neighbor_word) tuples
145
146
Raises:
147
UnicodeError: If word contains invalid Unicode and on_unicode_error='strict'
148
"""
149
150
def get_analogies(wordA, wordB, wordC, k=10, on_unicode_error='strict'):
151
"""
152
Find analogies of the form A:B::C:?.
153
154
Args:
155
wordA (str): First word in analogy
156
wordB (str): Second word in analogy
157
wordC (str): Third word in analogy
158
k (int): Number of analogies to return (default: 10)
159
on_unicode_error (str): Unicode error handling (default: 'strict')
160
161
Returns:
162
list: List of (similarity_score, word) tuples solving A:B::C:word
163
"""
164
```
165
166
#### Usage Example
167
168
```python
169
import fasttext
170
171
model = fasttext.load_model('model.bin')
172
173
# Find similar words
174
neighbors = model.get_nearest_neighbors('king', k=5)
175
print("Words similar to 'king':")
176
for score, word in neighbors:
177
print(f" {word}: {score:.4f}")
178
179
# Solve analogies: king - man + woman = ?
180
analogies = model.get_analogies('king', 'man', 'woman', k=3)
181
print("king:man::woman:?")
182
for score, word in analogies:
183
print(f" {word}: {score:.4f}")
184
185
# Handle Unicode errors gracefully
186
try:
187
neighbors = model.get_nearest_neighbors('café', k=5, on_unicode_error='strict')
188
except UnicodeError:
189
neighbors = model.get_nearest_neighbors('café', k=5, on_unicode_error='replace')
190
```
191
192
### Word and Label Information
193
194
Access vocabulary, labels, and internal model structure information.
195
196
```python { .api }
197
def get_words(include_freq=False, on_unicode_error='strict'):
198
"""
199
Get vocabulary words.
200
201
Args:
202
include_freq (bool): Include word frequencies (default: False)
203
on_unicode_error (str): Unicode error handling (default: 'strict')
204
205
Returns:
206
list: List of words or (word, frequency) tuples if include_freq=True
207
"""
208
209
def get_labels(include_freq=False, on_unicode_error='strict'):
210
"""
211
Get classification labels (supervised models only).
212
213
Args:
214
include_freq (bool): Include label frequencies (default: False)
215
on_unicode_error (str): Unicode error handling (default: 'strict')
216
217
Returns:
218
list: List of labels or (label, frequency) tuples if include_freq=True
219
"""
220
221
def get_word_id(word):
222
"""
223
Get word ID in internal dictionary.
224
225
Args:
226
word (str): Input word
227
228
Returns:
229
int: Word ID or -1 if not found
230
"""
231
232
def get_label_id(label):
233
"""
234
Get label ID in internal dictionary.
235
236
Args:
237
label (str): Input label
238
239
Returns:
240
int: Label ID or -1 if not found
241
"""
242
```
243
244
#### Usage Example
245
246
```python
247
import fasttext
248
249
model = fasttext.load_model('model.bin')
250
251
# Get vocabulary information
252
vocab = model.get_words()
253
print(f"Vocabulary size: {len(vocab)}")
254
print(f"First 10 words: {vocab[:10]}")
255
256
# Get word frequencies
257
vocab_freq = model.get_words(include_freq=True)
258
print("Most frequent words:")
259
for word, freq in sorted(vocab_freq, key=lambda x: x[1], reverse=True)[:10]:
260
print(f" {word}: {freq}")
261
262
# Check if words exist
263
word_id = model.get_word_id('king')
264
if word_id != -1:
265
print(f"'king' is in vocabulary with ID: {word_id}")
266
else:
267
print("'king' is not in vocabulary")
268
269
# For supervised models, get labels
270
if hasattr(model, 'get_labels'):
271
labels = model.get_labels()
272
print(f"Available labels: {labels}")
273
```
274
275
### Subword Information
276
277
Access subword components and character n-gram information for handling out-of-vocabulary words.
278
279
```python { .api }
280
def get_subwords(word, on_unicode_error='strict'):
281
"""
282
Get subwords and their indices for a word.
283
284
Args:
285
word (str): Input word
286
on_unicode_error (str): Unicode error handling (default: 'strict')
287
288
Returns:
289
tuple: (subwords_list, indices_list) where subwords_list contains
290
character n-grams and indices_list contains their hash indices
291
"""
292
293
def get_subword_id(subword):
294
"""
295
Get hash index for a subword.
296
297
Args:
298
subword (str): Character n-gram subword
299
300
Returns:
301
int: Hash index for the subword
302
"""
303
```
304
305
#### Usage Example
306
307
```python
308
import fasttext
309
310
model = fasttext.load_model('model.bin')
311
312
# Analyze subword structure
313
word = 'running'
314
subwords, indices = model.get_subwords(word)
315
316
print(f"Subwords for '{word}':")
317
for subword, idx in zip(subwords, indices):
318
print(f" {subword}: {idx}")
319
320
# This is especially useful for out-of-vocabulary words
321
oov_word = 'unknownword'
322
if model.get_word_id(oov_word) == -1:
323
print(f"'{oov_word}' is OOV, using subword information")
324
vector = model.get_word_vector(oov_word) # Still works via subwords
325
print(f"OOV vector shape: {vector.shape}")
326
```
327
328
### Model Properties
329
330
Access model metadata and cached properties.
331
332
```python { .api }
333
@property
334
def words(self):
335
"""Cached list of vocabulary words."""
336
337
@property
338
def labels(self):
339
"""Cached list of labels (supervised models only)."""
340
341
def get_dimension():
342
"""
343
Get vector dimension size.
344
345
Returns:
346
int: Dimension of word vectors
347
"""
348
349
def is_quantized():
350
"""
351
Check if model is quantized.
352
353
Returns:
354
bool: True if model is quantized, False otherwise
355
"""
356
357
def __contains__(word):
358
"""Check if word is in vocabulary using 'in' operator."""
359
360
def __getitem__(word):
361
"""Get word vector using [] syntax."""
362
```
363
364
#### Usage Example
365
366
```python
367
import fasttext
368
369
model = fasttext.load_model('model.bin')
370
371
# Model information
372
print(f"Vector dimension: {model.get_dimension()}")
373
print(f"Is quantized: {model.is_quantized()}")
374
print(f"Vocabulary size: {len(model.words)}")
375
376
# Convenient access patterns
377
if 'king' in model:
378
king_vector = model['king'] # Same as model.get_word_vector('king')
379
print(f"King vector: {king_vector[:5]}...") # First 5 dimensions
380
381
# Access cached vocabulary
382
frequent_words = model.words[:100] # First 100 words
383
print(f"Sample vocabulary: {frequent_words[:10]}")
384
```