or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classification.mdindex.mdtraining.mdutilities.mdword-vectors.md

word-vectors.mddocs/

0

# Word Vector Operations

1

2

FastText provides comprehensive access to word and sentence vector representations, enabling semantic similarity analysis, analogies, and vector arithmetic operations. The model handles out-of-vocabulary words through subword information.

3

4

## Capabilities

5

6

### Vector Retrieval

7

8

Access vector representations for words, sentences, and subword components.

9

10

```python { .api }

11

def get_word_vector(word):

12

"""

13

Get vector representation of a word.

14

15

Args:

16

word (str): Input word

17

18

Returns:

19

numpy.ndarray: Word vector of shape (dim,)

20

21

Note:

22

Handles out-of-vocabulary words using subword information

23

"""

24

25

def get_sentence_vector(text):

26

"""

27

Get vector representation of a sentence.

28

29

Args:

30

text (str): Input text/sentence (must not contain newlines)

31

32

Returns:

33

numpy.ndarray: Sentence vector of shape (dim,)

34

35

Raises:

36

ValueError: If text contains newline characters

37

"""

38

39

def get_input_vector(ind):

40

"""

41

Get input matrix vector by index.

42

43

Args:

44

ind (int): Word index in vocabulary

45

46

Returns:

47

numpy.ndarray: Input vector of shape (dim,)

48

49

Note:

50

Direct access to input matrix vectors for advanced use cases

51

"""

52

```

53

54

#### Usage Example

55

56

```python

57

import fasttext

58

import numpy as np

59

60

# Load model

61

model = fasttext.load_model('model.bin')

62

63

# Get word vectors

64

king_vector = model.get_word_vector('king')

65

queen_vector = model.get_word_vector('queen')

66

67

# Get sentence vector

68

sentence = "The quick brown fox jumps over the lazy dog"

69

sentence_vector = model.get_sentence_vector(sentence)

70

71

# Vector arithmetic

72

man_vector = model.get_word_vector('man')

73

woman_vector = model.get_word_vector('woman')

74

result = king_vector - man_vector + woman_vector

75

76

print(f"Word vector shape: {king_vector.shape}")

77

print(f"Sentence vector shape: {sentence_vector.shape}")

78

```

79

80

### Matrix Access

81

82

Access the full input and output matrices for advanced operations (non-quantized models only).

83

84

```python { .api }

85

def get_input_matrix():

86

"""

87

Get the full input matrix.

88

89

Returns:

90

numpy.ndarray: Input matrix of shape (vocab_size, dim)

91

92

Raises:

93

ValueError: If model is quantized

94

"""

95

96

def get_output_matrix():

97

"""

98

Get the full output matrix.

99

100

Returns:

101

numpy.ndarray: Output matrix of shape (vocab_size, dim)

102

103

Raises:

104

ValueError: If model is quantized

105

"""

106

```

107

108

#### Usage Example

109

110

```python

111

import fasttext

112

113

model = fasttext.load_model('model.bin')

114

115

if not model.is_quantized():

116

# Get full matrices for analysis

117

input_matrix = model.get_input_matrix()

118

output_matrix = model.get_output_matrix()

119

120

print(f"Input matrix shape: {input_matrix.shape}")

121

print(f"Output matrix shape: {output_matrix.shape}")

122

123

# Custom matrix operations

124

custom_input = input_matrix * 0.5

125

custom_output = output_matrix * 2.0

126

model.set_matrices(custom_input, custom_output)

127

```

128

129

### Similarity and Analogies

130

131

Find semantically similar words and solve word analogies using vector arithmetic.

132

133

```python { .api }

134

def get_nearest_neighbors(word, k=10, on_unicode_error='strict'):

135

"""

136

Find k nearest neighbors of a word.

137

138

Args:

139

word (str): Query word

140

k (int): Number of neighbors to return (default: 10)

141

on_unicode_error (str): Unicode error handling (default: 'strict')

142

143

Returns:

144

list: List of (similarity_score, neighbor_word) tuples

145

146

Raises:

147

UnicodeError: If word contains invalid Unicode and on_unicode_error='strict'

148

"""

149

150

def get_analogies(wordA, wordB, wordC, k=10, on_unicode_error='strict'):

151

"""

152

Find analogies of the form A:B::C:?.

153

154

Args:

155

wordA (str): First word in analogy

156

wordB (str): Second word in analogy

157

wordC (str): Third word in analogy

158

k (int): Number of analogies to return (default: 10)

159

on_unicode_error (str): Unicode error handling (default: 'strict')

160

161

Returns:

162

list: List of (similarity_score, word) tuples solving A:B::C:word

163

"""

164

```

165

166

#### Usage Example

167

168

```python

169

import fasttext

170

171

model = fasttext.load_model('model.bin')

172

173

# Find similar words

174

neighbors = model.get_nearest_neighbors('king', k=5)

175

print("Words similar to 'king':")

176

for score, word in neighbors:

177

print(f" {word}: {score:.4f}")

178

179

# Solve analogies: king - man + woman = ?

180

analogies = model.get_analogies('king', 'man', 'woman', k=3)

181

print("king:man::woman:?")

182

for score, word in analogies:

183

print(f" {word}: {score:.4f}")

184

185

# Handle Unicode errors gracefully

186

try:

187

neighbors = model.get_nearest_neighbors('café', k=5, on_unicode_error='strict')

188

except UnicodeError:

189

neighbors = model.get_nearest_neighbors('café', k=5, on_unicode_error='replace')

190

```

191

192

### Word and Label Information

193

194

Access vocabulary, labels, and internal model structure information.

195

196

```python { .api }

197

def get_words(include_freq=False, on_unicode_error='strict'):

198

"""

199

Get vocabulary words.

200

201

Args:

202

include_freq (bool): Include word frequencies (default: False)

203

on_unicode_error (str): Unicode error handling (default: 'strict')

204

205

Returns:

206

list: List of words or (word, frequency) tuples if include_freq=True

207

"""

208

209

def get_labels(include_freq=False, on_unicode_error='strict'):

210

"""

211

Get classification labels (supervised models only).

212

213

Args:

214

include_freq (bool): Include label frequencies (default: False)

215

on_unicode_error (str): Unicode error handling (default: 'strict')

216

217

Returns:

218

list: List of labels or (label, frequency) tuples if include_freq=True

219

"""

220

221

def get_word_id(word):

222

"""

223

Get word ID in internal dictionary.

224

225

Args:

226

word (str): Input word

227

228

Returns:

229

int: Word ID or -1 if not found

230

"""

231

232

def get_label_id(label):

233

"""

234

Get label ID in internal dictionary.

235

236

Args:

237

label (str): Input label

238

239

Returns:

240

int: Label ID or -1 if not found

241

"""

242

```

243

244

#### Usage Example

245

246

```python

247

import fasttext

248

249

model = fasttext.load_model('model.bin')

250

251

# Get vocabulary information

252

vocab = model.get_words()

253

print(f"Vocabulary size: {len(vocab)}")

254

print(f"First 10 words: {vocab[:10]}")

255

256

# Get word frequencies

257

vocab_freq = model.get_words(include_freq=True)

258

print("Most frequent words:")

259

for word, freq in sorted(vocab_freq, key=lambda x: x[1], reverse=True)[:10]:

260

print(f" {word}: {freq}")

261

262

# Check if words exist

263

word_id = model.get_word_id('king')

264

if word_id != -1:

265

print(f"'king' is in vocabulary with ID: {word_id}")

266

else:

267

print("'king' is not in vocabulary")

268

269

# For supervised models, get labels

270

if hasattr(model, 'get_labels'):

271

labels = model.get_labels()

272

print(f"Available labels: {labels}")

273

```

274

275

### Subword Information

276

277

Access subword components and character n-gram information for handling out-of-vocabulary words.

278

279

```python { .api }

280

def get_subwords(word, on_unicode_error='strict'):

281

"""

282

Get subwords and their indices for a word.

283

284

Args:

285

word (str): Input word

286

on_unicode_error (str): Unicode error handling (default: 'strict')

287

288

Returns:

289

tuple: (subwords_list, indices_list) where subwords_list contains

290

character n-grams and indices_list contains their hash indices

291

"""

292

293

def get_subword_id(subword):

294

"""

295

Get hash index for a subword.

296

297

Args:

298

subword (str): Character n-gram subword

299

300

Returns:

301

int: Hash index for the subword

302

"""

303

```

304

305

#### Usage Example

306

307

```python

308

import fasttext

309

310

model = fasttext.load_model('model.bin')

311

312

# Analyze subword structure

313

word = 'running'

314

subwords, indices = model.get_subwords(word)

315

316

print(f"Subwords for '{word}':")

317

for subword, idx in zip(subwords, indices):

318

print(f" {subword}: {idx}")

319

320

# This is especially useful for out-of-vocabulary words

321

oov_word = 'unknownword'

322

if model.get_word_id(oov_word) == -1:

323

print(f"'{oov_word}' is OOV, using subword information")

324

vector = model.get_word_vector(oov_word) # Still works via subwords

325

print(f"OOV vector shape: {vector.shape}")

326

```

327

328

### Model Properties

329

330

Access model metadata and cached properties.

331

332

```python { .api }

333

@property

334

def words(self):

335

"""Cached list of vocabulary words."""

336

337

@property

338

def labels(self):

339

"""Cached list of labels (supervised models only)."""

340

341

def get_dimension():

342

"""

343

Get vector dimension size.

344

345

Returns:

346

int: Dimension of word vectors

347

"""

348

349

def is_quantized():

350

"""

351

Check if model is quantized.

352

353

Returns:

354

bool: True if model is quantized, False otherwise

355

"""

356

357

def __contains__(word):

358

"""Check if word is in vocabulary using 'in' operator."""

359

360

def __getitem__(word):

361

"""Get word vector using [] syntax."""

362

```

363

364

#### Usage Example

365

366

```python

367

import fasttext

368

369

model = fasttext.load_model('model.bin')

370

371

# Model information

372

print(f"Vector dimension: {model.get_dimension()}")

373

print(f"Is quantized: {model.is_quantized()}")

374

print(f"Vocabulary size: {len(model.words)}")

375

376

# Convenient access patterns

377

if 'king' in model:

378

king_vector = model['king'] # Same as model.get_word_vector('king')

379

print(f"King vector: {king_vector[:5]}...") # First 5 dimensions

380

381

# Access cached vocabulary

382

frequent_words = model.words[:100] # First 100 words

383

print(f"Sample vocabulary: {frequent_words[:10]}")

384

```