FastText library for efficient learning of word representations and sentence classification
—
FastText provides comprehensive access to word and sentence vector representations, enabling semantic similarity analysis, analogies, and vector arithmetic operations. The model handles out-of-vocabulary words through subword information.
Access vector representations for words, sentences, and subword components.
def get_word_vector(word):
"""
Get vector representation of a word.
Args:
word (str): Input word
Returns:
numpy.ndarray: Word vector of shape (dim,)
Note:
Handles out-of-vocabulary words using subword information
"""
def get_sentence_vector(text):
"""
Get vector representation of a sentence.
Args:
text (str): Input text/sentence (must not contain newlines)
Returns:
numpy.ndarray: Sentence vector of shape (dim,)
Raises:
ValueError: If text contains newline characters
"""
def get_input_vector(ind):
"""
Get input matrix vector by index.
Args:
ind (int): Word index in vocabulary
Returns:
numpy.ndarray: Input vector of shape (dim,)
Note:
Direct access to input matrix vectors for advanced use cases
"""import fasttext
import numpy as np
# Load model
model = fasttext.load_model('model.bin')
# Get word vectors
king_vector = model.get_word_vector('king')
queen_vector = model.get_word_vector('queen')
# Get sentence vector
sentence = "The quick brown fox jumps over the lazy dog"
sentence_vector = model.get_sentence_vector(sentence)
# Vector arithmetic
man_vector = model.get_word_vector('man')
woman_vector = model.get_word_vector('woman')
result = king_vector - man_vector + woman_vector
print(f"Word vector shape: {king_vector.shape}")
print(f"Sentence vector shape: {sentence_vector.shape}")Access the full input and output matrices for advanced operations (non-quantized models only).
def get_input_matrix():
"""
Get the full input matrix.
Returns:
numpy.ndarray: Input matrix of shape (vocab_size, dim)
Raises:
ValueError: If model is quantized
"""
def get_output_matrix():
"""
Get the full output matrix.
Returns:
numpy.ndarray: Output matrix of shape (vocab_size, dim)
Raises:
ValueError: If model is quantized
"""import fasttext
model = fasttext.load_model('model.bin')
if not model.is_quantized():
# Get full matrices for analysis
input_matrix = model.get_input_matrix()
output_matrix = model.get_output_matrix()
print(f"Input matrix shape: {input_matrix.shape}")
print(f"Output matrix shape: {output_matrix.shape}")
# Custom matrix operations
custom_input = input_matrix * 0.5
custom_output = output_matrix * 2.0
model.set_matrices(custom_input, custom_output)Find semantically similar words and solve word analogies using vector arithmetic.
def get_nearest_neighbors(word, k=10, on_unicode_error='strict'):
"""
Find k nearest neighbors of a word.
Args:
word (str): Query word
k (int): Number of neighbors to return (default: 10)
on_unicode_error (str): Unicode error handling (default: 'strict')
Returns:
list: List of (similarity_score, neighbor_word) tuples
Raises:
UnicodeError: If word contains invalid Unicode and on_unicode_error='strict'
"""
def get_analogies(wordA, wordB, wordC, k=10, on_unicode_error='strict'):
"""
Find analogies of the form A:B::C:?.
Args:
wordA (str): First word in analogy
wordB (str): Second word in analogy
wordC (str): Third word in analogy
k (int): Number of analogies to return (default: 10)
on_unicode_error (str): Unicode error handling (default: 'strict')
Returns:
list: List of (similarity_score, word) tuples solving A:B::C:word
"""import fasttext
model = fasttext.load_model('model.bin')
# Find similar words
neighbors = model.get_nearest_neighbors('king', k=5)
print("Words similar to 'king':")
for score, word in neighbors:
print(f" {word}: {score:.4f}")
# Solve analogies: king - man + woman = ?
analogies = model.get_analogies('king', 'man', 'woman', k=3)
print("king:man::woman:?")
for score, word in analogies:
print(f" {word}: {score:.4f}")
# Handle Unicode errors gracefully
try:
neighbors = model.get_nearest_neighbors('café', k=5, on_unicode_error='strict')
except UnicodeError:
neighbors = model.get_nearest_neighbors('café', k=5, on_unicode_error='replace')Access vocabulary, labels, and internal model structure information.
def get_words(include_freq=False, on_unicode_error='strict'):
"""
Get vocabulary words.
Args:
include_freq (bool): Include word frequencies (default: False)
on_unicode_error (str): Unicode error handling (default: 'strict')
Returns:
list: List of words or (word, frequency) tuples if include_freq=True
"""
def get_labels(include_freq=False, on_unicode_error='strict'):
"""
Get classification labels (supervised models only).
Args:
include_freq (bool): Include label frequencies (default: False)
on_unicode_error (str): Unicode error handling (default: 'strict')
Returns:
list: List of labels or (label, frequency) tuples if include_freq=True
"""
def get_word_id(word):
"""
Get word ID in internal dictionary.
Args:
word (str): Input word
Returns:
int: Word ID or -1 if not found
"""
def get_label_id(label):
"""
Get label ID in internal dictionary.
Args:
label (str): Input label
Returns:
int: Label ID or -1 if not found
"""import fasttext
model = fasttext.load_model('model.bin')
# Get vocabulary information
vocab = model.get_words()
print(f"Vocabulary size: {len(vocab)}")
print(f"First 10 words: {vocab[:10]}")
# Get word frequencies
vocab_freq = model.get_words(include_freq=True)
print("Most frequent words:")
for word, freq in sorted(vocab_freq, key=lambda x: x[1], reverse=True)[:10]:
print(f" {word}: {freq}")
# Check if words exist
word_id = model.get_word_id('king')
if word_id != -1:
print(f"'king' is in vocabulary with ID: {word_id}")
else:
print("'king' is not in vocabulary")
# For supervised models, get labels
if hasattr(model, 'get_labels'):
labels = model.get_labels()
print(f"Available labels: {labels}")Access subword components and character n-gram information for handling out-of-vocabulary words.
def get_subwords(word, on_unicode_error='strict'):
"""
Get subwords and their indices for a word.
Args:
word (str): Input word
on_unicode_error (str): Unicode error handling (default: 'strict')
Returns:
tuple: (subwords_list, indices_list) where subwords_list contains
character n-grams and indices_list contains their hash indices
"""
def get_subword_id(subword):
"""
Get hash index for a subword.
Args:
subword (str): Character n-gram subword
Returns:
int: Hash index for the subword
"""import fasttext
model = fasttext.load_model('model.bin')
# Analyze subword structure
word = 'running'
subwords, indices = model.get_subwords(word)
print(f"Subwords for '{word}':")
for subword, idx in zip(subwords, indices):
print(f" {subword}: {idx}")
# This is especially useful for out-of-vocabulary words
oov_word = 'unknownword'
if model.get_word_id(oov_word) == -1:
print(f"'{oov_word}' is OOV, using subword information")
vector = model.get_word_vector(oov_word) # Still works via subwords
print(f"OOV vector shape: {vector.shape}")Access model metadata and cached properties.
@property
def words(self):
"""Cached list of vocabulary words."""
@property
def labels(self):
"""Cached list of labels (supervised models only)."""
def get_dimension():
"""
Get vector dimension size.
Returns:
int: Dimension of word vectors
"""
def is_quantized():
"""
Check if model is quantized.
Returns:
bool: True if model is quantized, False otherwise
"""
def __contains__(word):
"""Check if word is in vocabulary using 'in' operator."""
def __getitem__(word):
"""Get word vector using [] syntax."""import fasttext
model = fasttext.load_model('model.bin')
# Model information
print(f"Vector dimension: {model.get_dimension()}")
print(f"Is quantized: {model.is_quantized()}")
print(f"Vocabulary size: {len(model.words)}")
# Convenient access patterns
if 'king' in model:
king_vector = model['king'] # Same as model.get_word_vector('king')
print(f"King vector: {king_vector[:5]}...") # First 5 dimensions
# Access cached vocabulary
frequent_words = model.words[:100] # First 100 words
print(f"Sample vocabulary: {frequent_words[:10]}")Install with Tessl CLI
npx tessl i tessl/pypi-fasttext