FastText library for efficient learning of word representations and sentence classification
—
FastText provides various utility functions for model optimization, text processing, pre-trained model management, and advanced model manipulation. These utilities enhance the core functionality with performance optimizations and convenience features.
Optimize model size and performance through quantization and matrix manipulation.
def quantize(input=None, qout=False, cutoff=0, retrain=False, epoch=None,
lr=None, thread=None, verbose=None, dsub=2, qnorm=False):
"""
Quantize model to reduce memory usage and file size.
Args:
input (str, optional): Path to training data for retraining
qout (bool): Quantize output matrix (default: False)
cutoff (int): Vocabulary cutoff for quantization (default: 0)
retrain (bool): Retrain model after quantization (default: False)
epoch (int, optional): Number of retraining epochs
lr (float, optional): Learning rate for retraining
thread (int, optional): Number of threads for retraining
verbose (int, optional): Verbosity level
dsub (int): Dimension of subspace for quantization (default: 2)
qnorm (bool): Quantize normalization (default: False)
Note:
Quantization reduces model accuracy but significantly decreases size.
Some operations (get_input_matrix, get_output_matrix) become unavailable.
"""
def set_matrices(input_matrix, output_matrix):
"""
Set custom input and output matrices.
Args:
input_matrix (numpy.ndarray): Custom input matrix of shape (vocab_size, dim), float32 type
output_matrix (numpy.ndarray): Custom output matrix of shape (vocab_size, dim), float32 type
Raises:
ValueError: If model is quantized or matrix dimensions don't match
Note:
Matrices are automatically converted to float32 type. Use with caution as this
replaces the learned representations with custom values.
"""import fasttext
import numpy as np
# Load and quantize model
model = fasttext.load_model('large_model.bin')
print(f"Original model size: {model.get_dimension()} dimensions")
# Basic quantization
model.quantize()
print(f"Model quantized: {model.is_quantized()}")
# Advanced quantization with retraining
model = fasttext.load_model('model.bin') # Reload original
model.quantize(
input='train.txt', # Retrain after quantization
qout=True, # Quantize output matrix
retrain=True, # Enable retraining
epoch=5, # Retraining epochs
lr=0.01, # Lower learning rate
dsub=2, # Subspace dimension
verbose=2 # Show progress
)
# Save quantized model (much smaller file)
model.save_model('quantized_model.ftz')
# Custom matrix manipulation (before quantization)
model = fasttext.load_model('model.bin')
if not model.is_quantized():
input_matrix = model.get_input_matrix()
output_matrix = model.get_output_matrix()
# Apply custom transformations
scaled_input = input_matrix * 0.8
normalized_output = output_matrix / np.linalg.norm(output_matrix, axis=1, keepdims=True)
# Set modified matrices
model.set_matrices(scaled_input, normalized_output)Save and manage model files with different formats and compression levels.
def save_model(path):
"""
Save model to file.
Args:
path (str): Output file path (.bin for uncompressed, .ftz for compressed)
Note:
.bin format preserves full precision and all functionality
.ftz format is compressed but may lose some precision
"""import fasttext
import os
# Train and save model
model = fasttext.train_unsupervised('data.txt')
# Save in different formats
model.save_model('model.bin') # Full precision binary
model.save_model('model.ftz') # Compressed format
# Check file sizes
bin_size = os.path.getsize('model.bin')
ftz_size = os.path.getsize('model.ftz')
compression_ratio = bin_size / ftz_size
print(f"Binary model: {bin_size / 1024 / 1024:.1f} MB")
print(f"Compressed model: {ftz_size / 1024 / 1024:.1f} MB")
print(f"Compression ratio: {compression_ratio:.1f}x")
# Save after quantization for maximum compression
model.quantize()
model.save_model('quantized_model.ftz')
quantized_size = os.path.getsize('quantized_model.ftz')
print(f"Quantized model: {quantized_size / 1024 / 1024:.1f} MB")Download and manage pre-trained FastText models for multiple languages.
# Import utility module
import fasttext.util
def download_model(lang_id, if_exists='strict'):
"""
Download pre-trained FastText model for specified language.
Args:
lang_id (str): Language identifier (e.g., 'en', 'fr', 'de')
if_exists (str): Action if model exists - 'strict', 'ignore', 'overwrite'
Returns:
str: Path to downloaded model file (cc.{lang_id}.300.bin)
Raises:
Exception: If language ID is not supported
Note:
Always downloads 300-dimensional models from Common Crawl vectors
"""
# Set of valid language IDs (157 languages supported)
valid_lang_ids = {"af", "sq", "als", "am", "ar", "an", "hy", "as", "ast",
"az", "ba", "eu", "bar", "be", "bn", "bh", "bpy", "bs",
"br", "bg", "my", "ca", "ceb", "bcl", "ce", "zh", "cv",
"co", "hr", "cs", "da", "dv", "nl", "pa", "arz", "eml",
"en", "myv", "eo", "et", "hif", "fi", "fr", "gl", "ka",
"de", "gom", "el", "gu", "ht", "he", "mrj", "hi", "hu",
"is", "io", "ilo", "id", "ia", "ga", "it", "ja", "jv",
"kn", "pam", "kk", "km", "ky", "ko", "ku", "ckb", "la",
"lv", "li", "lt", "lmo", "nds", "lb", "mk", "mai", "mg",
"ms", "ml", "mt", "gv", "mr", "mzn", "mhr", "min", "xmf",
"mwl", "mn", "nah", "nap", "ne", "new", "frr", "nso",
"no", "nn", "oc", "or", "os", "pfl", "ps", "fa", "pms",
"pl", "pt", "qu", "ro", "rm", "ru", "sah", "sa", "sc",
"sco", "gd", "sr", "sh", "scn", "sd", "si", "sk", "sl",
"so", "azb", "es", "su", "sw", "sv", "tl", "tg", "ta",
"tt", "te", "th", "bo", "tr", "tk", "uk", "hsb", "ur",
"ug", "uz", "vec", "vi", "vo", "wa", "war", "cy", "vls",
"fy", "pnb", "yi", "yo", "diq", "zea"}import fasttext.util
# Download English model
model_path = fasttext.util.download_model('en', if_exists='ignore')
model = fasttext.load_model(model_path)
# Download specific dimension
fasttext.util.download_model('fr', dimension=100)
fr_model = fasttext.load_model('cc.fr.100.bin')
# Check available languages
print(f"Available languages: {len(fasttext.util.valid_lang_ids)}")
print(f"Sample languages: {list(fasttext.util.valid_lang_ids)[:10]}")
# Download multiple models
languages = ['en', 'es', 'fr', 'de', 'it']
models = {}
for lang in languages:
try:
path = fasttext.util.download_model(lang, if_exists='ignore')
models[lang] = fasttext.load_model(path)
print(f"Loaded {lang} model: {models[lang].get_dimension()} dimensions")
except ValueError as e:
print(f"Failed to download {lang}: {e}")
# Use multilingual models
text_samples = {
'en': 'Hello world',
'es': 'Hola mundo',
'fr': 'Bonjour monde',
'de': 'Hallo Welt'
}
for lang, text in text_samples.items():
if lang in models:
vector = models[lang].get_sentence_vector(text)
print(f"{lang}: '{text}' -> vector shape {vector.shape}")Reduce model dimensions using Principal Component Analysis for memory efficiency.
def reduce_model(ft_model, target_dim):
"""
Reduce model dimensions using PCA.
Args:
ft_model: FastText model object
target_dim (int): Target dimension size (must be < current dimension)
Returns:
_FastText: New model with reduced dimensions
Note:
Dimension reduction may impact model quality but reduces memory usage
"""import fasttext
import fasttext.util
# Load high-dimensional model
model = fasttext.load_model('cc.en.300.bin')
print(f"Original dimensions: {model.get_dimension()}")
# Reduce dimensions
reduced_model = fasttext.util.reduce_model(model, 100)
print(f"Reduced dimensions: {reduced_model.get_dimension()}")
# Compare performance
original_neighbors = model.get_nearest_neighbors('king', k=5)
reduced_neighbors = reduced_model.get_nearest_neighbors('king', k=5)
print("Original model neighbors:")
for score, word in original_neighbors:
print(f" {word}: {score:.4f}")
print("Reduced model neighbors:")
for score, word in reduced_neighbors:
print(f" {word}: {score:.4f}")
# Save reduced model
reduced_model.save_model('cc.en.100.reduced.bin')Utility functions for model evaluation and metric calculation.
def test(predictions, labels, k=1):
"""
Calculate precision and recall from predictions and true labels.
Args:
predictions (list): List of prediction tuples (labels, probabilities)
labels (list): List of true label lists for each sample
k (int): Number of top predictions to consider (default: 1)
Returns:
tuple: (precision, recall) at k
"""
def find_nearest_neighbor(query, vectors, ban_set, cossims=None):
"""
Find nearest vector to query, excluding banned items.
Args:
query (numpy.ndarray): Query vector
vectors (numpy.ndarray): Matrix of candidate vectors
ban_set (set): Set of indices to exclude from search
cossims (numpy.ndarray, optional): Pre-computed cosine similarities
Returns:
int: Index of nearest neighbor
"""import fasttext
import fasttext.util
import numpy as np
# Evaluate custom predictions
model = fasttext.load_model('classifier.bin')
# Generate predictions
test_texts = [
"Great movie, loved it!",
"Terrible film.",
"It was okay."
]
predictions = []
true_labels = [
['__label__positive'],
['__label__negative'],
['__label__neutral']
]
for text in test_texts:
pred_labels, pred_probs = model.predict(text, k=3)
predictions.append((pred_labels, pred_probs))
# Calculate metrics
precision, recall = fasttext.util.test(predictions, true_labels, k=1)
print(f"Custom evaluation - Precision: {precision:.4f}, Recall: {recall:.4f}")
# Find nearest neighbors with exclusions
word_vectors = model.get_input_matrix()
query_word = 'king'
query_vector = model.get_word_vector(query_word)
query_id = model.get_word_id(query_word)
# Exclude the query word itself and some others
ban_set = {query_id, model.get_word_id('the'), model.get_word_id('a')}
nearest_idx = fasttext.util.find_nearest_neighbor(
query_vector,
word_vectors,
ban_set
)
# Convert index back to word
vocab = model.get_words()
if nearest_idx < len(vocab):
nearest_word = vocab[nearest_idx]
print(f"Nearest neighbor to '{query_word}': {nearest_word}")Additional text processing utilities for consistency and preprocessing.
def tokenize(text):
"""
Tokenize text using FastText's internal tokenizer.
Args:
text (str): Input text to tokenize
Returns:
list: List of tokens following FastText's tokenization rules
Note:
This ensures consistency with training data preprocessing
"""import fasttext
# Consistent tokenization
texts = [
"Hello, world! How are you?",
"E-mail: user@domain.com (important)",
"123.45 is a number, isn't it?",
"Visit https://example.com for more."
]
for text in texts:
tokens = fasttext.tokenize(text)
print(f"'{text}'")
print(f" Tokens: {tokens}")
print(f" Count: {len(tokens)}")
print()
# Compare with model preprocessing
model = fasttext.load_model('model.bin')
sample_text = "This is a test sentence."
# Method 1: Direct tokenization
tokens1 = fasttext.tokenize(sample_text)
# Method 2: Model preprocessing
words, labels = model.get_line(sample_text)
print(f"Direct tokenization: {tokens1}")
print(f"Model preprocessing: {words}")
print(f"Are they equal? {tokens1 == words}")quantize() to reduce model size by 75-90%reduce_model() for further memory savings.ftz format for compressed storagethread parameter during trainingModel Formats:
.bin: Full precision, all features available.ftz: Compressed, may lose some precision.ftz: Maximum compression, limited functionalityPre-trained Models: Download once and reuse across projects
Temporary Files: Clean up downloaded models when no longer needed
Install with Tessl CLI
npx tessl i tessl/pypi-fasttext