tessl/pypi-fasttext

FastText library for efficient learning of word representations and sentence classification

—

Pending

Overview

Eval results

Files

Utilities

Name: tessl/pypi-fasttext
Author: tessl

FastText provides various utility functions for model optimization, text processing, pre-trained model management, and advanced model manipulation. These utilities enhance the core functionality with performance optimizations and convenience features.

Capabilities

Model Optimization

Optimize model size and performance through quantization and matrix manipulation.

def quantize(input=None, qout=False, cutoff=0, retrain=False, epoch=None, 
            lr=None, thread=None, verbose=None, dsub=2, qnorm=False):
    """
    Quantize model to reduce memory usage and file size.
    
    Args:
        input (str, optional): Path to training data for retraining
        qout (bool): Quantize output matrix (default: False)
        cutoff (int): Vocabulary cutoff for quantization (default: 0)
        retrain (bool): Retrain model after quantization (default: False)
        epoch (int, optional): Number of retraining epochs
        lr (float, optional): Learning rate for retraining
        thread (int, optional): Number of threads for retraining
        verbose (int, optional): Verbosity level
        dsub (int): Dimension of subspace for quantization (default: 2)
        qnorm (bool): Quantize normalization (default: False)
    
    Note:
        Quantization reduces model accuracy but significantly decreases size.
        Some operations (get_input_matrix, get_output_matrix) become unavailable.
    """

def set_matrices(input_matrix, output_matrix):
    """
    Set custom input and output matrices.
    
    Args:
        input_matrix (numpy.ndarray): Custom input matrix of shape (vocab_size, dim), float32 type
        output_matrix (numpy.ndarray): Custom output matrix of shape (vocab_size, dim), float32 type
        
    Raises:
        ValueError: If model is quantized or matrix dimensions don't match
        
    Note:
        Matrices are automatically converted to float32 type. Use with caution as this
        replaces the learned representations with custom values.
    """

Usage Example

import fasttext
import numpy as np

# Load and quantize model
model = fasttext.load_model('large_model.bin')
print(f"Original model size: {model.get_dimension()} dimensions")

# Basic quantization
model.quantize()
print(f"Model quantized: {model.is_quantized()}")

# Advanced quantization with retraining
model = fasttext.load_model('model.bin')  # Reload original
model.quantize(
    input='train.txt',  # Retrain after quantization
    qout=True,          # Quantize output matrix
    retrain=True,       # Enable retraining
    epoch=5,            # Retraining epochs
    lr=0.01,            # Lower learning rate
    dsub=2,             # Subspace dimension
    verbose=2           # Show progress
)

# Save quantized model (much smaller file)
model.save_model('quantized_model.ftz')

# Custom matrix manipulation (before quantization)
model = fasttext.load_model('model.bin')
if not model.is_quantized():
    input_matrix = model.get_input_matrix()
    output_matrix = model.get_output_matrix()
    
    # Apply custom transformations
    scaled_input = input_matrix * 0.8
    normalized_output = output_matrix / np.linalg.norm(output_matrix, axis=1, keepdims=True)
    
    # Set modified matrices
    model.set_matrices(scaled_input, normalized_output)

Model Persistence

Save and manage model files with different formats and compression levels.

def save_model(path):
    """
    Save model to file.
    
    Args:
        path (str): Output file path (.bin for uncompressed, .ftz for compressed)
        
    Note:
        .bin format preserves full precision and all functionality
        .ftz format is compressed but may lose some precision
    """

Usage Example

import fasttext
import os

# Train and save model
model = fasttext.train_unsupervised('data.txt')

# Save in different formats
model.save_model('model.bin')        # Full precision binary
model.save_model('model.ftz')        # Compressed format

# Check file sizes
bin_size = os.path.getsize('model.bin')
ftz_size = os.path.getsize('model.ftz')
compression_ratio = bin_size / ftz_size

print(f"Binary model: {bin_size / 1024 / 1024:.1f} MB")
print(f"Compressed model: {ftz_size / 1024 / 1024:.1f} MB")
print(f"Compression ratio: {compression_ratio:.1f}x")

# Save after quantization for maximum compression
model.quantize()
model.save_model('quantized_model.ftz')
quantized_size = os.path.getsize('quantized_model.ftz')
print(f"Quantized model: {quantized_size / 1024 / 1024:.1f} MB")

Pre-trained Model Management

Download and manage pre-trained FastText models for multiple languages.

# Import utility module
import fasttext.util

def download_model(lang_id, if_exists='strict'):
    """
    Download pre-trained FastText model for specified language.
    
    Args:
        lang_id (str): Language identifier (e.g., 'en', 'fr', 'de')
        if_exists (str): Action if model exists - 'strict', 'ignore', 'overwrite'
        
    Returns:
        str: Path to downloaded model file (cc.{lang_id}.300.bin)
        
    Raises:
        Exception: If language ID is not supported
        
    Note:
        Always downloads 300-dimensional models from Common Crawl vectors
    """

# Set of valid language IDs (157 languages supported)
valid_lang_ids = {"af", "sq", "als", "am", "ar", "an", "hy", "as", "ast",
                  "az", "ba", "eu", "bar", "be", "bn", "bh", "bpy", "bs",
                  "br", "bg", "my", "ca", "ceb", "bcl", "ce", "zh", "cv",
                  "co", "hr", "cs", "da", "dv", "nl", "pa", "arz", "eml",
                  "en", "myv", "eo", "et", "hif", "fi", "fr", "gl", "ka",
                  "de", "gom", "el", "gu", "ht", "he", "mrj", "hi", "hu",
                  "is", "io", "ilo", "id", "ia", "ga", "it", "ja", "jv",
                  "kn", "pam", "kk", "km", "ky", "ko", "ku", "ckb", "la",
                  "lv", "li", "lt", "lmo", "nds", "lb", "mk", "mai", "mg",
                  "ms", "ml", "mt", "gv", "mr", "mzn", "mhr", "min", "xmf",
                  "mwl", "mn", "nah", "nap", "ne", "new", "frr", "nso",
                  "no", "nn", "oc", "or", "os", "pfl", "ps", "fa", "pms",
                  "pl", "pt", "qu", "ro", "rm", "ru", "sah", "sa", "sc",
                  "sco", "gd", "sr", "sh", "scn", "sd", "si", "sk", "sl",
                  "so", "azb", "es", "su", "sw", "sv", "tl", "tg", "ta",
                  "tt", "te", "th", "bo", "tr", "tk", "uk", "hsb", "ur",
                  "ug", "uz", "vec", "vi", "vo", "wa", "war", "cy", "vls",
                  "fy", "pnb", "yi", "yo", "diq", "zea"}

Usage Example

import fasttext.util

# Download English model
model_path = fasttext.util.download_model('en', if_exists='ignore')
model = fasttext.load_model(model_path)

# Download specific dimension
fasttext.util.download_model('fr', dimension=100)
fr_model = fasttext.load_model('cc.fr.100.bin')

# Check available languages
print(f"Available languages: {len(fasttext.util.valid_lang_ids)}")
print(f"Sample languages: {list(fasttext.util.valid_lang_ids)[:10]}")

# Download multiple models
languages = ['en', 'es', 'fr', 'de', 'it']
models = {}

for lang in languages:
    try:
        path = fasttext.util.download_model(lang, if_exists='ignore')
        models[lang] = fasttext.load_model(path)
        print(f"Loaded {lang} model: {models[lang].get_dimension()} dimensions")
    except ValueError as e:
        print(f"Failed to download {lang}: {e}")

# Use multilingual models
text_samples = {
    'en': 'Hello world',
    'es': 'Hola mundo', 
    'fr': 'Bonjour monde',
    'de': 'Hallo Welt'
}

for lang, text in text_samples.items():
    if lang in models:
        vector = models[lang].get_sentence_vector(text)
        print(f"{lang}: '{text}' -> vector shape {vector.shape}")

Model Dimension Reduction

Reduce model dimensions using Principal Component Analysis for memory efficiency.

def reduce_model(ft_model, target_dim):
    """
    Reduce model dimensions using PCA.
    
    Args:
        ft_model: FastText model object
        target_dim (int): Target dimension size (must be < current dimension)
        
    Returns:
        _FastText: New model with reduced dimensions
        
    Note:
        Dimension reduction may impact model quality but reduces memory usage
    """

Usage Example

import fasttext
import fasttext.util

# Load high-dimensional model
model = fasttext.load_model('cc.en.300.bin')
print(f"Original dimensions: {model.get_dimension()}")

# Reduce dimensions
reduced_model = fasttext.util.reduce_model(model, 100)
print(f"Reduced dimensions: {reduced_model.get_dimension()}")

# Compare performance
original_neighbors = model.get_nearest_neighbors('king', k=5)
reduced_neighbors = reduced_model.get_nearest_neighbors('king', k=5)

print("Original model neighbors:")
for score, word in original_neighbors:
    print(f"  {word}: {score:.4f}")

print("Reduced model neighbors:")
for score, word in reduced_neighbors:
    print(f"  {word}: {score:.4f}")

# Save reduced model
reduced_model.save_model('cc.en.100.reduced.bin')

Evaluation Utilities

Utility functions for model evaluation and metric calculation.

def test(predictions, labels, k=1):
    """
    Calculate precision and recall from predictions and true labels.
    
    Args:
        predictions (list): List of prediction tuples (labels, probabilities)
        labels (list): List of true label lists for each sample
        k (int): Number of top predictions to consider (default: 1)
        
    Returns:
        tuple: (precision, recall) at k
    """

def find_nearest_neighbor(query, vectors, ban_set, cossims=None):
    """
    Find nearest vector to query, excluding banned items.
    
    Args:
        query (numpy.ndarray): Query vector
        vectors (numpy.ndarray): Matrix of candidate vectors
        ban_set (set): Set of indices to exclude from search
        cossims (numpy.ndarray, optional): Pre-computed cosine similarities
        
    Returns:
        int: Index of nearest neighbor
    """

Usage Example

import fasttext
import fasttext.util
import numpy as np

# Evaluate custom predictions
model = fasttext.load_model('classifier.bin')

# Generate predictions
test_texts = [
    "Great movie, loved it!",
    "Terrible film.",
    "It was okay."
]

predictions = []
true_labels = [
    ['__label__positive'],
    ['__label__negative'], 
    ['__label__neutral']
]

for text in test_texts:
    pred_labels, pred_probs = model.predict(text, k=3)
    predictions.append((pred_labels, pred_probs))

# Calculate metrics
precision, recall = fasttext.util.test(predictions, true_labels, k=1)
print(f"Custom evaluation - Precision: {precision:.4f}, Recall: {recall:.4f}")

# Find nearest neighbors with exclusions
word_vectors = model.get_input_matrix()
query_word = 'king'
query_vector = model.get_word_vector(query_word)
query_id = model.get_word_id(query_word)

# Exclude the query word itself and some others
ban_set = {query_id, model.get_word_id('the'), model.get_word_id('a')}

nearest_idx = fasttext.util.find_nearest_neighbor(
    query_vector, 
    word_vectors, 
    ban_set
)

# Convert index back to word
vocab = model.get_words()
if nearest_idx < len(vocab):
    nearest_word = vocab[nearest_idx]
    print(f"Nearest neighbor to '{query_word}': {nearest_word}")

Text Processing

Additional text processing utilities for consistency and preprocessing.

def tokenize(text):
    """
    Tokenize text using FastText's internal tokenizer.
    
    Args:
        text (str): Input text to tokenize
        
    Returns:
        list: List of tokens following FastText's tokenization rules
        
    Note:
        This ensures consistency with training data preprocessing
    """

Usage Example

import fasttext

# Consistent tokenization
texts = [
    "Hello, world! How are you?",
    "E-mail: user@domain.com (important)",  
    "123.45 is a number, isn't it?",
    "Visit https://example.com for more."
]

for text in texts:
    tokens = fasttext.tokenize(text)
    print(f"'{text}'")
    print(f"  Tokens: {tokens}")
    print(f"  Count: {len(tokens)}")
    print()

# Compare with model preprocessing
model = fasttext.load_model('model.bin')
sample_text = "This is a test sentence."

# Method 1: Direct tokenization
tokens1 = fasttext.tokenize(sample_text)

# Method 2: Model preprocessing
words, labels = model.get_line(sample_text)

print(f"Direct tokenization: {tokens1}")
print(f"Model preprocessing: {words}")
print(f"Are they equal? {tokens1 == words}")

Performance Optimization Tips

Memory Usage

Quantization: Use quantize() to reduce model size by 75-90%
Dimension Reduction: Use reduce_model() for further memory savings
Model Format: Use .ftz format for compressed storage

Speed Optimization

Threading: Set appropriate thread parameter during training
Batch Processing: Process multiple texts together when possible
Caching: Cache frequently accessed vectors and model properties

Storage Management

Model Formats:
- .bin: Full precision, all features available
- .ftz: Compressed, may lose some precision
- Quantized .ftz: Maximum compression, limited functionality
Pre-trained Models: Download once and reuse across projects
Temporary Files: Clean up downloaded models when no longer needed