tessl/pypi-fasttext

FastText library for efficient learning of word representations and sentence classification

—

Pending

Overview

Eval results

Files

Word Vector Operations

Name: tessl/pypi-fasttext
Author: tessl

FastText provides comprehensive access to word and sentence vector representations, enabling semantic similarity analysis, analogies, and vector arithmetic operations. The model handles out-of-vocabulary words through subword information.

Capabilities

Vector Retrieval

Access vector representations for words, sentences, and subword components.

def get_word_vector(word):
    """
    Get vector representation of a word.
    
    Args:
        word (str): Input word
        
    Returns:
        numpy.ndarray: Word vector of shape (dim,)
        
    Note:
        Handles out-of-vocabulary words using subword information
    """

def get_sentence_vector(text):
    """
    Get vector representation of a sentence.
    
    Args:
        text (str): Input text/sentence (must not contain newlines)
        
    Returns:
        numpy.ndarray: Sentence vector of shape (dim,)
        
    Raises:
        ValueError: If text contains newline characters
    """

def get_input_vector(ind):
    """
    Get input matrix vector by index.
    
    Args:
        ind (int): Word index in vocabulary
        
    Returns:
        numpy.ndarray: Input vector of shape (dim,)
        
    Note:
        Direct access to input matrix vectors for advanced use cases
    """

Usage Example

import fasttext
import numpy as np

# Load model
model = fasttext.load_model('model.bin')

# Get word vectors
king_vector = model.get_word_vector('king')
queen_vector = model.get_word_vector('queen')

# Get sentence vector
sentence = "The quick brown fox jumps over the lazy dog"
sentence_vector = model.get_sentence_vector(sentence)

# Vector arithmetic
man_vector = model.get_word_vector('man')
woman_vector = model.get_word_vector('woman')
result = king_vector - man_vector + woman_vector

print(f"Word vector shape: {king_vector.shape}")
print(f"Sentence vector shape: {sentence_vector.shape}")

Matrix Access

Access the full input and output matrices for advanced operations (non-quantized models only).

def get_input_matrix():
    """
    Get the full input matrix.
    
    Returns:
        numpy.ndarray: Input matrix of shape (vocab_size, dim)
        
    Raises:
        ValueError: If model is quantized
    """

def get_output_matrix():
    """
    Get the full output matrix.
    
    Returns:
        numpy.ndarray: Output matrix of shape (vocab_size, dim)
        
    Raises:
        ValueError: If model is quantized
    """

Usage Example

import fasttext

model = fasttext.load_model('model.bin')

if not model.is_quantized():
    # Get full matrices for analysis
    input_matrix = model.get_input_matrix()
    output_matrix = model.get_output_matrix()
    
    print(f"Input matrix shape: {input_matrix.shape}")
    print(f"Output matrix shape: {output_matrix.shape}")
    
    # Custom matrix operations
    custom_input = input_matrix * 0.5
    custom_output = output_matrix * 2.0
    model.set_matrices(custom_input, custom_output)

Similarity and Analogies

Find semantically similar words and solve word analogies using vector arithmetic.

def get_nearest_neighbors(word, k=10, on_unicode_error='strict'):
    """
    Find k nearest neighbors of a word.
    
    Args:
        word (str): Query word
        k (int): Number of neighbors to return (default: 10)
        on_unicode_error (str): Unicode error handling (default: 'strict')
        
    Returns:
        list: List of (similarity_score, neighbor_word) tuples
        
    Raises:
        UnicodeError: If word contains invalid Unicode and on_unicode_error='strict'
    """

def get_analogies(wordA, wordB, wordC, k=10, on_unicode_error='strict'):
    """
    Find analogies of the form A:B::C:?.
    
    Args:
        wordA (str): First word in analogy
        wordB (str): Second word in analogy  
        wordC (str): Third word in analogy
        k (int): Number of analogies to return (default: 10)
        on_unicode_error (str): Unicode error handling (default: 'strict')
        
    Returns:
        list: List of (similarity_score, word) tuples solving A:B::C:word
    """

Usage Example

import fasttext

model = fasttext.load_model('model.bin')

# Find similar words
neighbors = model.get_nearest_neighbors('king', k=5)
print("Words similar to 'king':")
for score, word in neighbors:
    print(f"  {word}: {score:.4f}")

# Solve analogies: king - man + woman = ?
analogies = model.get_analogies('king', 'man', 'woman', k=3)
print("king:man::woman:?")
for score, word in analogies:
    print(f"  {word}: {score:.4f}")

# Handle Unicode errors gracefully
try:
    neighbors = model.get_nearest_neighbors('café', k=5, on_unicode_error='strict')
except UnicodeError:
    neighbors = model.get_nearest_neighbors('café', k=5, on_unicode_error='replace')

Word and Label Information

Access vocabulary, labels, and internal model structure information.

def get_words(include_freq=False, on_unicode_error='strict'):
    """
    Get vocabulary words.
    
    Args:
        include_freq (bool): Include word frequencies (default: False)
        on_unicode_error (str): Unicode error handling (default: 'strict')
        
    Returns:
        list: List of words or (word, frequency) tuples if include_freq=True
    """

def get_labels(include_freq=False, on_unicode_error='strict'):
    """
    Get classification labels (supervised models only).
    
    Args:
        include_freq (bool): Include label frequencies (default: False)
        on_unicode_error (str): Unicode error handling (default: 'strict')
        
    Returns:
        list: List of labels or (label, frequency) tuples if include_freq=True
    """

def get_word_id(word):
    """
    Get word ID in internal dictionary.
    
    Args:
        word (str): Input word
        
    Returns:
        int: Word ID or -1 if not found
    """

def get_label_id(label):
    """
    Get label ID in internal dictionary.
    
    Args:
        label (str): Input label
        
    Returns:
        int: Label ID or -1 if not found
    """

Usage Example

import fasttext

model = fasttext.load_model('model.bin')

# Get vocabulary information
vocab = model.get_words()
print(f"Vocabulary size: {len(vocab)}")
print(f"First 10 words: {vocab[:10]}")

# Get word frequencies
vocab_freq = model.get_words(include_freq=True)
print("Most frequent words:")
for word, freq in sorted(vocab_freq, key=lambda x: x[1], reverse=True)[:10]:
    print(f"  {word}: {freq}")

# Check if words exist
word_id = model.get_word_id('king')
if word_id != -1:
    print(f"'king' is in vocabulary with ID: {word_id}")
else:
    print("'king' is not in vocabulary")

# For supervised models, get labels
if hasattr(model, 'get_labels'):
    labels = model.get_labels()
    print(f"Available labels: {labels}")

Subword Information

Access subword components and character n-gram information for handling out-of-vocabulary words.

def get_subwords(word, on_unicode_error='strict'):
    """
    Get subwords and their indices for a word.
    
    Args:
        word (str): Input word
        on_unicode_error (str): Unicode error handling (default: 'strict')
        
    Returns:
        tuple: (subwords_list, indices_list) where subwords_list contains
               character n-grams and indices_list contains their hash indices
    """

def get_subword_id(subword):
    """
    Get hash index for a subword.
    
    Args:
        subword (str): Character n-gram subword
        
    Returns:
        int: Hash index for the subword
    """

Usage Example

import fasttext

model = fasttext.load_model('model.bin')

# Analyze subword structure
word = 'running'
subwords, indices = model.get_subwords(word)

print(f"Subwords for '{word}':")
for subword, idx in zip(subwords, indices):
    print(f"  {subword}: {idx}")

# This is especially useful for out-of-vocabulary words
oov_word = 'unknownword'
if model.get_word_id(oov_word) == -1:
    print(f"'{oov_word}' is OOV, using subword information")
    vector = model.get_word_vector(oov_word)  # Still works via subwords
    print(f"OOV vector shape: {vector.shape}")

Model Properties

Access model metadata and cached properties.

@property
def words(self):
    """Cached list of vocabulary words."""

@property  
def labels(self):
    """Cached list of labels (supervised models only)."""

def get_dimension():
    """
    Get vector dimension size.
    
    Returns:
        int: Dimension of word vectors
    """

def is_quantized():
    """
    Check if model is quantized.
    
    Returns:
        bool: True if model is quantized, False otherwise
    """

def __contains__(word):
    """Check if word is in vocabulary using 'in' operator."""

def __getitem__(word):
    """Get word vector using [] syntax."""

Usage Example

import fasttext

model = fasttext.load_model('model.bin')

# Model information
print(f"Vector dimension: {model.get_dimension()}")
print(f"Is quantized: {model.is_quantized()}")
print(f"Vocabulary size: {len(model.words)}")

# Convenient access patterns
if 'king' in model:
    king_vector = model['king']  # Same as model.get_word_vector('king')
    print(f"King vector: {king_vector[:5]}...")  # First 5 dimensions

# Access cached vocabulary
frequent_words = model.words[:100]  # First 100 words
print(f"Sample vocabulary: {frequent_words[:10]}")

Install with Tessl CLI