tessl/pypi-fasttext

FastText library for efficient learning of word representations and sentence classification

—

Pending

Overview

Eval results

Files

Text Classification

Name: tessl/pypi-fasttext
Author: tessl

FastText provides comprehensive text classification capabilities including prediction, evaluation, and detailed performance metrics. Supports multi-class and multi-label classification with confidence thresholds and top-k predictions.

Capabilities

Prediction

Classify text into predefined categories with confidence scores and threshold filtering.

def predict(text, k=1, threshold=0.0, on_unicode_error='strict'):
    """
    Predict labels for input text.
    
    Args:
        text (str or list): Input text to classify or list of texts for batch prediction
        k (int): Number of top predictions to return (default: 1)
        threshold (float): Minimum prediction confidence (default: 0.0)
        on_unicode_error (str): Unicode error handling (default: 'strict')
        
    Returns:
        tuple: If text is str, returns (labels, probabilities) where labels is 
               a list of predicted labels and probabilities is a numpy array of scores.
               If text is list, returns (all_labels, all_probabilities) where each
               is a list containing results for each input text.
               
    Raises:
        ValueError: If text contains newline characters or model is not supervised
    """

def get_line(text, on_unicode_error='strict'):
    """
    Split text into words and labels for internal processing.
    
    Args:
        text (str or list): Input text or list of texts (must not contain newlines)
        on_unicode_error (str): Unicode error handling (default: 'strict')
        
    Returns:
        tuple or list: If text is str, returns (words, labels) tuple.
                      If text is list, returns list of (words, labels) tuples.
                      words is tokenized text, labels is list of any labels found
        
    Raises:
        ValueError: If text contains newline characters
        
    Note:
        Labels must start with the prefix used to create the model (__label__ by default)
    """

Usage Example

import fasttext

# Load trained classifier
model = fasttext.load_model('classifier.bin')

# Single prediction
text = "This movie is absolutely fantastic!"
labels, probabilities = model.predict(text)
print(f"Predicted: {labels[0]} (confidence: {probabilities[0]:.4f})")

# Top-k predictions
labels, probabilities = model.predict(text, k=3)
print("Top 3 predictions:")
for label, prob in zip(labels, probabilities):
    print(f"  {label}: {prob:.4f}")

# Predictions with threshold
labels, probabilities = model.predict(text, k=5, threshold=0.1)
print(f"Predictions above 0.1 confidence: {len(labels)}")

# Batch predictions
texts = [
    "Great movie, loved it!",
    "Terrible film, waste of time.",
    "It was okay, nothing special."
]

for text in texts:
    labels, probs = model.predict(text)
    print(f"'{text}' -> {labels[0]} ({probs[0]:.3f})")

# Handle multilabel predictions
multilabel_text = "This is a great action comedy movie"
labels, probs = model.predict(multilabel_text, k=3, threshold=0.2)
print(f"Multiple labels: {labels}")

Model Evaluation

Evaluate classifier performance on test datasets with precision, recall, and F1-score metrics.

def test(path, k=1, threshold=0.0):
    """
    Evaluate model on test data.
    
    Args:
        path (str): Path to test file in training format
        k (int): Number of predictions to consider (default: 1)
        threshold (float): Minimum prediction confidence (default: 0.0)
        
    Returns:
        tuple: (sample_count, precision, recall) where sample_count is
               number of test samples, precision is P@k, recall is R@k
    """

def test_label(path, k=1, threshold=0.0):
    """
    Get per-label precision and recall scores.
    
    Args:
        path (str): Path to test file in training format
        k (int): Number of predictions to consider (default: 1)
        threshold (float): Minimum prediction confidence (default: 0.0)
        
    Returns:
        dict: Dictionary mapping label names to dictionaries with 'precision' and 'recall' keys
              Example: {'__label__positive': {'precision': 0.7, 'recall': 0.74}}
    """

Usage Example

import fasttext

model = fasttext.load_model('classifier.bin')

# Overall evaluation
n_samples, precision, recall = model.test('test.txt')
f1_score = 2 * (precision * recall) / (precision + recall)

print(f"Test Results:")
print(f"  Samples: {n_samples}")
print(f"  Precision@1: {precision:.4f}")
print(f"  Recall@1: {recall:.4f}")
print(f"  F1-Score: {f1_score:.4f}")

# Top-k evaluation
n_samples, precision_k, recall_k = model.test('test.txt', k=3)
print(f"Precision@3: {precision_k:.4f}")
print(f"Recall@3: {recall_k:.4f}")

# Per-label evaluation
label_scores = model.test_label('test.txt')
print("Per-label scores:")
for label, (precision, recall) in label_scores.items():
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    print(f"  {label}: P={precision:.3f}, R={recall:.3f}, F1={f1:.3f}")

# Evaluation with threshold
n_samples, precision_t, recall_t = model.test('test.txt', k=1, threshold=0.5)
print(f"With threshold 0.5 - P@1: {precision_t:.4f}, R@1: {recall_t:.4f}")

Advanced Metrics

Access detailed evaluation metrics and precision-recall curves for comprehensive model analysis.

def get_meter(path, k=-1):
    """
    Get evaluation meter for detailed metrics.
    
    Args:
        path (str): Path to test file
        k (int): Number of predictions to consider (default: -1 for all)
        
    Returns:
        _Meter: Meter object for detailed evaluation
    """

The _Meter class provides advanced metric analysis:

class _Meter:
    def score_vs_true(self, label):
        """
        Get scores and true labels for a specific label.
        
        Args:
            label (str): Label to analyze
            
        Returns:
            tuple: (scores_array, true_labels_array) for ROC/PR analysis
        """
    
    def precision_recall_curve(self, label=None):
        """
        Get precision-recall curve data.
        
        Args:
            label (str, optional): Specific label or None for micro-average
            
        Returns:
            tuple: (precision_array, recall_array, thresholds_array)
        """
    
    def precision_at_recall(self, recall, label=None):
        """
        Get precision at specific recall level.
        
        Args:
            recall (float): Target recall level (0.0-1.0)
            label (str, optional): Specific label or None for micro-average
            
        Returns:
            float: Precision at the specified recall level
        """
    
    def recall_at_precision(self, precision, label=None):
        """
        Get recall at specific precision level.
        
        Args:
            precision (float): Target precision level (0.0-1.0)  
            label (str, optional): Specific label or None for micro-average
            
        Returns:
            float: Recall at the specified precision level
        """

Usage Example

import fasttext
import matplotlib.pyplot as plt
import numpy as np

model = fasttext.load_model('classifier.bin')

# Get detailed evaluation meter
meter = model.get_meter('test.txt')

# Analyze specific label
label = '__label__positive'
scores, true_labels = meter.score_vs_true(label)

print(f"Analysis for {label}:")
print(f"  Score range: {scores.min():.3f} to {scores.max():.3f}")
print(f"  Positive samples: {true_labels.sum()}")
print(f"  Negative samples: {len(true_labels) - true_labels.sum()}")

# Get precision-recall curve
precision, recall, thresholds = meter.precision_recall_curve(label)

# Plot PR curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, 'b-', linewidth=2)
plt.xlabel('Recall')
plt.ylabel('Precision') 
plt.title(f'Precision-Recall Curve for {label}')
plt.grid(True)
plt.show()

# Find optimal threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
f1_scores = np.nan_to_num(f1_scores)  # Handle division by zero
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
optimal_f1 = f1_scores[optimal_idx]

print(f"Optimal threshold: {optimal_threshold:.3f}")
print(f"Optimal F1-score: {optimal_f1:.3f}")

# Precision/recall at specific levels
precision_at_80_recall = meter.precision_at_recall(0.8, label)
recall_at_90_precision = meter.recall_at_precision(0.9, label)

print(f"Precision at 80% recall: {precision_at_80_recall:.3f}")
print(f"Recall at 90% precision: {recall_at_90_precision:.3f}")

# Multi-label analysis
labels = model.get_labels()
for label in labels[:5]:  # Analyze first 5 labels
    pr_at_50 = meter.precision_at_recall(0.5, label)
    re_at_90 = meter.recall_at_precision(0.9, label)
    print(f"{label}: P@50%R={pr_at_50:.3f}, R@90%P={re_at_90:.3f}")

Text Preprocessing

Access FastText's internal text processing for consistency with training.

def tokenize(text):
    """
    Tokenize text using FastText's internal tokenizer.
    
    Args:
        text (str): Input text to tokenize
        
    Returns:
        list: List of tokens
    """

Usage Example

import fasttext

# Tokenize text consistently with training
text = "Hello, world! This is a test."
tokens = fasttext.tokenize(text)
print(f"Tokens: {tokens}")

# Compare with model prediction preprocessing
model = fasttext.load_model('classifier.bin')
words, labels = model.get_line(text)
print(f"Model preprocessing: {words}")

# Ensure consistency
custom_text = "E-mail addresses like user@domain.com are tricky!"
custom_tokens = fasttext.tokenize(custom_text)
print(f"Custom tokenization: {custom_tokens}")

Classification Best Practices

Data Preparation

Label Format: Use __label__ prefix for all labels
Text Cleaning: FastText handles basic tokenization, but consider domain-specific preprocessing
Class Balance: Consider stratified sampling for imbalanced datasets
Validation Split: Reserve 10-20% of data for validation/hyperparameter tuning

Model Configuration

Loss Functions:
- softmax: Multi-class classification (default)
- ns: Negative sampling for large vocabularies
- hs: Hierarchical softmax for efficient training
- ova: One-vs-all for multi-label classification
Hyperparameters:
- lr=0.1: Good starting learning rate
- wordNgrams=2: Include bigrams for better context
- minn=3, maxn=6: Character n-grams for robustness
- dim=100-300: Higher dimensions for complex tasks