FastText library for efficient learning of word representations and sentence classification
—
FastText provides comprehensive text classification capabilities including prediction, evaluation, and detailed performance metrics. Supports multi-class and multi-label classification with confidence thresholds and top-k predictions.
Classify text into predefined categories with confidence scores and threshold filtering.
def predict(text, k=1, threshold=0.0, on_unicode_error='strict'):
"""
Predict labels for input text.
Args:
text (str or list): Input text to classify or list of texts for batch prediction
k (int): Number of top predictions to return (default: 1)
threshold (float): Minimum prediction confidence (default: 0.0)
on_unicode_error (str): Unicode error handling (default: 'strict')
Returns:
tuple: If text is str, returns (labels, probabilities) where labels is
a list of predicted labels and probabilities is a numpy array of scores.
If text is list, returns (all_labels, all_probabilities) where each
is a list containing results for each input text.
Raises:
ValueError: If text contains newline characters or model is not supervised
"""
def get_line(text, on_unicode_error='strict'):
"""
Split text into words and labels for internal processing.
Args:
text (str or list): Input text or list of texts (must not contain newlines)
on_unicode_error (str): Unicode error handling (default: 'strict')
Returns:
tuple or list: If text is str, returns (words, labels) tuple.
If text is list, returns list of (words, labels) tuples.
words is tokenized text, labels is list of any labels found
Raises:
ValueError: If text contains newline characters
Note:
Labels must start with the prefix used to create the model (__label__ by default)
"""import fasttext
# Load trained classifier
model = fasttext.load_model('classifier.bin')
# Single prediction
text = "This movie is absolutely fantastic!"
labels, probabilities = model.predict(text)
print(f"Predicted: {labels[0]} (confidence: {probabilities[0]:.4f})")
# Top-k predictions
labels, probabilities = model.predict(text, k=3)
print("Top 3 predictions:")
for label, prob in zip(labels, probabilities):
print(f" {label}: {prob:.4f}")
# Predictions with threshold
labels, probabilities = model.predict(text, k=5, threshold=0.1)
print(f"Predictions above 0.1 confidence: {len(labels)}")
# Batch predictions
texts = [
"Great movie, loved it!",
"Terrible film, waste of time.",
"It was okay, nothing special."
]
for text in texts:
labels, probs = model.predict(text)
print(f"'{text}' -> {labels[0]} ({probs[0]:.3f})")
# Handle multilabel predictions
multilabel_text = "This is a great action comedy movie"
labels, probs = model.predict(multilabel_text, k=3, threshold=0.2)
print(f"Multiple labels: {labels}")Evaluate classifier performance on test datasets with precision, recall, and F1-score metrics.
def test(path, k=1, threshold=0.0):
"""
Evaluate model on test data.
Args:
path (str): Path to test file in training format
k (int): Number of predictions to consider (default: 1)
threshold (float): Minimum prediction confidence (default: 0.0)
Returns:
tuple: (sample_count, precision, recall) where sample_count is
number of test samples, precision is P@k, recall is R@k
"""
def test_label(path, k=1, threshold=0.0):
"""
Get per-label precision and recall scores.
Args:
path (str): Path to test file in training format
k (int): Number of predictions to consider (default: 1)
threshold (float): Minimum prediction confidence (default: 0.0)
Returns:
dict: Dictionary mapping label names to dictionaries with 'precision' and 'recall' keys
Example: {'__label__positive': {'precision': 0.7, 'recall': 0.74}}
"""import fasttext
model = fasttext.load_model('classifier.bin')
# Overall evaluation
n_samples, precision, recall = model.test('test.txt')
f1_score = 2 * (precision * recall) / (precision + recall)
print(f"Test Results:")
print(f" Samples: {n_samples}")
print(f" Precision@1: {precision:.4f}")
print(f" Recall@1: {recall:.4f}")
print(f" F1-Score: {f1_score:.4f}")
# Top-k evaluation
n_samples, precision_k, recall_k = model.test('test.txt', k=3)
print(f"Precision@3: {precision_k:.4f}")
print(f"Recall@3: {recall_k:.4f}")
# Per-label evaluation
label_scores = model.test_label('test.txt')
print("Per-label scores:")
for label, (precision, recall) in label_scores.items():
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
print(f" {label}: P={precision:.3f}, R={recall:.3f}, F1={f1:.3f}")
# Evaluation with threshold
n_samples, precision_t, recall_t = model.test('test.txt', k=1, threshold=0.5)
print(f"With threshold 0.5 - P@1: {precision_t:.4f}, R@1: {recall_t:.4f}")Access detailed evaluation metrics and precision-recall curves for comprehensive model analysis.
def get_meter(path, k=-1):
"""
Get evaluation meter for detailed metrics.
Args:
path (str): Path to test file
k (int): Number of predictions to consider (default: -1 for all)
Returns:
_Meter: Meter object for detailed evaluation
"""The _Meter class provides advanced metric analysis:
class _Meter:
def score_vs_true(self, label):
"""
Get scores and true labels for a specific label.
Args:
label (str): Label to analyze
Returns:
tuple: (scores_array, true_labels_array) for ROC/PR analysis
"""
def precision_recall_curve(self, label=None):
"""
Get precision-recall curve data.
Args:
label (str, optional): Specific label or None for micro-average
Returns:
tuple: (precision_array, recall_array, thresholds_array)
"""
def precision_at_recall(self, recall, label=None):
"""
Get precision at specific recall level.
Args:
recall (float): Target recall level (0.0-1.0)
label (str, optional): Specific label or None for micro-average
Returns:
float: Precision at the specified recall level
"""
def recall_at_precision(self, precision, label=None):
"""
Get recall at specific precision level.
Args:
precision (float): Target precision level (0.0-1.0)
label (str, optional): Specific label or None for micro-average
Returns:
float: Recall at the specified precision level
"""import fasttext
import matplotlib.pyplot as plt
import numpy as np
model = fasttext.load_model('classifier.bin')
# Get detailed evaluation meter
meter = model.get_meter('test.txt')
# Analyze specific label
label = '__label__positive'
scores, true_labels = meter.score_vs_true(label)
print(f"Analysis for {label}:")
print(f" Score range: {scores.min():.3f} to {scores.max():.3f}")
print(f" Positive samples: {true_labels.sum()}")
print(f" Negative samples: {len(true_labels) - true_labels.sum()}")
# Get precision-recall curve
precision, recall, thresholds = meter.precision_recall_curve(label)
# Plot PR curve
plt.figure(figsize=(8, 6))
plt.plot(recall, precision, 'b-', linewidth=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title(f'Precision-Recall Curve for {label}')
plt.grid(True)
plt.show()
# Find optimal threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
f1_scores = np.nan_to_num(f1_scores) # Handle division by zero
optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
optimal_f1 = f1_scores[optimal_idx]
print(f"Optimal threshold: {optimal_threshold:.3f}")
print(f"Optimal F1-score: {optimal_f1:.3f}")
# Precision/recall at specific levels
precision_at_80_recall = meter.precision_at_recall(0.8, label)
recall_at_90_precision = meter.recall_at_precision(0.9, label)
print(f"Precision at 80% recall: {precision_at_80_recall:.3f}")
print(f"Recall at 90% precision: {recall_at_90_precision:.3f}")
# Multi-label analysis
labels = model.get_labels()
for label in labels[:5]: # Analyze first 5 labels
pr_at_50 = meter.precision_at_recall(0.5, label)
re_at_90 = meter.recall_at_precision(0.9, label)
print(f"{label}: P@50%R={pr_at_50:.3f}, R@90%P={re_at_90:.3f}")Access FastText's internal text processing for consistency with training.
def tokenize(text):
"""
Tokenize text using FastText's internal tokenizer.
Args:
text (str): Input text to tokenize
Returns:
list: List of tokens
"""import fasttext
# Tokenize text consistently with training
text = "Hello, world! This is a test."
tokens = fasttext.tokenize(text)
print(f"Tokens: {tokens}")
# Compare with model prediction preprocessing
model = fasttext.load_model('classifier.bin')
words, labels = model.get_line(text)
print(f"Model preprocessing: {words}")
# Ensure consistency
custom_text = "E-mail addresses like user@domain.com are tricky!"
custom_tokens = fasttext.tokenize(custom_text)
print(f"Custom tokenization: {custom_tokens}")__label__ prefix for all labelsLoss Functions:
softmax: Multi-class classification (default)ns: Negative sampling for large vocabularieshs: Hierarchical softmax for efficient trainingova: One-vs-all for multi-label classificationHyperparameters:
lr=0.1: Good starting learning ratewordNgrams=2: Include bigrams for better contextminn=3, maxn=6: Character n-grams for robustnessdim=100-300: Higher dimensions for complex tasksInstall with Tessl CLI
npx tessl i tessl/pypi-fasttext