CRFsuite (python-crfsuite) wrapper which provides interface similar to scikit-learn
—
Specialized metrics for sequence labeling evaluation that properly handle the structured nature of CRF predictions. These metrics provide both token-level (flat) and sequence-level accuracy measures essential for evaluating named entity recognition, part-of-speech tagging, and other sequence labeling tasks.
Metrics that evaluate individual token predictions by flattening sequences into individual predictions, useful for understanding per-token accuracy patterns.
def flat_accuracy_score(y_true, y_pred):
"""
Calculate token-level accuracy by flattening sequence predictions.
Parameters:
- y_true: List[List[str]], true label sequences
- y_pred: List[List[str]], predicted label sequences
Returns:
- float: accuracy score (correct tokens / total tokens)
"""
def flat_precision_score(y_true, y_pred, **kwargs):
"""
Calculate token-level precision score.
Parameters:
- y_true: List[List[str]], true label sequences
- y_pred: List[List[str]], predicted label sequences
- **kwargs: additional parameters passed to sklearn.metrics.precision_score
Returns:
- float: precision score
"""
def flat_recall_score(y_true, y_pred, **kwargs):
"""
Calculate token-level recall score.
Parameters:
- y_true: List[List[str]], true label sequences
- y_pred: List[List[str]], predicted label sequences
- **kwargs: additional parameters passed to sklearn.metrics.recall_score
Returns:
- float: recall score
"""
def flat_f1_score(y_true, y_pred, **kwargs):
"""
Calculate token-level F1 score.
Parameters:
- y_true: List[List[str]], true label sequences
- y_pred: List[List[str]], predicted label sequences
- **kwargs: additional parameters passed to sklearn.metrics.f1_score
Returns:
- float: F1 score
"""
def flat_fbeta_score(y_true, y_pred, beta, **kwargs):
"""
Calculate token-level F-beta score.
Parameters:
- y_true: List[List[str]], true label sequences
- y_pred: List[List[str]], predicted label sequences
- beta: float, beta parameter for F-beta score
- **kwargs: additional parameters passed to sklearn.metrics.fbeta_score
Returns:
- float: F-beta score
"""
def flat_classification_report(y_true, y_pred, labels=None, **kwargs):
"""
Generate detailed classification report for token-level predictions.
Parameters:
- y_true: List[List[str]], true label sequences
- y_pred: List[List[str]], predicted label sequences
- labels: List[str], labels to include in report
- **kwargs: additional parameters passed to sklearn.metrics.classification_report
Returns:
- str: formatted classification report
"""Usage Example:
from sklearn_crfsuite import metrics
# Sample predictions
y_true = [['B-PER', 'I-PER', 'O', 'B-LOC'], ['O', 'B-ORG', 'I-ORG']]
y_pred = [['B-PER', 'I-PER', 'O', 'O'], ['O', 'B-ORG', 'B-ORG']]
# Token-level evaluation
accuracy = metrics.flat_accuracy_score(y_true, y_pred)
precision = metrics.flat_precision_score(y_true, y_pred, average='weighted')
recall = metrics.flat_recall_score(y_true, y_pred, average='weighted')
f1 = metrics.flat_f1_score(y_true, y_pred, average='weighted')
print(f"Token Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")
# Detailed classification report
report = metrics.flat_classification_report(y_true, y_pred)
print("Classification Report:")
print(report)Metrics that evaluate complete sequence predictions, providing stricter evaluation where a sequence is considered correct only if all tokens match exactly.
def sequence_accuracy_score(y_true, y_pred):
"""
Calculate sequence-level accuracy where entire sequences must match exactly.
Parameters:
- y_true: List[List[str]], true label sequences
- y_pred: List[List[str]], predicted label sequences
Returns:
- float: sequence accuracy (correct sequences / total sequences)
"""Usage Example:
# Sequence-level evaluation (stricter)
seq_accuracy = metrics.sequence_accuracy_score(y_true, y_pred)
print(f"Sequence Accuracy: {seq_accuracy:.3f}")
# Compare with token-level
print(f"Token vs Sequence Accuracy: {accuracy:.3f} vs {seq_accuracy:.3f}")
# Perfect prediction case
y_perfect = [['B-PER', 'I-PER', 'O', 'B-LOC'], ['O', 'B-ORG', 'I-ORG']]
perfect_seq_acc = metrics.sequence_accuracy_score(y_true, y_perfect)
perfect_tok_acc = metrics.flat_accuracy_score(y_true, y_perfect)
print(f"Perfect scores - Token: {perfect_tok_acc}, Sequence: {perfect_seq_acc}")Cross-validation with CRF:
from sklearn.model_selection import cross_val_score
from sklearn_crfsuite import CRF, metrics
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)
# Custom scoring function for cross-validation
def crf_sequence_accuracy(estimator, X, y):
y_pred = estimator.predict(X)
return metrics.sequence_accuracy_score(y, y_pred)
# Use in cross-validation
cv_scores = cross_val_score(crf, X, y, cv=5, scoring=crf_sequence_accuracy)
print(f"CV Sequence Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")Comprehensive evaluation:
def evaluate_crf(crf, X_test, y_test):
"""Comprehensive CRF evaluation with multiple metrics."""
y_pred = crf.predict(X_test)
results = {
'flat_accuracy': metrics.flat_accuracy_score(y_test, y_pred),
'sequence_accuracy': metrics.sequence_accuracy_score(y_test, y_pred),
'precision_macro': metrics.flat_precision_score(y_test, y_pred, average='macro'),
'recall_macro': metrics.flat_recall_score(y_test, y_pred, average='macro'),
'f1_macro': metrics.flat_f1_score(y_test, y_pred, average='macro'),
'precision_weighted': metrics.flat_precision_score(y_test, y_pred, average='weighted'),
'recall_weighted': metrics.flat_recall_score(y_test, y_pred, average='weighted'),
'f1_weighted': metrics.flat_f1_score(y_test, y_pred, average='weighted')
}
return results
# Use the evaluation function
evaluation_results = evaluate_crf(crf, X_test, y_test)
for metric, score in evaluation_results.items():
print(f"{metric}: {score:.3f}")Install with Tessl CLI
npx tessl i tessl/pypi-sklearn-crfsuite