CRFsuite (python-crfsuite) wrapper which provides interface similar to scikit-learn
—
Ready-to-use scorer functions and utilities that enable seamless integration of sklearn-crfsuite with scikit-learn's model selection ecosystem, including cross-validation, grid search, pipeline construction, and automated hyperparameter optimization.
Pre-configured sklearn scorer objects that can be used directly with scikit-learn's model selection utilities.
from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy
flat_accuracy: sklearn.metrics.scorer._BaseScorer
"""Scorer for token-level accuracy using sklearn's make_scorer."""
sequence_accuracy: sklearn.metrics.scorer._BaseScorer
"""Scorer for sequence-level accuracy using sklearn's make_scorer."""Usage Example:
from sklearn.model_selection import cross_val_score
from sklearn_crfsuite import CRF
from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)
# Use built-in scorers with cross-validation
flat_scores = cross_val_score(crf, X, y, cv=5, scoring=flat_accuracy)
seq_scores = cross_val_score(crf, X, y, cv=5, scoring=sequence_accuracy)
print(f"Flat accuracy: {flat_scores.mean():.3f} (+/- {flat_scores.std() * 2:.3f})")
print(f"Sequence accuracy: {seq_scores.mean():.3f} (+/- {seq_scores.std() * 2:.3f})")Complete compatibility with scikit-learn's hyperparameter optimization tools.
Usage Example:
from sklearn.model_selection import GridSearchCV
from sklearn_crfsuite import CRF
from sklearn_crfsuite.scorers import flat_accuracy
# Define parameter grid
param_grid = {
'algorithm': ['lbfgs', 'l2sgd'],
'c1': [0.01, 0.1, 1.0],
'c2': [0.01, 0.1, 1.0],
'max_iterations': [50, 100, 200]
}
# Grid search with CRF
crf = CRF()
grid_search = GridSearchCV(
crf,
param_grid,
cv=3,
scoring=flat_accuracy,
n_jobs=-1,
verbose=1
)
grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")
# Use best model
best_crf = grid_search.best_estimator_
predictions = best_crf.predict(X_test)Use CRF models within scikit-learn pipelines for complete ML workflows.
Usage Example:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn_crfsuite import CRF
# Create pipeline with feature extraction and CRF
pipeline = Pipeline([
('vectorizer', DictVectorizer(sparse=False)),
('crf', CRF(algorithm='lbfgs', c1=0.1, c2=0.1))
])
# Note: This is a conceptual example. In practice, CRF expects
# sequences of feature dicts, not flat feature vectors.
# Custom transformers would be needed for real pipeline usage.Create custom scorers for specific evaluation needs.
Usage Example:
from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics
# Create custom scorers
def macro_f1_scorer(y_true, y_pred):
return metrics.flat_f1_score(y_true, y_pred, average='macro')
def weighted_precision_scorer(y_true, y_pred):
return metrics.flat_precision_score(y_true, y_pred, average='weighted')
# Convert to sklearn scorers
macro_f1 = make_scorer(macro_f1_scorer)
weighted_precision = make_scorer(weighted_precision_scorer)
# Use in grid search
scoring = {
'flat_acc': flat_accuracy,
'seq_acc': sequence_accuracy,
'macro_f1': macro_f1,
'weighted_prec': weighted_precision
}
grid_search = GridSearchCV(
CRF(),
param_grid,
cv=3,
scoring=scoring,
refit='macro_f1' # Use macro F1 to select best model
)Advanced cross-validation patterns for sequence labeling tasks.
Usage Example:
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn_crfsuite import CRF
from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy
def sequence_stratified_split(X, y, n_splits=5):
"""
Custom stratification for sequence data based on label distributions.
This is a conceptual example - real implementation would need
to handle sequence-specific stratification.
"""
# Flatten labels for stratification
flat_labels = [label for seq in y for label in seq]
# Use most common label per sequence for stratification key
seq_labels = [max(set(seq), key=seq.count) for seq in y]
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
return skf.split(X, seq_labels)
# Comprehensive cross-validation
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)
scoring = {
'flat_accuracy': flat_accuracy,
'sequence_accuracy': sequence_accuracy
}
cv_results = cross_validate(
crf, X, y,
cv=5,
scoring=scoring,
return_train_score=True,
return_estimator=True
)
print("Cross-validation results:")
for metric in ['flat_accuracy', 'sequence_accuracy']:
test_scores = cv_results[f'test_{metric}']
train_scores = cv_results[f'train_{metric}']
print(f"{metric}:")
print(f" Test: {test_scores.mean():.3f} (+/- {test_scores.std() * 2:.3f})")
print(f" Train: {train_scores.mean():.3f} (+/- {train_scores.std() * 2:.3f})")Save and load trained CRF models using joblib or pickle.
Usage Example:
import joblib
from sklearn_crfsuite import CRF
# Train and save model
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)
crf.fit(X_train, y_train)
# Save with joblib (recommended)
joblib.dump(crf, 'crf_model.pkl')
# Load model
loaded_crf = joblib.load('crf_model.pkl')
# Verify model works
predictions = loaded_crf.predict(X_test)
accuracy = loaded_crf.score(X_test, y_test)
print(f"Loaded model accuracy: {accuracy:.3f}")
# Alternative: use pickle
import pickle
with open('crf_model_pickle.pkl', 'wb') as f:
pickle.dump(crf, f)
with open('crf_model_pickle.pkl', 'rb') as f:
loaded_crf_pickle = pickle.load(f)Additional utilities for working with sequence data in sklearn contexts.
from sklearn_crfsuite.utils import flatten
def flatten(sequences):
"""
Flatten a list of sequences into a single list.
Parameters:
- sequences: List[List[Any]], list of sequences to flatten
Returns:
- List[Any]: flattened list
"""Usage Example:
from sklearn_crfsuite.utils import flatten
# Flatten sequence data when needed
y_sequences = [['B-PER', 'I-PER', 'O'], ['O', 'B-LOC']]
y_flat = flatten(y_sequences)
print(y_flat) # ['B-PER', 'I-PER', 'O', 'O', 'B-LOC']
# Useful for creating custom metrics or preprocessing
def create_label_encoder(y_sequences):
"""Create sklearn LabelEncoder from sequence data."""
from sklearn.preprocessing import LabelEncoder
flat_labels = flatten(y_sequences)
encoder = LabelEncoder()
encoder.fit(flat_labels)
return encoder
encoder = create_label_encoder(y_train)
all_labels = encoder.classes_
print(f"Unique labels: {all_labels}")Install with Tessl CLI
npx tessl i tessl/pypi-sklearn-crfsuite