tessl/pypi-sklearn-crfsuite

CRFsuite (python-crfsuite) wrapper which provides interface similar to scikit-learn

—

Pending

Overview

Eval results

Files

Scikit-learn Integration

Name: tessl/pypi-sklearn-crfsuite
Author: tessl

Ready-to-use scorer functions and utilities that enable seamless integration of sklearn-crfsuite with scikit-learn's model selection ecosystem, including cross-validation, grid search, pipeline construction, and automated hyperparameter optimization.

Capabilities

Built-in Scorers

Pre-configured sklearn scorer objects that can be used directly with scikit-learn's model selection utilities.

from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy

flat_accuracy: sklearn.metrics.scorer._BaseScorer
    """Scorer for token-level accuracy using sklearn's make_scorer."""

sequence_accuracy: sklearn.metrics.scorer._BaseScorer  
    """Scorer for sequence-level accuracy using sklearn's make_scorer."""

Usage Example:

from sklearn.model_selection import cross_val_score
from sklearn_crfsuite import CRF
from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy

crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)

# Use built-in scorers with cross-validation
flat_scores = cross_val_score(crf, X, y, cv=5, scoring=flat_accuracy)
seq_scores = cross_val_score(crf, X, y, cv=5, scoring=sequence_accuracy)

print(f"Flat accuracy: {flat_scores.mean():.3f} (+/- {flat_scores.std() * 2:.3f})")
print(f"Sequence accuracy: {seq_scores.mean():.3f} (+/- {seq_scores.std() * 2:.3f})")

Grid Search Integration

Complete compatibility with scikit-learn's hyperparameter optimization tools.

Usage Example:

from sklearn.model_selection import GridSearchCV
from sklearn_crfsuite import CRF
from sklearn_crfsuite.scorers import flat_accuracy

# Define parameter grid
param_grid = {
    'algorithm': ['lbfgs', 'l2sgd'],
    'c1': [0.01, 0.1, 1.0],
    'c2': [0.01, 0.1, 1.0],
    'max_iterations': [50, 100, 200]
}

# Grid search with CRF
crf = CRF()
grid_search = GridSearchCV(
    crf,
    param_grid,
    cv=3,
    scoring=flat_accuracy,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")

# Use best model
best_crf = grid_search.best_estimator_
predictions = best_crf.predict(X_test)

Pipeline Integration

Use CRF models within scikit-learn pipelines for complete ML workflows.

Usage Example:

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn_crfsuite import CRF

# Create pipeline with feature extraction and CRF
pipeline = Pipeline([
    ('vectorizer', DictVectorizer(sparse=False)),
    ('crf', CRF(algorithm='lbfgs', c1=0.1, c2=0.1))
])

# Note: This is a conceptual example. In practice, CRF expects 
# sequences of feature dicts, not flat feature vectors.
# Custom transformers would be needed for real pipeline usage.

Custom Scorer Creation

Create custom scorers for specific evaluation needs.

Usage Example:

from sklearn.metrics import make_scorer
from sklearn_crfsuite import metrics

# Create custom scorers
def macro_f1_scorer(y_true, y_pred):
    return metrics.flat_f1_score(y_true, y_pred, average='macro')

def weighted_precision_scorer(y_true, y_pred):
    return metrics.flat_precision_score(y_true, y_pred, average='weighted')

# Convert to sklearn scorers
macro_f1 = make_scorer(macro_f1_scorer)
weighted_precision = make_scorer(weighted_precision_scorer)

# Use in grid search
scoring = {
    'flat_acc': flat_accuracy,
    'seq_acc': sequence_accuracy,
    'macro_f1': macro_f1,
    'weighted_prec': weighted_precision
}

grid_search = GridSearchCV(
    CRF(),
    param_grid,
    cv=3,
    scoring=scoring,
    refit='macro_f1'  # Use macro F1 to select best model
)

Cross-Validation Strategies

Advanced cross-validation patterns for sequence labeling tasks.

Usage Example:

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn_crfsuite import CRF
from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy

def sequence_stratified_split(X, y, n_splits=5):
    """
    Custom stratification for sequence data based on label distributions.
    This is a conceptual example - real implementation would need 
    to handle sequence-specific stratification.
    """
    # Flatten labels for stratification
    flat_labels = [label for seq in y for label in seq]
    # Use most common label per sequence for stratification key
    seq_labels = [max(set(seq), key=seq.count) for seq in y]
    
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    return skf.split(X, seq_labels)

# Comprehensive cross-validation
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)

scoring = {
    'flat_accuracy': flat_accuracy,
    'sequence_accuracy': sequence_accuracy
}

cv_results = cross_validate(
    crf, X, y,
    cv=5,
    scoring=scoring,
    return_train_score=True,
    return_estimator=True
)

print("Cross-validation results:")
for metric in ['flat_accuracy', 'sequence_accuracy']:
    test_scores = cv_results[f'test_{metric}']
    train_scores = cv_results[f'train_{metric}']
    print(f"{metric}:")
    print(f"  Test:  {test_scores.mean():.3f} (+/- {test_scores.std() * 2:.3f})")
    print(f"  Train: {train_scores.mean():.3f} (+/- {train_scores.std() * 2:.3f})")

Model Persistence

Save and load trained CRF models using joblib or pickle.

Usage Example:

import joblib
from sklearn_crfsuite import CRF

# Train and save model
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)
crf.fit(X_train, y_train)

# Save with joblib (recommended)
joblib.dump(crf, 'crf_model.pkl')

# Load model
loaded_crf = joblib.load('crf_model.pkl')

# Verify model works
predictions = loaded_crf.predict(X_test)
accuracy = loaded_crf.score(X_test, y_test)
print(f"Loaded model accuracy: {accuracy:.3f}")

# Alternative: use pickle
import pickle

with open('crf_model_pickle.pkl', 'wb') as f:
    pickle.dump(crf, f)

with open('crf_model_pickle.pkl', 'rb') as f:
    loaded_crf_pickle = pickle.load(f)

Utility Functions

Additional utilities for working with sequence data in sklearn contexts.

from sklearn_crfsuite.utils import flatten

def flatten(sequences):
    """
    Flatten a list of sequences into a single list.
    
    Parameters:
    - sequences: List[List[Any]], list of sequences to flatten
    
    Returns:
    - List[Any]: flattened list
    """

Usage Example:

from sklearn_crfsuite.utils import flatten

# Flatten sequence data when needed
y_sequences = [['B-PER', 'I-PER', 'O'], ['O', 'B-LOC']]
y_flat = flatten(y_sequences)
print(y_flat)  # ['B-PER', 'I-PER', 'O', 'O', 'B-LOC']

# Useful for creating custom metrics or preprocessing
def create_label_encoder(y_sequences):
    """Create sklearn LabelEncoder from sequence data."""
    from sklearn.preprocessing import LabelEncoder
    
    flat_labels = flatten(y_sequences)
    encoder = LabelEncoder()
    encoder.fit(flat_labels)
    return encoder

encoder = create_label_encoder(y_train)
all_labels = encoder.classes_
print(f"Unique labels: {all_labels}")

Install with Tessl CLI