CRFsuite (python-crfsuite) wrapper which provides interface similar to scikit-learn
—
The main CRF class provides a scikit-learn compatible interface for Conditional Random Field sequence labeling. It wraps the efficient CRFsuite C++ implementation while maintaining full compatibility with sklearn's ecosystem for model selection, cross-validation, and pipeline integration.
Initialize a CRF estimator with algorithm selection and comprehensive hyperparameter configuration.
class CRF:
def __init__(
self,
algorithm='lbfgs',
min_freq=0,
all_possible_states=False,
all_possible_transitions=False,
c1=0,
c2=1.0,
max_iterations=None,
num_memories=6,
epsilon=1e-5,
period=10,
delta=1e-5,
linesearch='MoreThuente',
max_linesearch=20,
calibration_eta=0.1,
calibration_rate=2.0,
calibration_samples=1000,
calibration_candidates=10,
calibration_max_trials=20,
pa_type=1,
c=1,
error_sensitive=True,
averaging=True,
variance=1,
gamma=1,
verbose=False,
model_filename=None,
keep_tempfiles=False,
trainer_cls=None
):
"""
Initialize CRF estimator.
Parameters:
- algorithm: str, training algorithm ('lbfgs', 'l2sgd', 'ap', 'pa', 'arow')
- min_freq: float, feature occurrence frequency cutoff threshold
- all_possible_states: bool, generate state features for all attribute-label combinations
- all_possible_transitions: bool, generate transition features for all label pairs
- c1: float, L1 regularization coefficient (lbfgs only)
- c2: float, L2 regularization coefficient
- max_iterations: int, maximum optimization iterations
- num_memories: int, limited memories for inverse hessian approximation (lbfgs)
- epsilon: float, convergence condition parameter
- period: int, iteration period for stopping criterion testing
- delta: float, stopping criterion threshold
- linesearch: str, line search algorithm ('MoreThuente', 'Backtracking', 'StrongBacktracking')
- max_linesearch: int, maximum line search trials
- calibration_eta: float, initial learning rate for calibration (l2sgd)
- calibration_rate: float, learning rate change rate (l2sgd)
- calibration_samples: int, calibration sample count (l2sgd)
- calibration_candidates: int, learning rate candidates (l2sgd)
- calibration_max_trials: int, maximum calibration trials (l2sgd)
- pa_type: int, passive aggressive strategy (0=no slack, 1=PA-I, 2=PA-II)
- c: float, aggressiveness parameter for PA
- error_sensitive: bool, include prediction error count in objective
- averaging: bool, compute averaged feature weights
- variance: float, initial feature weight variance (arow)
- gamma: float, loss vs weight change tradeoff (arow)
- verbose: bool, enable training progress output
- model_filename: str, path to existing model file
- keep_tempfiles: bool, preserve temporary model files
- trainer_cls: class, custom trainer class
"""Usage Example:
# Basic L-BFGS with regularization
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100)
# Stochastic gradient descent setup
crf_sgd = CRF(
algorithm='l2sgd',
c2=1.0,
calibration_eta=0.01,
calibration_samples=500,
verbose=True
)
# Passive Aggressive configuration
crf_pa = CRF(algorithm='pa', pa_type=1, c=0.5, error_sensitive=True)Train the CRF model on sequential data with optional development set for validation.
def fit(self, X, y, X_dev=None, y_dev=None):
"""
Train the CRF model.
Parameters:
- X: List[List[Dict]], feature sequences for training documents
- y: List[List[str]], label sequences for training documents
- X_dev: List[List[Dict]], optional development/validation feature sequences
- y_dev: List[List[str]], optional development/validation label sequences
Returns:
- self: fitted CRF instance
"""Usage Example:
# Basic training
crf.fit(X_train, y_train)
# Training with validation set
crf.fit(X_train, y_train, X_dev=X_val, y_dev=y_val)Make predictions on new sequences with various output formats.
def predict(self, X):
"""
Predict labels for input sequences.
Parameters:
- X: List[List[Dict]], feature sequences to predict
Returns:
- List[List[str]]: predicted label sequences
"""
def predict_single(self, xseq):
"""
Predict labels for a single sequence.
Parameters:
- xseq: List[Dict], single feature sequence
Returns:
- List[str]: predicted labels for the sequence
"""
def predict_marginals(self, X):
"""
Get marginal probabilities for all labels at each position.
Parameters:
- X: List[List[Dict]], feature sequences
Returns:
- List[List[Dict[str, float]]]: marginal probabilities for each position
"""
def predict_marginals_single(self, xseq):
"""
Get marginal probabilities for a single sequence.
Parameters:
- xseq: List[Dict], single feature sequence
Returns:
- List[Dict[str, float]]: marginal probabilities for each position
"""Usage Example:
# Basic prediction
predictions = crf.predict(X_test)
# Single sequence prediction
single_pred = crf.predict_single(X_test[0])
# Get prediction confidence
marginals = crf.predict_marginals(X_test)
for seq_marginals in marginals:
for pos_probs in seq_marginals:
best_label = max(pos_probs, key=pos_probs.get)
confidence = pos_probs[best_label]
print(f"Label: {best_label}, Confidence: {confidence:.3f}")Evaluate model performance using built-in scoring methods.
def score(self, X, y):
"""
Return token-level accuracy score.
Parameters:
- X: List[List[Dict]], feature sequences
- y: List[List[str]], true label sequences
Returns:
- float: flat accuracy score (token-level accuracy)
"""Access learned model parameters and feature information.
@property
def classes_(self):
"""List of class labels learned during training."""
@property
def tagger_(self):
"""Underlying pycrfsuite.Tagger instance."""
@property
def size_(self):
"""Model size in bytes."""
@property
def num_attributes_(self):
"""Number of non-zero CRF attributes."""
@property
def attributes_(self):
"""List of learned feature attributes."""
@property
def state_features_(self):
"""
Dict mapping (attribute_name, label) tuples to feature coefficients.
Shows learned weights for state features.
"""
@property
def transition_features_(self):
"""
Dict mapping (label_from, label_to) tuples to transition coefficients.
Shows learned weights for label transitions.
"""
@property
def training_log_(self):
"""Training log parser with iteration details."""Usage Example:
# Inspect learned model
print(f"Model size: {crf.size_} bytes")
print(f"Number of features: {crf.num_attributes_}")
print(f"Learned labels: {crf.classes_}")
# Examine feature weights
for (attr, label), weight in crf.state_features_.items():
if abs(weight) > 0.1: # Show only significant features
print(f"Feature '{attr}' -> '{label}': {weight:.3f}")
# Check transition patterns
for (from_label, to_label), weight in crf.transition_features_.items():
if abs(weight) > 0.1:
print(f"Transition '{from_label}' -> '{to_label}': {weight:.3f}")Install with Tessl CLI
npx tessl i tessl/pypi-sklearn-crfsuite