tessl/pypi-sklearn-crfsuite

CRFsuite (python-crfsuite) wrapper which provides interface similar to scikit-learn

—

Pending

Overview

Eval results

Files

Advanced Features

Name: tessl/pypi-sklearn-crfsuite
Author: tessl

Advanced customization options for sklearn-crfsuite including custom trainer classes, specialized training workflows, and detailed training logging. These features are useful for users who need fine-grained control over the training process or want detailed insights into model convergence.

Capabilities

Custom Training with Detailed Logging

The LinePerIterationTrainer provides enhanced training output with detailed per-iteration statistics, making it easier to monitor training progress and diagnose convergence issues.

class LinePerIterationTrainer:
    """
    Enhanced pycrfsuite.Trainer that prints detailed information about each 
    training iteration on a single line with tabulated final results.
    
    This trainer extends the standard pycrfsuite.Trainer with:
    - Per-iteration progress logging
    - Detailed performance metrics during training
    - Final performance summary table
    """
    
    def on_iteration(self, log, info):
        """Called after each training iteration to display progress."""
    
    def on_optimization_end(self, log):
        """Called when training completes to display final results table."""

Usage Example:

from sklearn_crfsuite import CRF
from sklearn_crfsuite.trainer import LinePerIterationTrainer

# Use custom trainer for detailed training logs
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    verbose=True,  # Enable trainer output
    trainer_cls=LinePerIterationTrainer  # Use enhanced trainer
)

# Training will now show detailed per-iteration progress
crf.fit(X_train, y_train, X_dev=X_val, y_dev=y_val)

# Example output during training:
# Iter 1   time=0.05  loss=45.23    active=1250  
# Iter 2   time=0.04  loss=38.17    active=1180  
# Iter 3   time=0.04  loss=32.91    active=1150  precision=0.752  recall=0.698  F1=0.724  Acc(item/seq)=0.834 0.567
# ...
# =====================================
# Label      Precision  Recall  F1     Support
# B-PER      0.856      0.792   0.823  125
# I-PER      0.743      0.681   0.711  89
# B-LOC      0.912      0.878   0.895  156
# I-LOC      0.834      0.798   0.816  67
# O          0.945      0.967   0.956  1543
# -------------------------------------

Training Progress Analysis

Access detailed training logs for analysis and debugging:

@property
def training_log_:
    """
    Training log parser containing detailed iteration information.
    Available after model training completes.
    """

Usage Example:

# Access training log after fitting
crf.fit(X_train, y_train)
training_log = crf.training_log_

# Analyze training progress
if training_log:
    last_iteration = training_log.last_iteration
    print(f"Final loss: {last_iteration.get('loss', 'N/A')}")
    print(f"Training time: {last_iteration.get('time', 'N/A')} seconds")
    print(f"Active features: {last_iteration.get('active_features', 'N/A')}")
    
    # Check if validation scores are available
    if 'scores' in last_iteration:
        print("\nFinal validation scores per label:")
        for label, score in last_iteration['scores'].items():
            print(f"{label}: P={score.precision:.3f} R={score.recall:.3f} F1={score.f1:.3f}")

Custom Trainer Implementation

Create custom trainers for specialized training workflows:

Usage Example:

import pycrfsuite
from sklearn_crfsuite.trainer import LinePerIterationTrainer

class CustomTrainer(LinePerIterationTrainer):
    """Custom trainer with additional logging and early stopping."""
    
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.iteration_losses = []
        self.early_stop_patience = 10
        self.early_stop_threshold = 0.001
    
    def on_iteration(self, log, info):
        # Store loss for early stopping analysis
        current_loss = info.get('loss', float('inf'))
        self.iteration_losses.append(current_loss)
        
        # Check for early stopping
        if len(self.iteration_losses) > self.early_stop_patience:
            recent_losses = self.iteration_losses[-self.early_stop_patience:]
            if max(recent_losses) - min(recent_losses) < self.early_stop_threshold:
                print(f"\\nEarly stopping at iteration {info['num']} - loss converged")
        
        # Call parent method for standard logging
        super().on_iteration(log, info)
    
    def on_optimization_end(self, log):
        print(f"\\nTraining completed with {len(self.iteration_losses)} iterations")
        print(f"Final loss: {self.iteration_losses[-1]:.4f}")
        super().on_optimization_end(log)

# Use custom trainer
crf = CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    trainer_cls=CustomTrainer,
    verbose=True
)

Training Diagnostics

Advanced diagnostic functions for training analysis:

Usage Example:

def analyze_training_convergence(crf):
    """Analyze training convergence patterns."""
    if not crf.training_log_:
        print("No training log available")
        return
    
    log = crf.training_log_
    iterations = []
    losses = []
    
    # Extract iteration data (this is conceptual - actual log structure may vary)
    for i in range(len(log.iterations)):
        iter_info = log.iterations[i]
        iterations.append(iter_info.get('num', i))
        losses.append(iter_info.get('loss', 0))
    
    # Analyze convergence
    if len(losses) > 10:
        early_loss = sum(losses[:5]) / 5
        late_loss = sum(losses[-5:]) / 5
        improvement = (early_loss - late_loss) / early_loss * 100
        print(f"Loss improvement: {improvement:.2f}%")
        
        # Check for overfitting indicators
        if len(losses) > 20:
            mid_loss = sum(losses[10:15]) / 5
            if late_loss > mid_loss:
                print("Warning: Possible overfitting detected")

# Usage
crf.fit(X_train, y_train, X_dev=X_val, y_dev=y_val)
analyze_training_convergence(crf)

Algorithm-Specific Configuration

Advanced configuration for different training algorithms:

Usage Example:

# L-BFGS with custom line search
crf_lbfgs = CRF(
    algorithm='lbfgs',
    linesearch='StrongBacktracking',  # More aggressive line search
    max_linesearch=50,  # More line search attempts
    num_memories=10,  # More L-BFGS memories
    trainer_cls=LinePerIterationTrainer,
    verbose=True
)

# Stochastic gradient descent with calibration
crf_sgd = CRF(
    algorithm='l2sgd',
    calibration_eta=0.01,  # Lower initial learning rate
    calibration_rate=1.5,  # Slower learning rate adjustment
    calibration_samples=2000,  # More calibration samples
    trainer_cls=LinePerIterationTrainer,
    verbose=True
)

# Passive Aggressive with detailed monitoring
crf_pa = CRF(
    algorithm='pa',
    pa_type=2,  # PA-II variant
    c=0.1,  # Lower aggressiveness
    error_sensitive=True,  # Include error count in objective
    trainer_cls=LinePerIterationTrainer,
    verbose=True
)

Install with Tessl CLI