Tessl Tile for pypi/sklearn-crfsuite@0.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced.md crf-estimator.md index.md metrics.md sklearn-integration.md utils.md

sklearn-integration.mddocs/

0
# Scikit-learn Integration
1

2
Ready-to-use scorer functions and utilities that enable seamless integration of sklearn-crfsuite with scikit-learn's model selection ecosystem, including cross-validation, grid search, pipeline construction, and automated hyperparameter optimization.
3

4
## Capabilities
5

6
### Built-in Scorers
7

8
Pre-configured sklearn scorer objects that can be used directly with scikit-learn's model selection utilities.
9

10
```python { .api }
11
from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy
12

13
flat_accuracy: sklearn.metrics.scorer._BaseScorer
14
    """Scorer for token-level accuracy using sklearn's make_scorer."""
15

16
sequence_accuracy: sklearn.metrics.scorer._BaseScorer  
17
    """Scorer for sequence-level accuracy using sklearn's make_scorer."""
18
```
19

20
**Usage Example:**
21

22
```python
23
from sklearn.model_selection import cross_val_score
24
from sklearn_crfsuite import CRF
25
from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy
26

27
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)
28

29
# Use built-in scorers with cross-validation
30
flat_scores = cross_val_score(crf, X, y, cv=5, scoring=flat_accuracy)
31
seq_scores = cross_val_score(crf, X, y, cv=5, scoring=sequence_accuracy)
32

33
print(f"Flat accuracy: {flat_scores.mean():.3f} (+/- {flat_scores.std() * 2:.3f})")
34
print(f"Sequence accuracy: {seq_scores.mean():.3f} (+/- {seq_scores.std() * 2:.3f})")
35
```
36

37
### Grid Search Integration
38

39
Complete compatibility with scikit-learn's hyperparameter optimization tools.
40

41
**Usage Example:**
42

43
```python
44
from sklearn.model_selection import GridSearchCV
45
from sklearn_crfsuite import CRF
46
from sklearn_crfsuite.scorers import flat_accuracy
47

48
# Define parameter grid
49
param_grid = {
50
    'algorithm': ['lbfgs', 'l2sgd'],
51
    'c1': [0.01, 0.1, 1.0],
52
    'c2': [0.01, 0.1, 1.0],
53
    'max_iterations': [50, 100, 200]
54
}
55

56
# Grid search with CRF
57
crf = CRF()
58
grid_search = GridSearchCV(
59
    crf,
60
    param_grid,
61
    cv=3,
62
    scoring=flat_accuracy,
63
    n_jobs=-1,
64
    verbose=1
65
)
66

67
grid_search.fit(X_train, y_train)
68

69
print(f"Best parameters: {grid_search.best_params_}")
70
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")
71

72
# Use best model
73
best_crf = grid_search.best_estimator_
74
predictions = best_crf.predict(X_test)
75
```
76

77
### Pipeline Integration
78

79
Use CRF models within scikit-learn pipelines for complete ML workflows.
80

81
**Usage Example:**
82

83
```python
84
from sklearn.pipeline import Pipeline
85
from sklearn.feature_extraction import DictVectorizer
86
from sklearn_crfsuite import CRF
87

88
# Create pipeline with feature extraction and CRF
89
pipeline = Pipeline([
90
    ('vectorizer', DictVectorizer(sparse=False)),
91
    ('crf', CRF(algorithm='lbfgs', c1=0.1, c2=0.1))
92
])
93

94
# Note: This is a conceptual example. In practice, CRF expects 
95
# sequences of feature dicts, not flat feature vectors.
96
# Custom transformers would be needed for real pipeline usage.
97
```
98

99
### Custom Scorer Creation
100

101
Create custom scorers for specific evaluation needs.
102

103
**Usage Example:**
104

105
```python
106
from sklearn.metrics import make_scorer
107
from sklearn_crfsuite import metrics
108

109
# Create custom scorers
110
def macro_f1_scorer(y_true, y_pred):
111
    return metrics.flat_f1_score(y_true, y_pred, average='macro')
112

113
def weighted_precision_scorer(y_true, y_pred):
114
    return metrics.flat_precision_score(y_true, y_pred, average='weighted')
115

116
# Convert to sklearn scorers
117
macro_f1 = make_scorer(macro_f1_scorer)
118
weighted_precision = make_scorer(weighted_precision_scorer)
119

120
# Use in grid search
121
scoring = {
122
    'flat_acc': flat_accuracy,
123
    'seq_acc': sequence_accuracy,
124
    'macro_f1': macro_f1,
125
    'weighted_prec': weighted_precision
126
}
127

128
grid_search = GridSearchCV(
129
    CRF(),
130
    param_grid,
131
    cv=3,
132
    scoring=scoring,
133
    refit='macro_f1'  # Use macro F1 to select best model
134
)
135
```
136

137
### Cross-Validation Strategies
138

139
Advanced cross-validation patterns for sequence labeling tasks.
140

141
**Usage Example:**
142

143
```python
144
from sklearn.model_selection import StratifiedKFold, cross_validate
145
from sklearn_crfsuite import CRF
146
from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy
147

148
def sequence_stratified_split(X, y, n_splits=5):
149
    """
150
    Custom stratification for sequence data based on label distributions.
151
    This is a conceptual example - real implementation would need 
152
    to handle sequence-specific stratification.
153
    """
154
    # Flatten labels for stratification
155
    flat_labels = [label for seq in y for label in seq]
156
    # Use most common label per sequence for stratification key
157
    seq_labels = [max(set(seq), key=seq.count) for seq in y]
158
    
159
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
160
    return skf.split(X, seq_labels)
161

162
# Comprehensive cross-validation
163
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)
164

165
scoring = {
166
    'flat_accuracy': flat_accuracy,
167
    'sequence_accuracy': sequence_accuracy
168
}
169

170
cv_results = cross_validate(
171
    crf, X, y,
172
    cv=5,
173
    scoring=scoring,
174
    return_train_score=True,
175
    return_estimator=True
176
)
177

178
print("Cross-validation results:")
179
for metric in ['flat_accuracy', 'sequence_accuracy']:
180
    test_scores = cv_results[f'test_{metric}']
181
    train_scores = cv_results[f'train_{metric}']
182
    print(f"{metric}:")
183
    print(f"  Test:  {test_scores.mean():.3f} (+/- {test_scores.std() * 2:.3f})")
184
    print(f"  Train: {train_scores.mean():.3f} (+/- {train_scores.std() * 2:.3f})")
185
```
186

187
### Model Persistence
188

189
Save and load trained CRF models using joblib or pickle.
190

191
**Usage Example:**
192

193
```python
194
import joblib
195
from sklearn_crfsuite import CRF
196

197
# Train and save model
198
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)
199
crf.fit(X_train, y_train)
200

201
# Save with joblib (recommended)
202
joblib.dump(crf, 'crf_model.pkl')
203

204
# Load model
205
loaded_crf = joblib.load('crf_model.pkl')
206

207
# Verify model works
208
predictions = loaded_crf.predict(X_test)
209
accuracy = loaded_crf.score(X_test, y_test)
210
print(f"Loaded model accuracy: {accuracy:.3f}")
211

212
# Alternative: use pickle
213
import pickle
214

215
with open('crf_model_pickle.pkl', 'wb') as f:
216
    pickle.dump(crf, f)
217

218
with open('crf_model_pickle.pkl', 'rb') as f:
219
    loaded_crf_pickle = pickle.load(f)
220
```
221

222
### Utility Functions
223

224
Additional utilities for working with sequence data in sklearn contexts.
225

226
```python { .api }
227
from sklearn_crfsuite.utils import flatten
228

229
def flatten(sequences):
230
    """
231
    Flatten a list of sequences into a single list.
232
    
233
    Parameters:
234
    - sequences: List[List[Any]], list of sequences to flatten
235
    
236
    Returns:
237
    - List[Any]: flattened list
238
    """
239
```
240

241
**Usage Example:**
242

243
```python
244
from sklearn_crfsuite.utils import flatten
245

246
# Flatten sequence data when needed
247
y_sequences = [['B-PER', 'I-PER', 'O'], ['O', 'B-LOC']]
248
y_flat = flatten(y_sequences)
249
print(y_flat)  # ['B-PER', 'I-PER', 'O', 'O', 'B-LOC']
250

251
# Useful for creating custom metrics or preprocessing
252
def create_label_encoder(y_sequences):
253
    """Create sklearn LabelEncoder from sequence data."""
254
    from sklearn.preprocessing import LabelEncoder
255
    
256
    flat_labels = flatten(y_sequences)
257
    encoder = LabelEncoder()
258
    encoder.fit(flat_labels)
259
    return encoder
260

261
encoder = create_label_encoder(y_train)
262
all_labels = encoder.classes_
263
print(f"Unique labels: {all_labels}")
264
```

Version

Tile

Files

sklearn-integration.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

sklearn-integration.mddocs/