0
# Scikit-learn Integration
1
2
Ready-to-use scorer functions and utilities that enable seamless integration of sklearn-crfsuite with scikit-learn's model selection ecosystem, including cross-validation, grid search, pipeline construction, and automated hyperparameter optimization.
3
4
## Capabilities
5
6
### Built-in Scorers
7
8
Pre-configured sklearn scorer objects that can be used directly with scikit-learn's model selection utilities.
9
10
```python { .api }
11
from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy
12
13
flat_accuracy: sklearn.metrics.scorer._BaseScorer
14
"""Scorer for token-level accuracy using sklearn's make_scorer."""
15
16
sequence_accuracy: sklearn.metrics.scorer._BaseScorer
17
"""Scorer for sequence-level accuracy using sklearn's make_scorer."""
18
```
19
20
**Usage Example:**
21
22
```python
23
from sklearn.model_selection import cross_val_score
24
from sklearn_crfsuite import CRF
25
from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy
26
27
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)
28
29
# Use built-in scorers with cross-validation
30
flat_scores = cross_val_score(crf, X, y, cv=5, scoring=flat_accuracy)
31
seq_scores = cross_val_score(crf, X, y, cv=5, scoring=sequence_accuracy)
32
33
print(f"Flat accuracy: {flat_scores.mean():.3f} (+/- {flat_scores.std() * 2:.3f})")
34
print(f"Sequence accuracy: {seq_scores.mean():.3f} (+/- {seq_scores.std() * 2:.3f})")
35
```
36
37
### Grid Search Integration
38
39
Complete compatibility with scikit-learn's hyperparameter optimization tools.
40
41
**Usage Example:**
42
43
```python
44
from sklearn.model_selection import GridSearchCV
45
from sklearn_crfsuite import CRF
46
from sklearn_crfsuite.scorers import flat_accuracy
47
48
# Define parameter grid
49
param_grid = {
50
'algorithm': ['lbfgs', 'l2sgd'],
51
'c1': [0.01, 0.1, 1.0],
52
'c2': [0.01, 0.1, 1.0],
53
'max_iterations': [50, 100, 200]
54
}
55
56
# Grid search with CRF
57
crf = CRF()
58
grid_search = GridSearchCV(
59
crf,
60
param_grid,
61
cv=3,
62
scoring=flat_accuracy,
63
n_jobs=-1,
64
verbose=1
65
)
66
67
grid_search.fit(X_train, y_train)
68
69
print(f"Best parameters: {grid_search.best_params_}")
70
print(f"Best cross-validation score: {grid_search.best_score_:.3f}")
71
72
# Use best model
73
best_crf = grid_search.best_estimator_
74
predictions = best_crf.predict(X_test)
75
```
76
77
### Pipeline Integration
78
79
Use CRF models within scikit-learn pipelines for complete ML workflows.
80
81
**Usage Example:**
82
83
```python
84
from sklearn.pipeline import Pipeline
85
from sklearn.feature_extraction import DictVectorizer
86
from sklearn_crfsuite import CRF
87
88
# Create pipeline with feature extraction and CRF
89
pipeline = Pipeline([
90
('vectorizer', DictVectorizer(sparse=False)),
91
('crf', CRF(algorithm='lbfgs', c1=0.1, c2=0.1))
92
])
93
94
# Note: This is a conceptual example. In practice, CRF expects
95
# sequences of feature dicts, not flat feature vectors.
96
# Custom transformers would be needed for real pipeline usage.
97
```
98
99
### Custom Scorer Creation
100
101
Create custom scorers for specific evaluation needs.
102
103
**Usage Example:**
104
105
```python
106
from sklearn.metrics import make_scorer
107
from sklearn_crfsuite import metrics
108
109
# Create custom scorers
110
def macro_f1_scorer(y_true, y_pred):
111
return metrics.flat_f1_score(y_true, y_pred, average='macro')
112
113
def weighted_precision_scorer(y_true, y_pred):
114
return metrics.flat_precision_score(y_true, y_pred, average='weighted')
115
116
# Convert to sklearn scorers
117
macro_f1 = make_scorer(macro_f1_scorer)
118
weighted_precision = make_scorer(weighted_precision_scorer)
119
120
# Use in grid search
121
scoring = {
122
'flat_acc': flat_accuracy,
123
'seq_acc': sequence_accuracy,
124
'macro_f1': macro_f1,
125
'weighted_prec': weighted_precision
126
}
127
128
grid_search = GridSearchCV(
129
CRF(),
130
param_grid,
131
cv=3,
132
scoring=scoring,
133
refit='macro_f1' # Use macro F1 to select best model
134
)
135
```
136
137
### Cross-Validation Strategies
138
139
Advanced cross-validation patterns for sequence labeling tasks.
140
141
**Usage Example:**
142
143
```python
144
from sklearn.model_selection import StratifiedKFold, cross_validate
145
from sklearn_crfsuite import CRF
146
from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy
147
148
def sequence_stratified_split(X, y, n_splits=5):
149
"""
150
Custom stratification for sequence data based on label distributions.
151
This is a conceptual example - real implementation would need
152
to handle sequence-specific stratification.
153
"""
154
# Flatten labels for stratification
155
flat_labels = [label for seq in y for label in seq]
156
# Use most common label per sequence for stratification key
157
seq_labels = [max(set(seq), key=seq.count) for seq in y]
158
159
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
160
return skf.split(X, seq_labels)
161
162
# Comprehensive cross-validation
163
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)
164
165
scoring = {
166
'flat_accuracy': flat_accuracy,
167
'sequence_accuracy': sequence_accuracy
168
}
169
170
cv_results = cross_validate(
171
crf, X, y,
172
cv=5,
173
scoring=scoring,
174
return_train_score=True,
175
return_estimator=True
176
)
177
178
print("Cross-validation results:")
179
for metric in ['flat_accuracy', 'sequence_accuracy']:
180
test_scores = cv_results[f'test_{metric}']
181
train_scores = cv_results[f'train_{metric}']
182
print(f"{metric}:")
183
print(f" Test: {test_scores.mean():.3f} (+/- {test_scores.std() * 2:.3f})")
184
print(f" Train: {train_scores.mean():.3f} (+/- {train_scores.std() * 2:.3f})")
185
```
186
187
### Model Persistence
188
189
Save and load trained CRF models using joblib or pickle.
190
191
**Usage Example:**
192
193
```python
194
import joblib
195
from sklearn_crfsuite import CRF
196
197
# Train and save model
198
crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)
199
crf.fit(X_train, y_train)
200
201
# Save with joblib (recommended)
202
joblib.dump(crf, 'crf_model.pkl')
203
204
# Load model
205
loaded_crf = joblib.load('crf_model.pkl')
206
207
# Verify model works
208
predictions = loaded_crf.predict(X_test)
209
accuracy = loaded_crf.score(X_test, y_test)
210
print(f"Loaded model accuracy: {accuracy:.3f}")
211
212
# Alternative: use pickle
213
import pickle
214
215
with open('crf_model_pickle.pkl', 'wb') as f:
216
pickle.dump(crf, f)
217
218
with open('crf_model_pickle.pkl', 'rb') as f:
219
loaded_crf_pickle = pickle.load(f)
220
```
221
222
### Utility Functions
223
224
Additional utilities for working with sequence data in sklearn contexts.
225
226
```python { .api }
227
from sklearn_crfsuite.utils import flatten
228
229
def flatten(sequences):
230
"""
231
Flatten a list of sequences into a single list.
232
233
Parameters:
234
- sequences: List[List[Any]], list of sequences to flatten
235
236
Returns:
237
- List[Any]: flattened list
238
"""
239
```
240
241
**Usage Example:**
242
243
```python
244
from sklearn_crfsuite.utils import flatten
245
246
# Flatten sequence data when needed
247
y_sequences = [['B-PER', 'I-PER', 'O'], ['O', 'B-LOC']]
248
y_flat = flatten(y_sequences)
249
print(y_flat) # ['B-PER', 'I-PER', 'O', 'O', 'B-LOC']
250
251
# Useful for creating custom metrics or preprocessing
252
def create_label_encoder(y_sequences):
253
"""Create sklearn LabelEncoder from sequence data."""
254
from sklearn.preprocessing import LabelEncoder
255
256
flat_labels = flatten(y_sequences)
257
encoder = LabelEncoder()
258
encoder.fit(flat_labels)
259
return encoder
260
261
encoder = create_label_encoder(y_train)
262
all_labels = encoder.classes_
263
print(f"Unique labels: {all_labels}")
264
```