or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced.mdcrf-estimator.mdindex.mdmetrics.mdsklearn-integration.mdutils.md

sklearn-integration.mddocs/

0

# Scikit-learn Integration

1

2

Ready-to-use scorer functions and utilities that enable seamless integration of sklearn-crfsuite with scikit-learn's model selection ecosystem, including cross-validation, grid search, pipeline construction, and automated hyperparameter optimization.

3

4

## Capabilities

5

6

### Built-in Scorers

7

8

Pre-configured sklearn scorer objects that can be used directly with scikit-learn's model selection utilities.

9

10

```python { .api }

11

from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy

12

13

flat_accuracy: sklearn.metrics.scorer._BaseScorer

14

"""Scorer for token-level accuracy using sklearn's make_scorer."""

15

16

sequence_accuracy: sklearn.metrics.scorer._BaseScorer

17

"""Scorer for sequence-level accuracy using sklearn's make_scorer."""

18

```

19

20

**Usage Example:**

21

22

```python

23

from sklearn.model_selection import cross_val_score

24

from sklearn_crfsuite import CRF

25

from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy

26

27

crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)

28

29

# Use built-in scorers with cross-validation

30

flat_scores = cross_val_score(crf, X, y, cv=5, scoring=flat_accuracy)

31

seq_scores = cross_val_score(crf, X, y, cv=5, scoring=sequence_accuracy)

32

33

print(f"Flat accuracy: {flat_scores.mean():.3f} (+/- {flat_scores.std() * 2:.3f})")

34

print(f"Sequence accuracy: {seq_scores.mean():.3f} (+/- {seq_scores.std() * 2:.3f})")

35

```

36

37

### Grid Search Integration

38

39

Complete compatibility with scikit-learn's hyperparameter optimization tools.

40

41

**Usage Example:**

42

43

```python

44

from sklearn.model_selection import GridSearchCV

45

from sklearn_crfsuite import CRF

46

from sklearn_crfsuite.scorers import flat_accuracy

47

48

# Define parameter grid

49

param_grid = {

50

'algorithm': ['lbfgs', 'l2sgd'],

51

'c1': [0.01, 0.1, 1.0],

52

'c2': [0.01, 0.1, 1.0],

53

'max_iterations': [50, 100, 200]

54

}

55

56

# Grid search with CRF

57

crf = CRF()

58

grid_search = GridSearchCV(

59

crf,

60

param_grid,

61

cv=3,

62

scoring=flat_accuracy,

63

n_jobs=-1,

64

verbose=1

65

)

66

67

grid_search.fit(X_train, y_train)

68

69

print(f"Best parameters: {grid_search.best_params_}")

70

print(f"Best cross-validation score: {grid_search.best_score_:.3f}")

71

72

# Use best model

73

best_crf = grid_search.best_estimator_

74

predictions = best_crf.predict(X_test)

75

```

76

77

### Pipeline Integration

78

79

Use CRF models within scikit-learn pipelines for complete ML workflows.

80

81

**Usage Example:**

82

83

```python

84

from sklearn.pipeline import Pipeline

85

from sklearn.feature_extraction import DictVectorizer

86

from sklearn_crfsuite import CRF

87

88

# Create pipeline with feature extraction and CRF

89

pipeline = Pipeline([

90

('vectorizer', DictVectorizer(sparse=False)),

91

('crf', CRF(algorithm='lbfgs', c1=0.1, c2=0.1))

92

])

93

94

# Note: This is a conceptual example. In practice, CRF expects

95

# sequences of feature dicts, not flat feature vectors.

96

# Custom transformers would be needed for real pipeline usage.

97

```

98

99

### Custom Scorer Creation

100

101

Create custom scorers for specific evaluation needs.

102

103

**Usage Example:**

104

105

```python

106

from sklearn.metrics import make_scorer

107

from sklearn_crfsuite import metrics

108

109

# Create custom scorers

110

def macro_f1_scorer(y_true, y_pred):

111

return metrics.flat_f1_score(y_true, y_pred, average='macro')

112

113

def weighted_precision_scorer(y_true, y_pred):

114

return metrics.flat_precision_score(y_true, y_pred, average='weighted')

115

116

# Convert to sklearn scorers

117

macro_f1 = make_scorer(macro_f1_scorer)

118

weighted_precision = make_scorer(weighted_precision_scorer)

119

120

# Use in grid search

121

scoring = {

122

'flat_acc': flat_accuracy,

123

'seq_acc': sequence_accuracy,

124

'macro_f1': macro_f1,

125

'weighted_prec': weighted_precision

126

}

127

128

grid_search = GridSearchCV(

129

CRF(),

130

param_grid,

131

cv=3,

132

scoring=scoring,

133

refit='macro_f1' # Use macro F1 to select best model

134

)

135

```

136

137

### Cross-Validation Strategies

138

139

Advanced cross-validation patterns for sequence labeling tasks.

140

141

**Usage Example:**

142

143

```python

144

from sklearn.model_selection import StratifiedKFold, cross_validate

145

from sklearn_crfsuite import CRF

146

from sklearn_crfsuite.scorers import flat_accuracy, sequence_accuracy

147

148

def sequence_stratified_split(X, y, n_splits=5):

149

"""

150

Custom stratification for sequence data based on label distributions.

151

This is a conceptual example - real implementation would need

152

to handle sequence-specific stratification.

153

"""

154

# Flatten labels for stratification

155

flat_labels = [label for seq in y for label in seq]

156

# Use most common label per sequence for stratification key

157

seq_labels = [max(set(seq), key=seq.count) for seq in y]

158

159

skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

160

return skf.split(X, seq_labels)

161

162

# Comprehensive cross-validation

163

crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)

164

165

scoring = {

166

'flat_accuracy': flat_accuracy,

167

'sequence_accuracy': sequence_accuracy

168

}

169

170

cv_results = cross_validate(

171

crf, X, y,

172

cv=5,

173

scoring=scoring,

174

return_train_score=True,

175

return_estimator=True

176

)

177

178

print("Cross-validation results:")

179

for metric in ['flat_accuracy', 'sequence_accuracy']:

180

test_scores = cv_results[f'test_{metric}']

181

train_scores = cv_results[f'train_{metric}']

182

print(f"{metric}:")

183

print(f" Test: {test_scores.mean():.3f} (+/- {test_scores.std() * 2:.3f})")

184

print(f" Train: {train_scores.mean():.3f} (+/- {train_scores.std() * 2:.3f})")

185

```

186

187

### Model Persistence

188

189

Save and load trained CRF models using joblib or pickle.

190

191

**Usage Example:**

192

193

```python

194

import joblib

195

from sklearn_crfsuite import CRF

196

197

# Train and save model

198

crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)

199

crf.fit(X_train, y_train)

200

201

# Save with joblib (recommended)

202

joblib.dump(crf, 'crf_model.pkl')

203

204

# Load model

205

loaded_crf = joblib.load('crf_model.pkl')

206

207

# Verify model works

208

predictions = loaded_crf.predict(X_test)

209

accuracy = loaded_crf.score(X_test, y_test)

210

print(f"Loaded model accuracy: {accuracy:.3f}")

211

212

# Alternative: use pickle

213

import pickle

214

215

with open('crf_model_pickle.pkl', 'wb') as f:

216

pickle.dump(crf, f)

217

218

with open('crf_model_pickle.pkl', 'rb') as f:

219

loaded_crf_pickle = pickle.load(f)

220

```

221

222

### Utility Functions

223

224

Additional utilities for working with sequence data in sklearn contexts.

225

226

```python { .api }

227

from sklearn_crfsuite.utils import flatten

228

229

def flatten(sequences):

230

"""

231

Flatten a list of sequences into a single list.

232

233

Parameters:

234

- sequences: List[List[Any]], list of sequences to flatten

235

236

Returns:

237

- List[Any]: flattened list

238

"""

239

```

240

241

**Usage Example:**

242

243

```python

244

from sklearn_crfsuite.utils import flatten

245

246

# Flatten sequence data when needed

247

y_sequences = [['B-PER', 'I-PER', 'O'], ['O', 'B-LOC']]

248

y_flat = flatten(y_sequences)

249

print(y_flat) # ['B-PER', 'I-PER', 'O', 'O', 'B-LOC']

250

251

# Useful for creating custom metrics or preprocessing

252

def create_label_encoder(y_sequences):

253

"""Create sklearn LabelEncoder from sequence data."""

254

from sklearn.preprocessing import LabelEncoder

255

256

flat_labels = flatten(y_sequences)

257

encoder = LabelEncoder()

258

encoder.fit(flat_labels)

259

return encoder

260

261

encoder = create_label_encoder(y_train)

262

all_labels = encoder.classes_

263

print(f"Unique labels: {all_labels}")

264

```