or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced.mdcrf-estimator.mdindex.mdmetrics.mdsklearn-integration.mdutils.md

metrics.mddocs/

0

# Evaluation Metrics

1

2

Specialized metrics for sequence labeling evaluation that properly handle the structured nature of CRF predictions. These metrics provide both token-level (flat) and sequence-level accuracy measures essential for evaluating named entity recognition, part-of-speech tagging, and other sequence labeling tasks.

3

4

## Capabilities

5

6

### Token-Level Metrics

7

8

Metrics that evaluate individual token predictions by flattening sequences into individual predictions, useful for understanding per-token accuracy patterns.

9

10

```python { .api }

11

def flat_accuracy_score(y_true, y_pred):

12

"""

13

Calculate token-level accuracy by flattening sequence predictions.

14

15

Parameters:

16

- y_true: List[List[str]], true label sequences

17

- y_pred: List[List[str]], predicted label sequences

18

19

Returns:

20

- float: accuracy score (correct tokens / total tokens)

21

"""

22

23

def flat_precision_score(y_true, y_pred, **kwargs):

24

"""

25

Calculate token-level precision score.

26

27

Parameters:

28

- y_true: List[List[str]], true label sequences

29

- y_pred: List[List[str]], predicted label sequences

30

- **kwargs: additional parameters passed to sklearn.metrics.precision_score

31

32

Returns:

33

- float: precision score

34

"""

35

36

def flat_recall_score(y_true, y_pred, **kwargs):

37

"""

38

Calculate token-level recall score.

39

40

Parameters:

41

- y_true: List[List[str]], true label sequences

42

- y_pred: List[List[str]], predicted label sequences

43

- **kwargs: additional parameters passed to sklearn.metrics.recall_score

44

45

Returns:

46

- float: recall score

47

"""

48

49

def flat_f1_score(y_true, y_pred, **kwargs):

50

"""

51

Calculate token-level F1 score.

52

53

Parameters:

54

- y_true: List[List[str]], true label sequences

55

- y_pred: List[List[str]], predicted label sequences

56

- **kwargs: additional parameters passed to sklearn.metrics.f1_score

57

58

Returns:

59

- float: F1 score

60

"""

61

62

def flat_fbeta_score(y_true, y_pred, beta, **kwargs):

63

"""

64

Calculate token-level F-beta score.

65

66

Parameters:

67

- y_true: List[List[str]], true label sequences

68

- y_pred: List[List[str]], predicted label sequences

69

- beta: float, beta parameter for F-beta score

70

- **kwargs: additional parameters passed to sklearn.metrics.fbeta_score

71

72

Returns:

73

- float: F-beta score

74

"""

75

76

def flat_classification_report(y_true, y_pred, labels=None, **kwargs):

77

"""

78

Generate detailed classification report for token-level predictions.

79

80

Parameters:

81

- y_true: List[List[str]], true label sequences

82

- y_pred: List[List[str]], predicted label sequences

83

- labels: List[str], labels to include in report

84

- **kwargs: additional parameters passed to sklearn.metrics.classification_report

85

86

Returns:

87

- str: formatted classification report

88

"""

89

```

90

91

**Usage Example:**

92

93

```python

94

from sklearn_crfsuite import metrics

95

96

# Sample predictions

97

y_true = [['B-PER', 'I-PER', 'O', 'B-LOC'], ['O', 'B-ORG', 'I-ORG']]

98

y_pred = [['B-PER', 'I-PER', 'O', 'O'], ['O', 'B-ORG', 'B-ORG']]

99

100

# Token-level evaluation

101

accuracy = metrics.flat_accuracy_score(y_true, y_pred)

102

precision = metrics.flat_precision_score(y_true, y_pred, average='weighted')

103

recall = metrics.flat_recall_score(y_true, y_pred, average='weighted')

104

f1 = metrics.flat_f1_score(y_true, y_pred, average='weighted')

105

106

print(f"Token Accuracy: {accuracy:.3f}")

107

print(f"Precision: {precision:.3f}")

108

print(f"Recall: {recall:.3f}")

109

print(f"F1 Score: {f1:.3f}")

110

111

# Detailed classification report

112

report = metrics.flat_classification_report(y_true, y_pred)

113

print("Classification Report:")

114

print(report)

115

```

116

117

### Sequence-Level Metrics

118

119

Metrics that evaluate complete sequence predictions, providing stricter evaluation where a sequence is considered correct only if all tokens match exactly.

120

121

```python { .api }

122

def sequence_accuracy_score(y_true, y_pred):

123

"""

124

Calculate sequence-level accuracy where entire sequences must match exactly.

125

126

Parameters:

127

- y_true: List[List[str]], true label sequences

128

- y_pred: List[List[str]], predicted label sequences

129

130

Returns:

131

- float: sequence accuracy (correct sequences / total sequences)

132

"""

133

```

134

135

**Usage Example:**

136

137

```python

138

# Sequence-level evaluation (stricter)

139

seq_accuracy = metrics.sequence_accuracy_score(y_true, y_pred)

140

print(f"Sequence Accuracy: {seq_accuracy:.3f}")

141

142

# Compare with token-level

143

print(f"Token vs Sequence Accuracy: {accuracy:.3f} vs {seq_accuracy:.3f}")

144

145

# Perfect prediction case

146

y_perfect = [['B-PER', 'I-PER', 'O', 'B-LOC'], ['O', 'B-ORG', 'I-ORG']]

147

perfect_seq_acc = metrics.sequence_accuracy_score(y_true, y_perfect)

148

perfect_tok_acc = metrics.flat_accuracy_score(y_true, y_perfect)

149

print(f"Perfect scores - Token: {perfect_tok_acc}, Sequence: {perfect_seq_acc}")

150

```

151

152

### Metric Usage Patterns

153

154

**Cross-validation with CRF:**

155

156

```python

157

from sklearn.model_selection import cross_val_score

158

from sklearn_crfsuite import CRF, metrics

159

160

crf = CRF(algorithm='lbfgs', c1=0.1, c2=0.1)

161

162

# Custom scoring function for cross-validation

163

def crf_sequence_accuracy(estimator, X, y):

164

y_pred = estimator.predict(X)

165

return metrics.sequence_accuracy_score(y, y_pred)

166

167

# Use in cross-validation

168

cv_scores = cross_val_score(crf, X, y, cv=5, scoring=crf_sequence_accuracy)

169

print(f"CV Sequence Accuracy: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

170

```

171

172

**Comprehensive evaluation:**

173

174

```python

175

def evaluate_crf(crf, X_test, y_test):

176

"""Comprehensive CRF evaluation with multiple metrics."""

177

y_pred = crf.predict(X_test)

178

179

results = {

180

'flat_accuracy': metrics.flat_accuracy_score(y_test, y_pred),

181

'sequence_accuracy': metrics.sequence_accuracy_score(y_test, y_pred),

182

'precision_macro': metrics.flat_precision_score(y_test, y_pred, average='macro'),

183

'recall_macro': metrics.flat_recall_score(y_test, y_pred, average='macro'),

184

'f1_macro': metrics.flat_f1_score(y_test, y_pred, average='macro'),

185

'precision_weighted': metrics.flat_precision_score(y_test, y_pred, average='weighted'),

186

'recall_weighted': metrics.flat_recall_score(y_test, y_pred, average='weighted'),

187

'f1_weighted': metrics.flat_f1_score(y_test, y_pred, average='weighted')

188

}

189

190

return results

191

192

# Use the evaluation function

193

evaluation_results = evaluate_crf(crf, X_test, y_test)

194

for metric, score in evaluation_results.items():

195

print(f"{metric}: {score:.3f}")

196

```