0
# Text Metrics
1
2
Natural language processing metrics for translation, summarization, and text generation evaluation including BLEU, ROUGE, and semantic similarity measures for comprehensive text quality assessment.
3
4
## Capabilities
5
6
### Machine Translation Metrics
7
8
Metrics for evaluating machine translation quality and n-gram overlap.
9
10
```python { .api }
11
class BLEUScore(Metric):
12
def __init__(
13
self,
14
n_gram: int = 4,
15
smooth: bool = False,
16
weights: Optional[Sequence[float]] = None,
17
**kwargs
18
): ...
19
20
class SacreBLEUScore(Metric):
21
def __init__(
22
self,
23
n_gram: int = 4,
24
smooth: bool = False,
25
tokenize: Optional[str] = None,
26
lowercase: bool = False,
27
**kwargs
28
): ...
29
30
class CHRFScore(Metric):
31
def __init__(
32
self,
33
n_char_order: int = 6,
34
n_word_order: int = 2,
35
beta: float = 2.0,
36
lowercase: bool = False,
37
whitespace: bool = False,
38
**kwargs
39
): ...
40
```
41
42
### Summarization Metrics
43
44
Metrics for evaluating automatic summarization quality.
45
46
```python { .api }
47
class ROUGEScore(Metric):
48
def __init__(
49
self,
50
rouge_keys: Union[str, Tuple[str, ...]] = ("rouge1", "rouge2", "rougeL"),
51
use_stemmer: bool = False,
52
normalizer: Optional[Callable[[str], str]] = None,
53
tokenizer: Optional[Callable[[str], Sequence[str]]] = None,
54
accumulate: str = "best",
55
**kwargs
56
): ...
57
```
58
59
### Error Rate Metrics
60
61
Character and word-level error rate measurements for ASR and text processing.
62
63
```python { .api }
64
class CharErrorRate(Metric):
65
def __init__(
66
self,
67
**kwargs
68
): ...
69
70
class WordErrorRate(Metric):
71
def __init__(
72
self,
73
**kwargs
74
): ...
75
76
class MatchErrorRate(Metric):
77
def __init__(
78
self,
79
**kwargs
80
): ...
81
82
class TranslationEditRate(Metric):
83
def __init__(
84
self,
85
normalize: bool = False,
86
no_punctuation: bool = False,
87
lowercase: bool = True,
88
asian_support: bool = False,
89
**kwargs
90
): ...
91
```
92
93
### Edit Distance Metrics
94
95
String similarity and distance measures for sequence comparison.
96
97
```python { .api }
98
class EditDistance(Metric):
99
def __init__(
100
self,
101
substitution_cost: int = 1,
102
reduction: Optional[str] = "mean",
103
**kwargs
104
): ...
105
106
class ExtendedEditDistance(Metric):
107
def __init__(
108
self,
109
language: str = "en",
110
return_sentence_level_score: bool = False,
111
alpha: float = 2.0,
112
rho: float = 0.3,
113
deletion: float = 0.2,
114
insertion: float = 1.0,
115
substitution: float = 1.0,
116
**kwargs
117
): ...
118
```
119
120
### Information Metrics
121
122
Information-theoretic measures for text quality assessment.
123
124
```python { .api }
125
class WordInfoLost(Metric):
126
def __init__(
127
self,
128
**kwargs
129
): ...
130
131
class WordInfoPreserved(Metric):
132
def __init__(
133
self,
134
**kwargs
135
): ...
136
137
class Perplexity(Metric):
138
def __init__(
139
self,
140
ignore_index: int = -100,
141
**kwargs
142
): ...
143
```
144
145
### Question Answering Metrics
146
147
Specialized metrics for question answering task evaluation.
148
149
```python { .api }
150
class SQuAD(Metric):
151
def __init__(
152
self,
153
**kwargs
154
): ...
155
```
156
157
### Semantic Similarity Metrics
158
159
Deep learning-based semantic similarity measures (require optional dependencies).
160
161
```python { .api }
162
class BERTScore(Metric):
163
def __init__(
164
self,
165
model_name_or_path: str = "distilbert-base-uncased",
166
num_layers: Optional[int] = None,
167
all_layers: bool = False,
168
model_type: Optional[str] = None,
169
user_forward_fn: Optional[Callable[[Any, Tensor], Tensor]] = None,
170
user_tokenizer: Optional[Any] = None,
171
verbose: bool = False,
172
idf: bool = False,
173
device: Optional[Union[str, torch.device]] = None,
174
max_length: int = 512,
175
batch_size: int = 64,
176
num_threads: int = 4,
177
return_hash: bool = False,
178
lang: str = "en",
179
rescale_with_baseline: bool = False,
180
baseline_path: Optional[str] = None,
181
use_fast_tokenizer: bool = False,
182
**kwargs
183
): ...
184
185
class InfoLM(Metric):
186
def __init__(
187
self,
188
model_name_or_path: str = "google/bert_uncased_L-2_H-128_A-2",
189
temperature: float = 0.25,
190
measure_to_use: str = "fisher_rao",
191
max_length: Optional[int] = None,
192
device: Optional[Union[str, torch.device]] = None,
193
batch_size: int = 64,
194
num_threads: int = 4,
195
verbose: bool = False,
196
return_sentence_level_score: bool = False,
197
**kwargs
198
): ...
199
```
200
201
## Usage Examples
202
203
### BLEU Score for Translation
204
205
```python
206
import torch
207
from torchmetrics.text import BLEUScore
208
209
# Initialize BLEU metric
210
bleu = BLEUScore()
211
212
# Sample predictions and references
213
preds = ["the cat is on the mat"]
214
target = [["there is a cat on the mat", "a cat is on the mat"]]
215
216
# Compute BLEU score
217
bleu_score = bleu(preds, target)
218
print(f"BLEU Score: {bleu_score:.4f}")
219
220
# 4-gram BLEU with smoothing
221
bleu_smooth = BLEUScore(n_gram=4, smooth=True)
222
bleu_smooth_score = bleu_smooth(preds, target)
223
print(f"Smoothed BLEU: {bleu_smooth_score:.4f}")
224
```
225
226
### ROUGE for Summarization
227
228
```python
229
from torchmetrics.text import ROUGEScore
230
231
# Initialize ROUGE metric
232
rouge = ROUGEScore()
233
234
# Sample summaries and references
235
preds = ["the quick brown fox jumps over the lazy dog"]
236
target = ["a quick brown fox jumps over a lazy dog"]
237
238
# Compute ROUGE scores
239
rouge_scores = rouge(preds, target)
240
print(f"ROUGE-1: {rouge_scores['rouge1_fmeasure']:.4f}")
241
print(f"ROUGE-2: {rouge_scores['rouge2_fmeasure']:.4f}")
242
print(f"ROUGE-L: {rouge_scores['rougeL_fmeasure']:.4f}")
243
244
# Custom ROUGE configuration
245
rouge_custom = ROUGEScore(rouge_keys=("rouge1", "rouge2", "rougeL", "rougeLsum"))
246
rouge_custom_scores = rouge_custom(preds, target)
247
```
248
249
### Word Error Rate for ASR
250
251
```python
252
from torchmetrics.text import WordErrorRate, CharErrorRate
253
254
# Initialize error rate metrics
255
wer = WordErrorRate()
256
cer = CharErrorRate()
257
258
# ASR outputs vs ground truth
259
preds = ["this is a test"]
260
target = ["this is the test"]
261
262
# Compute error rates
263
wer_score = wer(preds, target)
264
cer_score = cer(preds, target)
265
266
print(f"Word Error Rate: {wer_score:.4f}")
267
print(f"Character Error Rate: {cer_score:.4f}")
268
```
269
270
### BERTScore for Semantic Similarity
271
272
```python
273
from torchmetrics.text import BERTScore
274
275
# Initialize BERTScore (requires transformers)
276
try:
277
bertscore = BERTScore(model_name_or_path="distilbert-base-uncased")
278
279
# Sample texts
280
preds = ["the cat sat on the mat"]
281
target = ["a cat was sitting on the mat"]
282
283
# Compute BERTScore
284
bert_scores = bertscore(preds, target)
285
print(f"BERTScore F1: {bert_scores['f1']:.4f}")
286
print(f"BERTScore Precision: {bert_scores['precision']:.4f}")
287
print(f"BERTScore Recall: {bert_scores['recall']:.4f}")
288
289
except ImportError:
290
print("BERTScore requires the 'transformers' package")
291
```
292
293
### Edit Distance
294
295
```python
296
from torchmetrics.text import EditDistance
297
298
# Initialize edit distance
299
edit_dist = EditDistance()
300
301
# Sample strings
302
preds = ["kitten"]
303
target = ["sitting"]
304
305
# Compute edit distance
306
distance = edit_dist(preds, target)
307
print(f"Edit Distance: {distance:.0f}")
308
309
# Normalized edit distance
310
edit_dist_norm = EditDistance(reduction="mean")
311
norm_distance = edit_dist_norm(preds, target)
312
print(f"Normalized Edit Distance: {norm_distance:.4f}")
313
```
314
315
### Perplexity for Language Models
316
317
```python
318
from torchmetrics.text import Perplexity
319
import torch
320
321
# Initialize perplexity metric
322
perplexity = Perplexity()
323
324
# Language model predictions (log probabilities)
325
# Shape: (batch_size, sequence_length, vocab_size)
326
preds = torch.randn(2, 8, 1000) # 2 sequences, 8 tokens, vocab size 1000
327
target = torch.randint(0, 1000, (2, 8)) # target token ids
328
329
# Compute perplexity
330
ppl_score = perplexity(preds, target)
331
print(f"Perplexity: {ppl_score:.4f}")
332
```
333
334
### Translation Edit Rate (TER)
335
336
```python
337
from torchmetrics.text import TranslationEditRate
338
339
# Initialize TER metric
340
ter = TranslationEditRate(normalize=True, lowercase=True)
341
342
# Translation examples
343
preds = ["The cat is on the mat"]
344
target = ["There is a cat on the mat"]
345
346
# Compute TER
347
ter_score = ter(preds, target)
348
print(f"Translation Edit Rate: {ter_score:.4f}")
349
```
350
351
### SQuAD Metric for QA
352
353
```python
354
from torchmetrics.text import SQuAD
355
356
# Initialize SQuAD metric
357
squad = SQuAD()
358
359
# QA predictions and references
360
preds = [{"prediction_text": "Denver Broncos", "id": "56be4db0acb8001400a502ec"}]
361
target = [{"answers": {"answer_start": [177], "text": ["Denver Broncos"]},
362
"id": "56be4db0acb8001400a502ec"}]
363
364
# Compute SQuAD scores
365
squad_scores = squad(preds, target)
366
print(f"Exact Match: {squad_scores['exact_match']:.4f}")
367
print(f"F1 Score: {squad_scores['f1']:.4f}")
368
```
369
370
### Multi-Reference Evaluation
371
372
```python
373
from torchmetrics.text import BLEUScore, ROUGEScore
374
375
# Multiple reference translations/summaries
376
preds = ["the cat is on the mat"]
377
target = [["there is a cat on the mat",
378
"a cat is on the mat",
379
"the cat sits on the mat"]]
380
381
# BLEU with multiple references
382
bleu_multi = BLEUScore()
383
bleu_score = bleu_multi(preds, target)
384
print(f"Multi-reference BLEU: {bleu_score:.4f}")
385
386
# ROUGE with multiple references
387
rouge_multi = ROUGEScore()
388
rouge_scores = rouge_multi(preds, target)
389
print(f"Multi-reference ROUGE-L: {rouge_scores['rougeL_fmeasure']:.4f}")
390
```
391
392
## Types
393
394
```python { .api }
395
from typing import Union, Optional, Sequence, List, Dict, Any, Callable, Tuple
396
import torch
397
from torch import Tensor
398
399
TextInput = Union[str, List[str]]
400
TextTarget = Union[str, List[str], List[List[str]]] # Multiple references supported
401
402
ROUGEKeys = Union[str, Tuple[str, ...]]
403
AccumulateType = Union["avg", "best"]
404
MeasureType = Union["fisher_rao", "kl_divergence", "js_divergence"]
405
```