Tessl Tile for pypi/torchmetrics@1.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

audio.md classification.md clustering.md detection.md functional.md image.md index.md multimodal.md nominal.md regression.md retrieval.md segmentation.md shape.md text.md utilities.md video.md

text.mddocs/

0
# Text Metrics
1

2
Natural language processing metrics for translation, summarization, and text generation evaluation including BLEU, ROUGE, and semantic similarity measures for comprehensive text quality assessment.
3

4
## Capabilities
5

6
### Machine Translation Metrics
7

8
Metrics for evaluating machine translation quality and n-gram overlap.
9

10
```python { .api }
11
class BLEUScore(Metric):
12
    def __init__(
13
        self,
14
        n_gram: int = 4,
15
        smooth: bool = False,
16
        weights: Optional[Sequence[float]] = None,
17
        **kwargs
18
    ): ...
19

20
class SacreBLEUScore(Metric):
21
    def __init__(
22
        self,
23
        n_gram: int = 4,
24
        smooth: bool = False,
25
        tokenize: Optional[str] = None,
26
        lowercase: bool = False,
27
        **kwargs
28
    ): ...
29

30
class CHRFScore(Metric):
31
    def __init__(
32
        self,
33
        n_char_order: int = 6,
34
        n_word_order: int = 2,
35
        beta: float = 2.0,
36
        lowercase: bool = False,
37
        whitespace: bool = False,
38
        **kwargs
39
    ): ...
40
```
41

42
### Summarization Metrics
43

44
Metrics for evaluating automatic summarization quality.
45

46
```python { .api }
47
class ROUGEScore(Metric):
48
    def __init__(
49
        self,
50
        rouge_keys: Union[str, Tuple[str, ...]] = ("rouge1", "rouge2", "rougeL"),
51
        use_stemmer: bool = False,
52
        normalizer: Optional[Callable[[str], str]] = None,
53
        tokenizer: Optional[Callable[[str], Sequence[str]]] = None,
54
        accumulate: str = "best",
55
        **kwargs
56
    ): ...
57
```
58

59
### Error Rate Metrics
60

61
Character and word-level error rate measurements for ASR and text processing.
62

63
```python { .api }
64
class CharErrorRate(Metric):
65
    def __init__(
66
        self,
67
        **kwargs
68
    ): ...
69

70
class WordErrorRate(Metric):
71
    def __init__(
72
        self,
73
        **kwargs
74
    ): ...
75

76
class MatchErrorRate(Metric):
77
    def __init__(
78
        self,
79
        **kwargs
80
    ): ...
81

82
class TranslationEditRate(Metric):
83
    def __init__(
84
        self,
85
        normalize: bool = False,
86
        no_punctuation: bool = False,
87
        lowercase: bool = True,
88
        asian_support: bool = False,
89
        **kwargs
90
    ): ...
91
```
92

93
### Edit Distance Metrics
94

95
String similarity and distance measures for sequence comparison.
96

97
```python { .api }
98
class EditDistance(Metric):
99
    def __init__(
100
        self,
101
        substitution_cost: int = 1,
102
        reduction: Optional[str] = "mean",
103
        **kwargs
104
    ): ...
105

106
class ExtendedEditDistance(Metric):
107
    def __init__(
108
        self,
109
        language: str = "en",
110
        return_sentence_level_score: bool = False,
111
        alpha: float = 2.0,
112
        rho: float = 0.3,
113
        deletion: float = 0.2,
114
        insertion: float = 1.0,
115
        substitution: float = 1.0,
116
        **kwargs
117
    ): ...
118
```
119

120
### Information Metrics
121

122
Information-theoretic measures for text quality assessment.
123

124
```python { .api }
125
class WordInfoLost(Metric):
126
    def __init__(
127
        self,
128
        **kwargs
129
    ): ...
130

131
class WordInfoPreserved(Metric):
132
    def __init__(
133
        self,
134
        **kwargs
135
    ): ...
136

137
class Perplexity(Metric):
138
    def __init__(
139
        self,
140
        ignore_index: int = -100,
141
        **kwargs
142
    ): ...
143
```
144

145
### Question Answering Metrics
146

147
Specialized metrics for question answering task evaluation.
148

149
```python { .api }
150
class SQuAD(Metric):
151
    def __init__(
152
        self,
153
        **kwargs
154
    ): ...
155
```
156

157
### Semantic Similarity Metrics
158

159
Deep learning-based semantic similarity measures (require optional dependencies).
160

161
```python { .api }
162
class BERTScore(Metric):
163
    def __init__(
164
        self,
165
        model_name_or_path: str = "distilbert-base-uncased",
166
        num_layers: Optional[int] = None,
167
        all_layers: bool = False,
168
        model_type: Optional[str] = None,
169
        user_forward_fn: Optional[Callable[[Any, Tensor], Tensor]] = None,
170
        user_tokenizer: Optional[Any] = None,
171
        verbose: bool = False,
172
        idf: bool = False,
173
        device: Optional[Union[str, torch.device]] = None,
174
        max_length: int = 512,
175
        batch_size: int = 64,
176
        num_threads: int = 4,
177
        return_hash: bool = False,
178
        lang: str = "en",
179
        rescale_with_baseline: bool = False,
180
        baseline_path: Optional[str] = None,
181
        use_fast_tokenizer: bool = False,
182
        **kwargs
183
    ): ...
184

185
class InfoLM(Metric):
186
    def __init__(
187
        self,
188
        model_name_or_path: str = "google/bert_uncased_L-2_H-128_A-2",
189
        temperature: float = 0.25,
190
        measure_to_use: str = "fisher_rao",
191
        max_length: Optional[int] = None,
192
        device: Optional[Union[str, torch.device]] = None,
193
        batch_size: int = 64,
194
        num_threads: int = 4,
195
        verbose: bool = False,
196
        return_sentence_level_score: bool = False,
197
        **kwargs
198
    ): ...
199
```
200

201
## Usage Examples
202

203
### BLEU Score for Translation
204

205
```python
206
import torch
207
from torchmetrics.text import BLEUScore
208

209
# Initialize BLEU metric
210
bleu = BLEUScore()
211

212
# Sample predictions and references
213
preds = ["the cat is on the mat"]
214
target = [["there is a cat on the mat", "a cat is on the mat"]]
215

216
# Compute BLEU score
217
bleu_score = bleu(preds, target)
218
print(f"BLEU Score: {bleu_score:.4f}")
219

220
# 4-gram BLEU with smoothing
221
bleu_smooth = BLEUScore(n_gram=4, smooth=True)
222
bleu_smooth_score = bleu_smooth(preds, target)
223
print(f"Smoothed BLEU: {bleu_smooth_score:.4f}")
224
```
225

226
### ROUGE for Summarization
227

228
```python
229
from torchmetrics.text import ROUGEScore
230

231
# Initialize ROUGE metric
232
rouge = ROUGEScore()
233

234
# Sample summaries and references
235
preds = ["the quick brown fox jumps over the lazy dog"]
236
target = ["a quick brown fox jumps over a lazy dog"]
237

238
# Compute ROUGE scores
239
rouge_scores = rouge(preds, target)
240
print(f"ROUGE-1: {rouge_scores['rouge1_fmeasure']:.4f}")
241
print(f"ROUGE-2: {rouge_scores['rouge2_fmeasure']:.4f}")
242
print(f"ROUGE-L: {rouge_scores['rougeL_fmeasure']:.4f}")
243

244
# Custom ROUGE configuration
245
rouge_custom = ROUGEScore(rouge_keys=("rouge1", "rouge2", "rougeL", "rougeLsum"))
246
rouge_custom_scores = rouge_custom(preds, target)
247
```
248

249
### Word Error Rate for ASR
250

251
```python
252
from torchmetrics.text import WordErrorRate, CharErrorRate
253

254
# Initialize error rate metrics
255
wer = WordErrorRate()
256
cer = CharErrorRate()
257

258
# ASR outputs vs ground truth
259
preds = ["this is a test"]
260
target = ["this is the test"]
261

262
# Compute error rates
263
wer_score = wer(preds, target)
264
cer_score = cer(preds, target)
265

266
print(f"Word Error Rate: {wer_score:.4f}")
267
print(f"Character Error Rate: {cer_score:.4f}")
268
```
269

270
### BERTScore for Semantic Similarity
271

272
```python
273
from torchmetrics.text import BERTScore
274

275
# Initialize BERTScore (requires transformers)
276
try:
277
    bertscore = BERTScore(model_name_or_path="distilbert-base-uncased")
278
    
279
    # Sample texts
280
    preds = ["the cat sat on the mat"]
281
    target = ["a cat was sitting on the mat"]
282
    
283
    # Compute BERTScore
284
    bert_scores = bertscore(preds, target)
285
    print(f"BERTScore F1: {bert_scores['f1']:.4f}")
286
    print(f"BERTScore Precision: {bert_scores['precision']:.4f}")
287
    print(f"BERTScore Recall: {bert_scores['recall']:.4f}")
288
    
289
except ImportError:
290
    print("BERTScore requires the 'transformers' package")
291
```
292

293
### Edit Distance
294

295
```python
296
from torchmetrics.text import EditDistance
297

298
# Initialize edit distance
299
edit_dist = EditDistance()
300

301
# Sample strings
302
preds = ["kitten"]
303
target = ["sitting"]
304

305
# Compute edit distance
306
distance = edit_dist(preds, target)
307
print(f"Edit Distance: {distance:.0f}")
308

309
# Normalized edit distance
310
edit_dist_norm = EditDistance(reduction="mean")
311
norm_distance = edit_dist_norm(preds, target)
312
print(f"Normalized Edit Distance: {norm_distance:.4f}")
313
```
314

315
### Perplexity for Language Models
316

317
```python
318
from torchmetrics.text import Perplexity
319
import torch
320

321
# Initialize perplexity metric
322
perplexity = Perplexity()
323

324
# Language model predictions (log probabilities)
325
# Shape: (batch_size, sequence_length, vocab_size)
326
preds = torch.randn(2, 8, 1000)  # 2 sequences, 8 tokens, vocab size 1000
327
target = torch.randint(0, 1000, (2, 8))  # target token ids
328

329
# Compute perplexity
330
ppl_score = perplexity(preds, target)
331
print(f"Perplexity: {ppl_score:.4f}")
332
```
333

334
### Translation Edit Rate (TER)
335

336
```python
337
from torchmetrics.text import TranslationEditRate
338

339
# Initialize TER metric
340
ter = TranslationEditRate(normalize=True, lowercase=True)
341

342
# Translation examples
343
preds = ["The cat is on the mat"]
344
target = ["There is a cat on the mat"]
345

346
# Compute TER
347
ter_score = ter(preds, target)
348
print(f"Translation Edit Rate: {ter_score:.4f}")
349
```
350

351
### SQuAD Metric for QA
352

353
```python
354
from torchmetrics.text import SQuAD
355

356
# Initialize SQuAD metric
357
squad = SQuAD()
358

359
# QA predictions and references
360
preds = [{"prediction_text": "Denver Broncos", "id": "56be4db0acb8001400a502ec"}]
361
target = [{"answers": {"answer_start": [177], "text": ["Denver Broncos"]}, 
362
          "id": "56be4db0acb8001400a502ec"}]
363

364
# Compute SQuAD scores
365
squad_scores = squad(preds, target)
366
print(f"Exact Match: {squad_scores['exact_match']:.4f}")
367
print(f"F1 Score: {squad_scores['f1']:.4f}")
368
```
369

370
### Multi-Reference Evaluation
371

372
```python
373
from torchmetrics.text import BLEUScore, ROUGEScore
374

375
# Multiple reference translations/summaries
376
preds = ["the cat is on the mat"]
377
target = [["there is a cat on the mat", 
378
          "a cat is on the mat", 
379
          "the cat sits on the mat"]]
380

381
# BLEU with multiple references
382
bleu_multi = BLEUScore()
383
bleu_score = bleu_multi(preds, target)
384
print(f"Multi-reference BLEU: {bleu_score:.4f}")
385

386
# ROUGE with multiple references
387
rouge_multi = ROUGEScore()
388
rouge_scores = rouge_multi(preds, target)
389
print(f"Multi-reference ROUGE-L: {rouge_scores['rougeL_fmeasure']:.4f}")
390
```
391

392
## Types
393

394
```python { .api }
395
from typing import Union, Optional, Sequence, List, Dict, Any, Callable, Tuple
396
import torch
397
from torch import Tensor
398

399
TextInput = Union[str, List[str]]
400
TextTarget = Union[str, List[str], List[List[str]]]  # Multiple references supported
401

402
ROUGEKeys = Union[str, Tuple[str, ...]]
403
AccumulateType = Union["avg", "best"]
404
MeasureType = Union["fisher_rao", "kl_divergence", "js_divergence"]
405
```

Version

Tile

Files

text.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

text.mddocs/