or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audio.mdclassification.mdclustering.mddetection.mdfunctional.mdimage.mdindex.mdmultimodal.mdnominal.mdregression.mdretrieval.mdsegmentation.mdshape.mdtext.mdutilities.mdvideo.md

text.mddocs/

0

# Text Metrics

1

2

Natural language processing metrics for translation, summarization, and text generation evaluation including BLEU, ROUGE, and semantic similarity measures for comprehensive text quality assessment.

3

4

## Capabilities

5

6

### Machine Translation Metrics

7

8

Metrics for evaluating machine translation quality and n-gram overlap.

9

10

```python { .api }

11

class BLEUScore(Metric):

12

def __init__(

13

self,

14

n_gram: int = 4,

15

smooth: bool = False,

16

weights: Optional[Sequence[float]] = None,

17

**kwargs

18

): ...

19

20

class SacreBLEUScore(Metric):

21

def __init__(

22

self,

23

n_gram: int = 4,

24

smooth: bool = False,

25

tokenize: Optional[str] = None,

26

lowercase: bool = False,

27

**kwargs

28

): ...

29

30

class CHRFScore(Metric):

31

def __init__(

32

self,

33

n_char_order: int = 6,

34

n_word_order: int = 2,

35

beta: float = 2.0,

36

lowercase: bool = False,

37

whitespace: bool = False,

38

**kwargs

39

): ...

40

```

41

42

### Summarization Metrics

43

44

Metrics for evaluating automatic summarization quality.

45

46

```python { .api }

47

class ROUGEScore(Metric):

48

def __init__(

49

self,

50

rouge_keys: Union[str, Tuple[str, ...]] = ("rouge1", "rouge2", "rougeL"),

51

use_stemmer: bool = False,

52

normalizer: Optional[Callable[[str], str]] = None,

53

tokenizer: Optional[Callable[[str], Sequence[str]]] = None,

54

accumulate: str = "best",

55

**kwargs

56

): ...

57

```

58

59

### Error Rate Metrics

60

61

Character and word-level error rate measurements for ASR and text processing.

62

63

```python { .api }

64

class CharErrorRate(Metric):

65

def __init__(

66

self,

67

**kwargs

68

): ...

69

70

class WordErrorRate(Metric):

71

def __init__(

72

self,

73

**kwargs

74

): ...

75

76

class MatchErrorRate(Metric):

77

def __init__(

78

self,

79

**kwargs

80

): ...

81

82

class TranslationEditRate(Metric):

83

def __init__(

84

self,

85

normalize: bool = False,

86

no_punctuation: bool = False,

87

lowercase: bool = True,

88

asian_support: bool = False,

89

**kwargs

90

): ...

91

```

92

93

### Edit Distance Metrics

94

95

String similarity and distance measures for sequence comparison.

96

97

```python { .api }

98

class EditDistance(Metric):

99

def __init__(

100

self,

101

substitution_cost: int = 1,

102

reduction: Optional[str] = "mean",

103

**kwargs

104

): ...

105

106

class ExtendedEditDistance(Metric):

107

def __init__(

108

self,

109

language: str = "en",

110

return_sentence_level_score: bool = False,

111

alpha: float = 2.0,

112

rho: float = 0.3,

113

deletion: float = 0.2,

114

insertion: float = 1.0,

115

substitution: float = 1.0,

116

**kwargs

117

): ...

118

```

119

120

### Information Metrics

121

122

Information-theoretic measures for text quality assessment.

123

124

```python { .api }

125

class WordInfoLost(Metric):

126

def __init__(

127

self,

128

**kwargs

129

): ...

130

131

class WordInfoPreserved(Metric):

132

def __init__(

133

self,

134

**kwargs

135

): ...

136

137

class Perplexity(Metric):

138

def __init__(

139

self,

140

ignore_index: int = -100,

141

**kwargs

142

): ...

143

```

144

145

### Question Answering Metrics

146

147

Specialized metrics for question answering task evaluation.

148

149

```python { .api }

150

class SQuAD(Metric):

151

def __init__(

152

self,

153

**kwargs

154

): ...

155

```

156

157

### Semantic Similarity Metrics

158

159

Deep learning-based semantic similarity measures (require optional dependencies).

160

161

```python { .api }

162

class BERTScore(Metric):

163

def __init__(

164

self,

165

model_name_or_path: str = "distilbert-base-uncased",

166

num_layers: Optional[int] = None,

167

all_layers: bool = False,

168

model_type: Optional[str] = None,

169

user_forward_fn: Optional[Callable[[Any, Tensor], Tensor]] = None,

170

user_tokenizer: Optional[Any] = None,

171

verbose: bool = False,

172

idf: bool = False,

173

device: Optional[Union[str, torch.device]] = None,

174

max_length: int = 512,

175

batch_size: int = 64,

176

num_threads: int = 4,

177

return_hash: bool = False,

178

lang: str = "en",

179

rescale_with_baseline: bool = False,

180

baseline_path: Optional[str] = None,

181

use_fast_tokenizer: bool = False,

182

**kwargs

183

): ...

184

185

class InfoLM(Metric):

186

def __init__(

187

self,

188

model_name_or_path: str = "google/bert_uncased_L-2_H-128_A-2",

189

temperature: float = 0.25,

190

measure_to_use: str = "fisher_rao",

191

max_length: Optional[int] = None,

192

device: Optional[Union[str, torch.device]] = None,

193

batch_size: int = 64,

194

num_threads: int = 4,

195

verbose: bool = False,

196

return_sentence_level_score: bool = False,

197

**kwargs

198

): ...

199

```

200

201

## Usage Examples

202

203

### BLEU Score for Translation

204

205

```python

206

import torch

207

from torchmetrics.text import BLEUScore

208

209

# Initialize BLEU metric

210

bleu = BLEUScore()

211

212

# Sample predictions and references

213

preds = ["the cat is on the mat"]

214

target = [["there is a cat on the mat", "a cat is on the mat"]]

215

216

# Compute BLEU score

217

bleu_score = bleu(preds, target)

218

print(f"BLEU Score: {bleu_score:.4f}")

219

220

# 4-gram BLEU with smoothing

221

bleu_smooth = BLEUScore(n_gram=4, smooth=True)

222

bleu_smooth_score = bleu_smooth(preds, target)

223

print(f"Smoothed BLEU: {bleu_smooth_score:.4f}")

224

```

225

226

### ROUGE for Summarization

227

228

```python

229

from torchmetrics.text import ROUGEScore

230

231

# Initialize ROUGE metric

232

rouge = ROUGEScore()

233

234

# Sample summaries and references

235

preds = ["the quick brown fox jumps over the lazy dog"]

236

target = ["a quick brown fox jumps over a lazy dog"]

237

238

# Compute ROUGE scores

239

rouge_scores = rouge(preds, target)

240

print(f"ROUGE-1: {rouge_scores['rouge1_fmeasure']:.4f}")

241

print(f"ROUGE-2: {rouge_scores['rouge2_fmeasure']:.4f}")

242

print(f"ROUGE-L: {rouge_scores['rougeL_fmeasure']:.4f}")

243

244

# Custom ROUGE configuration

245

rouge_custom = ROUGEScore(rouge_keys=("rouge1", "rouge2", "rougeL", "rougeLsum"))

246

rouge_custom_scores = rouge_custom(preds, target)

247

```

248

249

### Word Error Rate for ASR

250

251

```python

252

from torchmetrics.text import WordErrorRate, CharErrorRate

253

254

# Initialize error rate metrics

255

wer = WordErrorRate()

256

cer = CharErrorRate()

257

258

# ASR outputs vs ground truth

259

preds = ["this is a test"]

260

target = ["this is the test"]

261

262

# Compute error rates

263

wer_score = wer(preds, target)

264

cer_score = cer(preds, target)

265

266

print(f"Word Error Rate: {wer_score:.4f}")

267

print(f"Character Error Rate: {cer_score:.4f}")

268

```

269

270

### BERTScore for Semantic Similarity

271

272

```python

273

from torchmetrics.text import BERTScore

274

275

# Initialize BERTScore (requires transformers)

276

try:

277

bertscore = BERTScore(model_name_or_path="distilbert-base-uncased")

278

279

# Sample texts

280

preds = ["the cat sat on the mat"]

281

target = ["a cat was sitting on the mat"]

282

283

# Compute BERTScore

284

bert_scores = bertscore(preds, target)

285

print(f"BERTScore F1: {bert_scores['f1']:.4f}")

286

print(f"BERTScore Precision: {bert_scores['precision']:.4f}")

287

print(f"BERTScore Recall: {bert_scores['recall']:.4f}")

288

289

except ImportError:

290

print("BERTScore requires the 'transformers' package")

291

```

292

293

### Edit Distance

294

295

```python

296

from torchmetrics.text import EditDistance

297

298

# Initialize edit distance

299

edit_dist = EditDistance()

300

301

# Sample strings

302

preds = ["kitten"]

303

target = ["sitting"]

304

305

# Compute edit distance

306

distance = edit_dist(preds, target)

307

print(f"Edit Distance: {distance:.0f}")

308

309

# Normalized edit distance

310

edit_dist_norm = EditDistance(reduction="mean")

311

norm_distance = edit_dist_norm(preds, target)

312

print(f"Normalized Edit Distance: {norm_distance:.4f}")

313

```

314

315

### Perplexity for Language Models

316

317

```python

318

from torchmetrics.text import Perplexity

319

import torch

320

321

# Initialize perplexity metric

322

perplexity = Perplexity()

323

324

# Language model predictions (log probabilities)

325

# Shape: (batch_size, sequence_length, vocab_size)

326

preds = torch.randn(2, 8, 1000) # 2 sequences, 8 tokens, vocab size 1000

327

target = torch.randint(0, 1000, (2, 8)) # target token ids

328

329

# Compute perplexity

330

ppl_score = perplexity(preds, target)

331

print(f"Perplexity: {ppl_score:.4f}")

332

```

333

334

### Translation Edit Rate (TER)

335

336

```python

337

from torchmetrics.text import TranslationEditRate

338

339

# Initialize TER metric

340

ter = TranslationEditRate(normalize=True, lowercase=True)

341

342

# Translation examples

343

preds = ["The cat is on the mat"]

344

target = ["There is a cat on the mat"]

345

346

# Compute TER

347

ter_score = ter(preds, target)

348

print(f"Translation Edit Rate: {ter_score:.4f}")

349

```

350

351

### SQuAD Metric for QA

352

353

```python

354

from torchmetrics.text import SQuAD

355

356

# Initialize SQuAD metric

357

squad = SQuAD()

358

359

# QA predictions and references

360

preds = [{"prediction_text": "Denver Broncos", "id": "56be4db0acb8001400a502ec"}]

361

target = [{"answers": {"answer_start": [177], "text": ["Denver Broncos"]},

362

"id": "56be4db0acb8001400a502ec"}]

363

364

# Compute SQuAD scores

365

squad_scores = squad(preds, target)

366

print(f"Exact Match: {squad_scores['exact_match']:.4f}")

367

print(f"F1 Score: {squad_scores['f1']:.4f}")

368

```

369

370

### Multi-Reference Evaluation

371

372

```python

373

from torchmetrics.text import BLEUScore, ROUGEScore

374

375

# Multiple reference translations/summaries

376

preds = ["the cat is on the mat"]

377

target = [["there is a cat on the mat",

378

"a cat is on the mat",

379

"the cat sits on the mat"]]

380

381

# BLEU with multiple references

382

bleu_multi = BLEUScore()

383

bleu_score = bleu_multi(preds, target)

384

print(f"Multi-reference BLEU: {bleu_score:.4f}")

385

386

# ROUGE with multiple references

387

rouge_multi = ROUGEScore()

388

rouge_scores = rouge_multi(preds, target)

389

print(f"Multi-reference ROUGE-L: {rouge_scores['rougeL_fmeasure']:.4f}")

390

```

391

392

## Types

393

394

```python { .api }

395

from typing import Union, Optional, Sequence, List, Dict, Any, Callable, Tuple

396

import torch

397

from torch import Tensor

398

399

TextInput = Union[str, List[str]]

400

TextTarget = Union[str, List[str], List[List[str]]] # Multiple references supported

401

402

ROUGEKeys = Union[str, Tuple[str, ...]]

403

AccumulateType = Union["avg", "best"]

404

MeasureType = Union["fisher_rao", "kl_divergence", "js_divergence"]

405

```