or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-objects.mdindex.mdlanguage-models.mdpattern-matching.mdpipeline-components.mdtraining.mdvisualization.md

training.mddocs/

0

# Training and Model Building

1

2

Tools for training custom models, fine-tuning existing models, and creating specialized NLP pipelines for domain-specific applications. spaCy provides a complete training framework with support for multiple architectures and optimization strategies.

3

4

## Capabilities

5

6

### Training Functions

7

8

Core functions for training and evaluating spaCy models.

9

10

```python { .api }

11

def train(nlp: Language, examples: List[Example], sgd: Optimizer = None,

12

losses: dict = None, component_cfg: dict = None,

13

exclude: List[str] = None) -> dict:

14

"""

15

Train a spaCy model on examples.

16

17

Args:

18

nlp: Language object with pipeline components

19

examples: Training examples

20

sgd: Optimizer (created automatically if None)

21

losses: Dictionary to track losses

22

component_cfg: Component-specific config

23

exclude: Components to exclude from training

24

25

Returns:

26

Dictionary of losses by component

27

"""

28

29

def evaluate(nlp: Language, examples: List[Example],

30

verbose: bool = False, **kwargs) -> dict:

31

"""

32

Evaluate a spaCy model on examples.

33

34

Args:

35

nlp: Language object to evaluate

36

examples: Evaluation examples

37

verbose: Print detailed results

38

39

Returns:

40

Dictionary of evaluation metrics

41

"""

42

```

43

44

### Training Data Classes

45

46

Classes for representing and managing training data.

47

48

```python { .api }

49

class Example:

50

"""Training example with reference and predicted annotations."""

51

52

def __init__(self, predicted: Doc, reference: Doc) -> None:

53

"""Create an Example from predicted and reference docs."""

54

55

@classmethod

56

def from_dict(cls, predicted: Doc, example_dict: dict) -> 'Example':

57

"""Create Example from a dictionary of annotations."""

58

59

@property

60

def predicted(self) -> Doc:

61

"""The predicted Doc object."""

62

63

@property

64

def reference(self) -> Doc:

65

"""The reference Doc object with gold annotations."""

66

67

def get_aligned_parse(self, projectivize: bool = True) -> List[dict]:

68

"""Get aligned dependency parse."""

69

70

def get_aligned_ner(self) -> List[tuple]:

71

"""Get aligned named entity annotations."""

72

73

def get_aligned_spans(self, spans_key: str) -> List[tuple]:

74

"""Get aligned spans for a given key."""

75

76

def to_dict(self) -> dict:

77

"""Convert Example to dictionary format."""

78

```

79

80

### Training Utilities

81

82

Utility classes for training configuration and data management.

83

84

```python { .api }

85

class Config:

86

"""Configuration object for training."""

87

88

def __init__(self, data: dict = None) -> None:

89

"""Initialize config from dictionary."""

90

91

@classmethod

92

def from_str(cls, text: str) -> 'Config':

93

"""Create config from string."""

94

95

@classmethod

96

def from_disk(cls, path: str) -> 'Config':

97

"""Load config from disk."""

98

99

def to_disk(self, path: str) -> None:

100

"""Save config to disk."""

101

102

def interpolate(self) -> 'Config':

103

"""Resolve variable interpolations."""

104

105

class Corpus:

106

"""Training corpus with data loading utilities."""

107

108

def __init__(self, train_path: str, dev_path: str, **kwargs) -> None:

109

"""Initialize corpus with data paths."""

110

111

def train_dataset(self, nlp: Language) -> Iterator[Example]:

112

"""Get training examples."""

113

114

def dev_dataset(self, nlp: Language) -> Iterator[Example]:

115

"""Get development examples."""

116

```

117

118

### Model Architecture Components

119

120

Neural network components for building custom models.

121

122

```python { .api }

123

class Tok2Vec:

124

"""Token-to-vector encoder component."""

125

126

def __init__(self, vocab: Vocab, model: Model, **cfg) -> None:

127

"""Initialize tok2vec component."""

128

129

def __call__(self, doc: Doc) -> Doc:

130

"""Add token vectors to doc."""

131

132

def predict(self, docs: List[Doc]) -> List[numpy.ndarray]:

133

"""Predict token vectors."""

134

135

def set_annotations(self, docs: List[Doc],

136

predictions: List[numpy.ndarray]) -> None:

137

"""Set token vector annotations."""

138

139

def build_tok2vec_model(embed: Model, encode: Model) -> Model:

140

"""

141

Build a tok2vec model from embedding and encoding layers.

142

143

Args:

144

embed: Embedding layer (HashEmbed, CharacterEmbed, etc.)

145

encode: Encoding layer (MaxoutWindowEncoder, etc.)

146

147

Returns:

148

Complete tok2vec model

149

"""

150

151

def build_hash_embed_cnn_tok2vec(width: int, depth: int,

152

embed_size: int, **kwargs) -> Model:

153

"""Build CNN-based tok2vec with hash embedding."""

154

155

def build_transformer_model(name: str, **kwargs) -> Model:

156

"""Build transformer-based model."""

157

```

158

159

### Evaluation and Scoring

160

161

Classes for computing evaluation metrics and scores.

162

163

```python { .api }

164

class Scorer:

165

"""Evaluation scorer for spaCy models."""

166

167

def __init__(self, nlp: Language = None, **kwargs) -> None:

168

"""Initialize scorer."""

169

170

def score(self, examples: List[Example]) -> dict:

171

"""Score examples and return metrics."""

172

173

def score_tokenization(self, examples: List[Example]) -> dict:

174

"""Score tokenization accuracy."""

175

176

def score_token_attr(self, examples: List[Example],

177

attr: str, **kwargs) -> dict:

178

"""Score token-level attribute accuracy."""

179

180

def score_spans(self, examples: List[Example],

181

attr: str, **kwargs) -> dict:

182

"""Score span-level predictions."""

183

184

def score_cats(self, examples: List[Example], **kwargs) -> dict:

185

"""Score text classification."""

186

187

class PRFScore:

188

"""Precision, recall, and F-score container."""

189

190

def __init__(self) -> None:

191

"""Initialize score tracking."""

192

193

@property

194

def precision(self) -> float:

195

"""Precision score."""

196

197

@property

198

def recall(self) -> float:

199

"""Recall score."""

200

201

@property

202

def fscore(self) -> float:

203

"""F1 score."""

204

```

205

206

## Training Workflows

207

208

### Basic Training Example

209

210

```python

211

import spacy

212

from spacy.training import Example

213

from spacy.util import minibatch

214

import random

215

216

# Create blank model

217

nlp = spacy.blank("en")

218

219

# Add components

220

ner = nlp.add_pipe("ner")

221

ner.add_label("COMPANY")

222

ner.add_label("PERSON")

223

224

# Training data

225

TRAINING_DATA = [

226

("Apple Inc. was founded by Steve Jobs.", {

227

"entities": [(0, 10, "COMPANY"), (26, 36, "PERSON")]

228

}),

229

("Google hired Larry Page as CEO.", {

230

"entities": [(0, 6, "COMPANY"), (13, 23, "PERSON")]

231

}),

232

("Microsoft CEO is Satya Nadella.", {

233

"entities": [(0, 9, "COMPANY"), (17, 31, "PERSON")]

234

})

235

]

236

237

# Convert to Example objects

238

examples = []

239

for text, annotations in TRAINING_DATA:

240

doc = nlp.make_doc(text)

241

example = Example.from_dict(doc, annotations)

242

examples.append(example)

243

244

# Initialize training

245

nlp.begin_training()

246

247

# Training loop

248

for epoch in range(10):

249

random.shuffle(examples)

250

losses = {}

251

252

# Batch training

253

batches = minibatch(examples, size=2)

254

for batch in batches:

255

nlp.update(batch, losses=losses)

256

257

print(f"Epoch {epoch}, Losses: {losses}")

258

259

# Save trained model

260

nlp.to_disk("./custom_ner_model")

261

```

262

263

### Training with Configuration Files

264

265

```python

266

import spacy

267

from spacy.training import Example, init_nlp

268

from spacy.util import load_config

269

270

# Load configuration

271

config = load_config("./config.cfg")

272

273

# Initialize model from config

274

nlp = init_nlp(config)

275

276

# Load training data

277

def load_data(path):

278

"""Load training data from file."""

279

examples = []

280

# Load and convert your data format to Example objects

281

return examples

282

283

train_examples = load_data("train.json")

284

dev_examples = load_data("dev.json")

285

286

# Initialize training

287

nlp.initialize(lambda: train_examples)

288

289

# Training with config settings

290

for epoch in range(config["training"]["max_epochs"]):

291

losses = {}

292

batches = minibatch(train_examples, size=config["training"]["batch_size"])

293

294

for batch in batches:

295

nlp.update(batch, losses=losses, sgd=nlp.resume_training())

296

297

# Evaluate

298

scores = nlp.evaluate(dev_examples)

299

print(f"Epoch {epoch}: {scores}")

300

```

301

302

### Fine-tuning Existing Models

303

304

```python

305

import spacy

306

from spacy.training import Example

307

308

# Load existing model

309

nlp = spacy.load("en_core_web_sm")

310

311

# Get NER component

312

ner = nlp.get_pipe("ner")

313

314

# Add new labels

315

ner.add_label("PRODUCT")

316

ner.add_label("BRAND")

317

318

# Domain-specific training data

319

DOMAIN_DATA = [

320

("iPhone 12 is Apple's latest smartphone.", {

321

"entities": [(0, 9, "PRODUCT"), (13, 18, "BRAND")]

322

}),

323

("Samsung Galaxy S21 features 5G connectivity.", {

324

"entities": [(0, 7, "BRAND"), (8, 18, "PRODUCT")]

325

})

326

]

327

328

# Convert to examples

329

examples = []

330

for text, annotations in DOMAIN_DATA:

331

doc = nlp.make_doc(text)

332

example = Example.from_dict(doc, annotations)

333

examples.append(example)

334

335

# Fine-tune with lower learning rate

336

optimizer = nlp.resume_training()

337

for i in range(20):

338

losses = {}

339

nlp.update(examples, losses=losses, sgd=optimizer)

340

print(f"Iteration {i}, Losses: {losses}")

341

342

# Save fine-tuned model

343

nlp.to_disk("./fine_tuned_model")

344

```

345

346

### Custom Pipeline Component Training

347

348

```python

349

import spacy

350

from spacy import Language

351

from spacy.training import Example

352

353

@Language.component("custom_classifier")

354

class CustomClassifier:

355

"""Custom text classifier component."""

356

357

def __init__(self, nlp, name):

358

self.name = name

359

self.labels = set()

360

# Initialize your model here

361

362

def __call__(self, doc):

363

# Apply classification

364

doc.cats = self.predict(doc)

365

return doc

366

367

def predict(self, doc):

368

# Your prediction logic

369

return {"POSITIVE": 0.8, "NEGATIVE": 0.2}

370

371

def update(self, examples, losses=None, sgd=None):

372

# Training logic

373

pass

374

375

def add_label(self, label):

376

self.labels.add(label)

377

378

# Create model with custom component

379

nlp = spacy.blank("en")

380

classifier = nlp.add_pipe("custom_classifier")

381

classifier.add_label("POSITIVE")

382

classifier.add_label("NEGATIVE")

383

384

# Training data for classification

385

TRAINING_DATA = [

386

("This movie is great!", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),

387

("I hate this product.", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}})

388

]

389

390

examples = []

391

for text, annotations in TRAINING_DATA:

392

doc = nlp.make_doc(text)

393

example = Example.from_dict(doc, annotations)

394

examples.append(example)

395

396

# Train custom component

397

nlp.initialize()

398

for i in range(10):

399

losses = {}

400

nlp.update(examples, losses=losses)

401

print(f"Losses: {losses}")

402

```

403

404

### Multi-task Training

405

406

```python

407

import spacy

408

from spacy.training import Example

409

410

# Create model with multiple components

411

nlp = spacy.blank("en")

412

nlp.add_pipe("tagger")

413

nlp.add_pipe("ner")

414

nlp.add_pipe("textcat")

415

416

# Add labels

417

ner = nlp.get_pipe("ner")

418

ner.add_label("PERSON")

419

ner.add_label("ORG")

420

421

textcat = nlp.get_pipe("textcat")

422

textcat.add_label("POSITIVE")

423

textcat.add_label("NEGATIVE")

424

425

# Multi-task training data

426

TRAINING_DATA = [

427

("Apple Inc. makes great products!", {

428

"entities": [(0, 10, "ORG")],

429

"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}

430

}),

431

("John Smith dislikes Microsoft.", {

432

"entities": [(0, 10, "PERSON"), (20, 29, "ORG")],

433

"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}

434

})

435

]

436

437

examples = []

438

for text, annotations in TRAINING_DATA:

439

doc = nlp.make_doc(text)

440

example = Example.from_dict(doc, annotations)

441

examples.append(example)

442

443

# Joint training

444

nlp.initialize()

445

for epoch in range(20):

446

losses = {}

447

nlp.update(examples, losses=losses)

448

print(f"Epoch {epoch}, Losses: {losses}")

449

```

450

451

### Evaluation and Model Selection

452

453

```python

454

import spacy

455

from spacy.training import Example

456

from spacy.scorer import Scorer

457

458

# Load model and test data

459

nlp = spacy.load("./trained_model")

460

test_examples = load_test_data() # Your test data loading function

461

462

# Evaluate model

463

scorer = Scorer()

464

scores = scorer.score(test_examples)

465

466

print("Evaluation Results:")

467

print(f"Token accuracy: {scores['token_acc']:.3f}")

468

print(f"POS accuracy: {scores['tag_acc']:.3f}")

469

print(f"NER precision: {scores['ents_p']:.3f}")

470

print(f"NER recall: {scores['ents_r']:.3f}")

471

print(f"NER F1: {scores['ents_f']:.3f}")

472

473

# Component-specific evaluation

474

ner_scores = scorer.score_spans(test_examples, "ents")

475

print(f"NER scores by label: {ner_scores['ents_per_type']}")

476

477

# Detailed error analysis

478

for example in test_examples[:5]:

479

pred_ents = [(ent.start, ent.end, ent.label_) for ent in example.predicted.ents]

480

ref_ents = [(ent.start, ent.end, ent.label_) for ent in example.reference.ents]

481

482

print(f"Text: {example.predicted.text}")

483

print(f"Predicted: {pred_ents}")

484

print(f"Reference: {ref_ents}")

485

print("---")

486

```

487

488

### Advanced Training with Callbacks

489

490

```python

491

import spacy

492

from spacy.training import Example

493

from spacy.util import minibatch

494

495

# Training with callbacks

496

def create_evaluation_callback(nlp, dev_examples):

497

"""Create callback for evaluation during training."""

498

def evaluate_model():

499

scores = nlp.evaluate(dev_examples)

500

print(f"Dev scores: {scores}")

501

return scores

502

return evaluate_model

503

504

def create_save_callback(nlp, save_path):

505

"""Create callback to save best model."""

506

best_score = 0.0

507

def save_if_better(scores):

508

nonlocal best_score

509

current_score = scores.get("ents_f", 0.0)

510

if current_score > best_score:

511

best_score = current_score

512

nlp.to_disk(save_path)

513

print(f"Saved new best model with F1: {current_score:.3f}")

514

return save_if_better

515

516

# Training with callbacks

517

nlp = spacy.blank("en")

518

nlp.add_pipe("ner")

519

520

train_examples = load_training_data()

521

dev_examples = load_dev_data()

522

523

eval_callback = create_evaluation_callback(nlp, dev_examples)

524

save_callback = create_save_callback(nlp, "./best_model")

525

526

nlp.initialize()

527

528

for epoch in range(50):

529

losses = {}

530

batches = minibatch(train_examples, size=8)

531

532

for batch in batches:

533

nlp.update(batch, losses=losses)

534

535

# Evaluate every 10 epochs

536

if epoch % 10 == 0:

537

scores = eval_callback()

538

save_callback(scores)

539

```