0
# Training and Model Building
1
2
Tools for training custom models, fine-tuning existing models, and creating specialized NLP pipelines for domain-specific applications. spaCy provides a complete training framework with support for multiple architectures and optimization strategies.
3
4
## Capabilities
5
6
### Training Functions
7
8
Core functions for training and evaluating spaCy models.
9
10
```python { .api }
11
def train(nlp: Language, examples: List[Example], sgd: Optimizer = None,
12
losses: dict = None, component_cfg: dict = None,
13
exclude: List[str] = None) -> dict:
14
"""
15
Train a spaCy model on examples.
16
17
Args:
18
nlp: Language object with pipeline components
19
examples: Training examples
20
sgd: Optimizer (created automatically if None)
21
losses: Dictionary to track losses
22
component_cfg: Component-specific config
23
exclude: Components to exclude from training
24
25
Returns:
26
Dictionary of losses by component
27
"""
28
29
def evaluate(nlp: Language, examples: List[Example],
30
verbose: bool = False, **kwargs) -> dict:
31
"""
32
Evaluate a spaCy model on examples.
33
34
Args:
35
nlp: Language object to evaluate
36
examples: Evaluation examples
37
verbose: Print detailed results
38
39
Returns:
40
Dictionary of evaluation metrics
41
"""
42
```
43
44
### Training Data Classes
45
46
Classes for representing and managing training data.
47
48
```python { .api }
49
class Example:
50
"""Training example with reference and predicted annotations."""
51
52
def __init__(self, predicted: Doc, reference: Doc) -> None:
53
"""Create an Example from predicted and reference docs."""
54
55
@classmethod
56
def from_dict(cls, predicted: Doc, example_dict: dict) -> 'Example':
57
"""Create Example from a dictionary of annotations."""
58
59
@property
60
def predicted(self) -> Doc:
61
"""The predicted Doc object."""
62
63
@property
64
def reference(self) -> Doc:
65
"""The reference Doc object with gold annotations."""
66
67
def get_aligned_parse(self, projectivize: bool = True) -> List[dict]:
68
"""Get aligned dependency parse."""
69
70
def get_aligned_ner(self) -> List[tuple]:
71
"""Get aligned named entity annotations."""
72
73
def get_aligned_spans(self, spans_key: str) -> List[tuple]:
74
"""Get aligned spans for a given key."""
75
76
def to_dict(self) -> dict:
77
"""Convert Example to dictionary format."""
78
```
79
80
### Training Utilities
81
82
Utility classes for training configuration and data management.
83
84
```python { .api }
85
class Config:
86
"""Configuration object for training."""
87
88
def __init__(self, data: dict = None) -> None:
89
"""Initialize config from dictionary."""
90
91
@classmethod
92
def from_str(cls, text: str) -> 'Config':
93
"""Create config from string."""
94
95
@classmethod
96
def from_disk(cls, path: str) -> 'Config':
97
"""Load config from disk."""
98
99
def to_disk(self, path: str) -> None:
100
"""Save config to disk."""
101
102
def interpolate(self) -> 'Config':
103
"""Resolve variable interpolations."""
104
105
class Corpus:
106
"""Training corpus with data loading utilities."""
107
108
def __init__(self, train_path: str, dev_path: str, **kwargs) -> None:
109
"""Initialize corpus with data paths."""
110
111
def train_dataset(self, nlp: Language) -> Iterator[Example]:
112
"""Get training examples."""
113
114
def dev_dataset(self, nlp: Language) -> Iterator[Example]:
115
"""Get development examples."""
116
```
117
118
### Model Architecture Components
119
120
Neural network components for building custom models.
121
122
```python { .api }
123
class Tok2Vec:
124
"""Token-to-vector encoder component."""
125
126
def __init__(self, vocab: Vocab, model: Model, **cfg) -> None:
127
"""Initialize tok2vec component."""
128
129
def __call__(self, doc: Doc) -> Doc:
130
"""Add token vectors to doc."""
131
132
def predict(self, docs: List[Doc]) -> List[numpy.ndarray]:
133
"""Predict token vectors."""
134
135
def set_annotations(self, docs: List[Doc],
136
predictions: List[numpy.ndarray]) -> None:
137
"""Set token vector annotations."""
138
139
def build_tok2vec_model(embed: Model, encode: Model) -> Model:
140
"""
141
Build a tok2vec model from embedding and encoding layers.
142
143
Args:
144
embed: Embedding layer (HashEmbed, CharacterEmbed, etc.)
145
encode: Encoding layer (MaxoutWindowEncoder, etc.)
146
147
Returns:
148
Complete tok2vec model
149
"""
150
151
def build_hash_embed_cnn_tok2vec(width: int, depth: int,
152
embed_size: int, **kwargs) -> Model:
153
"""Build CNN-based tok2vec with hash embedding."""
154
155
def build_transformer_model(name: str, **kwargs) -> Model:
156
"""Build transformer-based model."""
157
```
158
159
### Evaluation and Scoring
160
161
Classes for computing evaluation metrics and scores.
162
163
```python { .api }
164
class Scorer:
165
"""Evaluation scorer for spaCy models."""
166
167
def __init__(self, nlp: Language = None, **kwargs) -> None:
168
"""Initialize scorer."""
169
170
def score(self, examples: List[Example]) -> dict:
171
"""Score examples and return metrics."""
172
173
def score_tokenization(self, examples: List[Example]) -> dict:
174
"""Score tokenization accuracy."""
175
176
def score_token_attr(self, examples: List[Example],
177
attr: str, **kwargs) -> dict:
178
"""Score token-level attribute accuracy."""
179
180
def score_spans(self, examples: List[Example],
181
attr: str, **kwargs) -> dict:
182
"""Score span-level predictions."""
183
184
def score_cats(self, examples: List[Example], **kwargs) -> dict:
185
"""Score text classification."""
186
187
class PRFScore:
188
"""Precision, recall, and F-score container."""
189
190
def __init__(self) -> None:
191
"""Initialize score tracking."""
192
193
@property
194
def precision(self) -> float:
195
"""Precision score."""
196
197
@property
198
def recall(self) -> float:
199
"""Recall score."""
200
201
@property
202
def fscore(self) -> float:
203
"""F1 score."""
204
```
205
206
## Training Workflows
207
208
### Basic Training Example
209
210
```python
211
import spacy
212
from spacy.training import Example
213
from spacy.util import minibatch
214
import random
215
216
# Create blank model
217
nlp = spacy.blank("en")
218
219
# Add components
220
ner = nlp.add_pipe("ner")
221
ner.add_label("COMPANY")
222
ner.add_label("PERSON")
223
224
# Training data
225
TRAINING_DATA = [
226
("Apple Inc. was founded by Steve Jobs.", {
227
"entities": [(0, 10, "COMPANY"), (26, 36, "PERSON")]
228
}),
229
("Google hired Larry Page as CEO.", {
230
"entities": [(0, 6, "COMPANY"), (13, 23, "PERSON")]
231
}),
232
("Microsoft CEO is Satya Nadella.", {
233
"entities": [(0, 9, "COMPANY"), (17, 31, "PERSON")]
234
})
235
]
236
237
# Convert to Example objects
238
examples = []
239
for text, annotations in TRAINING_DATA:
240
doc = nlp.make_doc(text)
241
example = Example.from_dict(doc, annotations)
242
examples.append(example)
243
244
# Initialize training
245
nlp.begin_training()
246
247
# Training loop
248
for epoch in range(10):
249
random.shuffle(examples)
250
losses = {}
251
252
# Batch training
253
batches = minibatch(examples, size=2)
254
for batch in batches:
255
nlp.update(batch, losses=losses)
256
257
print(f"Epoch {epoch}, Losses: {losses}")
258
259
# Save trained model
260
nlp.to_disk("./custom_ner_model")
261
```
262
263
### Training with Configuration Files
264
265
```python
266
import spacy
267
from spacy.training import Example, init_nlp
268
from spacy.util import load_config
269
270
# Load configuration
271
config = load_config("./config.cfg")
272
273
# Initialize model from config
274
nlp = init_nlp(config)
275
276
# Load training data
277
def load_data(path):
278
"""Load training data from file."""
279
examples = []
280
# Load and convert your data format to Example objects
281
return examples
282
283
train_examples = load_data("train.json")
284
dev_examples = load_data("dev.json")
285
286
# Initialize training
287
nlp.initialize(lambda: train_examples)
288
289
# Training with config settings
290
for epoch in range(config["training"]["max_epochs"]):
291
losses = {}
292
batches = minibatch(train_examples, size=config["training"]["batch_size"])
293
294
for batch in batches:
295
nlp.update(batch, losses=losses, sgd=nlp.resume_training())
296
297
# Evaluate
298
scores = nlp.evaluate(dev_examples)
299
print(f"Epoch {epoch}: {scores}")
300
```
301
302
### Fine-tuning Existing Models
303
304
```python
305
import spacy
306
from spacy.training import Example
307
308
# Load existing model
309
nlp = spacy.load("en_core_web_sm")
310
311
# Get NER component
312
ner = nlp.get_pipe("ner")
313
314
# Add new labels
315
ner.add_label("PRODUCT")
316
ner.add_label("BRAND")
317
318
# Domain-specific training data
319
DOMAIN_DATA = [
320
("iPhone 12 is Apple's latest smartphone.", {
321
"entities": [(0, 9, "PRODUCT"), (13, 18, "BRAND")]
322
}),
323
("Samsung Galaxy S21 features 5G connectivity.", {
324
"entities": [(0, 7, "BRAND"), (8, 18, "PRODUCT")]
325
})
326
]
327
328
# Convert to examples
329
examples = []
330
for text, annotations in DOMAIN_DATA:
331
doc = nlp.make_doc(text)
332
example = Example.from_dict(doc, annotations)
333
examples.append(example)
334
335
# Fine-tune with lower learning rate
336
optimizer = nlp.resume_training()
337
for i in range(20):
338
losses = {}
339
nlp.update(examples, losses=losses, sgd=optimizer)
340
print(f"Iteration {i}, Losses: {losses}")
341
342
# Save fine-tuned model
343
nlp.to_disk("./fine_tuned_model")
344
```
345
346
### Custom Pipeline Component Training
347
348
```python
349
import spacy
350
from spacy import Language
351
from spacy.training import Example
352
353
@Language.component("custom_classifier")
354
class CustomClassifier:
355
"""Custom text classifier component."""
356
357
def __init__(self, nlp, name):
358
self.name = name
359
self.labels = set()
360
# Initialize your model here
361
362
def __call__(self, doc):
363
# Apply classification
364
doc.cats = self.predict(doc)
365
return doc
366
367
def predict(self, doc):
368
# Your prediction logic
369
return {"POSITIVE": 0.8, "NEGATIVE": 0.2}
370
371
def update(self, examples, losses=None, sgd=None):
372
# Training logic
373
pass
374
375
def add_label(self, label):
376
self.labels.add(label)
377
378
# Create model with custom component
379
nlp = spacy.blank("en")
380
classifier = nlp.add_pipe("custom_classifier")
381
classifier.add_label("POSITIVE")
382
classifier.add_label("NEGATIVE")
383
384
# Training data for classification
385
TRAINING_DATA = [
386
("This movie is great!", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
387
("I hate this product.", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}})
388
]
389
390
examples = []
391
for text, annotations in TRAINING_DATA:
392
doc = nlp.make_doc(text)
393
example = Example.from_dict(doc, annotations)
394
examples.append(example)
395
396
# Train custom component
397
nlp.initialize()
398
for i in range(10):
399
losses = {}
400
nlp.update(examples, losses=losses)
401
print(f"Losses: {losses}")
402
```
403
404
### Multi-task Training
405
406
```python
407
import spacy
408
from spacy.training import Example
409
410
# Create model with multiple components
411
nlp = spacy.blank("en")
412
nlp.add_pipe("tagger")
413
nlp.add_pipe("ner")
414
nlp.add_pipe("textcat")
415
416
# Add labels
417
ner = nlp.get_pipe("ner")
418
ner.add_label("PERSON")
419
ner.add_label("ORG")
420
421
textcat = nlp.get_pipe("textcat")
422
textcat.add_label("POSITIVE")
423
textcat.add_label("NEGATIVE")
424
425
# Multi-task training data
426
TRAINING_DATA = [
427
("Apple Inc. makes great products!", {
428
"entities": [(0, 10, "ORG")],
429
"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}
430
}),
431
("John Smith dislikes Microsoft.", {
432
"entities": [(0, 10, "PERSON"), (20, 29, "ORG")],
433
"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}
434
})
435
]
436
437
examples = []
438
for text, annotations in TRAINING_DATA:
439
doc = nlp.make_doc(text)
440
example = Example.from_dict(doc, annotations)
441
examples.append(example)
442
443
# Joint training
444
nlp.initialize()
445
for epoch in range(20):
446
losses = {}
447
nlp.update(examples, losses=losses)
448
print(f"Epoch {epoch}, Losses: {losses}")
449
```
450
451
### Evaluation and Model Selection
452
453
```python
454
import spacy
455
from spacy.training import Example
456
from spacy.scorer import Scorer
457
458
# Load model and test data
459
nlp = spacy.load("./trained_model")
460
test_examples = load_test_data() # Your test data loading function
461
462
# Evaluate model
463
scorer = Scorer()
464
scores = scorer.score(test_examples)
465
466
print("Evaluation Results:")
467
print(f"Token accuracy: {scores['token_acc']:.3f}")
468
print(f"POS accuracy: {scores['tag_acc']:.3f}")
469
print(f"NER precision: {scores['ents_p']:.3f}")
470
print(f"NER recall: {scores['ents_r']:.3f}")
471
print(f"NER F1: {scores['ents_f']:.3f}")
472
473
# Component-specific evaluation
474
ner_scores = scorer.score_spans(test_examples, "ents")
475
print(f"NER scores by label: {ner_scores['ents_per_type']}")
476
477
# Detailed error analysis
478
for example in test_examples[:5]:
479
pred_ents = [(ent.start, ent.end, ent.label_) for ent in example.predicted.ents]
480
ref_ents = [(ent.start, ent.end, ent.label_) for ent in example.reference.ents]
481
482
print(f"Text: {example.predicted.text}")
483
print(f"Predicted: {pred_ents}")
484
print(f"Reference: {ref_ents}")
485
print("---")
486
```
487
488
### Advanced Training with Callbacks
489
490
```python
491
import spacy
492
from spacy.training import Example
493
from spacy.util import minibatch
494
495
# Training with callbacks
496
def create_evaluation_callback(nlp, dev_examples):
497
"""Create callback for evaluation during training."""
498
def evaluate_model():
499
scores = nlp.evaluate(dev_examples)
500
print(f"Dev scores: {scores}")
501
return scores
502
return evaluate_model
503
504
def create_save_callback(nlp, save_path):
505
"""Create callback to save best model."""
506
best_score = 0.0
507
def save_if_better(scores):
508
nonlocal best_score
509
current_score = scores.get("ents_f", 0.0)
510
if current_score > best_score:
511
best_score = current_score
512
nlp.to_disk(save_path)
513
print(f"Saved new best model with F1: {current_score:.3f}")
514
return save_if_better
515
516
# Training with callbacks
517
nlp = spacy.blank("en")
518
nlp.add_pipe("ner")
519
520
train_examples = load_training_data()
521
dev_examples = load_dev_data()
522
523
eval_callback = create_evaluation_callback(nlp, dev_examples)
524
save_callback = create_save_callback(nlp, "./best_model")
525
526
nlp.initialize()
527
528
for epoch in range(50):
529
losses = {}
530
batches = minibatch(train_examples, size=8)
531
532
for batch in batches:
533
nlp.update(batch, losses=losses)
534
535
# Evaluate every 10 epochs
536
if epoch % 10 == 0:
537
scores = eval_callback()
538
save_callback(scores)
539
```