Tessl Tile for pypi/spacy@2.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-objects.md index.md language-models.md pattern-matching.md pipeline-components.md training.md visualization.md

training.mddocs/

0
# Training and Model Building
1

2
Tools for training custom models, fine-tuning existing models, and creating specialized NLP pipelines for domain-specific applications. spaCy provides a complete training framework with support for multiple architectures and optimization strategies.
3

4
## Capabilities
5

6
### Training Functions
7

8
Core functions for training and evaluating spaCy models.
9

10
```python { .api }
11
def train(nlp: Language, examples: List[Example], sgd: Optimizer = None,
12
          losses: dict = None, component_cfg: dict = None,
13
          exclude: List[str] = None) -> dict:
14
    """
15
    Train a spaCy model on examples.
16
    
17
    Args:
18
        nlp: Language object with pipeline components
19
        examples: Training examples
20
        sgd: Optimizer (created automatically if None)
21
        losses: Dictionary to track losses
22
        component_cfg: Component-specific config
23
        exclude: Components to exclude from training
24
        
25
    Returns:
26
        Dictionary of losses by component
27
    """
28

29
def evaluate(nlp: Language, examples: List[Example], 
30
            verbose: bool = False, **kwargs) -> dict:
31
    """
32
    Evaluate a spaCy model on examples.
33
    
34
    Args:
35
        nlp: Language object to evaluate
36
        examples: Evaluation examples
37
        verbose: Print detailed results
38
        
39
    Returns:
40
        Dictionary of evaluation metrics
41
    """
42
```
43

44
### Training Data Classes
45

46
Classes for representing and managing training data.
47

48
```python { .api }
49
class Example:
50
    """Training example with reference and predicted annotations."""
51
    
52
    def __init__(self, predicted: Doc, reference: Doc) -> None:
53
        """Create an Example from predicted and reference docs."""
54
    
55
    @classmethod
56
    def from_dict(cls, predicted: Doc, example_dict: dict) -> 'Example':
57
        """Create Example from a dictionary of annotations."""
58
    
59
    @property
60
    def predicted(self) -> Doc:
61
        """The predicted Doc object."""
62
    
63
    @property  
64
    def reference(self) -> Doc:
65
        """The reference Doc object with gold annotations."""
66
    
67
    def get_aligned_parse(self, projectivize: bool = True) -> List[dict]:
68
        """Get aligned dependency parse."""
69
    
70
    def get_aligned_ner(self) -> List[tuple]:
71
        """Get aligned named entity annotations."""
72
    
73
    def get_aligned_spans(self, spans_key: str) -> List[tuple]:
74
        """Get aligned spans for a given key."""
75
    
76
    def to_dict(self) -> dict:
77
        """Convert Example to dictionary format."""
78
```
79

80
### Training Utilities
81

82
Utility classes for training configuration and data management.
83

84
```python { .api }
85
class Config:
86
    """Configuration object for training."""
87
    
88
    def __init__(self, data: dict = None) -> None:
89
        """Initialize config from dictionary."""
90
    
91
    @classmethod
92
    def from_str(cls, text: str) -> 'Config':
93
        """Create config from string."""
94
    
95
    @classmethod
96
    def from_disk(cls, path: str) -> 'Config':
97
        """Load config from disk."""
98
    
99
    def to_disk(self, path: str) -> None:
100
        """Save config to disk."""
101
    
102
    def interpolate(self) -> 'Config':
103
        """Resolve variable interpolations."""
104

105
class Corpus:
106
    """Training corpus with data loading utilities."""
107
    
108
    def __init__(self, train_path: str, dev_path: str, **kwargs) -> None:
109
        """Initialize corpus with data paths."""
110
    
111
    def train_dataset(self, nlp: Language) -> Iterator[Example]:
112
        """Get training examples."""
113
    
114
    def dev_dataset(self, nlp: Language) -> Iterator[Example]:
115
        """Get development examples."""
116
```
117

118
### Model Architecture Components
119

120
Neural network components for building custom models.
121

122
```python { .api }
123
class Tok2Vec:
124
    """Token-to-vector encoder component."""
125
    
126
    def __init__(self, vocab: Vocab, model: Model, **cfg) -> None:
127
        """Initialize tok2vec component."""
128
    
129
    def __call__(self, doc: Doc) -> Doc:
130
        """Add token vectors to doc."""
131
    
132
    def predict(self, docs: List[Doc]) -> List[numpy.ndarray]:
133
        """Predict token vectors."""
134
    
135
    def set_annotations(self, docs: List[Doc], 
136
                       predictions: List[numpy.ndarray]) -> None:
137
        """Set token vector annotations."""
138

139
def build_tok2vec_model(embed: Model, encode: Model) -> Model:
140
    """
141
    Build a tok2vec model from embedding and encoding layers.
142
    
143
    Args:
144
        embed: Embedding layer (HashEmbed, CharacterEmbed, etc.)
145
        encode: Encoding layer (MaxoutWindowEncoder, etc.)
146
        
147
    Returns:
148
        Complete tok2vec model
149
    """
150

151
def build_hash_embed_cnn_tok2vec(width: int, depth: int, 
152
                                embed_size: int, **kwargs) -> Model:
153
    """Build CNN-based tok2vec with hash embedding."""
154

155
def build_transformer_model(name: str, **kwargs) -> Model:
156
    """Build transformer-based model."""
157
```
158

159
### Evaluation and Scoring
160

161
Classes for computing evaluation metrics and scores.
162

163
```python { .api }
164
class Scorer:
165
    """Evaluation scorer for spaCy models."""
166
    
167
    def __init__(self, nlp: Language = None, **kwargs) -> None:
168
        """Initialize scorer."""
169
    
170
    def score(self, examples: List[Example]) -> dict:
171
        """Score examples and return metrics."""
172
    
173
    def score_tokenization(self, examples: List[Example]) -> dict:
174
        """Score tokenization accuracy."""
175
    
176
    def score_token_attr(self, examples: List[Example], 
177
                        attr: str, **kwargs) -> dict:
178
        """Score token-level attribute accuracy."""
179
    
180
    def score_spans(self, examples: List[Example], 
181
                   attr: str, **kwargs) -> dict:
182
        """Score span-level predictions."""
183
    
184
    def score_cats(self, examples: List[Example], **kwargs) -> dict:
185
        """Score text classification."""
186

187
class PRFScore:
188
    """Precision, recall, and F-score container."""
189
    
190
    def __init__(self) -> None:
191
        """Initialize score tracking."""
192
    
193
    @property
194
    def precision(self) -> float:
195
        """Precision score."""
196
    
197
    @property  
198
    def recall(self) -> float:
199
        """Recall score."""
200
    
201
    @property
202
    def fscore(self) -> float:
203
        """F1 score."""
204
```
205

206
## Training Workflows
207

208
### Basic Training Example
209

210
```python
211
import spacy
212
from spacy.training import Example
213
from spacy.util import minibatch
214
import random
215

216
# Create blank model
217
nlp = spacy.blank("en")
218

219
# Add components
220
ner = nlp.add_pipe("ner")
221
ner.add_label("COMPANY")
222
ner.add_label("PERSON")
223

224
# Training data
225
TRAINING_DATA = [
226
    ("Apple Inc. was founded by Steve Jobs.", {
227
        "entities": [(0, 10, "COMPANY"), (26, 36, "PERSON")]
228
    }),
229
    ("Google hired Larry Page as CEO.", {
230
        "entities": [(0, 6, "COMPANY"), (13, 23, "PERSON")]
231
    }),
232
    ("Microsoft CEO is Satya Nadella.", {
233
        "entities": [(0, 9, "COMPANY"), (17, 31, "PERSON")]
234
    })
235
]
236

237
# Convert to Example objects
238
examples = []
239
for text, annotations in TRAINING_DATA:
240
    doc = nlp.make_doc(text)
241
    example = Example.from_dict(doc, annotations)
242
    examples.append(example)
243

244
# Initialize training
245
nlp.begin_training()
246

247
# Training loop
248
for epoch in range(10):
249
    random.shuffle(examples)
250
    losses = {}
251
    
252
    # Batch training
253
    batches = minibatch(examples, size=2)
254
    for batch in batches:
255
        nlp.update(batch, losses=losses)
256
    
257
    print(f"Epoch {epoch}, Losses: {losses}")
258

259
# Save trained model
260
nlp.to_disk("./custom_ner_model")
261
```
262

263
### Training with Configuration Files
264

265
```python
266
import spacy
267
from spacy.training import Example, init_nlp
268
from spacy.util import load_config
269

270
# Load configuration
271
config = load_config("./config.cfg")
272

273
# Initialize model from config
274
nlp = init_nlp(config)
275

276
# Load training data
277
def load_data(path):
278
    """Load training data from file."""
279
    examples = []
280
    # Load and convert your data format to Example objects
281
    return examples
282

283
train_examples = load_data("train.json")
284
dev_examples = load_data("dev.json")
285

286
# Initialize training
287
nlp.initialize(lambda: train_examples)
288

289
# Training with config settings
290
for epoch in range(config["training"]["max_epochs"]):
291
    losses = {}
292
    batches = minibatch(train_examples, size=config["training"]["batch_size"])
293
    
294
    for batch in batches:
295
        nlp.update(batch, losses=losses, sgd=nlp.resume_training())
296
    
297
    # Evaluate
298
    scores = nlp.evaluate(dev_examples)
299
    print(f"Epoch {epoch}: {scores}")
300
```
301

302
### Fine-tuning Existing Models
303

304
```python
305
import spacy
306
from spacy.training import Example
307

308
# Load existing model
309
nlp = spacy.load("en_core_web_sm")
310

311
# Get NER component
312
ner = nlp.get_pipe("ner")
313

314
# Add new labels
315
ner.add_label("PRODUCT")
316
ner.add_label("BRAND")
317

318
# Domain-specific training data
319
DOMAIN_DATA = [
320
    ("iPhone 12 is Apple's latest smartphone.", {
321
        "entities": [(0, 9, "PRODUCT"), (13, 18, "BRAND")]
322
    }),
323
    ("Samsung Galaxy S21 features 5G connectivity.", {
324
        "entities": [(0, 7, "BRAND"), (8, 18, "PRODUCT")]
325
    })
326
]
327

328
# Convert to examples
329
examples = []
330
for text, annotations in DOMAIN_DATA:
331
    doc = nlp.make_doc(text)
332
    example = Example.from_dict(doc, annotations)
333
    examples.append(example)
334

335
# Fine-tune with lower learning rate
336
optimizer = nlp.resume_training()
337
for i in range(20):
338
    losses = {}
339
    nlp.update(examples, losses=losses, sgd=optimizer)
340
    print(f"Iteration {i}, Losses: {losses}")
341

342
# Save fine-tuned model
343
nlp.to_disk("./fine_tuned_model")
344
```
345

346
### Custom Pipeline Component Training
347

348
```python
349
import spacy
350
from spacy import Language
351
from spacy.training import Example
352

353
@Language.component("custom_classifier")
354
class CustomClassifier:
355
    """Custom text classifier component."""
356
    
357
    def __init__(self, nlp, name):
358
        self.name = name
359
        self.labels = set()
360
        # Initialize your model here
361
    
362
    def __call__(self, doc):
363
        # Apply classification
364
        doc.cats = self.predict(doc)
365
        return doc
366
    
367
    def predict(self, doc):
368
        # Your prediction logic
369
        return {"POSITIVE": 0.8, "NEGATIVE": 0.2}
370
    
371
    def update(self, examples, losses=None, sgd=None):
372
        # Training logic
373
        pass
374
    
375
    def add_label(self, label):
376
        self.labels.add(label)
377

378
# Create model with custom component
379
nlp = spacy.blank("en")
380
classifier = nlp.add_pipe("custom_classifier")
381
classifier.add_label("POSITIVE")
382
classifier.add_label("NEGATIVE")
383

384
# Training data for classification
385
TRAINING_DATA = [
386
    ("This movie is great!", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
387
    ("I hate this product.", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}})
388
]
389

390
examples = []
391
for text, annotations in TRAINING_DATA:
392
    doc = nlp.make_doc(text)
393
    example = Example.from_dict(doc, annotations)
394
    examples.append(example)
395

396
# Train custom component
397
nlp.initialize()
398
for i in range(10):
399
    losses = {}
400
    nlp.update(examples, losses=losses)
401
    print(f"Losses: {losses}")
402
```
403

404
### Multi-task Training
405

406
```python
407
import spacy
408
from spacy.training import Example
409

410
# Create model with multiple components
411
nlp = spacy.blank("en")
412
nlp.add_pipe("tagger")
413
nlp.add_pipe("ner")
414
nlp.add_pipe("textcat")
415

416
# Add labels
417
ner = nlp.get_pipe("ner")
418
ner.add_label("PERSON")
419
ner.add_label("ORG")
420

421
textcat = nlp.get_pipe("textcat")
422
textcat.add_label("POSITIVE")
423
textcat.add_label("NEGATIVE")
424

425
# Multi-task training data
426
TRAINING_DATA = [
427
    ("Apple Inc. makes great products!", {
428
        "entities": [(0, 10, "ORG")],
429
        "cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}
430
    }),
431
    ("John Smith dislikes Microsoft.", {
432
        "entities": [(0, 10, "PERSON"), (20, 29, "ORG")],
433
        "cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}
434
    })
435
]
436

437
examples = []
438
for text, annotations in TRAINING_DATA:
439
    doc = nlp.make_doc(text)
440
    example = Example.from_dict(doc, annotations)
441
    examples.append(example)
442

443
# Joint training
444
nlp.initialize()
445
for epoch in range(20):
446
    losses = {}
447
    nlp.update(examples, losses=losses)
448
    print(f"Epoch {epoch}, Losses: {losses}")
449
```
450

451
### Evaluation and Model Selection
452

453
```python
454
import spacy
455
from spacy.training import Example
456
from spacy.scorer import Scorer
457

458
# Load model and test data
459
nlp = spacy.load("./trained_model")
460
test_examples = load_test_data()  # Your test data loading function
461

462
# Evaluate model
463
scorer = Scorer()
464
scores = scorer.score(test_examples)
465

466
print("Evaluation Results:")
467
print(f"Token accuracy: {scores['token_acc']:.3f}")
468
print(f"POS accuracy: {scores['tag_acc']:.3f}")
469
print(f"NER precision: {scores['ents_p']:.3f}")
470
print(f"NER recall: {scores['ents_r']:.3f}")
471
print(f"NER F1: {scores['ents_f']:.3f}")
472

473
# Component-specific evaluation
474
ner_scores = scorer.score_spans(test_examples, "ents")
475
print(f"NER scores by label: {ner_scores['ents_per_type']}")
476

477
# Detailed error analysis
478
for example in test_examples[:5]:
479
    pred_ents = [(ent.start, ent.end, ent.label_) for ent in example.predicted.ents]
480
    ref_ents = [(ent.start, ent.end, ent.label_) for ent in example.reference.ents]
481
    
482
    print(f"Text: {example.predicted.text}")
483
    print(f"Predicted: {pred_ents}")
484
    print(f"Reference: {ref_ents}")
485
    print("---")
486
```
487

488
### Advanced Training with Callbacks
489

490
```python
491
import spacy
492
from spacy.training import Example
493
from spacy.util import minibatch
494

495
# Training with callbacks
496
def create_evaluation_callback(nlp, dev_examples):
497
    """Create callback for evaluation during training."""
498
    def evaluate_model():
499
        scores = nlp.evaluate(dev_examples)
500
        print(f"Dev scores: {scores}")
501
        return scores
502
    return evaluate_model
503

504
def create_save_callback(nlp, save_path):
505
    """Create callback to save best model."""
506
    best_score = 0.0
507
    def save_if_better(scores):
508
        nonlocal best_score
509
        current_score = scores.get("ents_f", 0.0)
510
        if current_score > best_score:
511
            best_score = current_score
512
            nlp.to_disk(save_path)
513
            print(f"Saved new best model with F1: {current_score:.3f}")
514
    return save_if_better
515

516
# Training with callbacks
517
nlp = spacy.blank("en")
518
nlp.add_pipe("ner")
519

520
train_examples = load_training_data()
521
dev_examples = load_dev_data()
522

523
eval_callback = create_evaluation_callback(nlp, dev_examples)
524
save_callback = create_save_callback(nlp, "./best_model")
525

526
nlp.initialize()
527

528
for epoch in range(50):
529
    losses = {}
530
    batches = minibatch(train_examples, size=8)
531
    
532
    for batch in batches:
533
        nlp.update(batch, losses=losses)
534
    
535
    # Evaluate every 10 epochs
536
    if epoch % 10 == 0:
537
        scores = eval_callback()
538
        save_callback(scores)
539
```

Version

Tile

Files

training.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

training.mddocs/