Industrial-strength Natural Language Processing (NLP) in Python
—
Tools for training custom models, fine-tuning existing models, and creating specialized NLP pipelines for domain-specific applications. spaCy provides a complete training framework with support for multiple architectures and optimization strategies.
Core functions for training and evaluating spaCy models.
def train(nlp: Language, examples: List[Example], sgd: Optimizer = None,
losses: dict = None, component_cfg: dict = None,
exclude: List[str] = None) -> dict:
"""
Train a spaCy model on examples.
Args:
nlp: Language object with pipeline components
examples: Training examples
sgd: Optimizer (created automatically if None)
losses: Dictionary to track losses
component_cfg: Component-specific config
exclude: Components to exclude from training
Returns:
Dictionary of losses by component
"""
def evaluate(nlp: Language, examples: List[Example],
verbose: bool = False, **kwargs) -> dict:
"""
Evaluate a spaCy model on examples.
Args:
nlp: Language object to evaluate
examples: Evaluation examples
verbose: Print detailed results
Returns:
Dictionary of evaluation metrics
"""Classes for representing and managing training data.
class Example:
"""Training example with reference and predicted annotations."""
def __init__(self, predicted: Doc, reference: Doc) -> None:
"""Create an Example from predicted and reference docs."""
@classmethod
def from_dict(cls, predicted: Doc, example_dict: dict) -> 'Example':
"""Create Example from a dictionary of annotations."""
@property
def predicted(self) -> Doc:
"""The predicted Doc object."""
@property
def reference(self) -> Doc:
"""The reference Doc object with gold annotations."""
def get_aligned_parse(self, projectivize: bool = True) -> List[dict]:
"""Get aligned dependency parse."""
def get_aligned_ner(self) -> List[tuple]:
"""Get aligned named entity annotations."""
def get_aligned_spans(self, spans_key: str) -> List[tuple]:
"""Get aligned spans for a given key."""
def to_dict(self) -> dict:
"""Convert Example to dictionary format."""Utility classes for training configuration and data management.
class Config:
"""Configuration object for training."""
def __init__(self, data: dict = None) -> None:
"""Initialize config from dictionary."""
@classmethod
def from_str(cls, text: str) -> 'Config':
"""Create config from string."""
@classmethod
def from_disk(cls, path: str) -> 'Config':
"""Load config from disk."""
def to_disk(self, path: str) -> None:
"""Save config to disk."""
def interpolate(self) -> 'Config':
"""Resolve variable interpolations."""
class Corpus:
"""Training corpus with data loading utilities."""
def __init__(self, train_path: str, dev_path: str, **kwargs) -> None:
"""Initialize corpus with data paths."""
def train_dataset(self, nlp: Language) -> Iterator[Example]:
"""Get training examples."""
def dev_dataset(self, nlp: Language) -> Iterator[Example]:
"""Get development examples."""Neural network components for building custom models.
class Tok2Vec:
"""Token-to-vector encoder component."""
def __init__(self, vocab: Vocab, model: Model, **cfg) -> None:
"""Initialize tok2vec component."""
def __call__(self, doc: Doc) -> Doc:
"""Add token vectors to doc."""
def predict(self, docs: List[Doc]) -> List[numpy.ndarray]:
"""Predict token vectors."""
def set_annotations(self, docs: List[Doc],
predictions: List[numpy.ndarray]) -> None:
"""Set token vector annotations."""
def build_tok2vec_model(embed: Model, encode: Model) -> Model:
"""
Build a tok2vec model from embedding and encoding layers.
Args:
embed: Embedding layer (HashEmbed, CharacterEmbed, etc.)
encode: Encoding layer (MaxoutWindowEncoder, etc.)
Returns:
Complete tok2vec model
"""
def build_hash_embed_cnn_tok2vec(width: int, depth: int,
embed_size: int, **kwargs) -> Model:
"""Build CNN-based tok2vec with hash embedding."""
def build_transformer_model(name: str, **kwargs) -> Model:
"""Build transformer-based model."""Classes for computing evaluation metrics and scores.
class Scorer:
"""Evaluation scorer for spaCy models."""
def __init__(self, nlp: Language = None, **kwargs) -> None:
"""Initialize scorer."""
def score(self, examples: List[Example]) -> dict:
"""Score examples and return metrics."""
def score_tokenization(self, examples: List[Example]) -> dict:
"""Score tokenization accuracy."""
def score_token_attr(self, examples: List[Example],
attr: str, **kwargs) -> dict:
"""Score token-level attribute accuracy."""
def score_spans(self, examples: List[Example],
attr: str, **kwargs) -> dict:
"""Score span-level predictions."""
def score_cats(self, examples: List[Example], **kwargs) -> dict:
"""Score text classification."""
class PRFScore:
"""Precision, recall, and F-score container."""
def __init__(self) -> None:
"""Initialize score tracking."""
@property
def precision(self) -> float:
"""Precision score."""
@property
def recall(self) -> float:
"""Recall score."""
@property
def fscore(self) -> float:
"""F1 score."""import spacy
from spacy.training import Example
from spacy.util import minibatch
import random
# Create blank model
nlp = spacy.blank("en")
# Add components
ner = nlp.add_pipe("ner")
ner.add_label("COMPANY")
ner.add_label("PERSON")
# Training data
TRAINING_DATA = [
("Apple Inc. was founded by Steve Jobs.", {
"entities": [(0, 10, "COMPANY"), (26, 36, "PERSON")]
}),
("Google hired Larry Page as CEO.", {
"entities": [(0, 6, "COMPANY"), (13, 23, "PERSON")]
}),
("Microsoft CEO is Satya Nadella.", {
"entities": [(0, 9, "COMPANY"), (17, 31, "PERSON")]
})
]
# Convert to Example objects
examples = []
for text, annotations in TRAINING_DATA:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
examples.append(example)
# Initialize training
nlp.begin_training()
# Training loop
for epoch in range(10):
random.shuffle(examples)
losses = {}
# Batch training
batches = minibatch(examples, size=2)
for batch in batches:
nlp.update(batch, losses=losses)
print(f"Epoch {epoch}, Losses: {losses}")
# Save trained model
nlp.to_disk("./custom_ner_model")import spacy
from spacy.training import Example, init_nlp
from spacy.util import load_config
# Load configuration
config = load_config("./config.cfg")
# Initialize model from config
nlp = init_nlp(config)
# Load training data
def load_data(path):
"""Load training data from file."""
examples = []
# Load and convert your data format to Example objects
return examples
train_examples = load_data("train.json")
dev_examples = load_data("dev.json")
# Initialize training
nlp.initialize(lambda: train_examples)
# Training with config settings
for epoch in range(config["training"]["max_epochs"]):
losses = {}
batches = minibatch(train_examples, size=config["training"]["batch_size"])
for batch in batches:
nlp.update(batch, losses=losses, sgd=nlp.resume_training())
# Evaluate
scores = nlp.evaluate(dev_examples)
print(f"Epoch {epoch}: {scores}")import spacy
from spacy.training import Example
# Load existing model
nlp = spacy.load("en_core_web_sm")
# Get NER component
ner = nlp.get_pipe("ner")
# Add new labels
ner.add_label("PRODUCT")
ner.add_label("BRAND")
# Domain-specific training data
DOMAIN_DATA = [
("iPhone 12 is Apple's latest smartphone.", {
"entities": [(0, 9, "PRODUCT"), (13, 18, "BRAND")]
}),
("Samsung Galaxy S21 features 5G connectivity.", {
"entities": [(0, 7, "BRAND"), (8, 18, "PRODUCT")]
})
]
# Convert to examples
examples = []
for text, annotations in DOMAIN_DATA:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
examples.append(example)
# Fine-tune with lower learning rate
optimizer = nlp.resume_training()
for i in range(20):
losses = {}
nlp.update(examples, losses=losses, sgd=optimizer)
print(f"Iteration {i}, Losses: {losses}")
# Save fine-tuned model
nlp.to_disk("./fine_tuned_model")import spacy
from spacy import Language
from spacy.training import Example
@Language.component("custom_classifier")
class CustomClassifier:
"""Custom text classifier component."""
def __init__(self, nlp, name):
self.name = name
self.labels = set()
# Initialize your model here
def __call__(self, doc):
# Apply classification
doc.cats = self.predict(doc)
return doc
def predict(self, doc):
# Your prediction logic
return {"POSITIVE": 0.8, "NEGATIVE": 0.2}
def update(self, examples, losses=None, sgd=None):
# Training logic
pass
def add_label(self, label):
self.labels.add(label)
# Create model with custom component
nlp = spacy.blank("en")
classifier = nlp.add_pipe("custom_classifier")
classifier.add_label("POSITIVE")
classifier.add_label("NEGATIVE")
# Training data for classification
TRAINING_DATA = [
("This movie is great!", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
("I hate this product.", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}})
]
examples = []
for text, annotations in TRAINING_DATA:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
examples.append(example)
# Train custom component
nlp.initialize()
for i in range(10):
losses = {}
nlp.update(examples, losses=losses)
print(f"Losses: {losses}")import spacy
from spacy.training import Example
# Create model with multiple components
nlp = spacy.blank("en")
nlp.add_pipe("tagger")
nlp.add_pipe("ner")
nlp.add_pipe("textcat")
# Add labels
ner = nlp.get_pipe("ner")
ner.add_label("PERSON")
ner.add_label("ORG")
textcat = nlp.get_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")
# Multi-task training data
TRAINING_DATA = [
("Apple Inc. makes great products!", {
"entities": [(0, 10, "ORG")],
"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}
}),
("John Smith dislikes Microsoft.", {
"entities": [(0, 10, "PERSON"), (20, 29, "ORG")],
"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}
})
]
examples = []
for text, annotations in TRAINING_DATA:
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
examples.append(example)
# Joint training
nlp.initialize()
for epoch in range(20):
losses = {}
nlp.update(examples, losses=losses)
print(f"Epoch {epoch}, Losses: {losses}")import spacy
from spacy.training import Example
from spacy.scorer import Scorer
# Load model and test data
nlp = spacy.load("./trained_model")
test_examples = load_test_data() # Your test data loading function
# Evaluate model
scorer = Scorer()
scores = scorer.score(test_examples)
print("Evaluation Results:")
print(f"Token accuracy: {scores['token_acc']:.3f}")
print(f"POS accuracy: {scores['tag_acc']:.3f}")
print(f"NER precision: {scores['ents_p']:.3f}")
print(f"NER recall: {scores['ents_r']:.3f}")
print(f"NER F1: {scores['ents_f']:.3f}")
# Component-specific evaluation
ner_scores = scorer.score_spans(test_examples, "ents")
print(f"NER scores by label: {ner_scores['ents_per_type']}")
# Detailed error analysis
for example in test_examples[:5]:
pred_ents = [(ent.start, ent.end, ent.label_) for ent in example.predicted.ents]
ref_ents = [(ent.start, ent.end, ent.label_) for ent in example.reference.ents]
print(f"Text: {example.predicted.text}")
print(f"Predicted: {pred_ents}")
print(f"Reference: {ref_ents}")
print("---")import spacy
from spacy.training import Example
from spacy.util import minibatch
# Training with callbacks
def create_evaluation_callback(nlp, dev_examples):
"""Create callback for evaluation during training."""
def evaluate_model():
scores = nlp.evaluate(dev_examples)
print(f"Dev scores: {scores}")
return scores
return evaluate_model
def create_save_callback(nlp, save_path):
"""Create callback to save best model."""
best_score = 0.0
def save_if_better(scores):
nonlocal best_score
current_score = scores.get("ents_f", 0.0)
if current_score > best_score:
best_score = current_score
nlp.to_disk(save_path)
print(f"Saved new best model with F1: {current_score:.3f}")
return save_if_better
# Training with callbacks
nlp = spacy.blank("en")
nlp.add_pipe("ner")
train_examples = load_training_data()
dev_examples = load_dev_data()
eval_callback = create_evaluation_callback(nlp, dev_examples)
save_callback = create_save_callback(nlp, "./best_model")
nlp.initialize()
for epoch in range(50):
losses = {}
batches = minibatch(train_examples, size=8)
for batch in batches:
nlp.update(batch, losses=losses)
# Evaluate every 10 epochs
if epoch % 10 == 0:
scores = eval_callback()
save_callback(scores)Install with Tessl CLI
npx tessl i tessl/pypi-spacy