tessl/pypi-spacy

Industrial-strength Natural Language Processing (NLP) in Python

—

Pending

Overview

Eval results

Files

Training and Model Building

Name: tessl/pypi-spacy
Author: tessl

Tools for training custom models, fine-tuning existing models, and creating specialized NLP pipelines for domain-specific applications. spaCy provides a complete training framework with support for multiple architectures and optimization strategies.

Capabilities

Training Functions

Core functions for training and evaluating spaCy models.

def train(nlp: Language, examples: List[Example], sgd: Optimizer = None,
          losses: dict = None, component_cfg: dict = None,
          exclude: List[str] = None) -> dict:
    """
    Train a spaCy model on examples.
    
    Args:
        nlp: Language object with pipeline components
        examples: Training examples
        sgd: Optimizer (created automatically if None)
        losses: Dictionary to track losses
        component_cfg: Component-specific config
        exclude: Components to exclude from training
        
    Returns:
        Dictionary of losses by component
    """

def evaluate(nlp: Language, examples: List[Example], 
            verbose: bool = False, **kwargs) -> dict:
    """
    Evaluate a spaCy model on examples.
    
    Args:
        nlp: Language object to evaluate
        examples: Evaluation examples
        verbose: Print detailed results
        
    Returns:
        Dictionary of evaluation metrics
    """

Training Data Classes

Classes for representing and managing training data.

class Example:
    """Training example with reference and predicted annotations."""
    
    def __init__(self, predicted: Doc, reference: Doc) -> None:
        """Create an Example from predicted and reference docs."""
    
    @classmethod
    def from_dict(cls, predicted: Doc, example_dict: dict) -> 'Example':
        """Create Example from a dictionary of annotations."""
    
    @property
    def predicted(self) -> Doc:
        """The predicted Doc object."""
    
    @property  
    def reference(self) -> Doc:
        """The reference Doc object with gold annotations."""
    
    def get_aligned_parse(self, projectivize: bool = True) -> List[dict]:
        """Get aligned dependency parse."""
    
    def get_aligned_ner(self) -> List[tuple]:
        """Get aligned named entity annotations."""
    
    def get_aligned_spans(self, spans_key: str) -> List[tuple]:
        """Get aligned spans for a given key."""
    
    def to_dict(self) -> dict:
        """Convert Example to dictionary format."""

Training Utilities

Utility classes for training configuration and data management.

class Config:
    """Configuration object for training."""
    
    def __init__(self, data: dict = None) -> None:
        """Initialize config from dictionary."""
    
    @classmethod
    def from_str(cls, text: str) -> 'Config':
        """Create config from string."""
    
    @classmethod
    def from_disk(cls, path: str) -> 'Config':
        """Load config from disk."""
    
    def to_disk(self, path: str) -> None:
        """Save config to disk."""
    
    def interpolate(self) -> 'Config':
        """Resolve variable interpolations."""

class Corpus:
    """Training corpus with data loading utilities."""
    
    def __init__(self, train_path: str, dev_path: str, **kwargs) -> None:
        """Initialize corpus with data paths."""
    
    def train_dataset(self, nlp: Language) -> Iterator[Example]:
        """Get training examples."""
    
    def dev_dataset(self, nlp: Language) -> Iterator[Example]:
        """Get development examples."""

Model Architecture Components

Neural network components for building custom models.

class Tok2Vec:
    """Token-to-vector encoder component."""
    
    def __init__(self, vocab: Vocab, model: Model, **cfg) -> None:
        """Initialize tok2vec component."""
    
    def __call__(self, doc: Doc) -> Doc:
        """Add token vectors to doc."""
    
    def predict(self, docs: List[Doc]) -> List[numpy.ndarray]:
        """Predict token vectors."""
    
    def set_annotations(self, docs: List[Doc], 
                       predictions: List[numpy.ndarray]) -> None:
        """Set token vector annotations."""

def build_tok2vec_model(embed: Model, encode: Model) -> Model:
    """
    Build a tok2vec model from embedding and encoding layers.
    
    Args:
        embed: Embedding layer (HashEmbed, CharacterEmbed, etc.)
        encode: Encoding layer (MaxoutWindowEncoder, etc.)
        
    Returns:
        Complete tok2vec model
    """

def build_hash_embed_cnn_tok2vec(width: int, depth: int, 
                                embed_size: int, **kwargs) -> Model:
    """Build CNN-based tok2vec with hash embedding."""

def build_transformer_model(name: str, **kwargs) -> Model:
    """Build transformer-based model."""

Evaluation and Scoring

Classes for computing evaluation metrics and scores.

class Scorer:
    """Evaluation scorer for spaCy models."""
    
    def __init__(self, nlp: Language = None, **kwargs) -> None:
        """Initialize scorer."""
    
    def score(self, examples: List[Example]) -> dict:
        """Score examples and return metrics."""
    
    def score_tokenization(self, examples: List[Example]) -> dict:
        """Score tokenization accuracy."""
    
    def score_token_attr(self, examples: List[Example], 
                        attr: str, **kwargs) -> dict:
        """Score token-level attribute accuracy."""
    
    def score_spans(self, examples: List[Example], 
                   attr: str, **kwargs) -> dict:
        """Score span-level predictions."""
    
    def score_cats(self, examples: List[Example], **kwargs) -> dict:
        """Score text classification."""

class PRFScore:
    """Precision, recall, and F-score container."""
    
    def __init__(self) -> None:
        """Initialize score tracking."""
    
    @property
    def precision(self) -> float:
        """Precision score."""
    
    @property  
    def recall(self) -> float:
        """Recall score."""
    
    @property
    def fscore(self) -> float:
        """F1 score."""

Training Workflows

Basic Training Example

import spacy
from spacy.training import Example
from spacy.util import minibatch
import random

# Create blank model
nlp = spacy.blank("en")

# Add components
ner = nlp.add_pipe("ner")
ner.add_label("COMPANY")
ner.add_label("PERSON")

# Training data
TRAINING_DATA = [
    ("Apple Inc. was founded by Steve Jobs.", {
        "entities": [(0, 10, "COMPANY"), (26, 36, "PERSON")]
    }),
    ("Google hired Larry Page as CEO.", {
        "entities": [(0, 6, "COMPANY"), (13, 23, "PERSON")]
    }),
    ("Microsoft CEO is Satya Nadella.", {
        "entities": [(0, 9, "COMPANY"), (17, 31, "PERSON")]
    })
]

# Convert to Example objects
examples = []
for text, annotations in TRAINING_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    examples.append(example)

# Initialize training
nlp.begin_training()

# Training loop
for epoch in range(10):
    random.shuffle(examples)
    losses = {}
    
    # Batch training
    batches = minibatch(examples, size=2)
    for batch in batches:
        nlp.update(batch, losses=losses)
    
    print(f"Epoch {epoch}, Losses: {losses}")

# Save trained model
nlp.to_disk("./custom_ner_model")

Training with Configuration Files

import spacy
from spacy.training import Example, init_nlp
from spacy.util import load_config

# Load configuration
config = load_config("./config.cfg")

# Initialize model from config
nlp = init_nlp(config)

# Load training data
def load_data(path):
    """Load training data from file."""
    examples = []
    # Load and convert your data format to Example objects
    return examples

train_examples = load_data("train.json")
dev_examples = load_data("dev.json")

# Initialize training
nlp.initialize(lambda: train_examples)

# Training with config settings
for epoch in range(config["training"]["max_epochs"]):
    losses = {}
    batches = minibatch(train_examples, size=config["training"]["batch_size"])
    
    for batch in batches:
        nlp.update(batch, losses=losses, sgd=nlp.resume_training())
    
    # Evaluate
    scores = nlp.evaluate(dev_examples)
    print(f"Epoch {epoch}: {scores}")

Fine-tuning Existing Models

import spacy
from spacy.training import Example

# Load existing model
nlp = spacy.load("en_core_web_sm")

# Get NER component
ner = nlp.get_pipe("ner")

# Add new labels
ner.add_label("PRODUCT")
ner.add_label("BRAND")

# Domain-specific training data
DOMAIN_DATA = [
    ("iPhone 12 is Apple's latest smartphone.", {
        "entities": [(0, 9, "PRODUCT"), (13, 18, "BRAND")]
    }),
    ("Samsung Galaxy S21 features 5G connectivity.", {
        "entities": [(0, 7, "BRAND"), (8, 18, "PRODUCT")]
    })
]

# Convert to examples
examples = []
for text, annotations in DOMAIN_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    examples.append(example)

# Fine-tune with lower learning rate
optimizer = nlp.resume_training()
for i in range(20):
    losses = {}
    nlp.update(examples, losses=losses, sgd=optimizer)
    print(f"Iteration {i}, Losses: {losses}")

# Save fine-tuned model
nlp.to_disk("./fine_tuned_model")

Custom Pipeline Component Training

import spacy
from spacy import Language
from spacy.training import Example

@Language.component("custom_classifier")
class CustomClassifier:
    """Custom text classifier component."""
    
    def __init__(self, nlp, name):
        self.name = name
        self.labels = set()
        # Initialize your model here
    
    def __call__(self, doc):
        # Apply classification
        doc.cats = self.predict(doc)
        return doc
    
    def predict(self, doc):
        # Your prediction logic
        return {"POSITIVE": 0.8, "NEGATIVE": 0.2}
    
    def update(self, examples, losses=None, sgd=None):
        # Training logic
        pass
    
    def add_label(self, label):
        self.labels.add(label)

# Create model with custom component
nlp = spacy.blank("en")
classifier = nlp.add_pipe("custom_classifier")
classifier.add_label("POSITIVE")
classifier.add_label("NEGATIVE")

# Training data for classification
TRAINING_DATA = [
    ("This movie is great!", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
    ("I hate this product.", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}})
]

examples = []
for text, annotations in TRAINING_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    examples.append(example)

# Train custom component
nlp.initialize()
for i in range(10):
    losses = {}
    nlp.update(examples, losses=losses)
    print(f"Losses: {losses}")

Multi-task Training

import spacy
from spacy.training import Example

# Create model with multiple components
nlp = spacy.blank("en")
nlp.add_pipe("tagger")
nlp.add_pipe("ner")
nlp.add_pipe("textcat")

# Add labels
ner = nlp.get_pipe("ner")
ner.add_label("PERSON")
ner.add_label("ORG")

textcat = nlp.get_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

# Multi-task training data
TRAINING_DATA = [
    ("Apple Inc. makes great products!", {
        "entities": [(0, 10, "ORG")],
        "cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}
    }),
    ("John Smith dislikes Microsoft.", {
        "entities": [(0, 10, "PERSON"), (20, 29, "ORG")],
        "cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}
    })
]

examples = []
for text, annotations in TRAINING_DATA:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    examples.append(example)

# Joint training
nlp.initialize()
for epoch in range(20):
    losses = {}
    nlp.update(examples, losses=losses)
    print(f"Epoch {epoch}, Losses: {losses}")

Evaluation and Model Selection

import spacy
from spacy.training import Example
from spacy.scorer import Scorer

# Load model and test data
nlp = spacy.load("./trained_model")
test_examples = load_test_data()  # Your test data loading function

# Evaluate model
scorer = Scorer()
scores = scorer.score(test_examples)

print("Evaluation Results:")
print(f"Token accuracy: {scores['token_acc']:.3f}")
print(f"POS accuracy: {scores['tag_acc']:.3f}")
print(f"NER precision: {scores['ents_p']:.3f}")
print(f"NER recall: {scores['ents_r']:.3f}")
print(f"NER F1: {scores['ents_f']:.3f}")

# Component-specific evaluation
ner_scores = scorer.score_spans(test_examples, "ents")
print(f"NER scores by label: {ner_scores['ents_per_type']}")

# Detailed error analysis
for example in test_examples[:5]:
    pred_ents = [(ent.start, ent.end, ent.label_) for ent in example.predicted.ents]
    ref_ents = [(ent.start, ent.end, ent.label_) for ent in example.reference.ents]
    
    print(f"Text: {example.predicted.text}")
    print(f"Predicted: {pred_ents}")
    print(f"Reference: {ref_ents}")
    print("---")

Advanced Training with Callbacks

import spacy
from spacy.training import Example
from spacy.util import minibatch

# Training with callbacks
def create_evaluation_callback(nlp, dev_examples):
    """Create callback for evaluation during training."""
    def evaluate_model():
        scores = nlp.evaluate(dev_examples)
        print(f"Dev scores: {scores}")
        return scores
    return evaluate_model

def create_save_callback(nlp, save_path):
    """Create callback to save best model."""
    best_score = 0.0
    def save_if_better(scores):
        nonlocal best_score
        current_score = scores.get("ents_f", 0.0)
        if current_score > best_score:
            best_score = current_score
            nlp.to_disk(save_path)
            print(f"Saved new best model with F1: {current_score:.3f}")
    return save_if_better

# Training with callbacks
nlp = spacy.blank("en")
nlp.add_pipe("ner")

train_examples = load_training_data()
dev_examples = load_dev_data()

eval_callback = create_evaluation_callback(nlp, dev_examples)
save_callback = create_save_callback(nlp, "./best_model")

nlp.initialize()

for epoch in range(50):
    losses = {}
    batches = minibatch(train_examples, size=8)
    
    for batch in batches:
        nlp.update(batch, losses=losses)
    
    # Evaluate every 10 epochs
    if epoch % 10 == 0:
        scores = eval_callback()
        save_callback(scores)

Install with Tessl CLI