tessl/pypi-spacy

Industrial-strength Natural Language Processing (NLP) in Python

—

Pending

Overview

Eval results

Files

Pipeline Components

Name: tessl/pypi-spacy
Author: tessl

Built-in pipeline components that perform linguistic analysis on documents. These components can be combined in customizable processing pipelines to add part-of-speech tags, dependency parsing, named entity recognition, text classification, and more.

Capabilities

Part-of-Speech Tagging

Statistical models that assign part-of-speech tags and morphological features to tokens based on context and linguistic patterns.

class Tagger:
    """Part-of-speech tagger pipeline component."""
    
    name: str = "tagger"
    
    def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
        """Initialize the tagger."""
    
    def __call__(self, doc: Doc) -> Doc:
        """Apply the tagger to a Doc object."""
    
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
        """Process documents in batches."""
    
    def predict(self, docs: List[Doc]) -> Scores:
        """Predict part-of-speech tags for documents."""
    
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
        """Set part-of-speech annotations on documents."""
    
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
        """Update the model with training examples."""
    
    def begin_training(self, get_examples: callable = None, **kwargs) -> Optimizer:
        """Initialize training."""
    
    def add_label(self, label: str) -> int:
        """Add a label to the component."""
    
    # Serialization
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
        """Save the component to disk."""
    
    def from_disk(self, path: str, exclude: List[str] = None) -> 'Tagger':
        """Load the component from disk."""

Dependency Parsing

Statistical parser that predicts syntactic dependencies between tokens, creating a dependency tree structure.

class DependencyParser:
    """Dependency parser pipeline component."""
    
    name: str = "parser"
    
    def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
        """Initialize the parser."""
    
    def __call__(self, doc: Doc) -> Doc:
        """Apply the parser to a Doc object."""
    
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
        """Process documents in batches."""
    
    def predict(self, docs: List[Doc]) -> Scores:
        """Predict dependency relations for documents."""
    
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
        """Set dependency annotations on documents."""
    
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
        """Update the model with training examples."""
    
    def add_label(self, label: str) -> int:
        """Add a dependency label."""
    
    # Serialization methods similar to Tagger

Named Entity Recognition

Statistical model that identifies and classifies named entities (people, organizations, locations, etc.) in text.

class EntityRecognizer:
    """Named entity recognition pipeline component."""
    
    name: str = "ner" 
    
    def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
        """Initialize the NER component."""
    
    def __call__(self, doc: Doc) -> Doc:
        """Apply NER to a Doc object."""
    
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
        """Process documents in batches."""
    
    def predict(self, docs: List[Doc]) -> Scores:
        """Predict named entities for documents."""
    
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
        """Set named entity annotations on documents."""
    
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
        """Update the model with training examples."""
    
    def add_label(self, label: str) -> int:
        """Add an entity label."""
    
    # Serialization methods similar to Tagger

Text Classification

Multi-label text classifier that assigns category scores to documents based on their content.

class TextCategorizer:
    """Text classification pipeline component."""
    
    name: str = "textcat"
    
    def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
        """Initialize the text categorizer."""
    
    def __call__(self, doc: Doc) -> Doc:
        """Apply text categorization to a Doc object."""
    
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
        """Process documents in batches."""
    
    def predict(self, docs: List[Doc]) -> Scores:
        """Predict category scores for documents."""
    
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
        """Set category annotations on documents."""
    
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
        """Update the model with training examples."""
    
    def add_label(self, label: str) -> int:
        """Add a category label."""
    
    @property
    def labels(self) -> tuple:
        """Get category labels."""

Entity Linking

Component that links named entities to entries in a knowledge base using entity embeddings and candidate ranking.

class EntityLinker:
    """Entity linking pipeline component."""
    
    name: str = "entity_linker"
    
    def __init__(self, vocab: Vocab, **cfg) -> None:
        """Initialize the entity linker."""
    
    def __call__(self, doc: Doc) -> Doc:
        """Apply entity linking to a Doc object."""
    
    def predict(self, docs: List[Doc]) -> Scores:
        """Predict entity links for documents."""
    
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
        """Set entity linking annotations on documents."""
    
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
        """Update the model with training examples."""
    
    def add_label(self, label: str) -> int:
        """Add an entity type label."""
    
    def get_candidates(self, mention: Span) -> List:
        """Get knowledge base candidates for a mention."""

Morphological Analysis

Component that analyzes word morphology and assigns detailed morphological features to tokens.

class Morphologizer:
    """Morphological analysis pipeline component."""
    
    name: str = "morphologizer"
    
    def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
        """Initialize the morphologizer."""
    
    def __call__(self, doc: Doc) -> Doc:
        """Apply morphological analysis to a Doc object."""
    
    def predict(self, docs: List[Doc]) -> Scores:
        """Predict morphological features for documents."""
    
    def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
        """Set morphological annotations on documents."""
    
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
        """Update the model with training examples."""

Rule-Based Components

Entity Ruler

Rule-based component for pattern-based named entity recognition using token patterns or phrase matching.

class EntityRuler:
    """Rule-based named entity recognition component."""
    
    name: str = "entity_ruler"  
    
    def __init__(self, nlp: Language, patterns: List[dict] = None,
                 overwrite_ents: bool = False, **cfg) -> None:
        """Initialize the entity ruler."""
    
    def __call__(self, doc: Doc) -> Doc:
        """Apply entity rules to a Doc object."""
    
    def add_patterns(self, patterns: List[dict]) -> None:
        """Add patterns to the entity ruler."""
    
    @property
    def patterns(self) -> List[dict]:
        """Get all patterns."""
    
    @property
    def labels(self) -> set:
        """Get entity labels."""
    
    # Serialization
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
        """Save patterns to disk."""
    
    def from_disk(self, path: str, exclude: List[str] = None) -> 'EntityRuler':
        """Load patterns from disk."""

Sentence Boundary Detection

Fast, rule-based sentence boundary detection for most languages.

class Sentencizer:
    """Rule-based sentence boundary detection component."""
    
    name: str = "sentencizer"
    
    def __init__(self, punct_chars: Set[str] = None, **cfg) -> None:
        """Initialize the sentencizer."""
    
    def __call__(self, doc: Doc) -> Doc:
        """Apply sentence boundary detection to a Doc object."""
    
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
        """Process documents in batches."""

Pipeline Management Functions

Functions for merging tokens based on linguistic analysis.

def merge_entities(doc: Doc) -> Doc:
    """
    Merge named entity tokens into single tokens.
    
    Args:
        doc: The Doc object to modify
        
    Returns:
        The modified Doc object
    """

def merge_noun_chunks(doc: Doc) -> Doc:
    """
    Merge noun chunk tokens into single tokens.
    
    Args:
        doc: The Doc object to modify
        
    Returns:
        The modified Doc object
    """

def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
    """
    Merge subtokens into single tokens.
    
    Args:
        doc: The Doc object to modify
        label: Label for merged tokens
        
    Returns:
        The modified Doc object
    """

Base Pipeline Component

Abstract base class for creating custom pipeline components.

class Pipe:
    """Base class for pipeline components."""
    
    name: str
    
    def __call__(self, doc: Doc) -> Doc:
        """Apply the component to a Doc object."""
        raise NotImplementedError
    
    def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
        """Process documents in batches."""
        for docs in util.minibatch(stream, size=batch_size):
            for doc in docs:
                yield self(doc)
    
    def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
        """Update the component with training examples."""
        pass
    
    def begin_training(self, get_examples: callable = None, **kwargs) -> Optimizer:
        """Initialize training."""
        pass

Usage Examples

Using Built-in Components

import spacy

# Load model with multiple components
nlp = spacy.load("en_core_web_sm")
print("Pipeline components:", nlp.pipe_names)
# Output: ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

# Process text through all components
doc = nlp("Apple Inc. is looking at buying U.K. startup for $1 billion")

# Access tagger results
for token in doc:
    print(f"{token.text}: {token.pos_} ({token.tag_})")

# Access parser results
for token in doc:
    print(f"{token.text} -> {token.head.text} ({token.dep_})")

# Access NER results
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")

Pipeline Management

import spacy
from spacy.pipeline import EntityRuler

# Create blank language model
nlp = spacy.blank("en")

# Add components to pipeline
nlp.add_pipe("tagger")
nlp.add_pipe("parser")
nlp.add_pipe("ner")

# Add custom rule-based component
ruler = EntityRuler(nlp, patterns=[
    {"label": "COMPANY", "pattern": "Apple Inc."},
    {"label": "COMPANY", "pattern": "Microsoft Corp."}
])
nlp.add_pipe(ruler, before="ner")

# Process text
doc = nlp("Apple Inc. and Microsoft Corp. are tech companies")
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")

Disabling Components

import spacy

nlp = spacy.load("en_core_web_sm")

# Disable specific components for faster processing
with nlp.disable_pipes("parser", "ner"):
    doc = nlp("This will only run tokenizer and tagger")
    
# Process multiple documents with disabled components
texts = ["Text one", "Text two", "Text three"]
with nlp.disable_pipes("parser"):
    docs = list(nlp.pipe(texts))

Custom Pipeline Components

from spacy.pipeline import Pipe
from spacy.tokens import Doc

class CustomComponent(Pipe):
    """Custom pipeline component example."""
    
    name = "custom_component"
    
    def __call__(self, doc):
        # Add custom processing logic
        for token in doc:
            if token.like_email:
                token._.is_email = True
        return doc

# Register and add to pipeline
@spacy.component("custom_component")
def create_custom_component(nlp, name):
    return CustomComponent()

nlp = spacy.blank("en")
nlp.add_pipe("custom_component")

Text Classification

import spacy

# Load model with text classifier
nlp = spacy.load("en_core_web_sm")

# Add text categorizer
textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

# After training...
doc = nlp("This movie is great!")
print("Categories:", doc.cats)
# Output: {'POSITIVE': 0.9, 'NEGATIVE': 0.1}

Install with Tessl CLI