Industrial-strength Natural Language Processing (NLP) in Python
—
Built-in pipeline components that perform linguistic analysis on documents. These components can be combined in customizable processing pipelines to add part-of-speech tags, dependency parsing, named entity recognition, text classification, and more.
Statistical models that assign part-of-speech tags and morphological features to tokens based on context and linguistic patterns.
class Tagger:
"""Part-of-speech tagger pipeline component."""
name: str = "tagger"
def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
"""Initialize the tagger."""
def __call__(self, doc: Doc) -> Doc:
"""Apply the tagger to a Doc object."""
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
"""Process documents in batches."""
def predict(self, docs: List[Doc]) -> Scores:
"""Predict part-of-speech tags for documents."""
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
"""Set part-of-speech annotations on documents."""
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
"""Update the model with training examples."""
def begin_training(self, get_examples: callable = None, **kwargs) -> Optimizer:
"""Initialize training."""
def add_label(self, label: str) -> int:
"""Add a label to the component."""
# Serialization
def to_disk(self, path: str, exclude: List[str] = None) -> None:
"""Save the component to disk."""
def from_disk(self, path: str, exclude: List[str] = None) -> 'Tagger':
"""Load the component from disk."""Statistical parser that predicts syntactic dependencies between tokens, creating a dependency tree structure.
class DependencyParser:
"""Dependency parser pipeline component."""
name: str = "parser"
def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
"""Initialize the parser."""
def __call__(self, doc: Doc) -> Doc:
"""Apply the parser to a Doc object."""
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
"""Process documents in batches."""
def predict(self, docs: List[Doc]) -> Scores:
"""Predict dependency relations for documents."""
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
"""Set dependency annotations on documents."""
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
"""Update the model with training examples."""
def add_label(self, label: str) -> int:
"""Add a dependency label."""
# Serialization methods similar to TaggerStatistical model that identifies and classifies named entities (people, organizations, locations, etc.) in text.
class EntityRecognizer:
"""Named entity recognition pipeline component."""
name: str = "ner"
def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
"""Initialize the NER component."""
def __call__(self, doc: Doc) -> Doc:
"""Apply NER to a Doc object."""
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
"""Process documents in batches."""
def predict(self, docs: List[Doc]) -> Scores:
"""Predict named entities for documents."""
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
"""Set named entity annotations on documents."""
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
"""Update the model with training examples."""
def add_label(self, label: str) -> int:
"""Add an entity label."""
# Serialization methods similar to TaggerMulti-label text classifier that assigns category scores to documents based on their content.
class TextCategorizer:
"""Text classification pipeline component."""
name: str = "textcat"
def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
"""Initialize the text categorizer."""
def __call__(self, doc: Doc) -> Doc:
"""Apply text categorization to a Doc object."""
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
"""Process documents in batches."""
def predict(self, docs: List[Doc]) -> Scores:
"""Predict category scores for documents."""
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
"""Set category annotations on documents."""
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
"""Update the model with training examples."""
def add_label(self, label: str) -> int:
"""Add a category label."""
@property
def labels(self) -> tuple:
"""Get category labels."""Component that links named entities to entries in a knowledge base using entity embeddings and candidate ranking.
class EntityLinker:
"""Entity linking pipeline component."""
name: str = "entity_linker"
def __init__(self, vocab: Vocab, **cfg) -> None:
"""Initialize the entity linker."""
def __call__(self, doc: Doc) -> Doc:
"""Apply entity linking to a Doc object."""
def predict(self, docs: List[Doc]) -> Scores:
"""Predict entity links for documents."""
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
"""Set entity linking annotations on documents."""
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
"""Update the model with training examples."""
def add_label(self, label: str) -> int:
"""Add an entity type label."""
def get_candidates(self, mention: Span) -> List:
"""Get knowledge base candidates for a mention."""Component that analyzes word morphology and assigns detailed morphological features to tokens.
class Morphologizer:
"""Morphological analysis pipeline component."""
name: str = "morphologizer"
def __init__(self, vocab: Vocab, model: Model = None, **cfg) -> None:
"""Initialize the morphologizer."""
def __call__(self, doc: Doc) -> Doc:
"""Apply morphological analysis to a Doc object."""
def predict(self, docs: List[Doc]) -> Scores:
"""Predict morphological features for documents."""
def set_annotations(self, docs: List[Doc], scores: Scores) -> None:
"""Set morphological annotations on documents."""
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
"""Update the model with training examples."""Rule-based component for pattern-based named entity recognition using token patterns or phrase matching.
class EntityRuler:
"""Rule-based named entity recognition component."""
name: str = "entity_ruler"
def __init__(self, nlp: Language, patterns: List[dict] = None,
overwrite_ents: bool = False, **cfg) -> None:
"""Initialize the entity ruler."""
def __call__(self, doc: Doc) -> Doc:
"""Apply entity rules to a Doc object."""
def add_patterns(self, patterns: List[dict]) -> None:
"""Add patterns to the entity ruler."""
@property
def patterns(self) -> List[dict]:
"""Get all patterns."""
@property
def labels(self) -> set:
"""Get entity labels."""
# Serialization
def to_disk(self, path: str, exclude: List[str] = None) -> None:
"""Save patterns to disk."""
def from_disk(self, path: str, exclude: List[str] = None) -> 'EntityRuler':
"""Load patterns from disk."""Fast, rule-based sentence boundary detection for most languages.
class Sentencizer:
"""Rule-based sentence boundary detection component."""
name: str = "sentencizer"
def __init__(self, punct_chars: Set[str] = None, **cfg) -> None:
"""Initialize the sentencizer."""
def __call__(self, doc: Doc) -> Doc:
"""Apply sentence boundary detection to a Doc object."""
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
"""Process documents in batches."""Functions for merging tokens based on linguistic analysis.
def merge_entities(doc: Doc) -> Doc:
"""
Merge named entity tokens into single tokens.
Args:
doc: The Doc object to modify
Returns:
The modified Doc object
"""
def merge_noun_chunks(doc: Doc) -> Doc:
"""
Merge noun chunk tokens into single tokens.
Args:
doc: The Doc object to modify
Returns:
The modified Doc object
"""
def merge_subtokens(doc: Doc, label: str = "subtok") -> Doc:
"""
Merge subtokens into single tokens.
Args:
doc: The Doc object to modify
label: Label for merged tokens
Returns:
The modified Doc object
"""Abstract base class for creating custom pipeline components.
class Pipe:
"""Base class for pipeline components."""
name: str
def __call__(self, doc: Doc) -> Doc:
"""Apply the component to a Doc object."""
raise NotImplementedError
def pipe(self, stream: Iterable[Doc], batch_size: int = 128) -> Iterator[Doc]:
"""Process documents in batches."""
for docs in util.minibatch(stream, size=batch_size):
for doc in docs:
yield self(doc)
def update(self, examples: List, sgd: Optimizer = None, **kwargs) -> dict:
"""Update the component with training examples."""
pass
def begin_training(self, get_examples: callable = None, **kwargs) -> Optimizer:
"""Initialize training."""
passimport spacy
# Load model with multiple components
nlp = spacy.load("en_core_web_sm")
print("Pipeline components:", nlp.pipe_names)
# Output: ['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']
# Process text through all components
doc = nlp("Apple Inc. is looking at buying U.K. startup for $1 billion")
# Access tagger results
for token in doc:
print(f"{token.text}: {token.pos_} ({token.tag_})")
# Access parser results
for token in doc:
print(f"{token.text} -> {token.head.text} ({token.dep_})")
# Access NER results
for ent in doc.ents:
print(f"{ent.text}: {ent.label_}")import spacy
from spacy.pipeline import EntityRuler
# Create blank language model
nlp = spacy.blank("en")
# Add components to pipeline
nlp.add_pipe("tagger")
nlp.add_pipe("parser")
nlp.add_pipe("ner")
# Add custom rule-based component
ruler = EntityRuler(nlp, patterns=[
{"label": "COMPANY", "pattern": "Apple Inc."},
{"label": "COMPANY", "pattern": "Microsoft Corp."}
])
nlp.add_pipe(ruler, before="ner")
# Process text
doc = nlp("Apple Inc. and Microsoft Corp. are tech companies")
for ent in doc.ents:
print(f"{ent.text}: {ent.label_}")import spacy
nlp = spacy.load("en_core_web_sm")
# Disable specific components for faster processing
with nlp.disable_pipes("parser", "ner"):
doc = nlp("This will only run tokenizer and tagger")
# Process multiple documents with disabled components
texts = ["Text one", "Text two", "Text three"]
with nlp.disable_pipes("parser"):
docs = list(nlp.pipe(texts))from spacy.pipeline import Pipe
from spacy.tokens import Doc
class CustomComponent(Pipe):
"""Custom pipeline component example."""
name = "custom_component"
def __call__(self, doc):
# Add custom processing logic
for token in doc:
if token.like_email:
token._.is_email = True
return doc
# Register and add to pipeline
@spacy.component("custom_component")
def create_custom_component(nlp, name):
return CustomComponent()
nlp = spacy.blank("en")
nlp.add_pipe("custom_component")import spacy
# Load model with text classifier
nlp = spacy.load("en_core_web_sm")
# Add text categorizer
textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")
# After training...
doc = nlp("This movie is great!")
print("Categories:", doc.cats)
# Output: {'POSITIVE': 0.9, 'NEGATIVE': 0.1}Install with Tessl CLI
npx tessl i tessl/pypi-spacy