Industrial-strength Natural Language Processing (NLP) in Python
npx @tessl/cli install tessl/pypi-spacy@2.3.0Industrial-strength Natural Language Processing (NLP) in Python. spaCy is designed for production use and provides fast, accurate processing for 70+ languages with state-of-the-art neural network models for tokenization, part-of-speech tagging, dependency parsing, named entity recognition, and text classification.
pip install spacypython -m spacy download en_core_web_smimport spacy
# Load a language model
nlp = spacy.load("en_core_web_sm")Most common imports:
from spacy import displacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc, Token, Spanimport spacy
# Load a language model
nlp = spacy.load("en_core_web_sm")
# Process text
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
# Access linguistic annotations
for token in doc:
print(token.text, token.pos_, token.dep_, token.lemma_)
# Access named entities
for ent in doc.ents:
print(ent.text, ent.label_)
# Process multiple texts efficiently
texts = ["First text", "Second text", "Third text"]
docs = list(nlp.pipe(texts))spaCy's processing pipeline is built around a Language object that chains together multiple pipeline components. Each document passes through tokenization, then through pipeline components (tagger, parser, NER, etc.) in sequence. This design allows for:
nlp.pipe() for batchesThe fundamental objects for text processing including documents, tokens, spans, and vocabulary management. These form the foundation of all spaCy operations.
class Language:
def __call__(self, text: str) -> Doc: ...
def pipe(self, texts: Iterable[str]) -> Iterator[Doc]: ...
class Doc:
text: str
ents: tuple
sents: Iterator
class Token:
text: str
pos_: str
lemma_: str
class Span:
text: str
label_: strBuilt-in pipeline components for linguistic analysis including part-of-speech tagging, dependency parsing, named entity recognition, and text classification.
class Tagger: ...
class DependencyParser: ...
class EntityRecognizer: ...
class TextCategorizer: ...Powerful pattern matching systems for finding and extracting specific linguistic patterns, phrases, and dependency structures from text.
class Matcher:
def add(self, key: str, patterns: List[dict]) -> None: ...
def __call__(self, doc: Doc) -> List[tuple]: ...
class PhraseMatcher:
def add(self, key: str, docs: List[Doc]) -> None: ...Access to 70+ language-specific models and tokenizers, each optimized for specific linguistic characteristics and writing systems.
def load(name: str, **overrides) -> Language: ...
def blank(name: str, **kwargs) -> Language: ...Interactive visualization tools for displaying linguistic analysis including dependency trees, named entities, and custom visualizations.
def render(docs, style: str = "dep", **options) -> str: ...
def serve(docs, style: str = "dep", port: int = 5000, **options) -> None: ...Tools for training custom models, fine-tuning existing models, and creating specialized NLP pipelines for domain-specific applications.
def train(nlp: Language, examples: List, **kwargs) -> dict: ...
def evaluate(nlp: Language, examples: List, **kwargs) -> dict: ...class Language:
"""Main NLP pipeline class."""
vocab: Vocab
pipeline: List[tuple]
pipe_names: List[str]
def __call__(self, text: str) -> Doc: ...
def pipe(self, texts: Iterable[str], batch_size: int = 1000) -> Iterator[Doc]: ...
def add_pipe(self, component, name: str = None, **kwargs) -> callable: ...
class Doc:
"""Container for accessing linguistic annotations."""
text: str
text_with_ws: str
ents: tuple
noun_chunks: Iterator
sents: Iterator
vector: numpy.ndarray
def similarity(self, other) -> float: ...
def to_json(self) -> dict: ...
class Token:
"""Individual token with linguistic annotations."""
text: str
lemma_: str
pos_: str
tag_: str
dep_: str
ent_type_: str
head: 'Token'
children: Iterator
is_alpha: bool
is_digit: bool
is_punct: bool
like_num: bool
class Span:
"""Slice of a document."""
text: str
label_: str
kb_id_: str
vector: numpy.ndarray
def similarity(self, other) -> float: ...
def as_doc(self) -> Doc: ...
class Vocab:
"""Vocabulary store."""
strings: StringStore
vectors: Vectors
def __getitem__(self, string: str) -> Lexeme: ...