Industrial-strength Natural Language Processing (NLP) in Python
—
The fundamental objects for text processing in spaCy. These classes form the foundation of all NLP operations and provide access to linguistic annotations, document structure, and vocabulary management.
The main entry point for NLP processing. The Language class manages the processing pipeline and provides methods for processing single texts or batches efficiently.
class Language:
"""Main NLP pipeline class that processes text through pipeline components."""
vocab: Vocab
pipeline: List[tuple]
pipe_names: List[str]
meta: dict
def __call__(self, text: str) -> Doc:
"""Process a single text and return a Doc object."""
def pipe(self, texts: Iterable[str],
batch_size: int = 1000,
disable: List[str] = None,
component_cfg: dict = None,
n_process: int = 1) -> Iterator[Doc]:
"""Process multiple texts efficiently."""
def update(self, examples: List, sgd=None, **kwargs) -> dict:
"""Update the model with training examples."""
def begin_training(self, get_examples=None, **kwargs) -> Optimizer:
"""Initialize training and return optimizer."""
def evaluate(self, examples: List, **kwargs) -> dict:
"""Evaluate the model on examples."""
# Pipeline management
def add_pipe(self, component, name: str = None,
before: str = None, after: str = None,
first: bool = False, last: bool = False) -> callable:
"""Add a component to the processing pipeline."""
def remove_pipe(self, name: str) -> tuple:
"""Remove a component from the pipeline."""
def get_pipe(self, name: str) -> callable:
"""Get a pipeline component by name."""
def has_pipe(self, name: str) -> bool:
"""Check if pipeline has a component."""
def disable_pipes(self, *names) -> ContextManager:
"""Temporarily disable pipeline components."""
# Serialization
def to_disk(self, path: str, exclude: List[str] = None) -> None:
"""Save the model to disk."""
def from_disk(self, path: str, exclude: List[str] = None) -> 'Language':
"""Load the model from disk."""
def to_bytes(self, exclude: List[str] = None) -> bytes:
"""Serialize the model to bytes."""
def from_bytes(self, bytes_data: bytes, exclude: List[str] = None) -> 'Language':
"""Load the model from bytes."""The Doc class represents a document with token-level and document-level annotations. It provides access to the parsed text structure and linguistic analysis.
class Doc:
"""Container for accessing linguistic annotations on a document."""
text: str
text_with_ws: str
ents: tuple
noun_chunks: Iterator
sents: Iterator
vector: numpy.ndarray
lang_: str
is_parsed: bool
is_tagged: bool
is_sentenced: bool
def __init__(self, vocab: Vocab, words: List[str] = None,
spaces: List[bool] = None) -> None:
"""Create a Doc object."""
def __getitem__(self, i: Union[int, slice]) -> Union[Token, Span]:
"""Get a token or span."""
def __iter__(self) -> Iterator[Token]:
"""Iterate over tokens."""
def __len__(self) -> int:
"""Number of tokens."""
def similarity(self, other: Union['Doc', 'Span', 'Token']) -> float:
"""Compute semantic similarity."""
def char_span(self, start: int, end: int,
label: str = None, kb_id: str = None) -> Span:
"""Create a Span from character positions."""
def count_by(self, attr: int, exclude: Set = None) -> dict:
"""Count tokens by attribute."""
def to_json(self, underscore: List[str] = None) -> dict:
"""Export to JSON format."""
def retokenize(self) -> ContextManager:
"""Context manager for merging/splitting tokens."""
# Serialization
def to_disk(self, path: str, exclude: List[str] = None) -> None:
"""Save the doc to disk."""
def from_disk(self, path: str, exclude: List[str] = None) -> 'Doc':
"""Load the doc from disk."""
def to_bytes(self, exclude: List[str] = None) -> bytes:
"""Serialize the doc to bytes."""
def from_bytes(self, bytes_data: bytes, exclude: List[str] = None) -> 'Doc':
"""Load the doc from bytes."""Individual tokens with comprehensive linguistic annotations including morphology, syntax, and semantic properties.
class Token:
"""Individual token with linguistic annotations."""
# Text properties
text: str
text_with_ws: str
whitespace_: str
orth: int
orth_: str
# Linguistic annotations
lemma: int
lemma_: str
pos: int
pos_: str
tag: int
tag_: str
dep: int
dep_: str
# Morphological features
morph: MorphAnalysis
# Named entity information
ent_type: int
ent_type_: str
ent_iob: int
ent_iob_: str
ent_kb_id: int
ent_kb_id_: str
ent_id: int
ent_id_: str
# Syntactic relationships
head: 'Token'
children: Iterator['Token']
ancestors: Iterator['Token']
subtree: Iterator['Token']
lefts: Iterator['Token']
rights: Iterator['Token']
n_lefts: int
n_rights: int
# Boolean flags
is_alpha: bool
is_ascii: bool
is_digit: bool
is_lower: bool
is_upper: bool
is_title: bool
is_punct: bool
is_space: bool
is_bracket: bool
is_quote: bool
is_stop: bool
like_url: bool
like_num: bool
like_email: bool
# Vector representation
vector: numpy.ndarray
has_vector: bool
vector_norm: float
def similarity(self, other: Union['Token', 'Span', 'Doc']) -> float:
"""Compute semantic similarity."""
def nbor(self, i: int = 1) -> 'Token':
"""Get neighboring token."""
def is_ancestor(self, descendant: 'Token') -> bool:
"""Check if token is ancestor of another."""Spans represent slices of documents, typically used for named entities, noun chunks, or custom text segments.
class Span:
"""Slice of a document with optional label and attributes."""
text: str
text_with_ws: str
label: int
label_: str
kb_id: int
kb_id_: str
ent_id: int
ent_id_: str
start: int
end: int
start_char: int
end_char: int
vector: numpy.ndarray
doc: Doc
sent: 'Span'
root: Token
ents: tuple
def __init__(self, doc: Doc, start: int, end: int,
label: int = 0, kb_id: int = 0) -> None:
"""Create a Span object."""
def __getitem__(self, i: Union[int, slice]) -> Union[Token, 'Span']:
"""Get token or subspan."""
def __iter__(self) -> Iterator[Token]:
"""Iterate over tokens."""
def __len__(self) -> int:
"""Number of tokens in span."""
def similarity(self, other: Union['Span', 'Doc', 'Token']) -> float:
"""Compute semantic similarity."""
def as_doc(self) -> Doc:
"""Create a new Doc object from the span."""
def char_span(self, start: int, end: int,
label: str = None, kb_id: str = None) -> 'Span':
"""Create a subspan from character positions."""
def conjuncts(self) -> List['Span']:
"""Get conjunct spans."""The vocabulary stores all strings, word vectors, and lexical entries used by the language model.
class Vocab:
"""Vocabulary store for strings, vectors, and lexical entries."""
strings: StringStore
vectors: Vectors
lookups: Lookups
writing_system: dict
def __init__(self, lex_attr_getters: dict = None,
strings: StringStore = None,
lookups: Lookups = None,
oov_prob: float = -20.0) -> None:
"""Create a vocabulary."""
def __getitem__(self, id_or_string: Union[int, str]) -> Lexeme:
"""Get a lexeme."""
def __iter__(self) -> Iterator[Lexeme]:
"""Iterate over lexemes."""
def __len__(self) -> int:
"""Number of lexemes."""
def __contains__(self, string: str) -> bool:
"""Check if string is in vocabulary."""
def add_flag(self, flag_getter: callable, flag_id: int = None) -> int:
"""Add a boolean flag attribute."""
def get_vector(self, orth: Union[int, str]) -> numpy.ndarray:
"""Get word vector."""
def set_vector(self, orth: Union[int, str], vector: numpy.ndarray) -> None:
"""Set word vector."""
def has_vector(self, orth: Union[int, str]) -> bool:
"""Check if word has vector."""
# Serialization
def to_disk(self, path: str, exclude: List[str] = None) -> None:
"""Save vocabulary to disk."""
def from_disk(self, path: str, exclude: List[str] = None) -> 'Vocab':
"""Load vocabulary from disk."""Lexemes store word-type information in the vocabulary, independent of context.
class Lexeme:
"""Word type stored in vocabulary."""
# Text properties
orth: int
orth_: str
text: str
lower: int
lower_: str
norm: int
norm_: str
shape: int
shape_: str
prefix: int
prefix_: str
suffix: int
suffix_: str
# Boolean flags
is_alpha: bool
is_ascii: bool
is_digit: bool
is_lower: bool
is_upper: bool
is_title: bool
is_punct: bool
is_space: bool
is_bracket: bool
is_quote: bool
is_stop: bool
like_url: bool
like_num: bool
like_email: bool
# Vector representation
vector: numpy.ndarray
has_vector: bool
vector_norm: float
# Probability and sentiment
prob: float
sentiment: float
def similarity(self, other: Union['Lexeme', 'Token']) -> float:
"""Compute semantic similarity."""Efficient storage and serialization for multiple documents.
class DocBin:
"""Efficient storage for multiple Doc objects."""
def __init__(self, attrs: List[str] = None, store_user_data: bool = False) -> None:
"""Create a DocBin for storing multiple documents."""
def __len__(self) -> int:
"""Number of documents in the collection."""
def add(self, doc: Doc) -> None:
"""Add a Doc object to the collection."""
def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
"""Retrieve Doc objects from the collection."""
def merge(self, other: 'DocBin') -> None:
"""Merge another DocBin into this one."""
# Serialization
def to_disk(self, path: str) -> None:
"""Save the DocBin to disk."""
def from_disk(self, path: str) -> 'DocBin':
"""Load the DocBin from disk."""
def to_bytes(self) -> bytes:
"""Serialize to bytes."""
def from_bytes(self, bytes_data: bytes) -> 'DocBin':
"""Deserialize from bytes."""Tools for modifying document tokenization after initial processing.
class Retokenizer:
"""Context manager for modifying document tokenization."""
def merge(self, span: Span, attrs: dict = None) -> None:
"""
Merge a span into a single token.
Args:
span: The span to merge
attrs: Optional token attributes for merged token
"""
def split(self, token: Token, orths: List[str],
heads: List[tuple] = None, attrs: dict = None) -> None:
"""
Split a token into multiple tokens.
Args:
token: The token to split
orths: List of orthographic forms for new tokens
heads: List of (head_index, dep_label) tuples
attrs: Optional token attributes
"""Container for morphological feature analysis.
class MorphAnalysis:
"""Morphological analysis container."""
def __init__(self, vocab: Vocab, features: dict = None) -> None:
"""Create morphological analysis."""
def __str__(self) -> str:
"""String representation of morphological features."""
def get(self, field: str) -> List[str]:
"""Get values for a morphological field."""
def to_dict(self) -> dict:
"""Convert to dictionary format."""
@classmethod
def from_id(cls, vocab: Vocab, key: int) -> 'MorphAnalysis':
"""Create from vocabulary ID."""Management system for linguistic lookup tables and data.
class Lookups:
"""Lookup table management system."""
def __init__(self) -> None:
"""Create empty lookup tables."""
def add_table(self, name: str, data: dict = None) -> dict:
"""Add a lookup table."""
def get_table(self, name: str, default: dict = None) -> dict:
"""Get a lookup table by name."""
def has_table(self, name: str) -> bool:
"""Check if table exists."""
def remove_table(self, name: str) -> dict:
"""Remove and return a table."""
def to_disk(self, path: str, exclude: List[str] = None) -> None:
"""Save lookup tables to disk."""
def from_disk(self, path: str, exclude: List[str] = None) -> 'Lookups':
"""Load lookup tables from disk."""System for reducing words to their lemmatized forms.
class Lemmatizer:
"""Lemmatization component."""
def __init__(self, lookups: Lookups = None, rules: dict = None) -> None:
"""Initialize lemmatizer."""
def lookup(self, string: str, pos: str = None, morphs: dict = None) -> List[str]:
"""Look up lemma in tables."""
def rule_lookup(self, string: str, pos: str) -> List[str]:
"""Apply lemmatization rules."""
def lookup_table(self, string: str, table: str) -> List[str]:
"""Look up in specific table."""
def is_base_form(self, univ_pos: str, morphs: dict = None) -> bool:
"""Check if token is in base form."""Efficient bidirectional mapping between strings and integer IDs.
class StringStore:
"""Bidirectional map between strings and integer IDs."""
def __init__(self, strings: Iterable[str] = None) -> None:
"""Create a string store."""
def __getitem__(self, id_or_string: Union[int, str]) -> Union[str, int]:
"""Get string by ID or ID by string."""
def __contains__(self, string: str) -> bool:
"""Check if string is in store."""
def __iter__(self) -> Iterator[str]:
"""Iterate over strings."""
def __len__(self) -> int:
"""Number of strings."""
def add(self, string: str) -> int:
"""Add string and return ID."""
# Serialization
def to_disk(self, path: str) -> None:
"""Save string store to disk."""
def from_disk(self, path: str) -> 'StringStore':
"""Load string store from disk."""
def to_bytes(self) -> bytes:
"""Serialize to bytes."""
def from_bytes(self, bytes_data: bytes) -> 'StringStore':
"""Deserialize from bytes."""import spacy
nlp = spacy.load("en_core_web_sm")
# Process single document
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")
# Access document properties
print(f"Text: {doc.text}")
print(f"Number of tokens: {len(doc)}")
print(f"Number of sentences: {len(list(doc.sents))}")
# Iterate over tokens
for token in doc:
print(f"{token.text}: {token.pos_} ({token.lemma_})")
# Process multiple documents efficiently
texts = ["First document", "Second document", "Third document"]
for doc in nlp.pipe(texts):
print(f"Processed: {doc.text}")# Create custom spans
doc = nlp("Apple is looking at buying U.K. startup")
company_span = doc[0:1] # "Apple"
target_span = doc[4:7] # "U.K. startup"
# Named entity spans
for ent in doc.ents:
print(f"Entity: {ent.text} ({ent.label_})")
print(f"Start: {ent.start}, End: {ent.end}")
# Create span from character positions
char_span = doc.char_span(0, 5, label="ORG") # "Apple"
if char_span:
print(f"Character span: {char_span.text}")# Access vocabulary
vocab = nlp.vocab
# Get lexeme
apple_lexeme = vocab["apple"]
print(f"Is alpha: {apple_lexeme.is_alpha}")
print(f"Is stop word: {apple_lexeme.is_stop}")
# String store operations
string_id = vocab.strings.add("custom_token")
retrieved_string = vocab.strings[string_id]
print(f"String ID: {string_id}, Retrieved: {retrieved_string}")Install with Tessl CLI
npx tessl i tessl/pypi-spacy