tessl/pypi-spacy

Industrial-strength Natural Language Processing (NLP) in Python

—

Pending

Overview

Eval results

Files

Core Processing Objects

Name: tessl/pypi-spacy
Author: tessl

The fundamental objects for text processing in spaCy. These classes form the foundation of all NLP operations and provide access to linguistic annotations, document structure, and vocabulary management.

Capabilities

Language Pipeline

The main entry point for NLP processing. The Language class manages the processing pipeline and provides methods for processing single texts or batches efficiently.

class Language:
    """Main NLP pipeline class that processes text through pipeline components."""
    
    vocab: Vocab
    pipeline: List[tuple]
    pipe_names: List[str]
    meta: dict
    
    def __call__(self, text: str) -> Doc:
        """Process a single text and return a Doc object."""
    
    def pipe(self, texts: Iterable[str], 
             batch_size: int = 1000, 
             disable: List[str] = None,
             component_cfg: dict = None,
             n_process: int = 1) -> Iterator[Doc]:
        """Process multiple texts efficiently."""
    
    def update(self, examples: List, sgd=None, **kwargs) -> dict:
        """Update the model with training examples."""
    
    def begin_training(self, get_examples=None, **kwargs) -> Optimizer:
        """Initialize training and return optimizer."""
    
    def evaluate(self, examples: List, **kwargs) -> dict:
        """Evaluate the model on examples."""
    
    # Pipeline management
    def add_pipe(self, component, name: str = None, 
                 before: str = None, after: str = None,
                 first: bool = False, last: bool = False) -> callable:
        """Add a component to the processing pipeline."""
    
    def remove_pipe(self, name: str) -> tuple:
        """Remove a component from the pipeline."""
    
    def get_pipe(self, name: str) -> callable:
        """Get a pipeline component by name."""
    
    def has_pipe(self, name: str) -> bool:
        """Check if pipeline has a component."""
    
    def disable_pipes(self, *names) -> ContextManager:
        """Temporarily disable pipeline components."""
    
    # Serialization
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
        """Save the model to disk."""
    
    def from_disk(self, path: str, exclude: List[str] = None) -> 'Language':
        """Load the model from disk."""
    
    def to_bytes(self, exclude: List[str] = None) -> bytes:
        """Serialize the model to bytes."""
    
    def from_bytes(self, bytes_data: bytes, exclude: List[str] = None) -> 'Language':
        """Load the model from bytes."""

Document Container

The Doc class represents a document with token-level and document-level annotations. It provides access to the parsed text structure and linguistic analysis.

class Doc:
    """Container for accessing linguistic annotations on a document."""
    
    text: str
    text_with_ws: str
    ents: tuple
    noun_chunks: Iterator
    sents: Iterator
    vector: numpy.ndarray
    lang_: str
    is_parsed: bool
    is_tagged: bool
    is_sentenced: bool
    
    def __init__(self, vocab: Vocab, words: List[str] = None, 
                 spaces: List[bool] = None) -> None:
        """Create a Doc object."""
    
    def __getitem__(self, i: Union[int, slice]) -> Union[Token, Span]:
        """Get a token or span."""
    
    def __iter__(self) -> Iterator[Token]:
        """Iterate over tokens."""
    
    def __len__(self) -> int:
        """Number of tokens."""
    
    def similarity(self, other: Union['Doc', 'Span', 'Token']) -> float:
        """Compute semantic similarity."""
    
    def char_span(self, start: int, end: int, 
                  label: str = None, kb_id: str = None) -> Span:
        """Create a Span from character positions."""
    
    def count_by(self, attr: int, exclude: Set = None) -> dict:
        """Count tokens by attribute."""
    
    def to_json(self, underscore: List[str] = None) -> dict:
        """Export to JSON format."""
    
    def retokenize(self) -> ContextManager:
        """Context manager for merging/splitting tokens."""
    
    # Serialization
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
        """Save the doc to disk."""
    
    def from_disk(self, path: str, exclude: List[str] = None) -> 'Doc':
        """Load the doc from disk."""
    
    def to_bytes(self, exclude: List[str] = None) -> bytes:
        """Serialize the doc to bytes."""
    
    def from_bytes(self, bytes_data: bytes, exclude: List[str] = None) -> 'Doc':
        """Load the doc from bytes."""

Token Annotations

Individual tokens with comprehensive linguistic annotations including morphology, syntax, and semantic properties.

class Token:
    """Individual token with linguistic annotations."""
    
    # Text properties
    text: str
    text_with_ws: str
    whitespace_: str
    orth: int
    orth_: str
    
    # Linguistic annotations
    lemma: int
    lemma_: str
    pos: int
    pos_: str
    tag: int
    tag_: str
    dep: int
    dep_: str
    
    # Morphological features
    morph: MorphAnalysis
    
    # Named entity information
    ent_type: int
    ent_type_: str
    ent_iob: int
    ent_iob_: str
    ent_kb_id: int
    ent_kb_id_: str
    ent_id: int
    ent_id_: str
    
    # Syntactic relationships
    head: 'Token'
    children: Iterator['Token']
    ancestors: Iterator['Token']
    subtree: Iterator['Token']
    lefts: Iterator['Token']
    rights: Iterator['Token']
    n_lefts: int
    n_rights: int
    
    # Boolean flags
    is_alpha: bool
    is_ascii: bool
    is_digit: bool
    is_lower: bool
    is_upper: bool
    is_title: bool
    is_punct: bool
    is_space: bool
    is_bracket: bool
    is_quote: bool
    is_stop: bool
    like_url: bool
    like_num: bool
    like_email: bool
    
    # Vector representation
    vector: numpy.ndarray
    has_vector: bool
    vector_norm: float
    
    def similarity(self, other: Union['Token', 'Span', 'Doc']) -> float:
        """Compute semantic similarity."""
    
    def nbor(self, i: int = 1) -> 'Token':
        """Get neighboring token."""
    
    def is_ancestor(self, descendant: 'Token') -> bool:
        """Check if token is ancestor of another."""

Span Objects

Spans represent slices of documents, typically used for named entities, noun chunks, or custom text segments.

class Span:
    """Slice of a document with optional label and attributes."""
    
    text: str
    text_with_ws: str
    label: int
    label_: str
    kb_id: int
    kb_id_: str
    ent_id: int
    ent_id_: str
    
    start: int
    end: int
    start_char: int
    end_char: int
    
    vector: numpy.ndarray
    
    doc: Doc
    sent: 'Span'
    root: Token
    ents: tuple
    
    def __init__(self, doc: Doc, start: int, end: int,
                 label: int = 0, kb_id: int = 0) -> None:
        """Create a Span object."""
    
    def __getitem__(self, i: Union[int, slice]) -> Union[Token, 'Span']:
        """Get token or subspan."""
    
    def __iter__(self) -> Iterator[Token]:
        """Iterate over tokens."""
    
    def __len__(self) -> int:
        """Number of tokens in span."""
    
    def similarity(self, other: Union['Span', 'Doc', 'Token']) -> float:
        """Compute semantic similarity."""
    
    def as_doc(self) -> Doc:
        """Create a new Doc object from the span."""
    
    def char_span(self, start: int, end: int, 
                  label: str = None, kb_id: str = None) -> 'Span':
        """Create a subspan from character positions."""
    
    def conjuncts(self) -> List['Span']:
        """Get conjunct spans."""

Vocabulary Management

The vocabulary stores all strings, word vectors, and lexical entries used by the language model.

class Vocab:
    """Vocabulary store for strings, vectors, and lexical entries."""
    
    strings: StringStore
    vectors: Vectors
    lookups: Lookups
    writing_system: dict
    
    def __init__(self, lex_attr_getters: dict = None, 
                 strings: StringStore = None,
                 lookups: Lookups = None,
                 oov_prob: float = -20.0) -> None:
        """Create a vocabulary."""
    
    def __getitem__(self, id_or_string: Union[int, str]) -> Lexeme:
        """Get a lexeme."""
    
    def __iter__(self) -> Iterator[Lexeme]:
        """Iterate over lexemes."""
    
    def __len__(self) -> int:
        """Number of lexemes."""
    
    def __contains__(self, string: str) -> bool:
        """Check if string is in vocabulary."""
    
    def add_flag(self, flag_getter: callable, flag_id: int = None) -> int:
        """Add a boolean flag attribute."""
    
    def get_vector(self, orth: Union[int, str]) -> numpy.ndarray:
        """Get word vector."""
    
    def set_vector(self, orth: Union[int, str], vector: numpy.ndarray) -> None:
        """Set word vector."""
    
    def has_vector(self, orth: Union[int, str]) -> bool:
        """Check if word has vector."""
    
    # Serialization
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
        """Save vocabulary to disk."""
    
    def from_disk(self, path: str, exclude: List[str] = None) -> 'Vocab':
        """Load vocabulary from disk."""

Lexeme Objects

Lexemes store word-type information in the vocabulary, independent of context.

class Lexeme:
    """Word type stored in vocabulary."""
    
    # Text properties
    orth: int
    orth_: str
    text: str
    lower: int
    lower_: str
    norm: int
    norm_: str
    shape: int
    shape_: str
    prefix: int
    prefix_: str
    suffix: int
    suffix_: str
    
    # Boolean flags
    is_alpha: bool
    is_ascii: bool
    is_digit: bool
    is_lower: bool
    is_upper: bool
    is_title: bool
    is_punct: bool
    is_space: bool
    is_bracket: bool 
    is_quote: bool
    is_stop: bool
    like_url: bool
    like_num: bool
    like_email: bool
    
    # Vector representation
    vector: numpy.ndarray
    has_vector: bool
    vector_norm: float
    
    # Probability and sentiment
    prob: float
    sentiment: float
    
    def similarity(self, other: Union['Lexeme', 'Token']) -> float:
        """Compute semantic similarity."""

Document Collections

Efficient storage and serialization for multiple documents.

class DocBin:
    """Efficient storage for multiple Doc objects."""
    
    def __init__(self, attrs: List[str] = None, store_user_data: bool = False) -> None:
        """Create a DocBin for storing multiple documents."""
    
    def __len__(self) -> int:
        """Number of documents in the collection."""
    
    def add(self, doc: Doc) -> None:
        """Add a Doc object to the collection."""
    
    def get_docs(self, vocab: Vocab) -> Iterator[Doc]:
        """Retrieve Doc objects from the collection."""
    
    def merge(self, other: 'DocBin') -> None:
        """Merge another DocBin into this one."""
    
    # Serialization
    def to_disk(self, path: str) -> None:
        """Save the DocBin to disk."""
    
    def from_disk(self, path: str) -> 'DocBin':
        """Load the DocBin from disk."""
    
    def to_bytes(self) -> bytes:
        """Serialize to bytes."""
    
    def from_bytes(self, bytes_data: bytes) -> 'DocBin':
        """Deserialize from bytes."""

Document Modification

Tools for modifying document tokenization after initial processing.

class Retokenizer:
    """Context manager for modifying document tokenization."""
    
    def merge(self, span: Span, attrs: dict = None) -> None:
        """
        Merge a span into a single token.
        
        Args:
            span: The span to merge
            attrs: Optional token attributes for merged token
        """
    
    def split(self, token: Token, orths: List[str], 
             heads: List[tuple] = None, attrs: dict = None) -> None:
        """
        Split a token into multiple tokens.
        
        Args:
            token: The token to split
            orths: List of orthographic forms for new tokens
            heads: List of (head_index, dep_label) tuples
            attrs: Optional token attributes
        """

Morphological Analysis

Container for morphological feature analysis.

class MorphAnalysis:
    """Morphological analysis container."""
    
    def __init__(self, vocab: Vocab, features: dict = None) -> None:
        """Create morphological analysis."""
    
    def __str__(self) -> str:
        """String representation of morphological features."""
    
    def get(self, field: str) -> List[str]:
        """Get values for a morphological field."""
    
    def to_dict(self) -> dict:
        """Convert to dictionary format."""
    
    @classmethod
    def from_id(cls, vocab: Vocab, key: int) -> 'MorphAnalysis':
        """Create from vocabulary ID."""

Lookup Tables

Management system for linguistic lookup tables and data.

class Lookups:
    """Lookup table management system."""
    
    def __init__(self) -> None:
        """Create empty lookup tables."""
    
    def add_table(self, name: str, data: dict = None) -> dict:
        """Add a lookup table."""
    
    def get_table(self, name: str, default: dict = None) -> dict:
        """Get a lookup table by name."""
    
    def has_table(self, name: str) -> bool:
        """Check if table exists."""
    
    def remove_table(self, name: str) -> dict:
        """Remove and return a table."""
    
    def to_disk(self, path: str, exclude: List[str] = None) -> None:
        """Save lookup tables to disk."""
    
    def from_disk(self, path: str, exclude: List[str] = None) -> 'Lookups':
        """Load lookup tables from disk."""

Lemmatization

System for reducing words to their lemmatized forms.

class Lemmatizer:
    """Lemmatization component."""
    
    def __init__(self, lookups: Lookups = None, rules: dict = None) -> None:
        """Initialize lemmatizer."""
    
    def lookup(self, string: str, pos: str = None, morphs: dict = None) -> List[str]:
        """Look up lemma in tables."""
    
    def rule_lookup(self, string: str, pos: str) -> List[str]:
        """Apply lemmatization rules."""
    
    def lookup_table(self, string: str, table: str) -> List[str]:
        """Look up in specific table."""
    
    def is_base_form(self, univ_pos: str, morphs: dict = None) -> bool:
        """Check if token is in base form."""

String Store

Efficient bidirectional mapping between strings and integer IDs.

class StringStore:
    """Bidirectional map between strings and integer IDs."""
    
    def __init__(self, strings: Iterable[str] = None) -> None:
        """Create a string store."""
    
    def __getitem__(self, id_or_string: Union[int, str]) -> Union[str, int]:
        """Get string by ID or ID by string."""
    
    def __contains__(self, string: str) -> bool:
        """Check if string is in store."""
    
    def __iter__(self) -> Iterator[str]:
        """Iterate over strings."""
    
    def __len__(self) -> int:
        """Number of strings."""
    
    def add(self, string: str) -> int:
        """Add string and return ID."""
    
    # Serialization
    def to_disk(self, path: str) -> None:
        """Save string store to disk."""
    
    def from_disk(self, path: str) -> 'StringStore':
        """Load string store from disk."""
    
    def to_bytes(self) -> bytes:
        """Serialize to bytes."""
    
    def from_bytes(self, bytes_data: bytes) -> 'StringStore':
        """Deserialize from bytes."""

Usage Examples

Processing Documents

import spacy

nlp = spacy.load("en_core_web_sm")

# Process single document
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Access document properties
print(f"Text: {doc.text}")
print(f"Number of tokens: {len(doc)}")
print(f"Number of sentences: {len(list(doc.sents))}")

# Iterate over tokens
for token in doc:
    print(f"{token.text}: {token.pos_} ({token.lemma_})")

# Process multiple documents efficiently
texts = ["First document", "Second document", "Third document"]
for doc in nlp.pipe(texts):
    print(f"Processed: {doc.text}")

Working with Spans

# Create custom spans
doc = nlp("Apple is looking at buying U.K. startup")
company_span = doc[0:1]  # "Apple"
target_span = doc[4:7]   # "U.K. startup"

# Named entity spans
for ent in doc.ents:
    print(f"Entity: {ent.text} ({ent.label_})")
    print(f"Start: {ent.start}, End: {ent.end}")

# Create span from character positions
char_span = doc.char_span(0, 5, label="ORG")  # "Apple"
if char_span:
    print(f"Character span: {char_span.text}")

Vocabulary Operations

# Access vocabulary
vocab = nlp.vocab

# Get lexeme
apple_lexeme = vocab["apple"]
print(f"Is alpha: {apple_lexeme.is_alpha}")
print(f"Is stop word: {apple_lexeme.is_stop}")

# String store operations
string_id = vocab.strings.add("custom_token")
retrieved_string = vocab.strings[string_id]
print(f"String ID: {string_id}, Retrieved: {retrieved_string}")

Install with Tessl CLI