tessl/pypi-spacy

Industrial-strength Natural Language Processing (NLP) in Python

—

Pending

Overview

Eval results

Files

Pattern Matching

Name: tessl/pypi-spacy
Author: tessl

Powerful pattern matching systems for finding and extracting specific linguistic patterns, phrases, and dependency structures from text. spaCy provides three different matchers optimized for different use cases.

Capabilities

Token Pattern Matching

Rule-based matching system that finds sequences of tokens based on their linguistic attributes. Supports complex patterns with wildcards, operators, and constraints.

class Matcher:
    """Rule-based token pattern matcher."""
    
    vocab: Vocab
    
    def __init__(self, vocab: Vocab, validate: bool = False) -> None:
        """Initialize the Matcher."""
    
    def __call__(self, doc: Doc) -> List[tuple]:
        """
        Find matches in a Doc object.
        
        Args:
            doc: The Doc object to search
            
        Returns:
            List of (match_id, start, end) tuples
        """
    
    def __len__(self) -> int:
        """Number of patterns in the matcher."""
    
    def __contains__(self, key: str) -> bool:
        """Check if key exists in matcher."""
    
    def add(self, key: str, patterns: List[List[dict]], 
            on_match: callable = None) -> None:
        """
        Add patterns to the matcher.
        
        Args:
            key: String ID for the pattern
            patterns: List of token patterns
            on_match: Optional callback function
        """
    
    def remove(self, key: str) -> None:
        """Remove a pattern by key."""
    
    def has_key(self, key: str) -> bool:
        """Check if matcher has a pattern key."""
    
    def get(self, key: str, default=None) -> List[List[dict]]:
        """Get patterns for a key."""
    
    def pipe(self, stream: Iterable[Doc], 
             batch_size: int = 1000,
             return_matches: bool = False,
             as_tuples: bool = False) -> Iterator:
        """Process multiple documents."""

Phrase Matching

Efficient exact-phrase matching using bloom filters and hash-based lookups. Optimized for matching large lists of multi-token phrases.

class PhraseMatcher:
    """Efficient phrase matching for exact multi-token phrases."""
    
    vocab: Vocab
    
    def __init__(self, vocab: Vocab, attr: str = "ORTH", 
                 validate: bool = False) -> None:
        """Initialize the PhraseMatcher."""
    
    def __call__(self, doc: Doc) -> List[tuple]:
        """
        Find phrase matches in a Doc object.
        
        Args:
            doc: The Doc object to search
            
        Returns:
            List of (match_id, start, end) tuples
        """
    
    def __len__(self) -> int:
        """Number of phrase patterns in the matcher."""
    
    def __contains__(self, key: str) -> bool:
        """Check if key exists in matcher."""
    
    def add(self, key: str, docs: List[Doc], 
            on_match: callable = None) -> None:
        """
        Add phrase patterns to the matcher.
        
        Args:
            key: String ID for the phrases
            docs: List of Doc objects representing phrases
            on_match: Optional callback function
        """
    
    def remove(self, key: str) -> None:
        """Remove phrases by key."""
    
    def has_key(self, key: str) -> bool:
        """Check if matcher has a phrase key."""
    
    def get(self, key: str, default=None) -> List[Doc]:
        """Get phrase patterns for a key."""
    
    def pipe(self, stream: Iterable[Doc], 
             batch_size: int = 1000,
             return_matches: bool = False,
             as_tuples: bool = False) -> Iterator:
        """Process multiple documents."""

Dependency Pattern Matching

Advanced pattern matching based on syntactic dependency relationships between tokens. Useful for extracting complex grammatical constructions.

class DependencyMatcher:
    """Pattern matching based on dependency parse trees."""
    
    vocab: Vocab
    
    def __init__(self, vocab: Vocab, validate: bool = False) -> None:
        """Initialize the DependencyMatcher."""
    
    def __call__(self, doc: Doc) -> List[tuple]:
        """
        Find dependency matches in a Doc object.
        
        Args:
            doc: The Doc object to search
            
        Returns:
            List of (match_id, matches) tuples where matches are token indices
        """
    
    def add(self, key: str, patterns: List[List[dict]], 
            on_match: callable = None) -> None:
        """
        Add dependency patterns to the matcher.
        
        Args:
            key: String ID for the pattern
            patterns: List of dependency patterns
            on_match: Optional callback function
        """
    
    def remove(self, key: str) -> None:
        """Remove a pattern by key."""
    
    def has_key(self, key: str) -> bool:
        """Check if matcher has a pattern key."""
    
    def get(self, key: str) -> List[List[dict]]:
        """Get patterns for a key."""

Pattern Specifications

Token Pattern Format

Token patterns are lists of dictionaries describing token attributes to match:

# Basic patterns
patterns = [
    [{"LOWER": "hello"}, {"LOWER": "world"}],  # "hello world"
    [{"POS": "NOUN", "OP": "+"}],              # One or more nouns
    [{"LIKE_EMAIL": True}],                    # Email addresses
]

# Pattern operators
{
    "OP": "!",    # Negation: not this token
    "OP": "?",    # Optional: zero or one
    "OP": "*",    # Kleene star: zero or more  
    "OP": "+",    # Plus: one or more
}

# Attribute matching
{
    "ORTH": "Apple",           # Exact text match
    "LOWER": "apple",          # Lowercase match
    "LEMMA": "be",            # Lemma match
    "POS": "NOUN",            # Part-of-speech
    "TAG": "NNP",             # Fine-grained POS tag
    "DEP": "nsubj",           # Dependency relation
    "SHAPE": "Xxxx",          # Word shape
    "IS_ALPHA": True,         # Boolean flags
    "LIKE_NUM": True,         # Number-like
    "ENT_TYPE": "PERSON",     # Entity type
}

Dependency Pattern Format

Dependency patterns specify relationships between tokens in the parse tree:

# Dependency pattern structure
pattern = [
    {
        "RIGHT_ID": "anchor",      # Node identifier
        "RIGHT_ATTRS": {"ORTH": "loves"}  # Token attributes
    },
    {
        "LEFT_ID": "anchor",       # Reference to existing node
        "REL_OP": ">",            # Relation operator
        "RIGHT_ID": "subject",     # New node identifier  
        "RIGHT_ATTRS": {"DEP": "nsubj"}  # Token attributes
    }
]

# Relation operators
{
    "REL_OP": ">",     # Right token is a direct child of left token
    "REL_OP": "<",     # Right token is the direct head of left token
    "REL_OP": ">>",    # Right token is a descendant of left token
    "REL_OP": "<<",    # Right token is an ancestor of left token
    "REL_OP": ".",     # Right token is immediately after left token
    "REL_OP": ";",     # Right token is immediately before left token
}

Usage Examples

Basic Token Matching

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Add patterns
patterns = [
    [{"LOWER": "apple"}, {"LOWER": "inc"}],
    [{"ORTH": "iPhone"}],
    [{"LIKE_EMAIL": True}]
]
matcher.add("TECH_TERMS", patterns)

# Find matches
doc = nlp("Apple Inc. released the iPhone. Contact us at info@apple.com")
matches = matcher(doc)

for match_id, start, end in matches:
    span = doc[start:end]
    print(f"Match: {span.text}")

Advanced Token Patterns

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

# Complex patterns with operators
patterns = [
    # One or more adjectives followed by a noun
    [{"POS": "ADJ", "OP": "+"}, {"POS": "NOUN"}],
    
    # Optional determiner, adjectives, noun
    [{"POS": "DET", "OP": "?"}, {"POS": "ADJ", "OP": "*"}, {"POS": "NOUN"}],
    
    # Currency amounts
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["dollar", "dollars", "usd", "$"]}}],
    
    # Negation patterns
    [{"LOWER": "not"}, {"POS": "ADV", "OP": "?"}, {"POS": "ADJ"}],
]

matcher.add("COMPLEX_PATTERNS", patterns)

doc = nlp("The big red car costs fifty dollars")
matches = matcher(doc)

for match_id, start, end in matches:
    print(f"Match: {doc[start:end].text}")

Phrase Matching

import spacy
from spacy.matcher import PhraseMatcher

nlp = spacy.load("en_core_web_sm")
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

# Create phrase patterns from strings
terms = ["machine learning", "artificial intelligence", "deep learning", "neural network"]
patterns = [nlp.make_doc(text) for text in terms]
phrase_matcher.add("AI_TERMS", patterns)

# Find phrase matches
doc = nlp("Machine learning and artificial intelligence are transforming technology.")
matches = phrase_matcher(doc)

for match_id, start, end in matches:
    span = doc[start:end]
    print(f"Found: {span.text}")

Dependency Matching

import spacy
from spacy.matcher import DependencyMatcher

nlp = spacy.load("en_core_web_sm")
dep_matcher = DependencyMatcher(nlp.vocab)

# Pattern: subject-verb-object relationships
pattern = [
    {
        "RIGHT_ID": "verb",
        "RIGHT_ATTRS": {"POS": "VERB"}
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "subject", 
        "RIGHT_ATTRS": {"DEP": "nsubj"}
    },
    {
        "LEFT_ID": "verb",
        "REL_OP": ">",
        "RIGHT_ID": "object",
        "RIGHT_ATTRS": {"DEP": "dobj"}
    }
]

dep_matcher.add("SVO", [pattern])

doc = nlp("The company acquired the startup.")
matches = dep_matcher(doc)

for match_id, token_ids in matches:
    tokens = [doc[i] for i in token_ids]
    print(f"SVO: {' '.join([t.text for t in tokens])}")

Custom Match Callbacks

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

def on_match(matcher, doc, id, matches):
    """Custom callback function for matches."""
    match_id, start, end = matches[0]  # First match
    span = doc[start:end]
    print(f"Callback triggered for: {span.text}")
    
    # Add custom processing
    span._.is_company = True

# Add pattern with callback
patterns = [[{"ORTH": "Apple"}, {"ORTH": "Inc."}]]
matcher.add("COMPANY", patterns, on_match=on_match)

doc = nlp("Apple Inc. is a technology company.")
matches = matcher(doc)

Batch Processing with Matchers

import spacy
from spacy.matcher import Matcher

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

patterns = [
    [{"ENT_TYPE": "PERSON"}],
    [{"ENT_TYPE": "ORG"}], 
    [{"LIKE_EMAIL": True}]
]
matcher.add("ENTITIES", patterns)

# Process multiple documents
texts = [
    "John Smith works at Apple Inc.",
    "Contact jane@company.com for details.",
    "Microsoft hired Sarah Johnson."
]

# Use pipe for efficient processing
docs = nlp.pipe(texts)
for doc in matcher.pipe(docs, return_matches=True, as_tuples=True):
    doc_obj, matches = doc
    print(f"Text: {doc_obj.text}")
    for match_id, start, end in matches:
        print(f"  Match: {doc_obj[start:end].text}")

Combining Multiple Matchers

import spacy
from spacy.matcher import Matcher, PhraseMatcher

nlp = spacy.load("en_core_web_sm") 

# Token-based matcher for patterns
token_matcher = Matcher(nlp.vocab)
token_patterns = [
    [{"LIKE_EMAIL": True}],
    [{"LIKE_URL": True}]
]
token_matcher.add("CONTACT_INFO", token_patterns)

# Phrase matcher for exact terms
phrase_matcher = PhraseMatcher(nlp.vocab)
companies = ["Apple Inc.", "Microsoft Corporation", "Google LLC"]
phrase_patterns = [nlp.make_doc(text) for text in companies]
phrase_matcher.add("COMPANIES", phrase_patterns)

# Process text with both matchers
doc = nlp("Contact Apple Inc. at info@apple.com or visit https://apple.com")

token_matches = token_matcher(doc)
phrase_matches = phrase_matcher(doc)

print("Token matches:")
for match_id, start, end in token_matches:
    print(f"  {doc[start:end].text}")

print("Phrase matches:")  
for match_id, start, end in phrase_matches:
    print(f"  {doc[start:end].text}")

Install with Tessl CLI