Industrial-strength Natural Language Processing (NLP) in Python
—
Powerful pattern matching systems for finding and extracting specific linguistic patterns, phrases, and dependency structures from text. spaCy provides three different matchers optimized for different use cases.
Rule-based matching system that finds sequences of tokens based on their linguistic attributes. Supports complex patterns with wildcards, operators, and constraints.
class Matcher:
"""Rule-based token pattern matcher."""
vocab: Vocab
def __init__(self, vocab: Vocab, validate: bool = False) -> None:
"""Initialize the Matcher."""
def __call__(self, doc: Doc) -> List[tuple]:
"""
Find matches in a Doc object.
Args:
doc: The Doc object to search
Returns:
List of (match_id, start, end) tuples
"""
def __len__(self) -> int:
"""Number of patterns in the matcher."""
def __contains__(self, key: str) -> bool:
"""Check if key exists in matcher."""
def add(self, key: str, patterns: List[List[dict]],
on_match: callable = None) -> None:
"""
Add patterns to the matcher.
Args:
key: String ID for the pattern
patterns: List of token patterns
on_match: Optional callback function
"""
def remove(self, key: str) -> None:
"""Remove a pattern by key."""
def has_key(self, key: str) -> bool:
"""Check if matcher has a pattern key."""
def get(self, key: str, default=None) -> List[List[dict]]:
"""Get patterns for a key."""
def pipe(self, stream: Iterable[Doc],
batch_size: int = 1000,
return_matches: bool = False,
as_tuples: bool = False) -> Iterator:
"""Process multiple documents."""Efficient exact-phrase matching using bloom filters and hash-based lookups. Optimized for matching large lists of multi-token phrases.
class PhraseMatcher:
"""Efficient phrase matching for exact multi-token phrases."""
vocab: Vocab
def __init__(self, vocab: Vocab, attr: str = "ORTH",
validate: bool = False) -> None:
"""Initialize the PhraseMatcher."""
def __call__(self, doc: Doc) -> List[tuple]:
"""
Find phrase matches in a Doc object.
Args:
doc: The Doc object to search
Returns:
List of (match_id, start, end) tuples
"""
def __len__(self) -> int:
"""Number of phrase patterns in the matcher."""
def __contains__(self, key: str) -> bool:
"""Check if key exists in matcher."""
def add(self, key: str, docs: List[Doc],
on_match: callable = None) -> None:
"""
Add phrase patterns to the matcher.
Args:
key: String ID for the phrases
docs: List of Doc objects representing phrases
on_match: Optional callback function
"""
def remove(self, key: str) -> None:
"""Remove phrases by key."""
def has_key(self, key: str) -> bool:
"""Check if matcher has a phrase key."""
def get(self, key: str, default=None) -> List[Doc]:
"""Get phrase patterns for a key."""
def pipe(self, stream: Iterable[Doc],
batch_size: int = 1000,
return_matches: bool = False,
as_tuples: bool = False) -> Iterator:
"""Process multiple documents."""Advanced pattern matching based on syntactic dependency relationships between tokens. Useful for extracting complex grammatical constructions.
class DependencyMatcher:
"""Pattern matching based on dependency parse trees."""
vocab: Vocab
def __init__(self, vocab: Vocab, validate: bool = False) -> None:
"""Initialize the DependencyMatcher."""
def __call__(self, doc: Doc) -> List[tuple]:
"""
Find dependency matches in a Doc object.
Args:
doc: The Doc object to search
Returns:
List of (match_id, matches) tuples where matches are token indices
"""
def add(self, key: str, patterns: List[List[dict]],
on_match: callable = None) -> None:
"""
Add dependency patterns to the matcher.
Args:
key: String ID for the pattern
patterns: List of dependency patterns
on_match: Optional callback function
"""
def remove(self, key: str) -> None:
"""Remove a pattern by key."""
def has_key(self, key: str) -> bool:
"""Check if matcher has a pattern key."""
def get(self, key: str) -> List[List[dict]]:
"""Get patterns for a key."""Token patterns are lists of dictionaries describing token attributes to match:
# Basic patterns
patterns = [
[{"LOWER": "hello"}, {"LOWER": "world"}], # "hello world"
[{"POS": "NOUN", "OP": "+"}], # One or more nouns
[{"LIKE_EMAIL": True}], # Email addresses
]
# Pattern operators
{
"OP": "!", # Negation: not this token
"OP": "?", # Optional: zero or one
"OP": "*", # Kleene star: zero or more
"OP": "+", # Plus: one or more
}
# Attribute matching
{
"ORTH": "Apple", # Exact text match
"LOWER": "apple", # Lowercase match
"LEMMA": "be", # Lemma match
"POS": "NOUN", # Part-of-speech
"TAG": "NNP", # Fine-grained POS tag
"DEP": "nsubj", # Dependency relation
"SHAPE": "Xxxx", # Word shape
"IS_ALPHA": True, # Boolean flags
"LIKE_NUM": True, # Number-like
"ENT_TYPE": "PERSON", # Entity type
}Dependency patterns specify relationships between tokens in the parse tree:
# Dependency pattern structure
pattern = [
{
"RIGHT_ID": "anchor", # Node identifier
"RIGHT_ATTRS": {"ORTH": "loves"} # Token attributes
},
{
"LEFT_ID": "anchor", # Reference to existing node
"REL_OP": ">", # Relation operator
"RIGHT_ID": "subject", # New node identifier
"RIGHT_ATTRS": {"DEP": "nsubj"} # Token attributes
}
]
# Relation operators
{
"REL_OP": ">", # Right token is a direct child of left token
"REL_OP": "<", # Right token is the direct head of left token
"REL_OP": ">>", # Right token is a descendant of left token
"REL_OP": "<<", # Right token is an ancestor of left token
"REL_OP": ".", # Right token is immediately after left token
"REL_OP": ";", # Right token is immediately before left token
}import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Add patterns
patterns = [
[{"LOWER": "apple"}, {"LOWER": "inc"}],
[{"ORTH": "iPhone"}],
[{"LIKE_EMAIL": True}]
]
matcher.add("TECH_TERMS", patterns)
# Find matches
doc = nlp("Apple Inc. released the iPhone. Contact us at info@apple.com")
matches = matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
print(f"Match: {span.text}")import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
# Complex patterns with operators
patterns = [
# One or more adjectives followed by a noun
[{"POS": "ADJ", "OP": "+"}, {"POS": "NOUN"}],
# Optional determiner, adjectives, noun
[{"POS": "DET", "OP": "?"}, {"POS": "ADJ", "OP": "*"}, {"POS": "NOUN"}],
# Currency amounts
[{"LIKE_NUM": True}, {"LOWER": {"IN": ["dollar", "dollars", "usd", "$"]}}],
# Negation patterns
[{"LOWER": "not"}, {"POS": "ADV", "OP": "?"}, {"POS": "ADJ"}],
]
matcher.add("COMPLEX_PATTERNS", patterns)
doc = nlp("The big red car costs fifty dollars")
matches = matcher(doc)
for match_id, start, end in matches:
print(f"Match: {doc[start:end].text}")import spacy
from spacy.matcher import PhraseMatcher
nlp = spacy.load("en_core_web_sm")
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
# Create phrase patterns from strings
terms = ["machine learning", "artificial intelligence", "deep learning", "neural network"]
patterns = [nlp.make_doc(text) for text in terms]
phrase_matcher.add("AI_TERMS", patterns)
# Find phrase matches
doc = nlp("Machine learning and artificial intelligence are transforming technology.")
matches = phrase_matcher(doc)
for match_id, start, end in matches:
span = doc[start:end]
print(f"Found: {span.text}")import spacy
from spacy.matcher import DependencyMatcher
nlp = spacy.load("en_core_web_sm")
dep_matcher = DependencyMatcher(nlp.vocab)
# Pattern: subject-verb-object relationships
pattern = [
{
"RIGHT_ID": "verb",
"RIGHT_ATTRS": {"POS": "VERB"}
},
{
"LEFT_ID": "verb",
"REL_OP": ">",
"RIGHT_ID": "subject",
"RIGHT_ATTRS": {"DEP": "nsubj"}
},
{
"LEFT_ID": "verb",
"REL_OP": ">",
"RIGHT_ID": "object",
"RIGHT_ATTRS": {"DEP": "dobj"}
}
]
dep_matcher.add("SVO", [pattern])
doc = nlp("The company acquired the startup.")
matches = dep_matcher(doc)
for match_id, token_ids in matches:
tokens = [doc[i] for i in token_ids]
print(f"SVO: {' '.join([t.text for t in tokens])}")import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
def on_match(matcher, doc, id, matches):
"""Custom callback function for matches."""
match_id, start, end = matches[0] # First match
span = doc[start:end]
print(f"Callback triggered for: {span.text}")
# Add custom processing
span._.is_company = True
# Add pattern with callback
patterns = [[{"ORTH": "Apple"}, {"ORTH": "Inc."}]]
matcher.add("COMPANY", patterns, on_match=on_match)
doc = nlp("Apple Inc. is a technology company.")
matches = matcher(doc)import spacy
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
patterns = [
[{"ENT_TYPE": "PERSON"}],
[{"ENT_TYPE": "ORG"}],
[{"LIKE_EMAIL": True}]
]
matcher.add("ENTITIES", patterns)
# Process multiple documents
texts = [
"John Smith works at Apple Inc.",
"Contact jane@company.com for details.",
"Microsoft hired Sarah Johnson."
]
# Use pipe for efficient processing
docs = nlp.pipe(texts)
for doc in matcher.pipe(docs, return_matches=True, as_tuples=True):
doc_obj, matches = doc
print(f"Text: {doc_obj.text}")
for match_id, start, end in matches:
print(f" Match: {doc_obj[start:end].text}")import spacy
from spacy.matcher import Matcher, PhraseMatcher
nlp = spacy.load("en_core_web_sm")
# Token-based matcher for patterns
token_matcher = Matcher(nlp.vocab)
token_patterns = [
[{"LIKE_EMAIL": True}],
[{"LIKE_URL": True}]
]
token_matcher.add("CONTACT_INFO", token_patterns)
# Phrase matcher for exact terms
phrase_matcher = PhraseMatcher(nlp.vocab)
companies = ["Apple Inc.", "Microsoft Corporation", "Google LLC"]
phrase_patterns = [nlp.make_doc(text) for text in companies]
phrase_matcher.add("COMPANIES", phrase_patterns)
# Process text with both matchers
doc = nlp("Contact Apple Inc. at info@apple.com or visit https://apple.com")
token_matches = token_matcher(doc)
phrase_matches = phrase_matcher(doc)
print("Token matches:")
for match_id, start, end in token_matches:
print(f" {doc[start:end].text}")
print("Phrase matches:")
for match_id, start, end in phrase_matches:
print(f" {doc[start:end].text}")Install with Tessl CLI
npx tessl i tessl/pypi-spacy