0
# Pattern Matching
1
2
Powerful pattern matching systems for finding and extracting specific linguistic patterns, phrases, and dependency structures from text. spaCy provides three different matchers optimized for different use cases.
3
4
## Capabilities
5
6
### Token Pattern Matching
7
8
Rule-based matching system that finds sequences of tokens based on their linguistic attributes. Supports complex patterns with wildcards, operators, and constraints.
9
10
```python { .api }
11
class Matcher:
12
"""Rule-based token pattern matcher."""
13
14
vocab: Vocab
15
16
def __init__(self, vocab: Vocab, validate: bool = False) -> None:
17
"""Initialize the Matcher."""
18
19
def __call__(self, doc: Doc) -> List[tuple]:
20
"""
21
Find matches in a Doc object.
22
23
Args:
24
doc: The Doc object to search
25
26
Returns:
27
List of (match_id, start, end) tuples
28
"""
29
30
def __len__(self) -> int:
31
"""Number of patterns in the matcher."""
32
33
def __contains__(self, key: str) -> bool:
34
"""Check if key exists in matcher."""
35
36
def add(self, key: str, patterns: List[List[dict]],
37
on_match: callable = None) -> None:
38
"""
39
Add patterns to the matcher.
40
41
Args:
42
key: String ID for the pattern
43
patterns: List of token patterns
44
on_match: Optional callback function
45
"""
46
47
def remove(self, key: str) -> None:
48
"""Remove a pattern by key."""
49
50
def has_key(self, key: str) -> bool:
51
"""Check if matcher has a pattern key."""
52
53
def get(self, key: str, default=None) -> List[List[dict]]:
54
"""Get patterns for a key."""
55
56
def pipe(self, stream: Iterable[Doc],
57
batch_size: int = 1000,
58
return_matches: bool = False,
59
as_tuples: bool = False) -> Iterator:
60
"""Process multiple documents."""
61
```
62
63
### Phrase Matching
64
65
Efficient exact-phrase matching using bloom filters and hash-based lookups. Optimized for matching large lists of multi-token phrases.
66
67
```python { .api }
68
class PhraseMatcher:
69
"""Efficient phrase matching for exact multi-token phrases."""
70
71
vocab: Vocab
72
73
def __init__(self, vocab: Vocab, attr: str = "ORTH",
74
validate: bool = False) -> None:
75
"""Initialize the PhraseMatcher."""
76
77
def __call__(self, doc: Doc) -> List[tuple]:
78
"""
79
Find phrase matches in a Doc object.
80
81
Args:
82
doc: The Doc object to search
83
84
Returns:
85
List of (match_id, start, end) tuples
86
"""
87
88
def __len__(self) -> int:
89
"""Number of phrase patterns in the matcher."""
90
91
def __contains__(self, key: str) -> bool:
92
"""Check if key exists in matcher."""
93
94
def add(self, key: str, docs: List[Doc],
95
on_match: callable = None) -> None:
96
"""
97
Add phrase patterns to the matcher.
98
99
Args:
100
key: String ID for the phrases
101
docs: List of Doc objects representing phrases
102
on_match: Optional callback function
103
"""
104
105
def remove(self, key: str) -> None:
106
"""Remove phrases by key."""
107
108
def has_key(self, key: str) -> bool:
109
"""Check if matcher has a phrase key."""
110
111
def get(self, key: str, default=None) -> List[Doc]:
112
"""Get phrase patterns for a key."""
113
114
def pipe(self, stream: Iterable[Doc],
115
batch_size: int = 1000,
116
return_matches: bool = False,
117
as_tuples: bool = False) -> Iterator:
118
"""Process multiple documents."""
119
```
120
121
### Dependency Pattern Matching
122
123
Advanced pattern matching based on syntactic dependency relationships between tokens. Useful for extracting complex grammatical constructions.
124
125
```python { .api }
126
class DependencyMatcher:
127
"""Pattern matching based on dependency parse trees."""
128
129
vocab: Vocab
130
131
def __init__(self, vocab: Vocab, validate: bool = False) -> None:
132
"""Initialize the DependencyMatcher."""
133
134
def __call__(self, doc: Doc) -> List[tuple]:
135
"""
136
Find dependency matches in a Doc object.
137
138
Args:
139
doc: The Doc object to search
140
141
Returns:
142
List of (match_id, matches) tuples where matches are token indices
143
"""
144
145
def add(self, key: str, patterns: List[List[dict]],
146
on_match: callable = None) -> None:
147
"""
148
Add dependency patterns to the matcher.
149
150
Args:
151
key: String ID for the pattern
152
patterns: List of dependency patterns
153
on_match: Optional callback function
154
"""
155
156
def remove(self, key: str) -> None:
157
"""Remove a pattern by key."""
158
159
def has_key(self, key: str) -> bool:
160
"""Check if matcher has a pattern key."""
161
162
def get(self, key: str) -> List[List[dict]]:
163
"""Get patterns for a key."""
164
```
165
166
## Pattern Specifications
167
168
### Token Pattern Format
169
170
Token patterns are lists of dictionaries describing token attributes to match:
171
172
```python
173
# Basic patterns
174
patterns = [
175
[{"LOWER": "hello"}, {"LOWER": "world"}], # "hello world"
176
[{"POS": "NOUN", "OP": "+"}], # One or more nouns
177
[{"LIKE_EMAIL": True}], # Email addresses
178
]
179
180
# Pattern operators
181
{
182
"OP": "!", # Negation: not this token
183
"OP": "?", # Optional: zero or one
184
"OP": "*", # Kleene star: zero or more
185
"OP": "+", # Plus: one or more
186
}
187
188
# Attribute matching
189
{
190
"ORTH": "Apple", # Exact text match
191
"LOWER": "apple", # Lowercase match
192
"LEMMA": "be", # Lemma match
193
"POS": "NOUN", # Part-of-speech
194
"TAG": "NNP", # Fine-grained POS tag
195
"DEP": "nsubj", # Dependency relation
196
"SHAPE": "Xxxx", # Word shape
197
"IS_ALPHA": True, # Boolean flags
198
"LIKE_NUM": True, # Number-like
199
"ENT_TYPE": "PERSON", # Entity type
200
}
201
```
202
203
### Dependency Pattern Format
204
205
Dependency patterns specify relationships between tokens in the parse tree:
206
207
```python
208
# Dependency pattern structure
209
pattern = [
210
{
211
"RIGHT_ID": "anchor", # Node identifier
212
"RIGHT_ATTRS": {"ORTH": "loves"} # Token attributes
213
},
214
{
215
"LEFT_ID": "anchor", # Reference to existing node
216
"REL_OP": ">", # Relation operator
217
"RIGHT_ID": "subject", # New node identifier
218
"RIGHT_ATTRS": {"DEP": "nsubj"} # Token attributes
219
}
220
]
221
222
# Relation operators
223
{
224
"REL_OP": ">", # Right token is a direct child of left token
225
"REL_OP": "<", # Right token is the direct head of left token
226
"REL_OP": ">>", # Right token is a descendant of left token
227
"REL_OP": "<<", # Right token is an ancestor of left token
228
"REL_OP": ".", # Right token is immediately after left token
229
"REL_OP": ";", # Right token is immediately before left token
230
}
231
```
232
233
## Usage Examples
234
235
### Basic Token Matching
236
237
```python
238
import spacy
239
from spacy.matcher import Matcher
240
241
nlp = spacy.load("en_core_web_sm")
242
matcher = Matcher(nlp.vocab)
243
244
# Add patterns
245
patterns = [
246
[{"LOWER": "apple"}, {"LOWER": "inc"}],
247
[{"ORTH": "iPhone"}],
248
[{"LIKE_EMAIL": True}]
249
]
250
matcher.add("TECH_TERMS", patterns)
251
252
# Find matches
253
doc = nlp("Apple Inc. released the iPhone. Contact us at info@apple.com")
254
matches = matcher(doc)
255
256
for match_id, start, end in matches:
257
span = doc[start:end]
258
print(f"Match: {span.text}")
259
```
260
261
### Advanced Token Patterns
262
263
```python
264
import spacy
265
from spacy.matcher import Matcher
266
267
nlp = spacy.load("en_core_web_sm")
268
matcher = Matcher(nlp.vocab)
269
270
# Complex patterns with operators
271
patterns = [
272
# One or more adjectives followed by a noun
273
[{"POS": "ADJ", "OP": "+"}, {"POS": "NOUN"}],
274
275
# Optional determiner, adjectives, noun
276
[{"POS": "DET", "OP": "?"}, {"POS": "ADJ", "OP": "*"}, {"POS": "NOUN"}],
277
278
# Currency amounts
279
[{"LIKE_NUM": True}, {"LOWER": {"IN": ["dollar", "dollars", "usd", "$"]}}],
280
281
# Negation patterns
282
[{"LOWER": "not"}, {"POS": "ADV", "OP": "?"}, {"POS": "ADJ"}],
283
]
284
285
matcher.add("COMPLEX_PATTERNS", patterns)
286
287
doc = nlp("The big red car costs fifty dollars")
288
matches = matcher(doc)
289
290
for match_id, start, end in matches:
291
print(f"Match: {doc[start:end].text}")
292
```
293
294
### Phrase Matching
295
296
```python
297
import spacy
298
from spacy.matcher import PhraseMatcher
299
300
nlp = spacy.load("en_core_web_sm")
301
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
302
303
# Create phrase patterns from strings
304
terms = ["machine learning", "artificial intelligence", "deep learning", "neural network"]
305
patterns = [nlp.make_doc(text) for text in terms]
306
phrase_matcher.add("AI_TERMS", patterns)
307
308
# Find phrase matches
309
doc = nlp("Machine learning and artificial intelligence are transforming technology.")
310
matches = phrase_matcher(doc)
311
312
for match_id, start, end in matches:
313
span = doc[start:end]
314
print(f"Found: {span.text}")
315
```
316
317
### Dependency Matching
318
319
```python
320
import spacy
321
from spacy.matcher import DependencyMatcher
322
323
nlp = spacy.load("en_core_web_sm")
324
dep_matcher = DependencyMatcher(nlp.vocab)
325
326
# Pattern: subject-verb-object relationships
327
pattern = [
328
{
329
"RIGHT_ID": "verb",
330
"RIGHT_ATTRS": {"POS": "VERB"}
331
},
332
{
333
"LEFT_ID": "verb",
334
"REL_OP": ">",
335
"RIGHT_ID": "subject",
336
"RIGHT_ATTRS": {"DEP": "nsubj"}
337
},
338
{
339
"LEFT_ID": "verb",
340
"REL_OP": ">",
341
"RIGHT_ID": "object",
342
"RIGHT_ATTRS": {"DEP": "dobj"}
343
}
344
]
345
346
dep_matcher.add("SVO", [pattern])
347
348
doc = nlp("The company acquired the startup.")
349
matches = dep_matcher(doc)
350
351
for match_id, token_ids in matches:
352
tokens = [doc[i] for i in token_ids]
353
print(f"SVO: {' '.join([t.text for t in tokens])}")
354
```
355
356
### Custom Match Callbacks
357
358
```python
359
import spacy
360
from spacy.matcher import Matcher
361
362
nlp = spacy.load("en_core_web_sm")
363
matcher = Matcher(nlp.vocab)
364
365
def on_match(matcher, doc, id, matches):
366
"""Custom callback function for matches."""
367
match_id, start, end = matches[0] # First match
368
span = doc[start:end]
369
print(f"Callback triggered for: {span.text}")
370
371
# Add custom processing
372
span._.is_company = True
373
374
# Add pattern with callback
375
patterns = [[{"ORTH": "Apple"}, {"ORTH": "Inc."}]]
376
matcher.add("COMPANY", patterns, on_match=on_match)
377
378
doc = nlp("Apple Inc. is a technology company.")
379
matches = matcher(doc)
380
```
381
382
### Batch Processing with Matchers
383
384
```python
385
import spacy
386
from spacy.matcher import Matcher
387
388
nlp = spacy.load("en_core_web_sm")
389
matcher = Matcher(nlp.vocab)
390
391
patterns = [
392
[{"ENT_TYPE": "PERSON"}],
393
[{"ENT_TYPE": "ORG"}],
394
[{"LIKE_EMAIL": True}]
395
]
396
matcher.add("ENTITIES", patterns)
397
398
# Process multiple documents
399
texts = [
400
"John Smith works at Apple Inc.",
401
"Contact jane@company.com for details.",
402
"Microsoft hired Sarah Johnson."
403
]
404
405
# Use pipe for efficient processing
406
docs = nlp.pipe(texts)
407
for doc in matcher.pipe(docs, return_matches=True, as_tuples=True):
408
doc_obj, matches = doc
409
print(f"Text: {doc_obj.text}")
410
for match_id, start, end in matches:
411
print(f" Match: {doc_obj[start:end].text}")
412
```
413
414
### Combining Multiple Matchers
415
416
```python
417
import spacy
418
from spacy.matcher import Matcher, PhraseMatcher
419
420
nlp = spacy.load("en_core_web_sm")
421
422
# Token-based matcher for patterns
423
token_matcher = Matcher(nlp.vocab)
424
token_patterns = [
425
[{"LIKE_EMAIL": True}],
426
[{"LIKE_URL": True}]
427
]
428
token_matcher.add("CONTACT_INFO", token_patterns)
429
430
# Phrase matcher for exact terms
431
phrase_matcher = PhraseMatcher(nlp.vocab)
432
companies = ["Apple Inc.", "Microsoft Corporation", "Google LLC"]
433
phrase_patterns = [nlp.make_doc(text) for text in companies]
434
phrase_matcher.add("COMPANIES", phrase_patterns)
435
436
# Process text with both matchers
437
doc = nlp("Contact Apple Inc. at info@apple.com or visit https://apple.com")
438
439
token_matches = token_matcher(doc)
440
phrase_matches = phrase_matcher(doc)
441
442
print("Token matches:")
443
for match_id, start, end in token_matches:
444
print(f" {doc[start:end].text}")
445
446
print("Phrase matches:")
447
for match_id, start, end in phrase_matches:
448
print(f" {doc[start:end].text}")
449
```