Tessl Tile for pypi/spacy@2.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-objects.md index.md language-models.md pattern-matching.md pipeline-components.md training.md visualization.md

pattern-matching.mddocs/

0
# Pattern Matching
1

2
Powerful pattern matching systems for finding and extracting specific linguistic patterns, phrases, and dependency structures from text. spaCy provides three different matchers optimized for different use cases.
3

4
## Capabilities
5

6
### Token Pattern Matching
7

8
Rule-based matching system that finds sequences of tokens based on their linguistic attributes. Supports complex patterns with wildcards, operators, and constraints.
9

10
```python { .api }
11
class Matcher:
12
    """Rule-based token pattern matcher."""
13
    
14
    vocab: Vocab
15
    
16
    def __init__(self, vocab: Vocab, validate: bool = False) -> None:
17
        """Initialize the Matcher."""
18
    
19
    def __call__(self, doc: Doc) -> List[tuple]:
20
        """
21
        Find matches in a Doc object.
22
        
23
        Args:
24
            doc: The Doc object to search
25
            
26
        Returns:
27
            List of (match_id, start, end) tuples
28
        """
29
    
30
    def __len__(self) -> int:
31
        """Number of patterns in the matcher."""
32
    
33
    def __contains__(self, key: str) -> bool:
34
        """Check if key exists in matcher."""
35
    
36
    def add(self, key: str, patterns: List[List[dict]], 
37
            on_match: callable = None) -> None:
38
        """
39
        Add patterns to the matcher.
40
        
41
        Args:
42
            key: String ID for the pattern
43
            patterns: List of token patterns
44
            on_match: Optional callback function
45
        """
46
    
47
    def remove(self, key: str) -> None:
48
        """Remove a pattern by key."""
49
    
50
    def has_key(self, key: str) -> bool:
51
        """Check if matcher has a pattern key."""
52
    
53
    def get(self, key: str, default=None) -> List[List[dict]]:
54
        """Get patterns for a key."""
55
    
56
    def pipe(self, stream: Iterable[Doc], 
57
             batch_size: int = 1000,
58
             return_matches: bool = False,
59
             as_tuples: bool = False) -> Iterator:
60
        """Process multiple documents."""
61
```
62

63
### Phrase Matching
64

65
Efficient exact-phrase matching using bloom filters and hash-based lookups. Optimized for matching large lists of multi-token phrases.
66

67
```python { .api }
68
class PhraseMatcher:
69
    """Efficient phrase matching for exact multi-token phrases."""
70
    
71
    vocab: Vocab
72
    
73
    def __init__(self, vocab: Vocab, attr: str = "ORTH", 
74
                 validate: bool = False) -> None:
75
        """Initialize the PhraseMatcher."""
76
    
77
    def __call__(self, doc: Doc) -> List[tuple]:
78
        """
79
        Find phrase matches in a Doc object.
80
        
81
        Args:
82
            doc: The Doc object to search
83
            
84
        Returns:
85
            List of (match_id, start, end) tuples
86
        """
87
    
88
    def __len__(self) -> int:
89
        """Number of phrase patterns in the matcher."""
90
    
91
    def __contains__(self, key: str) -> bool:
92
        """Check if key exists in matcher."""
93
    
94
    def add(self, key: str, docs: List[Doc], 
95
            on_match: callable = None) -> None:
96
        """
97
        Add phrase patterns to the matcher.
98
        
99
        Args:
100
            key: String ID for the phrases
101
            docs: List of Doc objects representing phrases
102
            on_match: Optional callback function
103
        """
104
    
105
    def remove(self, key: str) -> None:
106
        """Remove phrases by key."""
107
    
108
    def has_key(self, key: str) -> bool:
109
        """Check if matcher has a phrase key."""
110
    
111
    def get(self, key: str, default=None) -> List[Doc]:
112
        """Get phrase patterns for a key."""
113
    
114
    def pipe(self, stream: Iterable[Doc], 
115
             batch_size: int = 1000,
116
             return_matches: bool = False,
117
             as_tuples: bool = False) -> Iterator:
118
        """Process multiple documents."""
119
```
120

121
### Dependency Pattern Matching
122

123
Advanced pattern matching based on syntactic dependency relationships between tokens. Useful for extracting complex grammatical constructions.
124

125
```python { .api }
126
class DependencyMatcher:
127
    """Pattern matching based on dependency parse trees."""
128
    
129
    vocab: Vocab
130
    
131
    def __init__(self, vocab: Vocab, validate: bool = False) -> None:
132
        """Initialize the DependencyMatcher."""
133
    
134
    def __call__(self, doc: Doc) -> List[tuple]:
135
        """
136
        Find dependency matches in a Doc object.
137
        
138
        Args:
139
            doc: The Doc object to search
140
            
141
        Returns:
142
            List of (match_id, matches) tuples where matches are token indices
143
        """
144
    
145
    def add(self, key: str, patterns: List[List[dict]], 
146
            on_match: callable = None) -> None:
147
        """
148
        Add dependency patterns to the matcher.
149
        
150
        Args:
151
            key: String ID for the pattern
152
            patterns: List of dependency patterns
153
            on_match: Optional callback function
154
        """
155
    
156
    def remove(self, key: str) -> None:
157
        """Remove a pattern by key."""
158
    
159
    def has_key(self, key: str) -> bool:
160
        """Check if matcher has a pattern key."""
161
    
162
    def get(self, key: str) -> List[List[dict]]:
163
        """Get patterns for a key."""
164
```
165

166
## Pattern Specifications
167

168
### Token Pattern Format
169

170
Token patterns are lists of dictionaries describing token attributes to match:
171

172
```python
173
# Basic patterns
174
patterns = [
175
    [{"LOWER": "hello"}, {"LOWER": "world"}],  # "hello world"
176
    [{"POS": "NOUN", "OP": "+"}],              # One or more nouns
177
    [{"LIKE_EMAIL": True}],                    # Email addresses
178
]
179

180
# Pattern operators
181
{
182
    "OP": "!",    # Negation: not this token
183
    "OP": "?",    # Optional: zero or one
184
    "OP": "*",    # Kleene star: zero or more  
185
    "OP": "+",    # Plus: one or more
186
}
187

188
# Attribute matching
189
{
190
    "ORTH": "Apple",           # Exact text match
191
    "LOWER": "apple",          # Lowercase match
192
    "LEMMA": "be",            # Lemma match
193
    "POS": "NOUN",            # Part-of-speech
194
    "TAG": "NNP",             # Fine-grained POS tag
195
    "DEP": "nsubj",           # Dependency relation
196
    "SHAPE": "Xxxx",          # Word shape
197
    "IS_ALPHA": True,         # Boolean flags
198
    "LIKE_NUM": True,         # Number-like
199
    "ENT_TYPE": "PERSON",     # Entity type
200
}
201
```
202

203
### Dependency Pattern Format
204

205
Dependency patterns specify relationships between tokens in the parse tree:
206

207
```python
208
# Dependency pattern structure
209
pattern = [
210
    {
211
        "RIGHT_ID": "anchor",      # Node identifier
212
        "RIGHT_ATTRS": {"ORTH": "loves"}  # Token attributes
213
    },
214
    {
215
        "LEFT_ID": "anchor",       # Reference to existing node
216
        "REL_OP": ">",            # Relation operator
217
        "RIGHT_ID": "subject",     # New node identifier  
218
        "RIGHT_ATTRS": {"DEP": "nsubj"}  # Token attributes
219
    }
220
]
221

222
# Relation operators
223
{
224
    "REL_OP": ">",     # Right token is a direct child of left token
225
    "REL_OP": "<",     # Right token is the direct head of left token
226
    "REL_OP": ">>",    # Right token is a descendant of left token
227
    "REL_OP": "<<",    # Right token is an ancestor of left token
228
    "REL_OP": ".",     # Right token is immediately after left token
229
    "REL_OP": ";",     # Right token is immediately before left token
230
}
231
```
232

233
## Usage Examples
234

235
### Basic Token Matching
236

237
```python
238
import spacy
239
from spacy.matcher import Matcher
240

241
nlp = spacy.load("en_core_web_sm")
242
matcher = Matcher(nlp.vocab)
243

244
# Add patterns
245
patterns = [
246
    [{"LOWER": "apple"}, {"LOWER": "inc"}],
247
    [{"ORTH": "iPhone"}],
248
    [{"LIKE_EMAIL": True}]
249
]
250
matcher.add("TECH_TERMS", patterns)
251

252
# Find matches
253
doc = nlp("Apple Inc. released the iPhone. Contact us at info@apple.com")
254
matches = matcher(doc)
255

256
for match_id, start, end in matches:
257
    span = doc[start:end]
258
    print(f"Match: {span.text}")
259
```
260

261
### Advanced Token Patterns
262

263
```python
264
import spacy
265
from spacy.matcher import Matcher
266

267
nlp = spacy.load("en_core_web_sm")
268
matcher = Matcher(nlp.vocab)
269

270
# Complex patterns with operators
271
patterns = [
272
    # One or more adjectives followed by a noun
273
    [{"POS": "ADJ", "OP": "+"}, {"POS": "NOUN"}],
274
    
275
    # Optional determiner, adjectives, noun
276
    [{"POS": "DET", "OP": "?"}, {"POS": "ADJ", "OP": "*"}, {"POS": "NOUN"}],
277
    
278
    # Currency amounts
279
    [{"LIKE_NUM": True}, {"LOWER": {"IN": ["dollar", "dollars", "usd", "$"]}}],
280
    
281
    # Negation patterns
282
    [{"LOWER": "not"}, {"POS": "ADV", "OP": "?"}, {"POS": "ADJ"}],
283
]
284

285
matcher.add("COMPLEX_PATTERNS", patterns)
286

287
doc = nlp("The big red car costs fifty dollars")
288
matches = matcher(doc)
289

290
for match_id, start, end in matches:
291
    print(f"Match: {doc[start:end].text}")
292
```
293

294
### Phrase Matching
295

296
```python
297
import spacy
298
from spacy.matcher import PhraseMatcher
299

300
nlp = spacy.load("en_core_web_sm")
301
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
302

303
# Create phrase patterns from strings
304
terms = ["machine learning", "artificial intelligence", "deep learning", "neural network"]
305
patterns = [nlp.make_doc(text) for text in terms]
306
phrase_matcher.add("AI_TERMS", patterns)
307

308
# Find phrase matches
309
doc = nlp("Machine learning and artificial intelligence are transforming technology.")
310
matches = phrase_matcher(doc)
311

312
for match_id, start, end in matches:
313
    span = doc[start:end]
314
    print(f"Found: {span.text}")
315
```
316

317
### Dependency Matching
318

319
```python
320
import spacy
321
from spacy.matcher import DependencyMatcher
322

323
nlp = spacy.load("en_core_web_sm")
324
dep_matcher = DependencyMatcher(nlp.vocab)
325

326
# Pattern: subject-verb-object relationships
327
pattern = [
328
    {
329
        "RIGHT_ID": "verb",
330
        "RIGHT_ATTRS": {"POS": "VERB"}
331
    },
332
    {
333
        "LEFT_ID": "verb",
334
        "REL_OP": ">",
335
        "RIGHT_ID": "subject", 
336
        "RIGHT_ATTRS": {"DEP": "nsubj"}
337
    },
338
    {
339
        "LEFT_ID": "verb",
340
        "REL_OP": ">",
341
        "RIGHT_ID": "object",
342
        "RIGHT_ATTRS": {"DEP": "dobj"}
343
    }
344
]
345

346
dep_matcher.add("SVO", [pattern])
347

348
doc = nlp("The company acquired the startup.")
349
matches = dep_matcher(doc)
350

351
for match_id, token_ids in matches:
352
    tokens = [doc[i] for i in token_ids]
353
    print(f"SVO: {' '.join([t.text for t in tokens])}")
354
```
355

356
### Custom Match Callbacks
357

358
```python
359
import spacy
360
from spacy.matcher import Matcher
361

362
nlp = spacy.load("en_core_web_sm")
363
matcher = Matcher(nlp.vocab)
364

365
def on_match(matcher, doc, id, matches):
366
    """Custom callback function for matches."""
367
    match_id, start, end = matches[0]  # First match
368
    span = doc[start:end]
369
    print(f"Callback triggered for: {span.text}")
370
    
371
    # Add custom processing
372
    span._.is_company = True
373

374
# Add pattern with callback
375
patterns = [[{"ORTH": "Apple"}, {"ORTH": "Inc."}]]
376
matcher.add("COMPANY", patterns, on_match=on_match)
377

378
doc = nlp("Apple Inc. is a technology company.")
379
matches = matcher(doc)
380
```
381

382
### Batch Processing with Matchers
383

384
```python
385
import spacy
386
from spacy.matcher import Matcher
387

388
nlp = spacy.load("en_core_web_sm")
389
matcher = Matcher(nlp.vocab)
390

391
patterns = [
392
    [{"ENT_TYPE": "PERSON"}],
393
    [{"ENT_TYPE": "ORG"}], 
394
    [{"LIKE_EMAIL": True}]
395
]
396
matcher.add("ENTITIES", patterns)
397

398
# Process multiple documents
399
texts = [
400
    "John Smith works at Apple Inc.",
401
    "Contact jane@company.com for details.",
402
    "Microsoft hired Sarah Johnson."
403
]
404

405
# Use pipe for efficient processing
406
docs = nlp.pipe(texts)
407
for doc in matcher.pipe(docs, return_matches=True, as_tuples=True):
408
    doc_obj, matches = doc
409
    print(f"Text: {doc_obj.text}")
410
    for match_id, start, end in matches:
411
        print(f"  Match: {doc_obj[start:end].text}")
412
```
413

414
### Combining Multiple Matchers
415

416
```python
417
import spacy
418
from spacy.matcher import Matcher, PhraseMatcher
419

420
nlp = spacy.load("en_core_web_sm") 
421

422
# Token-based matcher for patterns
423
token_matcher = Matcher(nlp.vocab)
424
token_patterns = [
425
    [{"LIKE_EMAIL": True}],
426
    [{"LIKE_URL": True}]
427
]
428
token_matcher.add("CONTACT_INFO", token_patterns)
429

430
# Phrase matcher for exact terms
431
phrase_matcher = PhraseMatcher(nlp.vocab)
432
companies = ["Apple Inc.", "Microsoft Corporation", "Google LLC"]
433
phrase_patterns = [nlp.make_doc(text) for text in companies]
434
phrase_matcher.add("COMPANIES", phrase_patterns)
435

436
# Process text with both matchers
437
doc = nlp("Contact Apple Inc. at info@apple.com or visit https://apple.com")
438

439
token_matches = token_matcher(doc)
440
phrase_matches = phrase_matcher(doc)
441

442
print("Token matches:")
443
for match_id, start, end in token_matches:
444
    print(f"  {doc[start:end].text}")
445

446
print("Phrase matches:")  
447
for match_id, start, end in phrase_matches:
448
    print(f"  {doc[start:end].text}")
449
```

Version

Tile

Files

pattern-matching.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

pattern-matching.mddocs/