or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-objects.mdindex.mdlanguage-models.mdpattern-matching.mdpipeline-components.mdtraining.mdvisualization.md

pattern-matching.mddocs/

0

# Pattern Matching

1

2

Powerful pattern matching systems for finding and extracting specific linguistic patterns, phrases, and dependency structures from text. spaCy provides three different matchers optimized for different use cases.

3

4

## Capabilities

5

6

### Token Pattern Matching

7

8

Rule-based matching system that finds sequences of tokens based on their linguistic attributes. Supports complex patterns with wildcards, operators, and constraints.

9

10

```python { .api }

11

class Matcher:

12

"""Rule-based token pattern matcher."""

13

14

vocab: Vocab

15

16

def __init__(self, vocab: Vocab, validate: bool = False) -> None:

17

"""Initialize the Matcher."""

18

19

def __call__(self, doc: Doc) -> List[tuple]:

20

"""

21

Find matches in a Doc object.

22

23

Args:

24

doc: The Doc object to search

25

26

Returns:

27

List of (match_id, start, end) tuples

28

"""

29

30

def __len__(self) -> int:

31

"""Number of patterns in the matcher."""

32

33

def __contains__(self, key: str) -> bool:

34

"""Check if key exists in matcher."""

35

36

def add(self, key: str, patterns: List[List[dict]],

37

on_match: callable = None) -> None:

38

"""

39

Add patterns to the matcher.

40

41

Args:

42

key: String ID for the pattern

43

patterns: List of token patterns

44

on_match: Optional callback function

45

"""

46

47

def remove(self, key: str) -> None:

48

"""Remove a pattern by key."""

49

50

def has_key(self, key: str) -> bool:

51

"""Check if matcher has a pattern key."""

52

53

def get(self, key: str, default=None) -> List[List[dict]]:

54

"""Get patterns for a key."""

55

56

def pipe(self, stream: Iterable[Doc],

57

batch_size: int = 1000,

58

return_matches: bool = False,

59

as_tuples: bool = False) -> Iterator:

60

"""Process multiple documents."""

61

```

62

63

### Phrase Matching

64

65

Efficient exact-phrase matching using bloom filters and hash-based lookups. Optimized for matching large lists of multi-token phrases.

66

67

```python { .api }

68

class PhraseMatcher:

69

"""Efficient phrase matching for exact multi-token phrases."""

70

71

vocab: Vocab

72

73

def __init__(self, vocab: Vocab, attr: str = "ORTH",

74

validate: bool = False) -> None:

75

"""Initialize the PhraseMatcher."""

76

77

def __call__(self, doc: Doc) -> List[tuple]:

78

"""

79

Find phrase matches in a Doc object.

80

81

Args:

82

doc: The Doc object to search

83

84

Returns:

85

List of (match_id, start, end) tuples

86

"""

87

88

def __len__(self) -> int:

89

"""Number of phrase patterns in the matcher."""

90

91

def __contains__(self, key: str) -> bool:

92

"""Check if key exists in matcher."""

93

94

def add(self, key: str, docs: List[Doc],

95

on_match: callable = None) -> None:

96

"""

97

Add phrase patterns to the matcher.

98

99

Args:

100

key: String ID for the phrases

101

docs: List of Doc objects representing phrases

102

on_match: Optional callback function

103

"""

104

105

def remove(self, key: str) -> None:

106

"""Remove phrases by key."""

107

108

def has_key(self, key: str) -> bool:

109

"""Check if matcher has a phrase key."""

110

111

def get(self, key: str, default=None) -> List[Doc]:

112

"""Get phrase patterns for a key."""

113

114

def pipe(self, stream: Iterable[Doc],

115

batch_size: int = 1000,

116

return_matches: bool = False,

117

as_tuples: bool = False) -> Iterator:

118

"""Process multiple documents."""

119

```

120

121

### Dependency Pattern Matching

122

123

Advanced pattern matching based on syntactic dependency relationships between tokens. Useful for extracting complex grammatical constructions.

124

125

```python { .api }

126

class DependencyMatcher:

127

"""Pattern matching based on dependency parse trees."""

128

129

vocab: Vocab

130

131

def __init__(self, vocab: Vocab, validate: bool = False) -> None:

132

"""Initialize the DependencyMatcher."""

133

134

def __call__(self, doc: Doc) -> List[tuple]:

135

"""

136

Find dependency matches in a Doc object.

137

138

Args:

139

doc: The Doc object to search

140

141

Returns:

142

List of (match_id, matches) tuples where matches are token indices

143

"""

144

145

def add(self, key: str, patterns: List[List[dict]],

146

on_match: callable = None) -> None:

147

"""

148

Add dependency patterns to the matcher.

149

150

Args:

151

key: String ID for the pattern

152

patterns: List of dependency patterns

153

on_match: Optional callback function

154

"""

155

156

def remove(self, key: str) -> None:

157

"""Remove a pattern by key."""

158

159

def has_key(self, key: str) -> bool:

160

"""Check if matcher has a pattern key."""

161

162

def get(self, key: str) -> List[List[dict]]:

163

"""Get patterns for a key."""

164

```

165

166

## Pattern Specifications

167

168

### Token Pattern Format

169

170

Token patterns are lists of dictionaries describing token attributes to match:

171

172

```python

173

# Basic patterns

174

patterns = [

175

[{"LOWER": "hello"}, {"LOWER": "world"}], # "hello world"

176

[{"POS": "NOUN", "OP": "+"}], # One or more nouns

177

[{"LIKE_EMAIL": True}], # Email addresses

178

]

179

180

# Pattern operators

181

{

182

"OP": "!", # Negation: not this token

183

"OP": "?", # Optional: zero or one

184

"OP": "*", # Kleene star: zero or more

185

"OP": "+", # Plus: one or more

186

}

187

188

# Attribute matching

189

{

190

"ORTH": "Apple", # Exact text match

191

"LOWER": "apple", # Lowercase match

192

"LEMMA": "be", # Lemma match

193

"POS": "NOUN", # Part-of-speech

194

"TAG": "NNP", # Fine-grained POS tag

195

"DEP": "nsubj", # Dependency relation

196

"SHAPE": "Xxxx", # Word shape

197

"IS_ALPHA": True, # Boolean flags

198

"LIKE_NUM": True, # Number-like

199

"ENT_TYPE": "PERSON", # Entity type

200

}

201

```

202

203

### Dependency Pattern Format

204

205

Dependency patterns specify relationships between tokens in the parse tree:

206

207

```python

208

# Dependency pattern structure

209

pattern = [

210

{

211

"RIGHT_ID": "anchor", # Node identifier

212

"RIGHT_ATTRS": {"ORTH": "loves"} # Token attributes

213

},

214

{

215

"LEFT_ID": "anchor", # Reference to existing node

216

"REL_OP": ">", # Relation operator

217

"RIGHT_ID": "subject", # New node identifier

218

"RIGHT_ATTRS": {"DEP": "nsubj"} # Token attributes

219

}

220

]

221

222

# Relation operators

223

{

224

"REL_OP": ">", # Right token is a direct child of left token

225

"REL_OP": "<", # Right token is the direct head of left token

226

"REL_OP": ">>", # Right token is a descendant of left token

227

"REL_OP": "<<", # Right token is an ancestor of left token

228

"REL_OP": ".", # Right token is immediately after left token

229

"REL_OP": ";", # Right token is immediately before left token

230

}

231

```

232

233

## Usage Examples

234

235

### Basic Token Matching

236

237

```python

238

import spacy

239

from spacy.matcher import Matcher

240

241

nlp = spacy.load("en_core_web_sm")

242

matcher = Matcher(nlp.vocab)

243

244

# Add patterns

245

patterns = [

246

[{"LOWER": "apple"}, {"LOWER": "inc"}],

247

[{"ORTH": "iPhone"}],

248

[{"LIKE_EMAIL": True}]

249

]

250

matcher.add("TECH_TERMS", patterns)

251

252

# Find matches

253

doc = nlp("Apple Inc. released the iPhone. Contact us at info@apple.com")

254

matches = matcher(doc)

255

256

for match_id, start, end in matches:

257

span = doc[start:end]

258

print(f"Match: {span.text}")

259

```

260

261

### Advanced Token Patterns

262

263

```python

264

import spacy

265

from spacy.matcher import Matcher

266

267

nlp = spacy.load("en_core_web_sm")

268

matcher = Matcher(nlp.vocab)

269

270

# Complex patterns with operators

271

patterns = [

272

# One or more adjectives followed by a noun

273

[{"POS": "ADJ", "OP": "+"}, {"POS": "NOUN"}],

274

275

# Optional determiner, adjectives, noun

276

[{"POS": "DET", "OP": "?"}, {"POS": "ADJ", "OP": "*"}, {"POS": "NOUN"}],

277

278

# Currency amounts

279

[{"LIKE_NUM": True}, {"LOWER": {"IN": ["dollar", "dollars", "usd", "$"]}}],

280

281

# Negation patterns

282

[{"LOWER": "not"}, {"POS": "ADV", "OP": "?"}, {"POS": "ADJ"}],

283

]

284

285

matcher.add("COMPLEX_PATTERNS", patterns)

286

287

doc = nlp("The big red car costs fifty dollars")

288

matches = matcher(doc)

289

290

for match_id, start, end in matches:

291

print(f"Match: {doc[start:end].text}")

292

```

293

294

### Phrase Matching

295

296

```python

297

import spacy

298

from spacy.matcher import PhraseMatcher

299

300

nlp = spacy.load("en_core_web_sm")

301

phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

302

303

# Create phrase patterns from strings

304

terms = ["machine learning", "artificial intelligence", "deep learning", "neural network"]

305

patterns = [nlp.make_doc(text) for text in terms]

306

phrase_matcher.add("AI_TERMS", patterns)

307

308

# Find phrase matches

309

doc = nlp("Machine learning and artificial intelligence are transforming technology.")

310

matches = phrase_matcher(doc)

311

312

for match_id, start, end in matches:

313

span = doc[start:end]

314

print(f"Found: {span.text}")

315

```

316

317

### Dependency Matching

318

319

```python

320

import spacy

321

from spacy.matcher import DependencyMatcher

322

323

nlp = spacy.load("en_core_web_sm")

324

dep_matcher = DependencyMatcher(nlp.vocab)

325

326

# Pattern: subject-verb-object relationships

327

pattern = [

328

{

329

"RIGHT_ID": "verb",

330

"RIGHT_ATTRS": {"POS": "VERB"}

331

},

332

{

333

"LEFT_ID": "verb",

334

"REL_OP": ">",

335

"RIGHT_ID": "subject",

336

"RIGHT_ATTRS": {"DEP": "nsubj"}

337

},

338

{

339

"LEFT_ID": "verb",

340

"REL_OP": ">",

341

"RIGHT_ID": "object",

342

"RIGHT_ATTRS": {"DEP": "dobj"}

343

}

344

]

345

346

dep_matcher.add("SVO", [pattern])

347

348

doc = nlp("The company acquired the startup.")

349

matches = dep_matcher(doc)

350

351

for match_id, token_ids in matches:

352

tokens = [doc[i] for i in token_ids]

353

print(f"SVO: {' '.join([t.text for t in tokens])}")

354

```

355

356

### Custom Match Callbacks

357

358

```python

359

import spacy

360

from spacy.matcher import Matcher

361

362

nlp = spacy.load("en_core_web_sm")

363

matcher = Matcher(nlp.vocab)

364

365

def on_match(matcher, doc, id, matches):

366

"""Custom callback function for matches."""

367

match_id, start, end = matches[0] # First match

368

span = doc[start:end]

369

print(f"Callback triggered for: {span.text}")

370

371

# Add custom processing

372

span._.is_company = True

373

374

# Add pattern with callback

375

patterns = [[{"ORTH": "Apple"}, {"ORTH": "Inc."}]]

376

matcher.add("COMPANY", patterns, on_match=on_match)

377

378

doc = nlp("Apple Inc. is a technology company.")

379

matches = matcher(doc)

380

```

381

382

### Batch Processing with Matchers

383

384

```python

385

import spacy

386

from spacy.matcher import Matcher

387

388

nlp = spacy.load("en_core_web_sm")

389

matcher = Matcher(nlp.vocab)

390

391

patterns = [

392

[{"ENT_TYPE": "PERSON"}],

393

[{"ENT_TYPE": "ORG"}],

394

[{"LIKE_EMAIL": True}]

395

]

396

matcher.add("ENTITIES", patterns)

397

398

# Process multiple documents

399

texts = [

400

"John Smith works at Apple Inc.",

401

"Contact jane@company.com for details.",

402

"Microsoft hired Sarah Johnson."

403

]

404

405

# Use pipe for efficient processing

406

docs = nlp.pipe(texts)

407

for doc in matcher.pipe(docs, return_matches=True, as_tuples=True):

408

doc_obj, matches = doc

409

print(f"Text: {doc_obj.text}")

410

for match_id, start, end in matches:

411

print(f" Match: {doc_obj[start:end].text}")

412

```

413

414

### Combining Multiple Matchers

415

416

```python

417

import spacy

418

from spacy.matcher import Matcher, PhraseMatcher

419

420

nlp = spacy.load("en_core_web_sm")

421

422

# Token-based matcher for patterns

423

token_matcher = Matcher(nlp.vocab)

424

token_patterns = [

425

[{"LIKE_EMAIL": True}],

426

[{"LIKE_URL": True}]

427

]

428

token_matcher.add("CONTACT_INFO", token_patterns)

429

430

# Phrase matcher for exact terms

431

phrase_matcher = PhraseMatcher(nlp.vocab)

432

companies = ["Apple Inc.", "Microsoft Corporation", "Google LLC"]

433

phrase_patterns = [nlp.make_doc(text) for text in companies]

434

phrase_matcher.add("COMPANIES", phrase_patterns)

435

436

# Process text with both matchers

437

doc = nlp("Contact Apple Inc. at info@apple.com or visit https://apple.com")

438

439

token_matches = token_matcher(doc)

440

phrase_matches = phrase_matcher(doc)

441

442

print("Token matches:")

443

for match_id, start, end in token_matches:

444

print(f" {doc[start:end].text}")

445

446

print("Phrase matches:")

447

for match_id, start, end in phrase_matches:

448

print(f" {doc[start:end].text}")

449

```