or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-features.mdcore-matching.mdindex.mdoptions-configuration.mdpattern-compilation.mdtext-processing.md

advanced-features.mddocs/

0

# Advanced Features

1

2

Specialized functionality for high-performance scenarios including pattern sets for matching multiple patterns simultaneously and filtered matching for optimized multi-pattern operations. These features are designed for applications that need to match against many patterns efficiently.

3

4

## Capabilities

5

6

### Pattern Sets

7

8

Pattern sets allow efficient matching of text against multiple regular expressions simultaneously, returning which patterns matched.

9

10

```python { .api }

11

class Set:

12

"""Collection of patterns that can be matched simultaneously."""

13

14

def __init__(self, anchor, options=None):

15

"""

16

Create a new pattern set.

17

18

Args:

19

anchor: Anchoring mode for matches

20

options (Options, optional): Compilation options

21

"""

22

23

def Add(self, pattern):

24

"""

25

Add a pattern to the set.

26

27

Args:

28

pattern (str): Regular expression pattern to add

29

30

Returns:

31

int: Pattern index in the set

32

33

Raises:

34

error: If pattern is invalid or set is already compiled

35

"""

36

37

def Compile(self):

38

"""

39

Compile all patterns in the set for matching.

40

41

Returns:

42

bool: True if compilation successful

43

44

Raises:

45

error: If compilation fails

46

"""

47

48

def Match(self, text):

49

"""

50

Match text against all patterns in the set.

51

52

Args:

53

text (str): Text to match against patterns

54

55

Returns:

56

list: List of pattern indices that matched

57

"""

58

59

@classmethod

60

def SearchSet(options=None):

61

"""

62

Create a set for searching (unanchored matching).

63

64

Args:

65

options (Options, optional): Compilation options

66

67

Returns:

68

Set: New set configured for searching

69

"""

70

71

@classmethod

72

def MatchSet(options=None):

73

"""

74

Create a set for matching at start of text.

75

76

Args:

77

options (Options, optional): Compilation options

78

79

Returns:

80

Set: New set configured for start matching

81

"""

82

83

@classmethod

84

def FullMatchSet(options=None):

85

"""

86

Create a set for full text matching.

87

88

Args:

89

options (Options, optional): Compilation options

90

91

Returns:

92

Set: New set configured for full matching

93

"""

94

```

95

96

Example usage:

97

98

```python

99

import re2

100

101

# Create a search set

102

pattern_set = re2.Set.SearchSet()

103

104

# Add multiple patterns

105

email_idx = pattern_set.Add(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

106

phone_idx = pattern_set.Add(r'\b\d{3}-\d{3}-\d{4}\b')

107

url_idx = pattern_set.Add(r'https?://[^\s]+')

108

109

# Compile the set

110

pattern_set.Compile()

111

112

# Match against text

113

text = "Contact: john@example.com or call 555-123-4567"

114

matches = pattern_set.Match(text)

115

116

if email_idx in matches:

117

print("Found email address")

118

if phone_idx in matches:

119

print("Found phone number")

120

if url_idx in matches:

121

print("Found URL")

122

```

123

124

### Filtered Matching

125

126

Filtered matching provides optimized multi-pattern matching with prefiltering for high-performance scenarios.

127

128

```python { .api }

129

class Filter:

130

"""Optimized multi-pattern matcher with prefiltering."""

131

132

def __init__(self):

133

"""Create a new filtered matcher."""

134

135

def Add(self, pattern, options=None):

136

"""

137

Add a pattern to the filter.

138

139

Args:

140

pattern (str): Regular expression pattern

141

options (Options, optional): Compilation options for this pattern

142

143

Returns:

144

int: Pattern index in the filter

145

146

Raises:

147

error: If pattern is invalid

148

"""

149

150

def Compile(self):

151

"""

152

Compile all patterns for filtered matching.

153

154

This prepares the filter for high-performance matching

155

by analyzing patterns and building prefilter structures.

156

157

Returns:

158

bool: True if compilation successful

159

"""

160

161

def Match(self, text, potential=False):

162

"""

163

Match text against all patterns.

164

165

Args:

166

text (str): Text to match

167

potential (bool): If True, return potential matches for two-phase matching

168

169

Returns:

170

list: List of pattern indices that matched

171

"""

172

173

def re(self, index):

174

"""

175

Get the compiled RE2 object for a specific pattern.

176

177

Args:

178

index (int): Pattern index

179

180

Returns:

181

_Regexp: Compiled pattern object

182

"""

183

```

184

185

Example usage:

186

187

```python

188

import re2

189

190

# Create filtered matcher

191

filter_matcher = re2.Filter()

192

193

# Add patterns for different data types

194

patterns = [

195

r'\b\d{3}-\d{2}-\d{4}\b', # SSN

196

r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # Email

197

r'\b\d{4}-\d{4}-\d{4}-\d{4}\b', # Credit card

198

r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b', # IP address

199

]

200

201

pattern_indices = []

202

for pattern in patterns:

203

idx = filter_matcher.Add(pattern)

204

pattern_indices.append(idx)

205

206

# Compile for optimized matching

207

filter_matcher.Compile()

208

209

# Match large text efficiently

210

large_text = """

211

John's email is john@example.com and his SSN is 123-45-6789.

212

The server IP is 192.168.1.100 and payment was made with card 1234-5678-9012-3456.

213

"""

214

215

matches = filter_matcher.Match(large_text)

216

data_types = ['SSN', 'Email', 'Credit Card', 'IP Address']

217

218

for i, match_idx in enumerate(matches):

219

if match_idx in pattern_indices:

220

idx_pos = pattern_indices.index(match_idx)

221

print(f"Found {data_types[idx_pos]}")

222

223

# Get specific pattern for detailed matching

224

specific_pattern = filter_matcher.re(match_idx)

225

match_obj = specific_pattern.search(large_text)

226

if match_obj:

227

print(f" Value: {match_obj.group()}")

228

```

229

230

### Two-Phase Matching

231

232

For extremely high-performance scenarios, use two-phase matching with potential matches:

233

234

```python

235

import re2

236

237

# Set up filter for two-phase matching

238

filter_matcher = re2.Filter()

239

240

# Add many patterns

241

sensitive_patterns = [

242

r'\b\d{3}-\d{2}-\d{4}\b', # SSN

243

r'\b\d{4}-\d{4}-\d{4}-\d{4}\b', # Credit card

244

r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # Email

245

# ... many more patterns

246

]

247

248

for pattern in sensitive_patterns:

249

filter_matcher.Add(pattern)

250

251

filter_matcher.Compile()

252

253

def scan_text_efficiently(text):

254

# Phase 1: Fast prefiltering to get potential matches

255

potential_matches = filter_matcher.Match(text, potential=True)

256

257

if not potential_matches:

258

return [] # No potential matches, skip phase 2

259

260

# Phase 2: Detailed matching only for potential patterns

261

actual_matches = []

262

for pattern_idx in potential_matches:

263

pattern = filter_matcher.re(pattern_idx)

264

if pattern.search(text):

265

actual_matches.append(pattern_idx)

266

267

return actual_matches

268

269

# Scan large volumes of text efficiently

270

texts_to_scan = [

271

"Document 1 with email@example.com",

272

"Document 2 with SSN 123-45-6789",

273

"Document 3 with no sensitive data",

274

# ... thousands of documents

275

]

276

277

for i, text in enumerate(texts_to_scan):

278

matches = scan_text_efficiently(text)

279

if matches:

280

print(f"Document {i+1} contains sensitive data (patterns: {matches})")

281

```

282

283

### Anchor Modes for Sets

284

285

```python

286

import re2

287

288

# Different anchor modes for pattern sets

289

text = "email@example.com is my address"

290

291

# Search set (unanchored) - finds patterns anywhere

292

search_set = re2.Set.SearchSet()

293

search_set.Add(r'email@\w+\.com')

294

search_set.Compile()

295

matches = search_set.Match(text) # Will find the email

296

297

# Match set (anchored at start) - requires pattern at beginning

298

match_set = re2.Set.MatchSet()

299

match_set.Add(r'email@\w+\.com')

300

match_set.Compile()

301

matches = match_set.Match(text) # Will find the email (it's at start)

302

303

# Full match set - requires pattern to match entire text

304

full_set = re2.Set.FullMatchSet()

305

full_set.Add(r'email@\w+\.com is my address')

306

full_set.Compile()

307

matches = full_set.Match(text) # Will match (pattern matches entire text)

308

```

309

310

### Performance Considerations

311

312

```python

313

import re2

314

315

# For maximum performance with many patterns:

316

317

# 1. Use Filter for better prefiltering

318

filter_matcher = re2.Filter()

319

320

# 2. Use performance-optimized options

321

options = re2.Options()

322

options.never_capture = True # Disable capturing if not needed

323

options.never_nl = True # Optimize newline handling

324

options.max_mem = 67108864 # Increase memory limit if needed

325

326

# 3. Add patterns with optimized options

327

for pattern in large_pattern_list:

328

filter_matcher.Add(pattern, options)

329

330

filter_matcher.Compile()

331

332

# 4. Use two-phase matching for large texts

333

def efficient_scan(text):

334

potentials = filter_matcher.Match(text, potential=True)

335

if not potentials:

336

return []

337

338

# Only do expensive full matching on potential matches

339

return [idx for idx in potentials

340

if filter_matcher.re(idx).search(text)]

341

```

342

343

### Error Handling for Advanced Features

344

345

```python

346

import re2

347

348

# Handle compilation errors

349

try:

350

pattern_set = re2.Set.SearchSet()

351

pattern_set.Add(r'[invalid') # Invalid pattern

352

pattern_set.Compile()

353

except re2.error as e:

354

print(f"Set compilation failed: {e}")

355

356

# Handle filter errors

357

try:

358

filter_matcher = re2.Filter()

359

filter_matcher.Add(r'(?P<invalid') # Invalid named group

360

filter_matcher.Compile()

361

except re2.error as e:

362

print(f"Filter compilation failed: {e}")

363

364

# Safe pattern addition

365

def safe_add_to_set(pattern_set, pattern):

366

try:

367

return pattern_set.Add(pattern)

368

except re2.error:

369

print(f"Skipping invalid pattern: {pattern}")

370

return None

371

```