0
# Advanced Classes and Types
1
2
Pattern and Match objects providing compiled pattern functionality and match result access, plus Scanner for tokenization and RegexFlag enumeration for proper flag handling. These classes form the core object-oriented interface for advanced regex operations.
3
4
## Capabilities
5
6
### Pattern Class
7
8
Compiled regular expression pattern object that provides all matching methods with enhanced performance and additional functionality beyond module-level functions.
9
10
```python { .api }
11
class Pattern:
12
"""Compiled regular expression pattern object with matching methods."""
13
14
def match(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):
15
"""Try to apply pattern at start of string, returning Match object or None."""
16
17
def fullmatch(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):
18
"""Try to apply pattern against entire string, returning Match object or None."""
19
20
def search(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):
21
"""Search through string for pattern match, returning Match object or None."""
22
23
def findall(self, string, pos=None, endpos=None, overlapped=False, concurrent=None, timeout=None):
24
"""Return list of all matches in string."""
25
26
def finditer(self, string, pos=None, endpos=None, overlapped=False, partial=False, concurrent=None, timeout=None):
27
"""Return iterator over all matches in string."""
28
29
def sub(self, repl, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
30
"""Replace pattern occurrences with replacement string."""
31
32
def subf(self, format, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
33
"""Replace pattern occurrences using format string."""
34
35
def subn(self, repl, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
36
"""Return (new_string, number_of_substitutions_made) tuple."""
37
38
def subfn(self, format, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
39
"""Return (formatted_string, number_of_substitutions_made) tuple."""
40
41
def split(self, string, maxsplit=0, concurrent=None, timeout=None):
42
"""Split string by pattern occurrences, returning list of substrings."""
43
44
def splititer(self, string, maxsplit=0, concurrent=None, timeout=None):
45
"""Return iterator yielding split string parts."""
46
47
# Pattern properties
48
pattern: str # Original pattern string
49
flags: int # Compilation flags
50
groups: int # Number of capturing groups
51
groupindex: dict # Mapping of group names to numbers
52
```
53
54
**Usage Examples:**
55
56
```python
57
import regex
58
59
# Compile and use pattern object
60
email_pattern = regex.compile(r'\b([\w.-]+)@([\w.-]+\.\w+)\b')
61
62
# Use pattern methods
63
text = "Contact: john@example.com or admin@site.org"
64
matches = email_pattern.findall(text)
65
print(matches) # [('john', 'example.com'), ('admin', 'site.org')]
66
67
# Pattern properties
68
print(f"Pattern: {email_pattern.pattern}")
69
print(f"Groups: {email_pattern.groups}")
70
print(f"Flags: {email_pattern.flags}")
71
72
# Multiple operations on same pattern
73
def analyze_email_text(text, pattern):
74
# Count emails
75
all_emails = pattern.findall(text)
76
77
# Find first email
78
first_match = pattern.search(text)
79
80
# Replace emails with placeholder
81
anonymized = pattern.sub('[EMAIL]', text)
82
83
return {
84
'count': len(all_emails),
85
'first': first_match.group() if first_match else None,
86
'anonymized': anonymized
87
}
88
89
# Advanced pattern usage with concurrent execution
90
large_text = open('large_file.txt').read()
91
results = email_pattern.findall(large_text, concurrent=True)
92
93
# Pattern with timeout
94
try:
95
complex_pattern = regex.compile(r'(a+)+b')
96
result = complex_pattern.search('a' * 30, timeout=1.0)
97
except regex.error as e:
98
print(f"Pattern timed out: {e}")
99
```
100
101
### Match Class
102
103
Match object containing information about a successful pattern match, providing access to matched text, groups, and position information.
104
105
```python { .api }
106
class Match:
107
"""Match object containing match information and results."""
108
109
def group(self, *groups):
110
"""Return one or more subgroups of the match."""
111
112
def groups(self, default=None):
113
"""Return tuple of all subgroups of the match."""
114
115
def groupdict(self, default=None):
116
"""Return dictionary of all named subgroups."""
117
118
def start(self, group=0):
119
"""Return start position of substring matched by group."""
120
121
def end(self, group=0):
122
"""Return end position of substring matched by group."""
123
124
def span(self, group=0):
125
"""Return (start, end) positions of substring matched by group."""
126
127
def expand(self, template):
128
"""Return string obtained by template substitution."""
129
130
def expandf(self, format):
131
"""Return string obtained by format substitution."""
132
133
# Match properties
134
string: str # String passed to match function
135
pos: int # Start position for search
136
endpos: int # End position for search
137
lastindex: int # Index of last matched capturing group
138
lastgroup: str # Name of last matched capturing group
139
re: Pattern # Pattern object that produced this match
140
```
141
142
**Usage Examples:**
143
144
```python
145
import regex
146
147
# Basic match operations
148
pattern = regex.compile(r'(\w+)@(\w+\.\w+)')
149
match = pattern.search('Email: john@example.com is valid')
150
151
if match:
152
print(f"Full match: {match.group()}") # 'john@example.com'
153
print(f"Username: {match.group(1)}") # 'john'
154
print(f"Domain: {match.group(2)}") # 'example.com'
155
print(f"All groups: {match.groups()}") # ('john', 'example.com')
156
print(f"Match span: {match.span()}") # (7, 21)
157
158
# Named groups
159
pattern = regex.compile(r'(?P<user>\w+)@(?P<domain>\w+\.\w+)')
160
match = pattern.search('Contact: admin@site.org')
161
162
if match:
163
print(f"User: {match.group('user')}") # 'admin'
164
print(f"Domain: {match.group('domain')}") # 'site.org'
165
print(f"Group dict: {match.groupdict()}") # {'user': 'admin', 'domain': 'site.org'}
166
167
# Multiple group access
168
match = regex.search(r'(\d{4})-(\d{2})-(\d{2})', 'Date: 2023-12-25')
169
if match:
170
year, month, day = match.groups()
171
print(f"Date parts: {year}, {month}, {day}") # '2023', '12', '25'
172
173
# Individual positions
174
print(f"Year at: {match.span(1)}") # (6, 10)
175
print(f"Month at: {match.span(2)}") # (11, 13)
176
print(f"Day at: {match.span(3)}") # (14, 16)
177
178
# Template expansion
179
match = regex.search(r'(\w+)\s+(\w+)', 'John Doe')
180
if match:
181
# Traditional template
182
formatted = match.expand(r'\2, \1')
183
print(formatted) # 'Doe, John'
184
185
# Format-style template
186
formatted = match.expandf('{1}, {0}')
187
print(formatted) # 'Doe, John'
188
189
# Match object properties
190
print(f"Original string: {match.string}")
191
print(f"Search bounds: {match.pos}-{match.endpos}")
192
print(f"Last group index: {match.lastindex}")
193
print(f"Pattern object: {match.re}")
194
```
195
196
### Scanner Class
197
198
Tokenizing scanner that processes strings using a list of pattern-action pairs, providing a powerful tool for lexical analysis and text processing.
199
200
```python { .api }
201
class Scanner:
202
"""Scanner for tokenizing strings using pattern-action pairs."""
203
204
def __init__(self, lexicon, flags=0):
205
"""
206
Initialize scanner with lexicon of pattern-action pairs.
207
208
Args:
209
lexicon (list): List of (pattern, action) tuples
210
flags (int, optional): Regex flags for all patterns
211
"""
212
213
def scan(self, string):
214
"""
215
Scan string and return list of action results.
216
217
Args:
218
string (str): String to scan
219
220
Returns:
221
tuple: (results_list, remaining_string)
222
"""
223
```
224
225
**Usage Examples:**
226
227
```python
228
import regex
229
230
# Basic tokenizer
231
def make_number(scanner, token):
232
return ('NUMBER', int(token))
233
234
def make_word(scanner, token):
235
return ('WORD', token)
236
237
def make_operator(scanner, token):
238
return ('OP', token)
239
240
# Define lexicon (pattern, action) pairs
241
lexicon = [
242
(r'\d+', make_number),
243
(r'\w+', make_word),
244
(r'[+\-*/]', make_operator),
245
(r'\s+', None), # Skip whitespace
246
]
247
248
scanner = regex.Scanner(lexicon)
249
tokens, remainder = scanner.scan('age + 25 * factor')
250
print(tokens) # [('WORD', 'age'), ('OP', '+'), ('NUMBER', 25), ('OP', '*'), ('WORD', 'factor')]
251
print(f"Remainder: '{remainder}'") # Should be empty
252
253
# Advanced tokenizer with state
254
class StatefulScanner:
255
def __init__(self):
256
self.in_string = False
257
258
def string_start(self, scanner, token):
259
self.in_string = True
260
return ('STRING_START', token)
261
262
def string_content(self, scanner, token):
263
return ('STRING_CONTENT', token)
264
265
def string_end(self, scanner, token):
266
self.in_string = False
267
return ('STRING_END', token)
268
269
# HTML/XML tokenizer
270
def make_tag_open(scanner, token):
271
return ('TAG_OPEN', token)
272
273
def make_tag_close(scanner, token):
274
return ('TAG_CLOSE', token)
275
276
def make_text(scanner, token):
277
return ('TEXT', token.strip())
278
279
html_lexicon = [
280
(r'<(/?\w+)[^>]*>', make_tag_open),
281
(r'[^<]+', make_text),
282
]
283
284
html_scanner = regex.Scanner(html_lexicon)
285
tokens, remainder = html_scanner.scan('<div>Hello <span>world</span></div>')
286
print(tokens)
287
288
# Programming language tokenizer
289
def tokenize_code(code):
290
lexicon = [
291
(r'#.*$', lambda s, t: ('COMMENT', t)), # Comments
292
(r'\b(if|else|while|for|def|class)\b', lambda s, t: ('KEYWORD', t)), # Keywords
293
(r'\b[a-zA-Z_]\w*\b', lambda s, t: ('IDENTIFIER', t)), # Identifiers
294
(r'\b\d+\.\d+\b', lambda s, t: ('FLOAT', float(t))), # Float numbers
295
(r'\b\d+\b', lambda s, t: ('INTEGER', int(t))), # Integers
296
(r'[+\-*/=<>!]+', lambda s, t: ('OPERATOR', t)), # Operators
297
(r'[(){}[\];,.]', lambda s, t: ('DELIMITER', t)), # Delimiters
298
(r'"[^"]*"', lambda s, t: ('STRING', t[1:-1])), # String literals
299
(r'\s+', None), # Skip whitespace
300
]
301
302
scanner = regex.Scanner(lexicon, regex.MULTILINE)
303
tokens, remainder = scanner.scan(code)
304
305
if remainder:
306
print(f"Warning: Could not tokenize: '{remainder}'")
307
308
return tokens
309
310
# Example usage
311
code = '''
312
def hello(name):
313
# Print greeting
314
print("Hello, " + name)
315
return 42
316
'''
317
318
tokens = tokenize_code(code)
319
for token in tokens:
320
print(token)
321
```
322
323
### RegexFlag Enumeration
324
325
Enumeration of regex flags with proper flag combination support, providing a type-safe way to work with regex flags.
326
327
```python { .api }
328
class RegexFlag(enum.IntFlag):
329
"""Enumeration of regex flags with proper combination support."""
330
331
# Standard flags
332
ASCII = A = 0x80
333
IGNORECASE = I = 0x2
334
LOCALE = L = 0x4
335
MULTILINE = M = 0x8
336
DOTALL = S = 0x10
337
VERBOSE = X = 0x40
338
UNICODE = U = 0x20
339
340
# Enhanced flags
341
BESTMATCH = B = 0x1000
342
DEBUG = D = 0x200
343
ENHANCEMATCH = E = 0x8000
344
FULLCASE = F = 0x4000
345
POSIX = P = 0x10000
346
REVERSE = R = 0x400
347
TEMPLATE = T = 0x1
348
WORD = W = 0x800
349
350
# Version flags
351
VERSION0 = V0 = 0x2000
352
VERSION1 = V1 = 0x100
353
```
354
355
**Usage Examples:**
356
357
```python
358
import regex
359
from regex import RegexFlag
360
361
# Using flag enumeration
362
flags = RegexFlag.IGNORECASE | RegexFlag.MULTILINE
363
pattern = regex.compile(r'^hello.*world$', flags)
364
365
# Check flag combinations
366
combined_flags = RegexFlag.IGNORECASE | RegexFlag.DOTALL | RegexFlag.VERBOSE
367
print(f"Combined flags value: {combined_flags}")
368
369
# Test flag presence
370
if RegexFlag.IGNORECASE in combined_flags:
371
print("Case-insensitive matching enabled")
372
373
# Enhanced flags
374
fuzzy_flags = RegexFlag.BESTMATCH | RegexFlag.ENHANCEMATCH
375
pattern = regex.compile(r'(?e)(search){e<=2}', fuzzy_flags)
376
377
# Version-specific flags
378
v1_flags = RegexFlag.VERSION1 | RegexFlag.IGNORECASE | RegexFlag.FULLCASE
379
pattern = regex.compile(r'unicode', v1_flags)
380
381
# All flag names and values
382
print("Available flags:")
383
for flag in RegexFlag:
384
print(f"{flag.name}: {flag.value} (0x{flag.value:x})")
385
```
386
387
## Advanced Usage Patterns
388
389
### Pattern Object Reuse
390
391
```python
392
# Efficient pattern reuse
393
class TextProcessor:
394
def __init__(self):
395
# Pre-compile frequently used patterns
396
self.email_pattern = regex.compile(r'\b[\w.-]+@[\w.-]+\.\w+\b')
397
self.phone_pattern = regex.compile(r'\b\d{3}-\d{3}-\d{4}\b')
398
self.url_pattern = regex.compile(r'https?://[^\s]+')
399
400
def extract_contacts(self, text):
401
return {
402
'emails': self.email_pattern.findall(text),
403
'phones': self.phone_pattern.findall(text),
404
'urls': self.url_pattern.findall(text)
405
}
406
```
407
408
### Match Object Chaining
409
410
```python
411
def process_structured_data(text):
412
# Chain match operations
413
date_pattern = regex.compile(r'(\d{4})-(\d{2})-(\d{2})')
414
415
results = []
416
for match in date_pattern.finditer(text):
417
# Extract date components
418
year, month, day = match.groups()
419
420
# Use match position to get context
421
start, end = match.span()
422
context_start = max(0, start - 20)
423
context_end = min(len(text), end + 20)
424
context = text[context_start:context_end]
425
426
results.append({
427
'date': f"{year}-{month}-{day}",
428
'position': (start, end),
429
'context': context.strip()
430
})
431
432
return results
433
```
434
435
### Scanner State Management
436
437
```python
438
class AdvancedScanner:
439
def __init__(self):
440
self.context_stack = []
441
self.current_context = 'normal'
442
443
def enter_context(self, scanner, token):
444
self.context_stack.append(self.current_context)
445
self.current_context = 'special'
446
return ('CONTEXT_ENTER', token)
447
448
def exit_context(self, scanner, token):
449
if self.context_stack:
450
self.current_context = self.context_stack.pop()
451
return ('CONTEXT_EXIT', token)
452
453
def process_token(self, scanner, token):
454
return (f'{self.current_context.upper()}_TOKEN', token)
455
```