or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

classes-types.mdcompilation-utilities.mdflags-constants.mdindex.mdpattern-matching.mdsplitting.mdsubstitution.md

classes-types.mddocs/

0

# Advanced Classes and Types

1

2

Pattern and Match objects providing compiled pattern functionality and match result access, plus Scanner for tokenization and RegexFlag enumeration for proper flag handling. These classes form the core object-oriented interface for advanced regex operations.

3

4

## Capabilities

5

6

### Pattern Class

7

8

Compiled regular expression pattern object that provides all matching methods with enhanced performance and additional functionality beyond module-level functions.

9

10

```python { .api }

11

class Pattern:

12

"""Compiled regular expression pattern object with matching methods."""

13

14

def match(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):

15

"""Try to apply pattern at start of string, returning Match object or None."""

16

17

def fullmatch(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):

18

"""Try to apply pattern against entire string, returning Match object or None."""

19

20

def search(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):

21

"""Search through string for pattern match, returning Match object or None."""

22

23

def findall(self, string, pos=None, endpos=None, overlapped=False, concurrent=None, timeout=None):

24

"""Return list of all matches in string."""

25

26

def finditer(self, string, pos=None, endpos=None, overlapped=False, partial=False, concurrent=None, timeout=None):

27

"""Return iterator over all matches in string."""

28

29

def sub(self, repl, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):

30

"""Replace pattern occurrences with replacement string."""

31

32

def subf(self, format, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):

33

"""Replace pattern occurrences using format string."""

34

35

def subn(self, repl, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):

36

"""Return (new_string, number_of_substitutions_made) tuple."""

37

38

def subfn(self, format, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):

39

"""Return (formatted_string, number_of_substitutions_made) tuple."""

40

41

def split(self, string, maxsplit=0, concurrent=None, timeout=None):

42

"""Split string by pattern occurrences, returning list of substrings."""

43

44

def splititer(self, string, maxsplit=0, concurrent=None, timeout=None):

45

"""Return iterator yielding split string parts."""

46

47

# Pattern properties

48

pattern: str # Original pattern string

49

flags: int # Compilation flags

50

groups: int # Number of capturing groups

51

groupindex: dict # Mapping of group names to numbers

52

```

53

54

**Usage Examples:**

55

56

```python

57

import regex

58

59

# Compile and use pattern object

60

email_pattern = regex.compile(r'\b([\w.-]+)@([\w.-]+\.\w+)\b')

61

62

# Use pattern methods

63

text = "Contact: john@example.com or admin@site.org"

64

matches = email_pattern.findall(text)

65

print(matches) # [('john', 'example.com'), ('admin', 'site.org')]

66

67

# Pattern properties

68

print(f"Pattern: {email_pattern.pattern}")

69

print(f"Groups: {email_pattern.groups}")

70

print(f"Flags: {email_pattern.flags}")

71

72

# Multiple operations on same pattern

73

def analyze_email_text(text, pattern):

74

# Count emails

75

all_emails = pattern.findall(text)

76

77

# Find first email

78

first_match = pattern.search(text)

79

80

# Replace emails with placeholder

81

anonymized = pattern.sub('[EMAIL]', text)

82

83

return {

84

'count': len(all_emails),

85

'first': first_match.group() if first_match else None,

86

'anonymized': anonymized

87

}

88

89

# Advanced pattern usage with concurrent execution

90

large_text = open('large_file.txt').read()

91

results = email_pattern.findall(large_text, concurrent=True)

92

93

# Pattern with timeout

94

try:

95

complex_pattern = regex.compile(r'(a+)+b')

96

result = complex_pattern.search('a' * 30, timeout=1.0)

97

except regex.error as e:

98

print(f"Pattern timed out: {e}")

99

```

100

101

### Match Class

102

103

Match object containing information about a successful pattern match, providing access to matched text, groups, and position information.

104

105

```python { .api }

106

class Match:

107

"""Match object containing match information and results."""

108

109

def group(self, *groups):

110

"""Return one or more subgroups of the match."""

111

112

def groups(self, default=None):

113

"""Return tuple of all subgroups of the match."""

114

115

def groupdict(self, default=None):

116

"""Return dictionary of all named subgroups."""

117

118

def start(self, group=0):

119

"""Return start position of substring matched by group."""

120

121

def end(self, group=0):

122

"""Return end position of substring matched by group."""

123

124

def span(self, group=0):

125

"""Return (start, end) positions of substring matched by group."""

126

127

def expand(self, template):

128

"""Return string obtained by template substitution."""

129

130

def expandf(self, format):

131

"""Return string obtained by format substitution."""

132

133

# Match properties

134

string: str # String passed to match function

135

pos: int # Start position for search

136

endpos: int # End position for search

137

lastindex: int # Index of last matched capturing group

138

lastgroup: str # Name of last matched capturing group

139

re: Pattern # Pattern object that produced this match

140

```

141

142

**Usage Examples:**

143

144

```python

145

import regex

146

147

# Basic match operations

148

pattern = regex.compile(r'(\w+)@(\w+\.\w+)')

149

match = pattern.search('Email: john@example.com is valid')

150

151

if match:

152

print(f"Full match: {match.group()}") # 'john@example.com'

153

print(f"Username: {match.group(1)}") # 'john'

154

print(f"Domain: {match.group(2)}") # 'example.com'

155

print(f"All groups: {match.groups()}") # ('john', 'example.com')

156

print(f"Match span: {match.span()}") # (7, 21)

157

158

# Named groups

159

pattern = regex.compile(r'(?P<user>\w+)@(?P<domain>\w+\.\w+)')

160

match = pattern.search('Contact: admin@site.org')

161

162

if match:

163

print(f"User: {match.group('user')}") # 'admin'

164

print(f"Domain: {match.group('domain')}") # 'site.org'

165

print(f"Group dict: {match.groupdict()}") # {'user': 'admin', 'domain': 'site.org'}

166

167

# Multiple group access

168

match = regex.search(r'(\d{4})-(\d{2})-(\d{2})', 'Date: 2023-12-25')

169

if match:

170

year, month, day = match.groups()

171

print(f"Date parts: {year}, {month}, {day}") # '2023', '12', '25'

172

173

# Individual positions

174

print(f"Year at: {match.span(1)}") # (6, 10)

175

print(f"Month at: {match.span(2)}") # (11, 13)

176

print(f"Day at: {match.span(3)}") # (14, 16)

177

178

# Template expansion

179

match = regex.search(r'(\w+)\s+(\w+)', 'John Doe')

180

if match:

181

# Traditional template

182

formatted = match.expand(r'\2, \1')

183

print(formatted) # 'Doe, John'

184

185

# Format-style template

186

formatted = match.expandf('{1}, {0}')

187

print(formatted) # 'Doe, John'

188

189

# Match object properties

190

print(f"Original string: {match.string}")

191

print(f"Search bounds: {match.pos}-{match.endpos}")

192

print(f"Last group index: {match.lastindex}")

193

print(f"Pattern object: {match.re}")

194

```

195

196

### Scanner Class

197

198

Tokenizing scanner that processes strings using a list of pattern-action pairs, providing a powerful tool for lexical analysis and text processing.

199

200

```python { .api }

201

class Scanner:

202

"""Scanner for tokenizing strings using pattern-action pairs."""

203

204

def __init__(self, lexicon, flags=0):

205

"""

206

Initialize scanner with lexicon of pattern-action pairs.

207

208

Args:

209

lexicon (list): List of (pattern, action) tuples

210

flags (int, optional): Regex flags for all patterns

211

"""

212

213

def scan(self, string):

214

"""

215

Scan string and return list of action results.

216

217

Args:

218

string (str): String to scan

219

220

Returns:

221

tuple: (results_list, remaining_string)

222

"""

223

```

224

225

**Usage Examples:**

226

227

```python

228

import regex

229

230

# Basic tokenizer

231

def make_number(scanner, token):

232

return ('NUMBER', int(token))

233

234

def make_word(scanner, token):

235

return ('WORD', token)

236

237

def make_operator(scanner, token):

238

return ('OP', token)

239

240

# Define lexicon (pattern, action) pairs

241

lexicon = [

242

(r'\d+', make_number),

243

(r'\w+', make_word),

244

(r'[+\-*/]', make_operator),

245

(r'\s+', None), # Skip whitespace

246

]

247

248

scanner = regex.Scanner(lexicon)

249

tokens, remainder = scanner.scan('age + 25 * factor')

250

print(tokens) # [('WORD', 'age'), ('OP', '+'), ('NUMBER', 25), ('OP', '*'), ('WORD', 'factor')]

251

print(f"Remainder: '{remainder}'") # Should be empty

252

253

# Advanced tokenizer with state

254

class StatefulScanner:

255

def __init__(self):

256

self.in_string = False

257

258

def string_start(self, scanner, token):

259

self.in_string = True

260

return ('STRING_START', token)

261

262

def string_content(self, scanner, token):

263

return ('STRING_CONTENT', token)

264

265

def string_end(self, scanner, token):

266

self.in_string = False

267

return ('STRING_END', token)

268

269

# HTML/XML tokenizer

270

def make_tag_open(scanner, token):

271

return ('TAG_OPEN', token)

272

273

def make_tag_close(scanner, token):

274

return ('TAG_CLOSE', token)

275

276

def make_text(scanner, token):

277

return ('TEXT', token.strip())

278

279

html_lexicon = [

280

(r'<(/?\w+)[^>]*>', make_tag_open),

281

(r'[^<]+', make_text),

282

]

283

284

html_scanner = regex.Scanner(html_lexicon)

285

tokens, remainder = html_scanner.scan('<div>Hello <span>world</span></div>')

286

print(tokens)

287

288

# Programming language tokenizer

289

def tokenize_code(code):

290

lexicon = [

291

(r'#.*$', lambda s, t: ('COMMENT', t)), # Comments

292

(r'\b(if|else|while|for|def|class)\b', lambda s, t: ('KEYWORD', t)), # Keywords

293

(r'\b[a-zA-Z_]\w*\b', lambda s, t: ('IDENTIFIER', t)), # Identifiers

294

(r'\b\d+\.\d+\b', lambda s, t: ('FLOAT', float(t))), # Float numbers

295

(r'\b\d+\b', lambda s, t: ('INTEGER', int(t))), # Integers

296

(r'[+\-*/=<>!]+', lambda s, t: ('OPERATOR', t)), # Operators

297

(r'[(){}[\];,.]', lambda s, t: ('DELIMITER', t)), # Delimiters

298

(r'"[^"]*"', lambda s, t: ('STRING', t[1:-1])), # String literals

299

(r'\s+', None), # Skip whitespace

300

]

301

302

scanner = regex.Scanner(lexicon, regex.MULTILINE)

303

tokens, remainder = scanner.scan(code)

304

305

if remainder:

306

print(f"Warning: Could not tokenize: '{remainder}'")

307

308

return tokens

309

310

# Example usage

311

code = '''

312

def hello(name):

313

# Print greeting

314

print("Hello, " + name)

315

return 42

316

'''

317

318

tokens = tokenize_code(code)

319

for token in tokens:

320

print(token)

321

```

322

323

### RegexFlag Enumeration

324

325

Enumeration of regex flags with proper flag combination support, providing a type-safe way to work with regex flags.

326

327

```python { .api }

328

class RegexFlag(enum.IntFlag):

329

"""Enumeration of regex flags with proper combination support."""

330

331

# Standard flags

332

ASCII = A = 0x80

333

IGNORECASE = I = 0x2

334

LOCALE = L = 0x4

335

MULTILINE = M = 0x8

336

DOTALL = S = 0x10

337

VERBOSE = X = 0x40

338

UNICODE = U = 0x20

339

340

# Enhanced flags

341

BESTMATCH = B = 0x1000

342

DEBUG = D = 0x200

343

ENHANCEMATCH = E = 0x8000

344

FULLCASE = F = 0x4000

345

POSIX = P = 0x10000

346

REVERSE = R = 0x400

347

TEMPLATE = T = 0x1

348

WORD = W = 0x800

349

350

# Version flags

351

VERSION0 = V0 = 0x2000

352

VERSION1 = V1 = 0x100

353

```

354

355

**Usage Examples:**

356

357

```python

358

import regex

359

from regex import RegexFlag

360

361

# Using flag enumeration

362

flags = RegexFlag.IGNORECASE | RegexFlag.MULTILINE

363

pattern = regex.compile(r'^hello.*world$', flags)

364

365

# Check flag combinations

366

combined_flags = RegexFlag.IGNORECASE | RegexFlag.DOTALL | RegexFlag.VERBOSE

367

print(f"Combined flags value: {combined_flags}")

368

369

# Test flag presence

370

if RegexFlag.IGNORECASE in combined_flags:

371

print("Case-insensitive matching enabled")

372

373

# Enhanced flags

374

fuzzy_flags = RegexFlag.BESTMATCH | RegexFlag.ENHANCEMATCH

375

pattern = regex.compile(r'(?e)(search){e<=2}', fuzzy_flags)

376

377

# Version-specific flags

378

v1_flags = RegexFlag.VERSION1 | RegexFlag.IGNORECASE | RegexFlag.FULLCASE

379

pattern = regex.compile(r'unicode', v1_flags)

380

381

# All flag names and values

382

print("Available flags:")

383

for flag in RegexFlag:

384

print(f"{flag.name}: {flag.value} (0x{flag.value:x})")

385

```

386

387

## Advanced Usage Patterns

388

389

### Pattern Object Reuse

390

391

```python

392

# Efficient pattern reuse

393

class TextProcessor:

394

def __init__(self):

395

# Pre-compile frequently used patterns

396

self.email_pattern = regex.compile(r'\b[\w.-]+@[\w.-]+\.\w+\b')

397

self.phone_pattern = regex.compile(r'\b\d{3}-\d{3}-\d{4}\b')

398

self.url_pattern = regex.compile(r'https?://[^\s]+')

399

400

def extract_contacts(self, text):

401

return {

402

'emails': self.email_pattern.findall(text),

403

'phones': self.phone_pattern.findall(text),

404

'urls': self.url_pattern.findall(text)

405

}

406

```

407

408

### Match Object Chaining

409

410

```python

411

def process_structured_data(text):

412

# Chain match operations

413

date_pattern = regex.compile(r'(\d{4})-(\d{2})-(\d{2})')

414

415

results = []

416

for match in date_pattern.finditer(text):

417

# Extract date components

418

year, month, day = match.groups()

419

420

# Use match position to get context

421

start, end = match.span()

422

context_start = max(0, start - 20)

423

context_end = min(len(text), end + 20)

424

context = text[context_start:context_end]

425

426

results.append({

427

'date': f"{year}-{month}-{day}",

428

'position': (start, end),

429

'context': context.strip()

430

})

431

432

return results

433

```

434

435

### Scanner State Management

436

437

```python

438

class AdvancedScanner:

439

def __init__(self):

440

self.context_stack = []

441

self.current_context = 'normal'

442

443

def enter_context(self, scanner, token):

444

self.context_stack.append(self.current_context)

445

self.current_context = 'special'

446

return ('CONTEXT_ENTER', token)

447

448

def exit_context(self, scanner, token):

449

if self.context_stack:

450

self.current_context = self.context_stack.pop()

451

return ('CONTEXT_EXIT', token)

452

453

def process_token(self, scanner, token):

454

return (f'{self.current_context.upper()}_TOKEN', token)

455

```