Alternative regular expression module providing enhanced pattern matching, fuzzy matching, and advanced Unicode support as a replacement for Python's re module.
—
Pattern and Match objects providing compiled pattern functionality and match result access, plus Scanner for tokenization and RegexFlag enumeration for proper flag handling. These classes form the core object-oriented interface for advanced regex operations.
Compiled regular expression pattern object that provides all matching methods with enhanced performance and additional functionality beyond module-level functions.
class Pattern:
"""Compiled regular expression pattern object with matching methods."""
def match(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):
"""Try to apply pattern at start of string, returning Match object or None."""
def fullmatch(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):
"""Try to apply pattern against entire string, returning Match object or None."""
def search(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):
"""Search through string for pattern match, returning Match object or None."""
def findall(self, string, pos=None, endpos=None, overlapped=False, concurrent=None, timeout=None):
"""Return list of all matches in string."""
def finditer(self, string, pos=None, endpos=None, overlapped=False, partial=False, concurrent=None, timeout=None):
"""Return iterator over all matches in string."""
def sub(self, repl, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
"""Replace pattern occurrences with replacement string."""
def subf(self, format, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
"""Replace pattern occurrences using format string."""
def subn(self, repl, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
"""Return (new_string, number_of_substitutions_made) tuple."""
def subfn(self, format, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
"""Return (formatted_string, number_of_substitutions_made) tuple."""
def split(self, string, maxsplit=0, concurrent=None, timeout=None):
"""Split string by pattern occurrences, returning list of substrings."""
def splititer(self, string, maxsplit=0, concurrent=None, timeout=None):
"""Return iterator yielding split string parts."""
# Pattern properties
pattern: str # Original pattern string
flags: int # Compilation flags
groups: int # Number of capturing groups
groupindex: dict # Mapping of group names to numbersUsage Examples:
import regex
# Compile and use pattern object
email_pattern = regex.compile(r'\b([\w.-]+)@([\w.-]+\.\w+)\b')
# Use pattern methods
text = "Contact: john@example.com or admin@site.org"
matches = email_pattern.findall(text)
print(matches) # [('john', 'example.com'), ('admin', 'site.org')]
# Pattern properties
print(f"Pattern: {email_pattern.pattern}")
print(f"Groups: {email_pattern.groups}")
print(f"Flags: {email_pattern.flags}")
# Multiple operations on same pattern
def analyze_email_text(text, pattern):
# Count emails
all_emails = pattern.findall(text)
# Find first email
first_match = pattern.search(text)
# Replace emails with placeholder
anonymized = pattern.sub('[EMAIL]', text)
return {
'count': len(all_emails),
'first': first_match.group() if first_match else None,
'anonymized': anonymized
}
# Advanced pattern usage with concurrent execution
large_text = open('large_file.txt').read()
results = email_pattern.findall(large_text, concurrent=True)
# Pattern with timeout
try:
complex_pattern = regex.compile(r'(a+)+b')
result = complex_pattern.search('a' * 30, timeout=1.0)
except regex.error as e:
print(f"Pattern timed out: {e}")Match object containing information about a successful pattern match, providing access to matched text, groups, and position information.
class Match:
"""Match object containing match information and results."""
def group(self, *groups):
"""Return one or more subgroups of the match."""
def groups(self, default=None):
"""Return tuple of all subgroups of the match."""
def groupdict(self, default=None):
"""Return dictionary of all named subgroups."""
def start(self, group=0):
"""Return start position of substring matched by group."""
def end(self, group=0):
"""Return end position of substring matched by group."""
def span(self, group=0):
"""Return (start, end) positions of substring matched by group."""
def expand(self, template):
"""Return string obtained by template substitution."""
def expandf(self, format):
"""Return string obtained by format substitution."""
# Match properties
string: str # String passed to match function
pos: int # Start position for search
endpos: int # End position for search
lastindex: int # Index of last matched capturing group
lastgroup: str # Name of last matched capturing group
re: Pattern # Pattern object that produced this matchUsage Examples:
import regex
# Basic match operations
pattern = regex.compile(r'(\w+)@(\w+\.\w+)')
match = pattern.search('Email: john@example.com is valid')
if match:
print(f"Full match: {match.group()}") # 'john@example.com'
print(f"Username: {match.group(1)}") # 'john'
print(f"Domain: {match.group(2)}") # 'example.com'
print(f"All groups: {match.groups()}") # ('john', 'example.com')
print(f"Match span: {match.span()}") # (7, 21)
# Named groups
pattern = regex.compile(r'(?P<user>\w+)@(?P<domain>\w+\.\w+)')
match = pattern.search('Contact: admin@site.org')
if match:
print(f"User: {match.group('user')}") # 'admin'
print(f"Domain: {match.group('domain')}") # 'site.org'
print(f"Group dict: {match.groupdict()}") # {'user': 'admin', 'domain': 'site.org'}
# Multiple group access
match = regex.search(r'(\d{4})-(\d{2})-(\d{2})', 'Date: 2023-12-25')
if match:
year, month, day = match.groups()
print(f"Date parts: {year}, {month}, {day}") # '2023', '12', '25'
# Individual positions
print(f"Year at: {match.span(1)}") # (6, 10)
print(f"Month at: {match.span(2)}") # (11, 13)
print(f"Day at: {match.span(3)}") # (14, 16)
# Template expansion
match = regex.search(r'(\w+)\s+(\w+)', 'John Doe')
if match:
# Traditional template
formatted = match.expand(r'\2, \1')
print(formatted) # 'Doe, John'
# Format-style template
formatted = match.expandf('{1}, {0}')
print(formatted) # 'Doe, John'
# Match object properties
print(f"Original string: {match.string}")
print(f"Search bounds: {match.pos}-{match.endpos}")
print(f"Last group index: {match.lastindex}")
print(f"Pattern object: {match.re}")Tokenizing scanner that processes strings using a list of pattern-action pairs, providing a powerful tool for lexical analysis and text processing.
class Scanner:
"""Scanner for tokenizing strings using pattern-action pairs."""
def __init__(self, lexicon, flags=0):
"""
Initialize scanner with lexicon of pattern-action pairs.
Args:
lexicon (list): List of (pattern, action) tuples
flags (int, optional): Regex flags for all patterns
"""
def scan(self, string):
"""
Scan string and return list of action results.
Args:
string (str): String to scan
Returns:
tuple: (results_list, remaining_string)
"""Usage Examples:
import regex
# Basic tokenizer
def make_number(scanner, token):
return ('NUMBER', int(token))
def make_word(scanner, token):
return ('WORD', token)
def make_operator(scanner, token):
return ('OP', token)
# Define lexicon (pattern, action) pairs
lexicon = [
(r'\d+', make_number),
(r'\w+', make_word),
(r'[+\-*/]', make_operator),
(r'\s+', None), # Skip whitespace
]
scanner = regex.Scanner(lexicon)
tokens, remainder = scanner.scan('age + 25 * factor')
print(tokens) # [('WORD', 'age'), ('OP', '+'), ('NUMBER', 25), ('OP', '*'), ('WORD', 'factor')]
print(f"Remainder: '{remainder}'") # Should be empty
# Advanced tokenizer with state
class StatefulScanner:
def __init__(self):
self.in_string = False
def string_start(self, scanner, token):
self.in_string = True
return ('STRING_START', token)
def string_content(self, scanner, token):
return ('STRING_CONTENT', token)
def string_end(self, scanner, token):
self.in_string = False
return ('STRING_END', token)
# HTML/XML tokenizer
def make_tag_open(scanner, token):
return ('TAG_OPEN', token)
def make_tag_close(scanner, token):
return ('TAG_CLOSE', token)
def make_text(scanner, token):
return ('TEXT', token.strip())
html_lexicon = [
(r'<(/?\w+)[^>]*>', make_tag_open),
(r'[^<]+', make_text),
]
html_scanner = regex.Scanner(html_lexicon)
tokens, remainder = html_scanner.scan('<div>Hello <span>world</span></div>')
print(tokens)
# Programming language tokenizer
def tokenize_code(code):
lexicon = [
(r'#.*$', lambda s, t: ('COMMENT', t)), # Comments
(r'\b(if|else|while|for|def|class)\b', lambda s, t: ('KEYWORD', t)), # Keywords
(r'\b[a-zA-Z_]\w*\b', lambda s, t: ('IDENTIFIER', t)), # Identifiers
(r'\b\d+\.\d+\b', lambda s, t: ('FLOAT', float(t))), # Float numbers
(r'\b\d+\b', lambda s, t: ('INTEGER', int(t))), # Integers
(r'[+\-*/=<>!]+', lambda s, t: ('OPERATOR', t)), # Operators
(r'[(){}[\];,.]', lambda s, t: ('DELIMITER', t)), # Delimiters
(r'"[^"]*"', lambda s, t: ('STRING', t[1:-1])), # String literals
(r'\s+', None), # Skip whitespace
]
scanner = regex.Scanner(lexicon, regex.MULTILINE)
tokens, remainder = scanner.scan(code)
if remainder:
print(f"Warning: Could not tokenize: '{remainder}'")
return tokens
# Example usage
code = '''
def hello(name):
# Print greeting
print("Hello, " + name)
return 42
'''
tokens = tokenize_code(code)
for token in tokens:
print(token)Enumeration of regex flags with proper flag combination support, providing a type-safe way to work with regex flags.
class RegexFlag(enum.IntFlag):
"""Enumeration of regex flags with proper combination support."""
# Standard flags
ASCII = A = 0x80
IGNORECASE = I = 0x2
LOCALE = L = 0x4
MULTILINE = M = 0x8
DOTALL = S = 0x10
VERBOSE = X = 0x40
UNICODE = U = 0x20
# Enhanced flags
BESTMATCH = B = 0x1000
DEBUG = D = 0x200
ENHANCEMATCH = E = 0x8000
FULLCASE = F = 0x4000
POSIX = P = 0x10000
REVERSE = R = 0x400
TEMPLATE = T = 0x1
WORD = W = 0x800
# Version flags
VERSION0 = V0 = 0x2000
VERSION1 = V1 = 0x100Usage Examples:
import regex
from regex import RegexFlag
# Using flag enumeration
flags = RegexFlag.IGNORECASE | RegexFlag.MULTILINE
pattern = regex.compile(r'^hello.*world$', flags)
# Check flag combinations
combined_flags = RegexFlag.IGNORECASE | RegexFlag.DOTALL | RegexFlag.VERBOSE
print(f"Combined flags value: {combined_flags}")
# Test flag presence
if RegexFlag.IGNORECASE in combined_flags:
print("Case-insensitive matching enabled")
# Enhanced flags
fuzzy_flags = RegexFlag.BESTMATCH | RegexFlag.ENHANCEMATCH
pattern = regex.compile(r'(?e)(search){e<=2}', fuzzy_flags)
# Version-specific flags
v1_flags = RegexFlag.VERSION1 | RegexFlag.IGNORECASE | RegexFlag.FULLCASE
pattern = regex.compile(r'unicode', v1_flags)
# All flag names and values
print("Available flags:")
for flag in RegexFlag:
print(f"{flag.name}: {flag.value} (0x{flag.value:x})")# Efficient pattern reuse
class TextProcessor:
def __init__(self):
# Pre-compile frequently used patterns
self.email_pattern = regex.compile(r'\b[\w.-]+@[\w.-]+\.\w+\b')
self.phone_pattern = regex.compile(r'\b\d{3}-\d{3}-\d{4}\b')
self.url_pattern = regex.compile(r'https?://[^\s]+')
def extract_contacts(self, text):
return {
'emails': self.email_pattern.findall(text),
'phones': self.phone_pattern.findall(text),
'urls': self.url_pattern.findall(text)
}def process_structured_data(text):
# Chain match operations
date_pattern = regex.compile(r'(\d{4})-(\d{2})-(\d{2})')
results = []
for match in date_pattern.finditer(text):
# Extract date components
year, month, day = match.groups()
# Use match position to get context
start, end = match.span()
context_start = max(0, start - 20)
context_end = min(len(text), end + 20)
context = text[context_start:context_end]
results.append({
'date': f"{year}-{month}-{day}",
'position': (start, end),
'context': context.strip()
})
return resultsclass AdvancedScanner:
def __init__(self):
self.context_stack = []
self.current_context = 'normal'
def enter_context(self, scanner, token):
self.context_stack.append(self.current_context)
self.current_context = 'special'
return ('CONTEXT_ENTER', token)
def exit_context(self, scanner, token):
if self.context_stack:
self.current_context = self.context_stack.pop()
return ('CONTEXT_EXIT', token)
def process_token(self, scanner, token):
return (f'{self.current_context.upper()}_TOKEN', token)Install with Tessl CLI
npx tessl i tessl/pypi-regex