Alternative regular expression module providing enhanced pattern matching, fuzzy matching, and advanced Unicode support as a replacement for Python's re module.
—
Comprehensive flag system including standard regex flags, enhanced flags for fuzzy matching and Unicode handling, version control flags, and global constants for controlling library behavior. These flags provide fine-grained control over pattern matching behavior and enable advanced regex features.
Traditional regex flags that control basic matching behavior, compatible with Python's standard re module while providing enhanced functionality.
# Case and Character Class Flags
ASCII = A = 0x80 # ASCII-only character class matching
IGNORECASE = I = 0x2 # Case-insensitive matching
LOCALE = L = 0x4 # Locale-dependent character classes
UNICODE = U = 0x20 # Unicode-dependent character classes
# Pattern Behavior Flags
MULTILINE = M = 0x8 # Multi-line mode for ^ and $
DOTALL = S = 0x10 # Make . match any character including newline
VERBOSE = X = 0x40 # Verbose mode allowing comments and whitespace
TEMPLATE = T = 0x1 # Template mode (compatibility with re module)Usage Examples:
import regex
# Case-insensitive matching
result = regex.search(r'hello', 'HELLO WORLD', regex.IGNORECASE)
print(result.group()) # 'HELLO'
# Multi-line mode - ^ and $ match line boundaries
text = 'line1\nline2\nline3'
matches = regex.findall(r'^line\d$', text, regex.MULTILINE)
print(matches) # ['line1', 'line2', 'line3']
# Dot matches newlines
result = regex.search(r'start.*end', 'start\nmiddle\nend', regex.DOTALL)
print(result.group()) # 'start\nmiddle\nend'
# Verbose mode with comments
pattern = regex.compile(r'''
\b # Word boundary
(\w+) # Username (group 1)
@ # Literal @
([\w.-]+) # Domain name (group 2)
\. # Literal dot
(\w+) # TLD (group 3)
\b # Word boundary
''', regex.VERBOSE)
# Combining flags
combined = regex.IGNORECASE | regex.MULTILINE | regex.DOTALL
result = regex.search(r'^hello.*world$', 'HELLO\nBEAUTIFUL\nWORLD', combined)
# ASCII vs Unicode character classes
text = 'café naïve résumé'
# Unicode mode (default for str patterns)
unicode_words = regex.findall(r'\w+', text, regex.UNICODE)
print(unicode_words) # ['café', 'naïve', 'résumé']
# ASCII mode
ascii_words = regex.findall(r'\w+', text, regex.ASCII)
print(ascii_words) # ['caf', 'na', 've', 'r', 'sum']Advanced flags unique to the regex module that enable fuzzy matching, improved Unicode support, and specialized matching behaviors.
# Fuzzy Matching Flags
BESTMATCH = B = 0x1000 # Find best fuzzy match instead of first
ENHANCEMATCH = E = 0x8000 # Improve fuzzy match fit after finding first
# Unicode Enhancement Flags
FULLCASE = F = 0x4000 # Full case-folding for Unicode case-insensitive matching
WORD = W = 0x800 # Unicode word boundaries and line breaks
# Matching Behavior Flags
POSIX = P = 0x10000 # POSIX-standard leftmost longest matching
REVERSE = R = 0x400 # Search backwards through string
DEBUG = D = 0x200 # Print parsed pattern for debuggingUsage Examples:
import regex
# Fuzzy matching with best match
pattern = r'(?b)(python){e<=2}' # Allow up to 2 errors, find best match
text = 'pyton pythom python pyth'
result = regex.search(pattern, text, regex.BESTMATCH)
print(result.group()) # 'python' (exact match is best)
# Enhanced fuzzy matching
pattern = r'(?e)(search){e<=1}'
result = regex.search(pattern, 'serch found', regex.ENHANCEMATCH)
print(result.group()) # 'serch' with improved fit
# Full case-folding for Unicode
pattern = r'STRASSE'
text = 'Hauptstraße in München' # German ß should match SS
result = regex.search(pattern, text, regex.IGNORECASE | regex.FULLCASE)
print(result.group()) # 'straße'
# Word boundaries with Unicode
text = 'hello мир world'
words = regex.findall(r'\b\w+\b', text, regex.WORD)
print(words) # ['hello', 'мир', 'world'] - properly handles Unicode
# POSIX leftmost-longest matching
pattern = r'a|ab'
text = 'ab'
# Normal (first match)
result1 = regex.search(pattern, text)
print(result1.group()) # 'a'
# POSIX (longest match)
result2 = regex.search(pattern, text, regex.POSIX)
print(result2.group()) # 'ab'
# Reverse searching
text = 'first second third'
result = regex.search(r'\w+', text, regex.REVERSE)
print(result.group()) # 'third' (last word when searching backwards)
# Debug mode - prints parsed pattern
pattern = regex.compile(r'(a+)(b+)', regex.DEBUG)
# Prints internal pattern structure to stdoutFlags that control regex behavior version, allowing choice between legacy re-compatible behavior and enhanced regex features.
# Version Control Flags
VERSION0 = V0 = 0x2000 # Legacy re-compatible behavior
VERSION1 = V1 = 0x100 # Enhanced behavior mode (includes FULLCASE)
# Global Version Setting
DEFAULT_VERSION # Current default version setting (VERSION0)Usage Examples:
import regex
# Version 0 (legacy re-compatible behavior)
pattern_v0 = regex.compile(r'(?V0)\w+', regex.IGNORECASE)
# Version 1 (enhanced behavior with full case-folding)
pattern_v1 = regex.compile(r'(?V1)\w+', regex.IGNORECASE)
# Compare behavior with Unicode case-folding
text = 'Straße' # German word with ß
# Version 0 - basic case folding
result_v0 = regex.search(r'(?V0)STRASSE', text, regex.IGNORECASE)
print(f"V0 result: {result_v0}") # May not match
# Version 1 - full case folding (automatic with IGNORECASE)
result_v1 = regex.search(r'(?V1)STRASSE', text, regex.IGNORECASE)
print(f"V1 result: {result_v1.group() if result_v1 else None}") # 'Straße'
# Global default version setting
print(f"Current default: {regex.DEFAULT_VERSION}")
# Set global default (affects patterns without explicit version)
# regex.DEFAULT_VERSION = regex.VERSION1 # Would change global default
# Inline version specification in patterns
pattern = r'(?V1)case insensitive with full folding'
result = regex.search(pattern, 'CASE INSENSITIVE', regex.IGNORECASE)
# Mixed version usage
def compare_versions(pattern_str, text, flags=0):
v0_result = regex.search(f'(?V0){pattern_str}', text, flags)
v1_result = regex.search(f'(?V1){pattern_str}', text, flags)
return {
'v0': v0_result.group() if v0_result else None,
'v1': v1_result.group() if v1_result else None
}Global constants and version information for the regex module.
# Module Information
__version__ = "2.5.161" # Module version string
__doc__ # Module documentation string
# Function Aliases
Regex # Alias for compile function (for pattern repr)
# Exception Class
error # Exception class for regex errorsUsage Examples:
import regex
# Check module version
print(f"regex module version: {regex.__version__}")
# Read module documentation
print(f"Module doc length: {len(regex.__doc__)} characters")
# Using Regex alias (mainly for internal use)
pattern = regex.Regex(r'\d+') # Same as regex.compile(r'\d+')
# Exception handling
try:
bad_pattern = regex.compile(r'[') # Invalid pattern
except regex.error as e:
print(f"Regex error: {e}")
print(f"Error message: {e.msg}")
if hasattr(e, 'pos'):
print(f"Error position: {e.pos}")# Case-insensitive multiline matching
CASE_INSENSITIVE_MULTILINE = regex.IGNORECASE | regex.MULTILINE
# Full Unicode support with word boundaries
UNICODE_WORDS = regex.UNICODE | regex.WORD
# Enhanced fuzzy matching
FUZZY_BEST = regex.BESTMATCH | regex.ENHANCEMATCH
# Version 1 with full case folding
ENHANCED_CASE = regex.VERSION1 | regex.IGNORECASE
# Debug verbose mode
DEBUG_VERBOSE = regex.DEBUG | regex.VERBOSE
# Example usage
pattern = regex.compile(r'''
\b # Word boundary
(?e) # Enable fuzzy matching
(search){e<=2} # Allow up to 2 errors
\b # Word boundary
''', FUZZY_BEST | DEBUG_VERBOSE)def build_pattern_flags(case_sensitive=True, multiline=False,
fuzzy=False, unicode_aware=True):
"""Build flags based on requirements."""
flags = 0
if not case_sensitive:
flags |= regex.IGNORECASE
flags |= regex.FULLCASE # Enhanced case folding
if multiline:
flags |= regex.MULTILINE
if fuzzy:
flags |= regex.BESTMATCH | regex.ENHANCEMATCH
if unicode_aware:
flags |= regex.UNICODE | regex.WORD
return flags
# Usage
flags = build_pattern_flags(case_sensitive=False, fuzzy=True)
pattern = regex.compile(r'(?e)(search){e<=1}', flags)def analyze_pattern_flags(pattern):
"""Analyze flags used in a compiled pattern."""
flags = pattern.flags
flag_names = []
for flag_name in dir(regex):
if flag_name.isupper() and len(flag_name) <= 12: # Flag names
flag_value = getattr(regex, flag_name)
if isinstance(flag_value, int) and flags & flag_value:
flag_names.append(flag_name)
return {
'flags_value': flags,
'flags_hex': f'0x{flags:x}',
'active_flags': flag_names
}
# Example
pattern = regex.compile(r'test', regex.IGNORECASE | regex.MULTILINE)
info = analyze_pattern_flags(pattern)
print(info)# Pre-define flag combinations for reuse
STANDARD_TEXT = regex.IGNORECASE | regex.MULTILINE | regex.DOTALL
FUZZY_SEARCH = regex.BESTMATCH | regex.ENHANCEMATCH | regex.IGNORECASE
UNICODE_FULL = regex.UNICODE | regex.WORD | regex.FULLCASE
# Cache compiled patterns with flags
_pattern_cache = {}
def get_cached_pattern(pattern_str, flags):
cache_key = (pattern_str, flags)
if cache_key not in _pattern_cache:
_pattern_cache[cache_key] = regex.compile(pattern_str, flags)
return _pattern_cache[cache_key]
# Usage
email_pattern = get_cached_pattern(r'\b[\w.-]+@[\w.-]+\.\w+\b', STANDARD_TEXT)# Conditional flag application
def smart_search(pattern, text, **options):
flags = 0
# Apply flags based on text characteristics
if any(ord(c) > 127 for c in text): # Contains non-ASCII
flags |= regex.UNICODE | regex.WORD | regex.FULLCASE
if '\n' in text: # Multi-line text
flags |= regex.MULTILINE
if options.get('case_insensitive', True):
flags |= regex.IGNORECASE
if options.get('fuzzy', False):
flags |= regex.BESTMATCH
pattern = f'(?e)({pattern}){{e<={options.get("errors", 1)}}}'
return regex.search(pattern, text, flags)
# Example usage
result = smart_search('hello', 'Hello, мир!', case_insensitive=True, fuzzy=True)Install with Tessl CLI
npx tessl i tessl/pypi-regex