tessl/pypi-regex

Alternative regular expression module providing enhanced pattern matching, fuzzy matching, and advanced Unicode support as a replacement for Python's re module.

—

Pending

Overview

Eval results

Files

Flags and Constants

Name: tessl/pypi-regex
Author: tessl

Comprehensive flag system including standard regex flags, enhanced flags for fuzzy matching and Unicode handling, version control flags, and global constants for controlling library behavior. These flags provide fine-grained control over pattern matching behavior and enable advanced regex features.

Capabilities

Standard Regular Expression Flags

Traditional regex flags that control basic matching behavior, compatible with Python's standard re module while providing enhanced functionality.

# Case and Character Class Flags
ASCII = A = 0x80              # ASCII-only character class matching
IGNORECASE = I = 0x2          # Case-insensitive matching  
LOCALE = L = 0x4              # Locale-dependent character classes
UNICODE = U = 0x20            # Unicode-dependent character classes

# Pattern Behavior Flags
MULTILINE = M = 0x8           # Multi-line mode for ^ and $
DOTALL = S = 0x10             # Make . match any character including newline
VERBOSE = X = 0x40            # Verbose mode allowing comments and whitespace
TEMPLATE = T = 0x1            # Template mode (compatibility with re module)

Usage Examples:

import regex

# Case-insensitive matching
result = regex.search(r'hello', 'HELLO WORLD', regex.IGNORECASE)
print(result.group())  # 'HELLO'

# Multi-line mode - ^ and $ match line boundaries
text = 'line1\nline2\nline3'
matches = regex.findall(r'^line\d$', text, regex.MULTILINE)
print(matches)  # ['line1', 'line2', 'line3']

# Dot matches newlines
result = regex.search(r'start.*end', 'start\nmiddle\nend', regex.DOTALL)
print(result.group())  # 'start\nmiddle\nend'

# Verbose mode with comments
pattern = regex.compile(r'''
    \b                    # Word boundary
    (\w+)                 # Username (group 1)
    @                     # Literal @
    ([\w.-]+)             # Domain name (group 2)
    \.                    # Literal dot
    (\w+)                 # TLD (group 3)
    \b                    # Word boundary
''', regex.VERBOSE)

# Combining flags
combined = regex.IGNORECASE | regex.MULTILINE | regex.DOTALL
result = regex.search(r'^hello.*world$', 'HELLO\nBEAUTIFUL\nWORLD', combined)

# ASCII vs Unicode character classes
text = 'café naïve résumé'
# Unicode mode (default for str patterns)
unicode_words = regex.findall(r'\w+', text, regex.UNICODE)
print(unicode_words)  # ['café', 'naïve', 'résumé']

# ASCII mode
ascii_words = regex.findall(r'\w+', text, regex.ASCII)
print(ascii_words)  # ['caf', 'na', 've', 'r', 'sum']

Enhanced Regular Expression Flags

Advanced flags unique to the regex module that enable fuzzy matching, improved Unicode support, and specialized matching behaviors.

# Fuzzy Matching Flags
BESTMATCH = B = 0x1000        # Find best fuzzy match instead of first
ENHANCEMATCH = E = 0x8000     # Improve fuzzy match fit after finding first

# Unicode Enhancement Flags  
FULLCASE = F = 0x4000         # Full case-folding for Unicode case-insensitive matching
WORD = W = 0x800              # Unicode word boundaries and line breaks

# Matching Behavior Flags
POSIX = P = 0x10000           # POSIX-standard leftmost longest matching
REVERSE = R = 0x400           # Search backwards through string
DEBUG = D = 0x200             # Print parsed pattern for debugging

Usage Examples:

import regex

# Fuzzy matching with best match
pattern = r'(?b)(python){e<=2}'  # Allow up to 2 errors, find best match
text = 'pyton pythom python pyth'
result = regex.search(pattern, text, regex.BESTMATCH)
print(result.group())  # 'python' (exact match is best)

# Enhanced fuzzy matching  
pattern = r'(?e)(search){e<=1}'
result = regex.search(pattern, 'serch found', regex.ENHANCEMATCH)
print(result.group())  # 'serch' with improved fit

# Full case-folding for Unicode
pattern = r'STRASSE'
text = 'Hauptstraße in München'  # German ß should match SS
result = regex.search(pattern, text, regex.IGNORECASE | regex.FULLCASE)
print(result.group())  # 'straße'

# Word boundaries with Unicode
text = 'hello мир world'
words = regex.findall(r'\b\w+\b', text, regex.WORD)
print(words)  # ['hello', 'мир', 'world'] - properly handles Unicode

# POSIX leftmost-longest matching
pattern = r'a|ab'
text = 'ab'
# Normal (first match)
result1 = regex.search(pattern, text)
print(result1.group())  # 'a'

# POSIX (longest match)
result2 = regex.search(pattern, text, regex.POSIX)
print(result2.group())  # 'ab'

# Reverse searching
text = 'first second third'
result = regex.search(r'\w+', text, regex.REVERSE)
print(result.group())  # 'third' (last word when searching backwards)

# Debug mode - prints parsed pattern
pattern = regex.compile(r'(a+)(b+)', regex.DEBUG)
# Prints internal pattern structure to stdout

Version Control Flags

Flags that control regex behavior version, allowing choice between legacy re-compatible behavior and enhanced regex features.

# Version Control Flags
VERSION0 = V0 = 0x2000        # Legacy re-compatible behavior
VERSION1 = V1 = 0x100         # Enhanced behavior mode (includes FULLCASE)

# Global Version Setting
DEFAULT_VERSION               # Current default version setting (VERSION0)

Usage Examples:

import regex

# Version 0 (legacy re-compatible behavior)
pattern_v0 = regex.compile(r'(?V0)\w+', regex.IGNORECASE)

# Version 1 (enhanced behavior with full case-folding)
pattern_v1 = regex.compile(r'(?V1)\w+', regex.IGNORECASE)

# Compare behavior with Unicode case-folding
text = 'Straße'  # German word with ß

# Version 0 - basic case folding
result_v0 = regex.search(r'(?V0)STRASSE', text, regex.IGNORECASE)
print(f"V0 result: {result_v0}")  # May not match

# Version 1 - full case folding (automatic with IGNORECASE)
result_v1 = regex.search(r'(?V1)STRASSE', text, regex.IGNORECASE)
print(f"V1 result: {result_v1.group() if result_v1 else None}")  # 'Straße'

# Global default version setting
print(f"Current default: {regex.DEFAULT_VERSION}")

# Set global default (affects patterns without explicit version)
# regex.DEFAULT_VERSION = regex.VERSION1  # Would change global default

# Inline version specification in patterns
pattern = r'(?V1)case insensitive with full folding'
result = regex.search(pattern, 'CASE INSENSITIVE', regex.IGNORECASE)

# Mixed version usage
def compare_versions(pattern_str, text, flags=0):
    v0_result = regex.search(f'(?V0){pattern_str}', text, flags)
    v1_result = regex.search(f'(?V1){pattern_str}', text, flags)
    
    return {
        'v0': v0_result.group() if v0_result else None,
        'v1': v1_result.group() if v1_result else None
    }

Module Constants and Metadata

Global constants and version information for the regex module.

# Module Information
__version__ = "2.5.161"       # Module version string
__doc__                       # Module documentation string

# Function Aliases
Regex                         # Alias for compile function (for pattern repr)

# Exception Class
error                         # Exception class for regex errors

Usage Examples:

import regex

# Check module version
print(f"regex module version: {regex.__version__}")

# Read module documentation
print(f"Module doc length: {len(regex.__doc__)} characters")

# Using Regex alias (mainly for internal use)
pattern = regex.Regex(r'\d+')  # Same as regex.compile(r'\d+')

# Exception handling
try:
    bad_pattern = regex.compile(r'[')  # Invalid pattern
except regex.error as e:
    print(f"Regex error: {e}")
    print(f"Error message: {e.msg}")
    if hasattr(e, 'pos'):
        print(f"Error position: {e.pos}")

Flag Combinations and Usage Patterns

Common Flag Combinations

# Case-insensitive multiline matching
CASE_INSENSITIVE_MULTILINE = regex.IGNORECASE | regex.MULTILINE

# Full Unicode support with word boundaries
UNICODE_WORDS = regex.UNICODE | regex.WORD

# Enhanced fuzzy matching
FUZZY_BEST = regex.BESTMATCH | regex.ENHANCEMATCH

# Version 1 with full case folding
ENHANCED_CASE = regex.VERSION1 | regex.IGNORECASE

# Debug verbose mode
DEBUG_VERBOSE = regex.DEBUG | regex.VERBOSE

# Example usage
pattern = regex.compile(r'''
    \b                  # Word boundary
    (?e)                # Enable fuzzy matching
    (search){e<=2}      # Allow up to 2 errors
    \b                  # Word boundary
''', FUZZY_BEST | DEBUG_VERBOSE)

Dynamic Flag Handling

def build_pattern_flags(case_sensitive=True, multiline=False, 
                       fuzzy=False, unicode_aware=True):
    """Build flags based on requirements."""
    flags = 0
    
    if not case_sensitive:
        flags |= regex.IGNORECASE
        flags |= regex.FULLCASE  # Enhanced case folding
    
    if multiline:
        flags |= regex.MULTILINE
    
    if fuzzy:
        flags |= regex.BESTMATCH | regex.ENHANCEMATCH
    
    if unicode_aware:
        flags |= regex.UNICODE | regex.WORD
    
    return flags

# Usage
flags = build_pattern_flags(case_sensitive=False, fuzzy=True)
pattern = regex.compile(r'(?e)(search){e<=1}', flags)

Flag Testing and Introspection

def analyze_pattern_flags(pattern):
    """Analyze flags used in a compiled pattern."""
    flags = pattern.flags
    
    flag_names = []
    for flag_name in dir(regex):
        if flag_name.isupper() and len(flag_name) <= 12:  # Flag names
            flag_value = getattr(regex, flag_name)
            if isinstance(flag_value, int) and flags & flag_value:
                flag_names.append(flag_name)
    
    return {
        'flags_value': flags,
        'flags_hex': f'0x{flags:x}',
        'active_flags': flag_names
    }

# Example
pattern = regex.compile(r'test', regex.IGNORECASE | regex.MULTILINE)
info = analyze_pattern_flags(pattern)
print(info)

Performance Considerations

# Pre-define flag combinations for reuse
STANDARD_TEXT = regex.IGNORECASE | regex.MULTILINE | regex.DOTALL
FUZZY_SEARCH = regex.BESTMATCH | regex.ENHANCEMATCH | regex.IGNORECASE
UNICODE_FULL = regex.UNICODE | regex.WORD | regex.FULLCASE

# Cache compiled patterns with flags
_pattern_cache = {}

def get_cached_pattern(pattern_str, flags):
    cache_key = (pattern_str, flags)
    if cache_key not in _pattern_cache:
        _pattern_cache[cache_key] = regex.compile(pattern_str, flags)
    return _pattern_cache[cache_key]

# Usage
email_pattern = get_cached_pattern(r'\b[\w.-]+@[\w.-]+\.\w+\b', STANDARD_TEXT)

Advanced Flag Usage

# Conditional flag application
def smart_search(pattern, text, **options):
    flags = 0
    
    # Apply flags based on text characteristics
    if any(ord(c) > 127 for c in text):  # Contains non-ASCII
        flags |= regex.UNICODE | regex.WORD | regex.FULLCASE
    
    if '\n' in text:  # Multi-line text
        flags |= regex.MULTILINE
    
    if options.get('case_insensitive', True):
        flags |= regex.IGNORECASE
    
    if options.get('fuzzy', False):
        flags |= regex.BESTMATCH
        pattern = f'(?e)({pattern}){{e<={options.get("errors", 1)}}}'
    
    return regex.search(pattern, text, flags)

# Example usage
result = smart_search('hello', 'Hello, мир!', case_insensitive=True, fuzzy=True)

Install with Tessl CLI