CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-regex

Alternative regular expression module providing enhanced pattern matching, fuzzy matching, and advanced Unicode support as a replacement for Python's re module.

Pending
Overview
Eval results
Files

classes-types.mddocs/

Advanced Classes and Types

Pattern and Match objects providing compiled pattern functionality and match result access, plus Scanner for tokenization and RegexFlag enumeration for proper flag handling. These classes form the core object-oriented interface for advanced regex operations.

Capabilities

Pattern Class

Compiled regular expression pattern object that provides all matching methods with enhanced performance and additional functionality beyond module-level functions.

class Pattern:
    """Compiled regular expression pattern object with matching methods."""
    
    def match(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):
        """Try to apply pattern at start of string, returning Match object or None."""
    
    def fullmatch(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):
        """Try to apply pattern against entire string, returning Match object or None."""
    
    def search(self, string, pos=None, endpos=None, concurrent=None, partial=False, timeout=None):
        """Search through string for pattern match, returning Match object or None."""
    
    def findall(self, string, pos=None, endpos=None, overlapped=False, concurrent=None, timeout=None):
        """Return list of all matches in string."""
    
    def finditer(self, string, pos=None, endpos=None, overlapped=False, partial=False, concurrent=None, timeout=None):
        """Return iterator over all matches in string."""
    
    def sub(self, repl, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
        """Replace pattern occurrences with replacement string."""
    
    def subf(self, format, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
        """Replace pattern occurrences using format string."""
    
    def subn(self, repl, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
        """Return (new_string, number_of_substitutions_made) tuple."""
    
    def subfn(self, format, string, count=0, pos=None, endpos=None, concurrent=None, timeout=None):
        """Return (formatted_string, number_of_substitutions_made) tuple."""
    
    def split(self, string, maxsplit=0, concurrent=None, timeout=None):
        """Split string by pattern occurrences, returning list of substrings."""
    
    def splititer(self, string, maxsplit=0, concurrent=None, timeout=None):
        """Return iterator yielding split string parts."""
    
    # Pattern properties
    pattern: str        # Original pattern string
    flags: int         # Compilation flags
    groups: int        # Number of capturing groups
    groupindex: dict   # Mapping of group names to numbers

Usage Examples:

import regex

# Compile and use pattern object
email_pattern = regex.compile(r'\b([\w.-]+)@([\w.-]+\.\w+)\b')

# Use pattern methods
text = "Contact: john@example.com or admin@site.org"
matches = email_pattern.findall(text)
print(matches)  # [('john', 'example.com'), ('admin', 'site.org')]

# Pattern properties
print(f"Pattern: {email_pattern.pattern}")
print(f"Groups: {email_pattern.groups}")
print(f"Flags: {email_pattern.flags}")

# Multiple operations on same pattern
def analyze_email_text(text, pattern):
    # Count emails
    all_emails = pattern.findall(text)
    
    # Find first email
    first_match = pattern.search(text)
    
    # Replace emails with placeholder
    anonymized = pattern.sub('[EMAIL]', text)
    
    return {
        'count': len(all_emails),
        'first': first_match.group() if first_match else None,
        'anonymized': anonymized
    }

# Advanced pattern usage with concurrent execution
large_text = open('large_file.txt').read()
results = email_pattern.findall(large_text, concurrent=True)

# Pattern with timeout
try:
    complex_pattern = regex.compile(r'(a+)+b')
    result = complex_pattern.search('a' * 30, timeout=1.0)
except regex.error as e:
    print(f"Pattern timed out: {e}")

Match Class

Match object containing information about a successful pattern match, providing access to matched text, groups, and position information.

class Match:
    """Match object containing match information and results."""
    
    def group(self, *groups):
        """Return one or more subgroups of the match."""
    
    def groups(self, default=None):
        """Return tuple of all subgroups of the match."""
    
    def groupdict(self, default=None):
        """Return dictionary of all named subgroups."""
    
    def start(self, group=0):
        """Return start position of substring matched by group."""
    
    def end(self, group=0):
        """Return end position of substring matched by group."""
    
    def span(self, group=0):
        """Return (start, end) positions of substring matched by group."""
    
    def expand(self, template):
        """Return string obtained by template substitution."""
    
    def expandf(self, format):
        """Return string obtained by format substitution."""
    
    # Match properties
    string: str        # String passed to match function
    pos: int          # Start position for search
    endpos: int       # End position for search
    lastindex: int    # Index of last matched capturing group
    lastgroup: str    # Name of last matched capturing group
    re: Pattern       # Pattern object that produced this match

Usage Examples:

import regex

# Basic match operations
pattern = regex.compile(r'(\w+)@(\w+\.\w+)')
match = pattern.search('Email: john@example.com is valid')

if match:
    print(f"Full match: {match.group()}")      # 'john@example.com'
    print(f"Username: {match.group(1)}")       # 'john'
    print(f"Domain: {match.group(2)}")         # 'example.com'
    print(f"All groups: {match.groups()}")     # ('john', 'example.com')
    print(f"Match span: {match.span()}")       # (7, 21)

# Named groups
pattern = regex.compile(r'(?P<user>\w+)@(?P<domain>\w+\.\w+)')
match = pattern.search('Contact: admin@site.org')

if match:
    print(f"User: {match.group('user')}")           # 'admin'
    print(f"Domain: {match.group('domain')}")       # 'site.org'
    print(f"Group dict: {match.groupdict()}")       # {'user': 'admin', 'domain': 'site.org'}

# Multiple group access
match = regex.search(r'(\d{4})-(\d{2})-(\d{2})', 'Date: 2023-12-25')
if match:
    year, month, day = match.groups()
    print(f"Date parts: {year}, {month}, {day}")    # '2023', '12', '25'
    
    # Individual positions
    print(f"Year at: {match.span(1)}")              # (6, 10)
    print(f"Month at: {match.span(2)}")             # (11, 13)
    print(f"Day at: {match.span(3)}")               # (14, 16)

# Template expansion
match = regex.search(r'(\w+)\s+(\w+)', 'John Doe')
if match:
    # Traditional template
    formatted = match.expand(r'\2, \1')
    print(formatted)  # 'Doe, John'
    
    # Format-style template
    formatted = match.expandf('{1}, {0}')
    print(formatted)  # 'Doe, John'

# Match object properties
print(f"Original string: {match.string}")
print(f"Search bounds: {match.pos}-{match.endpos}")
print(f"Last group index: {match.lastindex}")
print(f"Pattern object: {match.re}")

Scanner Class

Tokenizing scanner that processes strings using a list of pattern-action pairs, providing a powerful tool for lexical analysis and text processing.

class Scanner:
    """Scanner for tokenizing strings using pattern-action pairs."""
    
    def __init__(self, lexicon, flags=0):
        """
        Initialize scanner with lexicon of pattern-action pairs.
        
        Args:
            lexicon (list): List of (pattern, action) tuples
            flags (int, optional): Regex flags for all patterns
        """
    
    def scan(self, string):
        """
        Scan string and return list of action results.
        
        Args:
            string (str): String to scan
            
        Returns:
            tuple: (results_list, remaining_string)
        """

Usage Examples:

import regex

# Basic tokenizer
def make_number(scanner, token):
    return ('NUMBER', int(token))

def make_word(scanner, token):
    return ('WORD', token)

def make_operator(scanner, token):
    return ('OP', token)

# Define lexicon (pattern, action) pairs
lexicon = [
    (r'\d+', make_number),
    (r'\w+', make_word),
    (r'[+\-*/]', make_operator),
    (r'\s+', None),  # Skip whitespace
]

scanner = regex.Scanner(lexicon)
tokens, remainder = scanner.scan('age + 25 * factor')
print(tokens)  # [('WORD', 'age'), ('OP', '+'), ('NUMBER', 25), ('OP', '*'), ('WORD', 'factor')]
print(f"Remainder: '{remainder}'")  # Should be empty

# Advanced tokenizer with state
class StatefulScanner:
    def __init__(self):
        self.in_string = False
        
    def string_start(self, scanner, token):
        self.in_string = True
        return ('STRING_START', token)
        
    def string_content(self, scanner, token):
        return ('STRING_CONTENT', token)
        
    def string_end(self, scanner, token):
        self.in_string = False
        return ('STRING_END', token)

# HTML/XML tokenizer
def make_tag_open(scanner, token):
    return ('TAG_OPEN', token)

def make_tag_close(scanner, token):
    return ('TAG_CLOSE', token)

def make_text(scanner, token):
    return ('TEXT', token.strip())

html_lexicon = [
    (r'<(/?\w+)[^>]*>', make_tag_open),
    (r'[^<]+', make_text),
]

html_scanner = regex.Scanner(html_lexicon)
tokens, remainder = html_scanner.scan('<div>Hello <span>world</span></div>')
print(tokens)

# Programming language tokenizer
def tokenize_code(code):
    lexicon = [
        (r'#.*$', lambda s, t: ('COMMENT', t)),          # Comments
        (r'\b(if|else|while|for|def|class)\b', lambda s, t: ('KEYWORD', t)),  # Keywords
        (r'\b[a-zA-Z_]\w*\b', lambda s, t: ('IDENTIFIER', t)),  # Identifiers
        (r'\b\d+\.\d+\b', lambda s, t: ('FLOAT', float(t))),    # Float numbers
        (r'\b\d+\b', lambda s, t: ('INTEGER', int(t))),         # Integers
        (r'[+\-*/=<>!]+', lambda s, t: ('OPERATOR', t)),        # Operators
        (r'[(){}[\];,.]', lambda s, t: ('DELIMITER', t)),       # Delimiters
        (r'"[^"]*"', lambda s, t: ('STRING', t[1:-1])),         # String literals
        (r'\s+', None),  # Skip whitespace
    ]
    
    scanner = regex.Scanner(lexicon, regex.MULTILINE)
    tokens, remainder = scanner.scan(code)
    
    if remainder:
        print(f"Warning: Could not tokenize: '{remainder}'")
    
    return tokens

# Example usage
code = '''
def hello(name):
    # Print greeting
    print("Hello, " + name)
    return 42
'''

tokens = tokenize_code(code)
for token in tokens:
    print(token)

RegexFlag Enumeration

Enumeration of regex flags with proper flag combination support, providing a type-safe way to work with regex flags.

class RegexFlag(enum.IntFlag):
    """Enumeration of regex flags with proper combination support."""
    
    # Standard flags
    ASCII = A = 0x80
    IGNORECASE = I = 0x2
    LOCALE = L = 0x4
    MULTILINE = M = 0x8
    DOTALL = S = 0x10
    VERBOSE = X = 0x40
    UNICODE = U = 0x20
    
    # Enhanced flags
    BESTMATCH = B = 0x1000
    DEBUG = D = 0x200
    ENHANCEMATCH = E = 0x8000
    FULLCASE = F = 0x4000
    POSIX = P = 0x10000
    REVERSE = R = 0x400
    TEMPLATE = T = 0x1
    WORD = W = 0x800
    
    # Version flags
    VERSION0 = V0 = 0x2000
    VERSION1 = V1 = 0x100

Usage Examples:

import regex
from regex import RegexFlag

# Using flag enumeration
flags = RegexFlag.IGNORECASE | RegexFlag.MULTILINE
pattern = regex.compile(r'^hello.*world$', flags)

# Check flag combinations
combined_flags = RegexFlag.IGNORECASE | RegexFlag.DOTALL | RegexFlag.VERBOSE
print(f"Combined flags value: {combined_flags}")

# Test flag presence
if RegexFlag.IGNORECASE in combined_flags:
    print("Case-insensitive matching enabled")

# Enhanced flags
fuzzy_flags = RegexFlag.BESTMATCH | RegexFlag.ENHANCEMATCH
pattern = regex.compile(r'(?e)(search){e<=2}', fuzzy_flags)

# Version-specific flags
v1_flags = RegexFlag.VERSION1 | RegexFlag.IGNORECASE | RegexFlag.FULLCASE
pattern = regex.compile(r'unicode', v1_flags)

# All flag names and values
print("Available flags:")
for flag in RegexFlag:
    print(f"{flag.name}: {flag.value} (0x{flag.value:x})")

Advanced Usage Patterns

Pattern Object Reuse

# Efficient pattern reuse
class TextProcessor:
    def __init__(self):
        # Pre-compile frequently used patterns
        self.email_pattern = regex.compile(r'\b[\w.-]+@[\w.-]+\.\w+\b')
        self.phone_pattern = regex.compile(r'\b\d{3}-\d{3}-\d{4}\b')
        self.url_pattern = regex.compile(r'https?://[^\s]+')
    
    def extract_contacts(self, text):
        return {
            'emails': self.email_pattern.findall(text),
            'phones': self.phone_pattern.findall(text),
            'urls': self.url_pattern.findall(text)
        }

Match Object Chaining

def process_structured_data(text):
    # Chain match operations
    date_pattern = regex.compile(r'(\d{4})-(\d{2})-(\d{2})')
    
    results = []
    for match in date_pattern.finditer(text):
        # Extract date components
        year, month, day = match.groups()
        
        # Use match position to get context
        start, end = match.span()
        context_start = max(0, start - 20)
        context_end = min(len(text), end + 20)
        context = text[context_start:context_end]
        
        results.append({
            'date': f"{year}-{month}-{day}",
            'position': (start, end),
            'context': context.strip()
        })
    
    return results

Scanner State Management

class AdvancedScanner:
    def __init__(self):
        self.context_stack = []
        self.current_context = 'normal'
        
    def enter_context(self, scanner, token):
        self.context_stack.append(self.current_context)
        self.current_context = 'special'
        return ('CONTEXT_ENTER', token)
        
    def exit_context(self, scanner, token):
        if self.context_stack:
            self.current_context = self.context_stack.pop()
        return ('CONTEXT_EXIT', token)
        
    def process_token(self, scanner, token):
        return (f'{self.current_context.upper()}_TOKEN', token)

Install with Tessl CLI

npx tessl i tessl/pypi-regex

docs

classes-types.md

compilation-utilities.md

flags-constants.md

index.md

pattern-matching.md

splitting.md

substitution.md

tile.json