tessl/pypi-google-re2

RE2 - A regular expression library with linear time guarantees

—

Pending

Overview

Eval results

Files

Advanced Features

Name: tessl/pypi-google-re2
Author: tessl

Specialized functionality for high-performance scenarios including pattern sets for matching multiple patterns simultaneously and filtered matching for optimized multi-pattern operations. These features are designed for applications that need to match against many patterns efficiently.

Capabilities

Pattern Sets

Pattern sets allow efficient matching of text against multiple regular expressions simultaneously, returning which patterns matched.

class Set:
    """Collection of patterns that can be matched simultaneously."""
    
    def __init__(self, anchor, options=None):
        """
        Create a new pattern set.
        
        Args:
            anchor: Anchoring mode for matches
            options (Options, optional): Compilation options
        """
    
    def Add(self, pattern):
        """
        Add a pattern to the set.
        
        Args:
            pattern (str): Regular expression pattern to add
        
        Returns:
            int: Pattern index in the set
        
        Raises:
            error: If pattern is invalid or set is already compiled
        """
    
    def Compile(self):
        """
        Compile all patterns in the set for matching.
        
        Returns:
            bool: True if compilation successful
        
        Raises:
            error: If compilation fails
        """
    
    def Match(self, text):
        """
        Match text against all patterns in the set.
        
        Args:
            text (str): Text to match against patterns
        
        Returns:
            list: List of pattern indices that matched
        """
    
    @classmethod
    def SearchSet(options=None):
        """
        Create a set for searching (unanchored matching).
        
        Args:
            options (Options, optional): Compilation options
        
        Returns:
            Set: New set configured for searching
        """
    
    @classmethod
    def MatchSet(options=None):
        """
        Create a set for matching at start of text.
        
        Args:
            options (Options, optional): Compilation options
        
        Returns:
            Set: New set configured for start matching
        """
    
    @classmethod
    def FullMatchSet(options=None):
        """
        Create a set for full text matching.
        
        Args:
            options (Options, optional): Compilation options
        
        Returns:
            Set: New set configured for full matching
        """

Example usage:

import re2

# Create a search set
pattern_set = re2.Set.SearchSet()

# Add multiple patterns
email_idx = pattern_set.Add(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')
phone_idx = pattern_set.Add(r'\b\d{3}-\d{3}-\d{4}\b')
url_idx = pattern_set.Add(r'https?://[^\s]+')

# Compile the set
pattern_set.Compile()

# Match against text
text = "Contact: john@example.com or call 555-123-4567"
matches = pattern_set.Match(text)

if email_idx in matches:
    print("Found email address")
if phone_idx in matches:
    print("Found phone number")
if url_idx in matches:
    print("Found URL")

Filtered Matching

Filtered matching provides optimized multi-pattern matching with prefiltering for high-performance scenarios.

class Filter:
    """Optimized multi-pattern matcher with prefiltering."""
    
    def __init__(self):
        """Create a new filtered matcher."""
    
    def Add(self, pattern, options=None):
        """
        Add a pattern to the filter.
        
        Args:
            pattern (str): Regular expression pattern
            options (Options, optional): Compilation options for this pattern
        
        Returns:
            int: Pattern index in the filter
        
        Raises:
            error: If pattern is invalid
        """
    
    def Compile(self):
        """
        Compile all patterns for filtered matching.
        
        This prepares the filter for high-performance matching
        by analyzing patterns and building prefilter structures.
        
        Returns:
            bool: True if compilation successful
        """
    
    def Match(self, text, potential=False):
        """
        Match text against all patterns.
        
        Args:
            text (str): Text to match
            potential (bool): If True, return potential matches for two-phase matching
        
        Returns:
            list: List of pattern indices that matched
        """
    
    def re(self, index):
        """
        Get the compiled RE2 object for a specific pattern.
        
        Args:
            index (int): Pattern index
        
        Returns:
            _Regexp: Compiled pattern object
        """

Example usage:

import re2

# Create filtered matcher
filter_matcher = re2.Filter()

# Add patterns for different data types
patterns = [
    r'\b\d{3}-\d{2}-\d{4}\b',           # SSN
    r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',  # Email
    r'\b\d{4}-\d{4}-\d{4}-\d{4}\b',     # Credit card
    r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\b',  # IP address
]

pattern_indices = []
for pattern in patterns:
    idx = filter_matcher.Add(pattern)
    pattern_indices.append(idx)

# Compile for optimized matching
filter_matcher.Compile()

# Match large text efficiently
large_text = """
John's email is john@example.com and his SSN is 123-45-6789.
The server IP is 192.168.1.100 and payment was made with card 1234-5678-9012-3456.
"""

matches = filter_matcher.Match(large_text)
data_types = ['SSN', 'Email', 'Credit Card', 'IP Address']

for i, match_idx in enumerate(matches):
    if match_idx in pattern_indices:
        idx_pos = pattern_indices.index(match_idx)
        print(f"Found {data_types[idx_pos]}")
        
        # Get specific pattern for detailed matching
        specific_pattern = filter_matcher.re(match_idx)
        match_obj = specific_pattern.search(large_text)
        if match_obj:
            print(f"  Value: {match_obj.group()}")

Two-Phase Matching

For extremely high-performance scenarios, use two-phase matching with potential matches:

import re2

# Set up filter for two-phase matching
filter_matcher = re2.Filter()

# Add many patterns
sensitive_patterns = [
    r'\b\d{3}-\d{2}-\d{4}\b',      # SSN
    r'\b\d{4}-\d{4}-\d{4}-\d{4}\b', # Credit card
    r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', # Email
    # ... many more patterns
]

for pattern in sensitive_patterns:
    filter_matcher.Add(pattern)

filter_matcher.Compile()

def scan_text_efficiently(text):
    # Phase 1: Fast prefiltering to get potential matches
    potential_matches = filter_matcher.Match(text, potential=True)
    
    if not potential_matches:
        return []  # No potential matches, skip phase 2
    
    # Phase 2: Detailed matching only for potential patterns
    actual_matches = []
    for pattern_idx in potential_matches:
        pattern = filter_matcher.re(pattern_idx)
        if pattern.search(text):
            actual_matches.append(pattern_idx)
    
    return actual_matches

# Scan large volumes of text efficiently
texts_to_scan = [
    "Document 1 with email@example.com",
    "Document 2 with SSN 123-45-6789", 
    "Document 3 with no sensitive data",
    # ... thousands of documents
]

for i, text in enumerate(texts_to_scan):
    matches = scan_text_efficiently(text)
    if matches:
        print(f"Document {i+1} contains sensitive data (patterns: {matches})")

Anchor Modes for Sets

import re2

# Different anchor modes for pattern sets
text = "email@example.com is my address"

# Search set (unanchored) - finds patterns anywhere
search_set = re2.Set.SearchSet()
search_set.Add(r'email@\w+\.com')
search_set.Compile()
matches = search_set.Match(text)  # Will find the email

# Match set (anchored at start) - requires pattern at beginning
match_set = re2.Set.MatchSet()
match_set.Add(r'email@\w+\.com')
match_set.Compile()
matches = match_set.Match(text)  # Will find the email (it's at start)

# Full match set - requires pattern to match entire text
full_set = re2.Set.FullMatchSet()
full_set.Add(r'email@\w+\.com is my address')
full_set.Compile()
matches = full_set.Match(text)  # Will match (pattern matches entire text)

Performance Considerations

import re2

# For maximum performance with many patterns:

# 1. Use Filter for better prefiltering
filter_matcher = re2.Filter()

# 2. Use performance-optimized options
options = re2.Options()
options.never_capture = True    # Disable capturing if not needed
options.never_nl = True        # Optimize newline handling
options.max_mem = 67108864     # Increase memory limit if needed

# 3. Add patterns with optimized options
for pattern in large_pattern_list:
    filter_matcher.Add(pattern, options)

filter_matcher.Compile()

# 4. Use two-phase matching for large texts
def efficient_scan(text):
    potentials = filter_matcher.Match(text, potential=True)
    if not potentials:
        return []
    
    # Only do expensive full matching on potential matches
    return [idx for idx in potentials 
            if filter_matcher.re(idx).search(text)]

Error Handling for Advanced Features

import re2

# Handle compilation errors
try:
    pattern_set = re2.Set.SearchSet()
    pattern_set.Add(r'[invalid')  # Invalid pattern
    pattern_set.Compile()
except re2.error as e:
    print(f"Set compilation failed: {e}")

# Handle filter errors
try:
    filter_matcher = re2.Filter()
    filter_matcher.Add(r'(?P<invalid')  # Invalid named group
    filter_matcher.Compile()
except re2.error as e:
    print(f"Filter compilation failed: {e}")

# Safe pattern addition
def safe_add_to_set(pattern_set, pattern):
    try:
        return pattern_set.Add(pattern)
    except re2.error:
        print(f"Skipping invalid pattern: {pattern}")
        return None

Install with Tessl CLI