RE2 - A regular expression library with linear time guarantees
—
Functions for splitting text and performing substitutions using regular expressions. These operations are fundamental for text processing, data cleaning, and string manipulation tasks.
Splits text into a list using a regular expression pattern as the delimiter, with optional control over the maximum number of splits.
def split(pattern, text, maxsplit=0, options=None):
"""
Split text by pattern occurrences.
Args:
pattern (str): Regular expression pattern used as delimiter
text (str): Input text to split
maxsplit (int): Maximum number of splits (0 = no limit)
options (Options, optional): Compilation options
Returns:
list: List of text segments
"""Example usage:
import re2
# Split on whitespace
text = "apple banana cherry"
parts = re2.split(r'\s+', text)
print(parts) # ['apple', 'banana', 'cherry']
# Split with limit
text = "one,two,three,four"
parts = re2.split(r',', text, maxsplit=2)
print(parts) # ['one', 'two', 'three,four']
# Split capturing delimiter
text = "word1,word2;word3"
parts = re2.split(r'([,;])', text)
print(parts) # ['word1', ',', 'word2', ';', 'word3']Replaces occurrences of a pattern with a replacement string, with optional control over the number of replacements.
def sub(pattern, repl, text, count=0, options=None):
"""
Replace pattern occurrences with replacement string.
Args:
pattern (str): Regular expression pattern to match
repl (str or callable): Replacement string or function
text (str): Input text to process
count (int): Maximum number of replacements (0 = all)
options (Options, optional): Compilation options
Returns:
str: Text with replacements made
"""Example usage:
import re2
# Simple replacement
text = "Hello world"
result = re2.sub(r'world', 'universe', text)
print(result) # "Hello universe"
# Replace with group references
text = "John Smith, Jane Doe"
result = re2.sub(r'(\w+) (\w+)', r'\2, \1', text)
print(result) # "Smith, John, Doe, Jane"
# Limited replacements
text = "foo foo foo"
result = re2.sub(r'foo', 'bar', text, count=2)
print(result) # "bar bar foo"
# Using callable replacement
def upper_match(match):
return match.group().upper()
text = "hello world"
result = re2.sub(r'\w+', upper_match, text)
print(result) # "HELLO WORLD"Performs substitution like sub() but returns both the modified text and the number of substitutions made.
def subn(pattern, repl, text, count=0, options=None):
"""
Replace pattern occurrences and return (result, count).
Args:
pattern (str): Regular expression pattern to match
repl (str or callable): Replacement string or function
text (str): Input text to process
count (int): Maximum number of replacements (0 = all)
options (Options, optional): Compilation options
Returns:
tuple: (modified_text, substitution_count)
"""Example usage:
import re2
# Get substitution count
text = "The quick brown fox jumps over the lazy dog"
result, num_subs = re2.subn(r'\b\w{4}\b', 'WORD', text)
print(result) # "The quick brown WORD jumps WORD the WORD dog"
print(num_subs) # 3
# Check if any substitutions were made
text = "no matches here"
result, num_subs = re2.subn(r'\d+', 'NUMBER', text)
if num_subs == 0:
print("No changes made")Additional text processing utilities for escaping special characters and managing compiled pattern cache.
def escape(pattern):
"""
Escape special regex characters in pattern.
Args:
pattern (str): String to escape
Returns:
str: Pattern with special characters escaped
"""
def purge():
"""
Clear the compiled regular expression cache.
This function clears the internal LRU cache that stores
compiled pattern objects for better performance.
"""Example usage:
import re2
# Escape special characters
literal_text = "Price: $19.99 (20% off)"
escaped = re2.escape(literal_text)
print(escaped) # "Price: \$19\.99 \(20% off\)"
# Use escaped text as literal pattern
text = "Item costs $19.99 (20% off) today"
match = re2.search(escaped, text)
print(match is not None) # True
# Clear pattern cache (useful for memory management)
re2.purge()When using compiled pattern objects, text processing methods are available as instance methods:
class _Regexp:
"""Compiled regular expression pattern object."""
def split(text, maxsplit=0):
"""Split text using this pattern as delimiter."""
def sub(repl, text, count=0):
"""Replace matches with replacement string."""
def subn(repl, text, count=0):
"""Replace matches and return (result, count)."""Example usage:
import re2
# Compile pattern once, use multiple times
pattern = re2.compile(r'[,;]\s*')
text1 = "apple, banana; cherry"
text2 = "red,green;blue"
# Split multiple texts with same pattern
parts1 = pattern.split(text1) # ['apple', 'banana', 'cherry']
parts2 = pattern.split(text2) # ['red', 'green', 'blue']
# Replace using compiled pattern
result = pattern.sub(' | ', text1) # "apple | banana | cherry"Install with Tessl CLI
npx tessl i tessl/pypi-google-re2