Alternative regular expression module providing enhanced pattern matching, fuzzy matching, and advanced Unicode support as a replacement for Python's re module.
—
Pattern-based string splitting capabilities that provide enhanced control over text segmentation operations. These functions support maximum split limits, concurrent execution, timeout handling, and memory-efficient iterator-based processing for large texts.
Split a string by pattern occurrences, returning a list of substrings with enhanced control over the splitting operation.
def split(pattern, string, maxsplit=0, flags=0, concurrent=None,
timeout=None, ignore_unused=False, **kwargs):
"""
Split string by pattern occurrences, returning a list containing the resulting substrings.
Args:
pattern (str): Regular expression pattern to split on
string (str): String to split
maxsplit (int, optional): Maximum number of splits (0 = no limit)
flags (int, optional): Regex flags to modify matching behavior
concurrent (bool, optional): Release GIL during matching for multithreading
timeout (float, optional): Timeout in seconds for matching operation
ignore_unused (bool, optional): Ignore unused keyword arguments
**kwargs: Additional pattern compilation arguments
Returns:
list: List of substrings after splitting
"""Usage Examples:
import regex
# Basic splitting on whitespace
result = regex.split(r'\s+', 'one two\tthree\nfour')
print(result) # ['one', 'two', 'three', 'four']
# Split with maximum splits
result = regex.split(r',\s*', 'apple, banana, cherry, date', maxsplit=2)
print(result) # ['apple', 'banana', 'cherry, date']
# Split on multiple delimiters
result = regex.split(r'[,;:|]+', 'data,separated;by:various|delimiters')
print(result) # ['data', 'separated', 'by', 'various', 'delimiters']
# Split preserving capture groups
result = regex.split(r'(\s+)', 'one two three')
print(result) # ['one', ' ', 'two', ' ', 'three']
# Split on word boundaries
result = regex.split(r'\b', 'hello-world test')
print(result) # ['', 'hello', '-', 'world', ' ', 'test', '']
# Case-insensitive splitting
result = regex.split(r'and', 'cats AND dogs and birds', flags=regex.IGNORECASE)
print(result) # ['cats ', ' dogs ', ' birds']Return an iterator yielding split string parts, providing memory-efficient processing for large texts or when you need to process splits incrementally.
def splititer(pattern, string, maxsplit=0, flags=0, concurrent=None,
timeout=None, ignore_unused=False, **kwargs):
"""
Return an iterator yielding the parts of a split string.
Args:
pattern (str): Regular expression pattern to split on
string (str): String to split
maxsplit (int, optional): Maximum number of splits (0 = no limit)
flags (int, optional): Regex flags to modify matching behavior
concurrent (bool, optional): Release GIL during matching for multithreading
timeout (float, optional): Timeout in seconds for matching operation
ignore_unused (bool, optional): Ignore unused keyword arguments
**kwargs: Additional pattern compilation arguments
Returns:
iterator: Iterator yielding string parts
"""Usage Examples:
import regex
# Memory-efficient splitting of large text
def process_large_file(filename):
with open(filename, 'r') as f:
content = f.read()
# Process one paragraph at a time without loading all splits into memory
for paragraph in regex.splititer(r'\n\s*\n', content):
if paragraph.strip(): # Skip empty paragraphs
yield process_paragraph(paragraph)
# Iterator over sentence splits
text = 'First sentence. Second sentence! Third sentence?'
for i, sentence in enumerate(regex.splititer(r'[.!?]+\s*', text)):
if sentence.strip():
print(f"Sentence {i+1}: {sentence.strip()}")
# Lazy evaluation with maximum splits
text = 'a,b,c,d,e,f,g,h,i,j'
splits = regex.splititer(r',', text, maxsplit=3)
for i, part in enumerate(splits):
print(f"Part {i}: {part}")
if i >= 2: # Process only first few parts
break
# Generator for processing CSV-like data
def parse_csv_line(line):
for field in regex.splititer(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', line):
yield field.strip().strip('"')
line = 'name,"description, with comma",price,quantity'
fields = list(parse_csv_line(line))
print(fields) # ['name', 'description, with comma', 'price', 'quantity']When capture groups are present in the pattern, they are included in the result:
# Include delimiters in result
result = regex.split(r'(\s+)', 'word1 word2\tword3')
print(result) # ['word1', ' ', 'word2', '\t', 'word3']
# Multiple capture groups
result = regex.split(r'(\d+)([a-z]+)', 'abc123def456ghi')
print(result) # ['abc', '123', 'def', '456', 'ghi']
# Named capture groups
result = regex.split(r'(?P<num>\d+)(?P<sep>[,-])', 'item1,item2-item3')
print(result) # ['item', '1', ',', 'item2', '2', '-', 'item3']Understanding how empty strings are handled in splits:
# Leading/trailing delimiters create empty strings
result = regex.split(r',', ',a,b,c,')
print(result) # ['', 'a', 'b', 'c', '']
# Consecutive delimiters
result = regex.split(r',+', 'a,,b,,,c')
print(result) # ['a', 'b', 'c']
# Filter empty strings if needed
result = [s for s in regex.split(r',', ',a,,b,c,') if s]
print(result) # ['a', 'b', 'c']Advanced splitting patterns for specific use cases:
# Split on balanced parentheses
def split_balanced_parens(text):
# This is a simplified example - full balanced parentheses require recursive patterns
return regex.split(r'\([^)]*\)', text)
# Split preserving quoted strings
result = regex.split(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', 'a,"b,c",d')
print(result) # ['a', '"b,c"', 'd']
# Split on word boundaries but preserve certain characters
result = regex.split(r'(?<=\w)(?=\W)|(?<=\W)(?=\w)', 'hello-world.test')
print(result) # ['hello', '-', 'world', '.', 'test']# Use concurrent execution for large texts
large_text = open('large_file.txt').read()
result = regex.split(r'\n', large_text, concurrent=True)
# Set timeout for complex patterns
try:
result = regex.split(complex_pattern, text, timeout=5.0)
except regex.error as e:
print(f"Split operation timed out: {e}")
# Use iterator for memory efficiency
def count_paragraphs(text):
count = 0
for paragraph in regex.splititer(r'\n\s*\n', text):
if paragraph.strip():
count += 1
return countUse the REVERSE flag for right-to-left splitting:
# Split from right to left with maximum splits
result = regex.split(r'\.', 'path.to.file.ext', maxsplit=1, flags=regex.REVERSE)
print(result) # ['path.to.file', 'ext']
# Compare with normal left-to-right splitting
result = regex.split(r'\.', 'path.to.file.ext', maxsplit=1)
print(result) # ['path', 'to.file.ext']# Unicode-aware word boundary splitting
result = regex.split(r'\b', 'hello мир world', flags=regex.UNICODE)
print(result) # Properly handles Unicode word boundaries
# Locale-aware character class splitting
result = regex.split(r'[[:space:]]+', 'word1\xa0word2\u2000word3', flags=regex.LOCALE)
print(result) # Handles locale-specific whitespace charactersInstall with Tessl CLI
npx tessl i tessl/pypi-regex