tessl/pypi-regex

Alternative regular expression module providing enhanced pattern matching, fuzzy matching, and advanced Unicode support as a replacement for Python's re module.

—

Pending

Overview

Eval results

Files

String Splitting Functions

Name: tessl/pypi-regex
Author: tessl

Pattern-based string splitting capabilities that provide enhanced control over text segmentation operations. These functions support maximum split limits, concurrent execution, timeout handling, and memory-efficient iterator-based processing for large texts.

Capabilities

Pattern-Based String Splitting

Split a string by pattern occurrences, returning a list of substrings with enhanced control over the splitting operation.

def split(pattern, string, maxsplit=0, flags=0, concurrent=None,
          timeout=None, ignore_unused=False, **kwargs):
    """
    Split string by pattern occurrences, returning a list containing the resulting substrings.
    
    Args:
        pattern (str): Regular expression pattern to split on
        string (str): String to split
        maxsplit (int, optional): Maximum number of splits (0 = no limit)
        flags (int, optional): Regex flags to modify matching behavior
        concurrent (bool, optional): Release GIL during matching for multithreading
        timeout (float, optional): Timeout in seconds for matching operation
        ignore_unused (bool, optional): Ignore unused keyword arguments
        **kwargs: Additional pattern compilation arguments
    
    Returns:
        list: List of substrings after splitting
    """

Usage Examples:

import regex

# Basic splitting on whitespace
result = regex.split(r'\s+', 'one   two\tthree\nfour')
print(result)  # ['one', 'two', 'three', 'four']

# Split with maximum splits
result = regex.split(r',\s*', 'apple, banana, cherry, date', maxsplit=2)
print(result)  # ['apple', 'banana', 'cherry, date']

# Split on multiple delimiters
result = regex.split(r'[,;:|]+', 'data,separated;by:various|delimiters')
print(result)  # ['data', 'separated', 'by', 'various', 'delimiters']

# Split preserving capture groups
result = regex.split(r'(\s+)', 'one two three')
print(result)  # ['one', ' ', 'two', ' ', 'three']

# Split on word boundaries
result = regex.split(r'\b', 'hello-world test')
print(result)  # ['', 'hello', '-', 'world', ' ', 'test', '']

# Case-insensitive splitting
result = regex.split(r'and', 'cats AND dogs and birds', flags=regex.IGNORECASE)
print(result)  # ['cats ', ' dogs ', ' birds']

Iterator-Based String Splitting

Return an iterator yielding split string parts, providing memory-efficient processing for large texts or when you need to process splits incrementally.

def splititer(pattern, string, maxsplit=0, flags=0, concurrent=None,
              timeout=None, ignore_unused=False, **kwargs):
    """
    Return an iterator yielding the parts of a split string.
    
    Args:
        pattern (str): Regular expression pattern to split on
        string (str): String to split
        maxsplit (int, optional): Maximum number of splits (0 = no limit)
        flags (int, optional): Regex flags to modify matching behavior
        concurrent (bool, optional): Release GIL during matching for multithreading
        timeout (float, optional): Timeout in seconds for matching operation
        ignore_unused (bool, optional): Ignore unused keyword arguments
        **kwargs: Additional pattern compilation arguments
    
    Returns:
        iterator: Iterator yielding string parts
    """

Usage Examples:

import regex

# Memory-efficient splitting of large text
def process_large_file(filename):
    with open(filename, 'r') as f:
        content = f.read()
    
    # Process one paragraph at a time without loading all splits into memory
    for paragraph in regex.splititer(r'\n\s*\n', content):
        if paragraph.strip():  # Skip empty paragraphs
            yield process_paragraph(paragraph)

# Iterator over sentence splits
text = 'First sentence. Second sentence! Third sentence?'
for i, sentence in enumerate(regex.splititer(r'[.!?]+\s*', text)):
    if sentence.strip():
        print(f"Sentence {i+1}: {sentence.strip()}")

# Lazy evaluation with maximum splits
text = 'a,b,c,d,e,f,g,h,i,j'
splits = regex.splititer(r',', text, maxsplit=3)
for i, part in enumerate(splits):
    print(f"Part {i}: {part}")
    if i >= 2:  # Process only first few parts
        break

# Generator for processing CSV-like data
def parse_csv_line(line):
    for field in regex.splititer(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', line):
        yield field.strip().strip('"')

line = 'name,"description, with comma",price,quantity'
fields = list(parse_csv_line(line))
print(fields)  # ['name', 'description, with comma', 'price', 'quantity']

Advanced Splitting Features

Splitting with Capture Groups

When capture groups are present in the pattern, they are included in the result:

# Include delimiters in result
result = regex.split(r'(\s+)', 'word1   word2\tword3')
print(result)  # ['word1', '   ', 'word2', '\t', 'word3']

# Multiple capture groups
result = regex.split(r'(\d+)([a-z]+)', 'abc123def456ghi')
print(result)  # ['abc', '123', 'def', '456', 'ghi']

# Named capture groups
result = regex.split(r'(?P<num>\d+)(?P<sep>[,-])', 'item1,item2-item3')
print(result)  # ['item', '1', ',', 'item2', '2', '-', 'item3']

Empty String Handling

Understanding how empty strings are handled in splits:

# Leading/trailing delimiters create empty strings
result = regex.split(r',', ',a,b,c,')
print(result)  # ['', 'a', 'b', 'c', '']

# Consecutive delimiters
result = regex.split(r',+', 'a,,b,,,c')
print(result)  # ['a', 'b', 'c']

# Filter empty strings if needed
result = [s for s in regex.split(r',', ',a,,b,c,') if s]
print(result)  # ['a', 'b', 'c']

Complex Pattern Splitting

Advanced splitting patterns for specific use cases:

# Split on balanced parentheses
def split_balanced_parens(text):
    # This is a simplified example - full balanced parentheses require recursive patterns
    return regex.split(r'\([^)]*\)', text)

# Split preserving quoted strings
result = regex.split(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', 'a,"b,c",d')
print(result)  # ['a', '"b,c"', 'd']

# Split on word boundaries but preserve certain characters
result = regex.split(r'(?<=\w)(?=\W)|(?<=\W)(?=\w)', 'hello-world.test')
print(result)  # ['hello', '-', 'world', '.', 'test']

Performance Considerations

# Use concurrent execution for large texts
large_text = open('large_file.txt').read()
result = regex.split(r'\n', large_text, concurrent=True)

# Set timeout for complex patterns
try:
    result = regex.split(complex_pattern, text, timeout=5.0)
except regex.error as e:
    print(f"Split operation timed out: {e}")

# Use iterator for memory efficiency
def count_paragraphs(text):
    count = 0
    for paragraph in regex.splititer(r'\n\s*\n', text):
        if paragraph.strip():
            count += 1
    return count

Reverse Splitting

Use the REVERSE flag for right-to-left splitting:

# Split from right to left with maximum splits
result = regex.split(r'\.', 'path.to.file.ext', maxsplit=1, flags=regex.REVERSE)
print(result)  # ['path.to.file', 'ext']

# Compare with normal left-to-right splitting
result = regex.split(r'\.', 'path.to.file.ext', maxsplit=1)
print(result)  # ['path', 'to.file.ext']

Unicode and Locale-Aware Splitting

# Unicode-aware word boundary splitting
result = regex.split(r'\b', 'hello мир world', flags=regex.UNICODE)
print(result)  # Properly handles Unicode word boundaries

# Locale-aware character class splitting
result = regex.split(r'[[:space:]]+', 'word1\xa0word2\u2000word3', flags=regex.LOCALE)
print(result)  # Handles locale-specific whitespace characters

Install with Tessl CLI