Tessl Tile for pypi/regex@2025.9.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

classes-types.md compilation-utilities.md flags-constants.md index.md pattern-matching.md splitting.md substitution.md

splitting.mddocs/

0
# String Splitting Functions
1

2
Pattern-based string splitting capabilities that provide enhanced control over text segmentation operations. These functions support maximum split limits, concurrent execution, timeout handling, and memory-efficient iterator-based processing for large texts.
3

4
## Capabilities
5

6
### Pattern-Based String Splitting
7

8
Split a string by pattern occurrences, returning a list of substrings with enhanced control over the splitting operation.
9

10
```python { .api }
11
def split(pattern, string, maxsplit=0, flags=0, concurrent=None,
12
          timeout=None, ignore_unused=False, **kwargs):
13
    """
14
    Split string by pattern occurrences, returning a list containing the resulting substrings.
15
    
16
    Args:
17
        pattern (str): Regular expression pattern to split on
18
        string (str): String to split
19
        maxsplit (int, optional): Maximum number of splits (0 = no limit)
20
        flags (int, optional): Regex flags to modify matching behavior
21
        concurrent (bool, optional): Release GIL during matching for multithreading
22
        timeout (float, optional): Timeout in seconds for matching operation
23
        ignore_unused (bool, optional): Ignore unused keyword arguments
24
        **kwargs: Additional pattern compilation arguments
25
    
26
    Returns:
27
        list: List of substrings after splitting
28
    """
29
```
30

31
**Usage Examples:**
32

33
```python
34
import regex
35

36
# Basic splitting on whitespace
37
result = regex.split(r'\s+', 'one   two\tthree\nfour')
38
print(result)  # ['one', 'two', 'three', 'four']
39

40
# Split with maximum splits
41
result = regex.split(r',\s*', 'apple, banana, cherry, date', maxsplit=2)
42
print(result)  # ['apple', 'banana', 'cherry, date']
43

44
# Split on multiple delimiters
45
result = regex.split(r'[,;:|]+', 'data,separated;by:various|delimiters')
46
print(result)  # ['data', 'separated', 'by', 'various', 'delimiters']
47

48
# Split preserving capture groups
49
result = regex.split(r'(\s+)', 'one two three')
50
print(result)  # ['one', ' ', 'two', ' ', 'three']
51

52
# Split on word boundaries
53
result = regex.split(r'\b', 'hello-world test')
54
print(result)  # ['', 'hello', '-', 'world', ' ', 'test', '']
55

56
# Case-insensitive splitting
57
result = regex.split(r'and', 'cats AND dogs and birds', flags=regex.IGNORECASE)
58
print(result)  # ['cats ', ' dogs ', ' birds']
59
```
60

61
### Iterator-Based String Splitting
62

63
Return an iterator yielding split string parts, providing memory-efficient processing for large texts or when you need to process splits incrementally.
64

65
```python { .api }
66
def splititer(pattern, string, maxsplit=0, flags=0, concurrent=None,
67
              timeout=None, ignore_unused=False, **kwargs):
68
    """
69
    Return an iterator yielding the parts of a split string.
70
    
71
    Args:
72
        pattern (str): Regular expression pattern to split on
73
        string (str): String to split
74
        maxsplit (int, optional): Maximum number of splits (0 = no limit)
75
        flags (int, optional): Regex flags to modify matching behavior
76
        concurrent (bool, optional): Release GIL during matching for multithreading
77
        timeout (float, optional): Timeout in seconds for matching operation
78
        ignore_unused (bool, optional): Ignore unused keyword arguments
79
        **kwargs: Additional pattern compilation arguments
80
    
81
    Returns:
82
        iterator: Iterator yielding string parts
83
    """
84
```
85

86
**Usage Examples:**
87

88
```python
89
import regex
90

91
# Memory-efficient splitting of large text
92
def process_large_file(filename):
93
    with open(filename, 'r') as f:
94
        content = f.read()
95
    
96
    # Process one paragraph at a time without loading all splits into memory
97
    for paragraph in regex.splititer(r'\n\s*\n', content):
98
        if paragraph.strip():  # Skip empty paragraphs
99
            yield process_paragraph(paragraph)
100

101
# Iterator over sentence splits
102
text = 'First sentence. Second sentence! Third sentence?'
103
for i, sentence in enumerate(regex.splititer(r'[.!?]+\s*', text)):
104
    if sentence.strip():
105
        print(f"Sentence {i+1}: {sentence.strip()}")
106

107
# Lazy evaluation with maximum splits
108
text = 'a,b,c,d,e,f,g,h,i,j'
109
splits = regex.splititer(r',', text, maxsplit=3)
110
for i, part in enumerate(splits):
111
    print(f"Part {i}: {part}")
112
    if i >= 2:  # Process only first few parts
113
        break
114

115
# Generator for processing CSV-like data
116
def parse_csv_line(line):
117
    for field in regex.splititer(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', line):
118
        yield field.strip().strip('"')
119

120
line = 'name,"description, with comma",price,quantity'
121
fields = list(parse_csv_line(line))
122
print(fields)  # ['name', 'description, with comma', 'price', 'quantity']
123
```
124

125
## Advanced Splitting Features
126

127
### Splitting with Capture Groups
128

129
When capture groups are present in the pattern, they are included in the result:
130

131
```python
132
# Include delimiters in result
133
result = regex.split(r'(\s+)', 'word1   word2\tword3')
134
print(result)  # ['word1', '   ', 'word2', '\t', 'word3']
135

136
# Multiple capture groups
137
result = regex.split(r'(\d+)([a-z]+)', 'abc123def456ghi')
138
print(result)  # ['abc', '123', 'def', '456', 'ghi']
139

140
# Named capture groups
141
result = regex.split(r'(?P<num>\d+)(?P<sep>[,-])', 'item1,item2-item3')
142
print(result)  # ['item', '1', ',', 'item2', '2', '-', 'item3']
143
```
144

145
### Empty String Handling
146

147
Understanding how empty strings are handled in splits:
148

149
```python
150
# Leading/trailing delimiters create empty strings
151
result = regex.split(r',', ',a,b,c,')
152
print(result)  # ['', 'a', 'b', 'c', '']
153

154
# Consecutive delimiters
155
result = regex.split(r',+', 'a,,b,,,c')
156
print(result)  # ['a', 'b', 'c']
157

158
# Filter empty strings if needed
159
result = [s for s in regex.split(r',', ',a,,b,c,') if s]
160
print(result)  # ['a', 'b', 'c']
161
```
162

163
### Complex Pattern Splitting
164

165
Advanced splitting patterns for specific use cases:
166

167
```python
168
# Split on balanced parentheses
169
def split_balanced_parens(text):
170
    # This is a simplified example - full balanced parentheses require recursive patterns
171
    return regex.split(r'\([^)]*\)', text)
172

173
# Split preserving quoted strings
174
result = regex.split(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', 'a,"b,c",d')
175
print(result)  # ['a', '"b,c"', 'd']
176

177
# Split on word boundaries but preserve certain characters
178
result = regex.split(r'(?<=\w)(?=\W)|(?<=\W)(?=\w)', 'hello-world.test')
179
print(result)  # ['hello', '-', 'world', '.', 'test']
180
```
181

182
### Performance Considerations
183

184
```python
185
# Use concurrent execution for large texts
186
large_text = open('large_file.txt').read()
187
result = regex.split(r'\n', large_text, concurrent=True)
188

189
# Set timeout for complex patterns
190
try:
191
    result = regex.split(complex_pattern, text, timeout=5.0)
192
except regex.error as e:
193
    print(f"Split operation timed out: {e}")
194

195
# Use iterator for memory efficiency
196
def count_paragraphs(text):
197
    count = 0
198
    for paragraph in regex.splititer(r'\n\s*\n', text):
199
        if paragraph.strip():
200
            count += 1
201
    return count
202
```
203

204
### Reverse Splitting
205

206
Use the REVERSE flag for right-to-left splitting:
207

208
```python
209
# Split from right to left with maximum splits
210
result = regex.split(r'\.', 'path.to.file.ext', maxsplit=1, flags=regex.REVERSE)
211
print(result)  # ['path.to.file', 'ext']
212

213
# Compare with normal left-to-right splitting
214
result = regex.split(r'\.', 'path.to.file.ext', maxsplit=1)
215
print(result)  # ['path', 'to.file.ext']
216
```
217

218
### Unicode and Locale-Aware Splitting
219

220
```python
221
# Unicode-aware word boundary splitting
222
result = regex.split(r'\b', 'hello мир world', flags=regex.UNICODE)
223
print(result)  # Properly handles Unicode word boundaries
224

225
# Locale-aware character class splitting
226
result = regex.split(r'[[:space:]]+', 'word1\xa0word2\u2000word3', flags=regex.LOCALE)
227
print(result)  # Handles locale-specific whitespace characters
228
```

Version

Tile

Files

splitting.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

splitting.mddocs/