0
# Utilities and Helpers
1
2
Utility functions for text processing, version handling, encoding detection, file I/O operations, and caching that support the parsing infrastructure and provide convenient helpers for working with Python code.
3
4
## Capabilities
5
6
### Text Processing
7
8
Utilities for handling Python source code text with proper encoding and line splitting.
9
10
```python { .api }
11
def split_lines(string, keepends=False):
12
"""
13
Split Python code into lines, handling form feeds correctly.
14
15
Unlike str.splitlines(), this treats form feeds as normal characters
16
and only splits on \\n and \\r\\n, which matches Python's behavior.
17
18
Args:
19
string (str): Text to split
20
keepends (bool): Keep line endings in result (default: False)
21
22
Returns:
23
list[str]: List of lines, always returns at least [''] for empty input
24
"""
25
26
def python_bytes_to_unicode(source, encoding='utf-8', errors='strict'):
27
"""
28
Convert bytes to unicode with Python encoding detection.
29
30
Handles Unicode BOMs and PEP 263 encoding declarations automatically.
31
32
Args:
33
source (str | bytes): Source code as string or bytes
34
encoding (str): Default encoding if none detected (default: 'utf-8')
35
errors (str): Error handling strategy ('strict', 'replace', 'ignore')
36
37
Returns:
38
str: Unicode string representation of source code
39
40
Raises:
41
LookupError: If detected encoding is invalid and errors='strict'
42
"""
43
```
44
45
#### Usage Examples
46
47
```python
48
from parso.utils import split_lines, python_bytes_to_unicode
49
50
# Line splitting - Python-aware
51
code = "line1\nline2\r\nline3\f\nline4" # Form feed between line3 and line4
52
lines = split_lines(code)
53
print("Lines:", lines) # ['line1', 'line2', 'line3\fline4']
54
55
# With line endings preserved
56
lines_with_ends = split_lines(code, keepends=True)
57
print("With endings:", lines_with_ends)
58
59
# Encoding detection from bytes
60
latin1_code = b'# -*- coding: latin-1 -*-\ntext = "caf\xe9"'
61
unicode_code = python_bytes_to_unicode(latin1_code)
62
print("Detected and converted:", repr(unicode_code))
63
64
# UTF-8 BOM handling
65
utf8_bom = b'\xef\xbb\xbfprint("hello world")'
66
clean_code = python_bytes_to_unicode(utf8_bom)
67
print("BOM removed:", repr(clean_code))
68
69
# Error handling options
70
invalid_bytes = b'\xff\xfe invalid encoding'
71
safe_code = python_bytes_to_unicode(invalid_bytes, errors='replace')
72
print("With replacements:", repr(safe_code))
73
```
74
75
### Version Handling
76
77
Classes and functions for working with Python version information.
78
79
```python { .api }
80
class Version:
81
"""
82
Parso version information.
83
84
Attributes:
85
major (int): Major version number
86
minor (int): Minor version number
87
micro (int): Micro version number
88
"""
89
90
class PythonVersionInfo:
91
"""
92
Python version information for grammar selection.
93
94
Attributes:
95
major (int): Python major version (e.g., 3)
96
minor (int): Python minor version (e.g., 9)
97
"""
98
99
def __gt__(self, other):
100
"""Compare versions (supports tuples)."""
101
102
def __eq__(self, other):
103
"""Check version equality (supports tuples)."""
104
105
def parse_version_string(version=None):
106
"""
107
Parse Python version string into version info.
108
109
Args:
110
version (str, optional): Version string like '3.8' or '3.10.1'
111
Defaults to current Python version
112
113
Returns:
114
PythonVersionInfo: Parsed version information
115
116
Raises:
117
ValueError: If version format is invalid
118
TypeError: If version is not a string
119
"""
120
121
def version_info():
122
"""
123
Get parso library version information.
124
125
Returns:
126
Version: Parso version as named tuple
127
"""
128
```
129
130
#### Usage Examples
131
132
```python
133
from parso.utils import parse_version_string, version_info, PythonVersionInfo
134
135
# Parse version strings
136
py38 = parse_version_string("3.8")
137
py310 = parse_version_string("3.10.5") # Micro version ignored
138
current = parse_version_string() # Uses sys.version_info
139
140
print(f"Python 3.8: {py38.major}.{py38.minor}")
141
print(f"Python 3.10: {py310.major}.{py310.minor}")
142
print(f"Current: {current.major}.{current.minor}")
143
144
# Version comparisons
145
if py310 > py38:
146
print("3.10 is newer than 3.8")
147
148
if py38 == (3, 8): # Compare with tuple
149
print("Version matches tuple")
150
151
# Get parso version
152
parso_version = version_info()
153
print(f"Parso version: {parso_version.major}.{parso_version.minor}.{parso_version.micro}")
154
155
# Version-specific feature detection
156
def supports_walrus_operator(version_info):
157
"""Check if Python version supports walrus operator."""
158
return version_info >= (3, 8)
159
160
def supports_match_statements(version_info):
161
"""Check if Python version supports match statements."""
162
return version_info >= (3, 10)
163
164
py_version = parse_version_string("3.9")
165
print(f"3.9 supports walrus: {supports_walrus_operator(py_version)}")
166
print(f"3.9 supports match: {supports_match_statements(py_version)}")
167
```
168
169
### File I/O Classes
170
171
File handling abstractions that support caching and content management.
172
173
```python { .api }
174
class FileIO:
175
"""
176
File I/O abstraction for reading Python source files.
177
178
Attributes:
179
path (Path): File path as pathlib.Path object
180
"""
181
182
def __init__(self, path):
183
"""
184
Initialize file I/O handler.
185
186
Args:
187
path (str | Path): File path to read
188
"""
189
190
def read(self):
191
"""
192
Read file contents as bytes.
193
194
Returns:
195
bytes: Raw file contents
196
"""
197
198
def get_last_modified(self):
199
"""
200
Get file modification timestamp.
201
202
Returns:
203
float | None: Timestamp or None if file doesn't exist
204
"""
205
206
class KnownContentFileIO(FileIO):
207
"""
208
File I/O wrapper for content that's already known.
209
210
Useful for parsing strings while maintaining file-like interface.
211
"""
212
213
def __init__(self, path, content):
214
"""
215
Initialize with known content.
216
217
Args:
218
path (str | Path): File path (can be None)
219
content (str | bytes): Known file content
220
"""
221
222
def read(self):
223
"""
224
Return the known content.
225
226
Returns:
227
str | bytes: The provided content
228
"""
229
```
230
231
#### Usage Examples
232
233
```python
234
from parso.file_io import FileIO, KnownContentFileIO
235
import parso
236
237
# Read from actual file
238
file_io = FileIO("/path/to/script.py")
239
content = file_io.read()
240
last_modified = file_io.get_last_modified()
241
242
# Parse using FileIO
243
grammar = parso.load_grammar()
244
module = grammar.parse(file_io=file_io, cache=True)
245
246
# Use known content (useful for in-memory parsing)
247
code = '''
248
def example():
249
return "hello world"
250
'''
251
252
known_io = KnownContentFileIO("virtual_file.py", code)
253
module = grammar.parse(file_io=known_io)
254
255
# File I/O with caching
256
def parse_file_with_caching(file_path):
257
"""Parse file with automatic caching."""
258
file_io = FileIO(file_path)
259
260
# Check if file exists and get modification time
261
mod_time = file_io.get_last_modified()
262
if mod_time is None:
263
raise FileNotFoundError(f"File not found: {file_path}")
264
265
grammar = parso.load_grammar()
266
return grammar.parse(file_io=file_io, cache=True)
267
268
# Virtual file for testing
269
def create_test_module(code_string, filename="test.py"):
270
"""Create module from string with virtual filename."""
271
file_io = KnownContentFileIO(filename, code_string)
272
grammar = parso.load_grammar()
273
return grammar.parse(file_io=file_io)
274
275
test_module = create_test_module('x = 42')
276
```
277
278
### Cache Management
279
280
Functions for managing parso's parser cache system.
281
282
```python { .api }
283
def load_module(hashed_grammar, file_io, cache_path=None):
284
"""
285
Load cached parsed module.
286
287
Args:
288
hashed_grammar (str): Grammar hash identifier
289
file_io (FileIO): File I/O handler
290
cache_path (Path, optional): Custom cache directory
291
292
Returns:
293
NodeOrLeaf | None: Cached module or None if not cached/outdated
294
"""
295
296
def try_to_save_module(hashed_grammar, file_io, module, lines, pickling=True, cache_path=None):
297
"""
298
Save parsed module to cache.
299
300
Args:
301
hashed_grammar (str): Grammar hash
302
file_io (FileIO): File I/O handler
303
module (NodeOrLeaf): Parsed module to cache
304
lines (list[str]): Source code lines
305
pickling (bool): Enable disk caching (default: True)
306
cache_path (Path, optional): Custom cache directory
307
"""
308
309
def clear_cache(cache_path=None):
310
"""
311
Clear all cached files and in-memory cache.
312
313
Args:
314
cache_path (Path, optional): Cache directory to clear
315
"""
316
317
def clear_inactive_cache(cache_path=None, inactivity_threshold=2592000):
318
"""
319
Clear cached files that haven't been accessed recently.
320
321
Args:
322
cache_path (Path, optional): Cache directory
323
inactivity_threshold (int): Seconds of inactivity before removal
324
325
Returns:
326
bool: True if cleanup completed successfully
327
"""
328
```
329
330
#### Usage Examples
331
332
```python
333
import parso
334
import parso.cache
335
from pathlib import Path
336
337
# Manual cache management
338
def process_files_with_caching(file_paths):
339
"""Process multiple files with shared cache."""
340
grammar = parso.load_grammar()
341
342
for file_path in file_paths:
343
try:
344
# Parse with caching enabled
345
module = grammar.parse(path=file_path, cache=True)
346
print(f"Processed {file_path}: {len(module.children)} statements")
347
except Exception as e:
348
print(f"Error processing {file_path}: {e}")
349
350
# Cache statistics
351
def get_cache_stats():
352
"""Get information about current cache state."""
353
cache = parso.cache.parser_cache
354
355
total_grammars = len(cache)
356
total_files = sum(len(files) for files in cache.values())
357
358
return {
359
'grammars_cached': total_grammars,
360
'files_cached': total_files,
361
'cache_keys': list(cache.keys())
362
}
363
364
stats = get_cache_stats()
365
print("Cache statistics:", stats)
366
367
# Periodic cache cleanup
368
def cleanup_old_cache():
369
"""Clean up old cache files."""
370
print("Clearing inactive cache files...")
371
success = parso.cache.clear_inactive_cache()
372
373
if success:
374
print("Cache cleanup completed")
375
else:
376
print("Cache cleanup had issues")
377
378
# Custom cache directory
379
custom_cache = Path.home() / '.my_parso_cache'
380
grammar = parso.load_grammar()
381
module = grammar.parse(
382
path="example.py",
383
cache=True,
384
cache_path=custom_cache
385
)
386
387
# Clear specific cache directory
388
parso.cache.clear_cache(cache_path=custom_cache)
389
```
390
391
## Integration Patterns
392
393
### Encoding-Safe File Processing
394
395
```python
396
from parso.utils import python_bytes_to_unicode
397
from parso.file_io import FileIO
398
import parso
399
400
def safe_parse_file(file_path):
401
"""Safely parse file handling encoding issues."""
402
try:
403
# Read as bytes first
404
with open(file_path, 'rb') as f:
405
raw_content = f.read()
406
407
# Convert to unicode with encoding detection
408
unicode_content = python_bytes_to_unicode(raw_content, errors='replace')
409
410
# Parse the content
411
grammar = parso.load_grammar()
412
return grammar.parse(unicode_content)
413
414
except Exception as e:
415
print(f"Error parsing {file_path}: {e}")
416
return None
417
418
# Process directory of Python files
419
def process_python_directory(directory):
420
"""Process all Python files in directory safely."""
421
from pathlib import Path
422
423
python_files = Path(directory).glob("**/*.py")
424
425
for py_file in python_files:
426
module = safe_parse_file(py_file)
427
if module:
428
print(f"Successfully parsed: {py_file}")
429
else:
430
print(f"Failed to parse: {py_file}")
431
```
432
433
### Version-Aware Parsing
434
435
```python
436
from parso.utils import parse_version_string
437
import parso
438
439
def parse_with_version_detection(code):
440
"""Parse code with automatic version detection."""
441
442
# Try to detect version from code features
443
def detect_version_features(code):
444
"""Detect Python version from code features."""
445
if ':=' in code: # Walrus operator
446
return "3.8"
447
if 'match ' in code and 'case ' in code: # Match statements
448
return "3.10"
449
if '|' in code and 'Union' not in code: # Union types
450
return "3.10"
451
return "3.6" # Safe default
452
453
detected_version = detect_version_features(code)
454
version_info = parse_version_string(detected_version)
455
456
grammar = parso.load_grammar(version=f"{version_info.major}.{version_info.minor}")
457
return grammar.parse(code), detected_version
458
459
# Usage
460
code_samples = [
461
'x = 42', # Basic
462
'if (n := len(items)) > 5: pass', # Python 3.8 walrus
463
'''match value:
464
case 1: print("one")''', # Python 3.10 match
465
]
466
467
for code in code_samples:
468
module, version = parse_with_version_detection(code)
469
print(f"Parsed with Python {version}: {code[:30]}...")
470
```
471
472
### High-Performance Parsing
473
474
```python
475
import parso
476
from parso.cache import clear_inactive_cache
477
import time
478
479
class HighPerformanceParser:
480
"""Optimized parser for processing many files."""
481
482
def __init__(self, cache_cleanup_interval=3600): # 1 hour
483
self.grammar = parso.load_grammar()
484
self.last_cleanup = time.time()
485
self.cleanup_interval = cache_cleanup_interval
486
self.files_processed = 0
487
488
def parse_file(self, file_path):
489
"""Parse single file with optimizations."""
490
try:
491
# Use caching and differential parsing for performance
492
module = self.grammar.parse(
493
path=file_path,
494
cache=True,
495
diff_cache=True
496
)
497
498
self.files_processed += 1
499
500
# Periodic cache cleanup
501
if time.time() - self.last_cleanup > self.cleanup_interval:
502
clear_inactive_cache()
503
self.last_cleanup = time.time()
504
print(f"Cleaned cache after processing {self.files_processed} files")
505
506
return module
507
508
except Exception as e:
509
print(f"Error parsing {file_path}: {e}")
510
return None
511
512
def batch_parse(self, file_paths):
513
"""Parse multiple files efficiently."""
514
results = []
515
516
for file_path in file_paths:
517
result = self.parse_file(file_path)
518
if result:
519
results.append((file_path, result))
520
521
return results
522
523
# Usage
524
parser = HighPerformanceParser()
525
file_paths = ["file1.py", "file2.py", "file3.py"]
526
results = parser.batch_parse(file_paths)
527
print(f"Successfully parsed {len(results)} files")
528
```