0
# Dialects and Configuration
1
2
Dialect classes and configuration utilities for managing CSV parsing parameters. CleverCSV provides enhanced dialect support with the SimpleDialect class and utilities for working with various CSV formats and configurations.
3
4
## Capabilities
5
6
### SimpleDialect Class
7
8
Enhanced dialect class that provides a simplified and more flexible alternative to Python's csv.Dialect, with better support for CleverCSV's detection algorithms.
9
10
```python { .api }
11
class SimpleDialect:
12
"""
13
Simplified dialect object for CSV parsing configuration.
14
15
For delimiter, quotechar, and escapechar:
16
- Empty string ('') means no delimiter/quotechar/escapechar in the file
17
- None is used to mark it as undefined
18
"""
19
20
def __init__(
21
self,
22
delimiter: Optional[str],
23
quotechar: Optional[str],
24
escapechar: Optional[str],
25
strict: bool = False
26
):
27
"""
28
Create a new SimpleDialect.
29
30
Parameters:
31
- delimiter: Field delimiter character
32
- quotechar: Quote character for fields containing special characters
33
- escapechar: Escape character for escaping delimiters/quotes
34
- strict: Whether to enforce strict parsing
35
"""
36
37
def validate(self) -> None:
38
"""
39
Validate dialect parameters.
40
41
Raises:
42
ValueError: If any parameter is invalid
43
"""
44
45
def to_csv_dialect(self) -> csv.Dialect:
46
"""
47
Convert to standard csv.Dialect object.
48
49
Returns:
50
csv.Dialect compatible object
51
"""
52
53
def to_dict(self) -> Dict[str, Union[str, bool, None]]:
54
"""
55
Convert dialect to dictionary representation.
56
57
Returns:
58
Dictionary with dialect parameters
59
"""
60
61
def serialize(self) -> str:
62
"""
63
Serialize dialect to JSON string.
64
65
Returns:
66
JSON string representation of dialect
67
"""
68
69
@classmethod
70
def deserialize(cls, obj: str) -> 'SimpleDialect':
71
"""
72
Deserialize dialect from JSON string.
73
74
Parameters:
75
- obj: JSON string representation
76
77
Returns:
78
SimpleDialect instance
79
"""
80
81
@classmethod
82
def from_dict(cls, d: Dict[str, Any]) -> 'SimpleDialect':
83
"""
84
Create SimpleDialect from dictionary.
85
86
Parameters:
87
- d: Dictionary with dialect parameters
88
89
Returns:
90
SimpleDialect instance
91
"""
92
93
@classmethod
94
def from_csv_dialect(cls, d: csv.Dialect) -> 'SimpleDialect':
95
"""
96
Create SimpleDialect from csv.Dialect.
97
98
Parameters:
99
- d: csv.Dialect instance
100
101
Returns:
102
SimpleDialect instance
103
"""
104
```
105
106
#### Usage Examples
107
108
```python
109
import clevercsv
110
import json
111
112
# Create custom dialect
113
dialect = clevercsv.SimpleDialect(',', '"', '\\', strict=True)
114
print(f"Delimiter: '{dialect.delimiter}'")
115
print(f"Quote char: '{dialect.quotechar}'")
116
print(f"Escape char: '{dialect.escapechar}'")
117
118
# Validate dialect
119
try:
120
dialect.validate()
121
print("Dialect is valid")
122
except ValueError as e:
123
print(f"Invalid dialect: {e}")
124
125
# Convert to csv.Dialect for use with standard library
126
csv_dialect = dialect.to_csv_dialect()
127
with open('data.csv', 'r', newline='') as f:
128
reader = csv.reader(f, dialect=csv_dialect)
129
data = list(reader)
130
131
# Serialize dialect for storage
132
serialized = dialect.serialize()
133
print(f"Serialized: {serialized}")
134
135
# Deserialize dialect
136
restored_dialect = clevercsv.SimpleDialect.deserialize(serialized)
137
print(f"Restored: {restored_dialect}")
138
139
# Create from dictionary
140
dialect_dict = {'delimiter': ';', 'quotechar': "'", 'escapechar': '', 'strict': False}
141
dialect_from_dict = clevercsv.SimpleDialect.from_dict(dialect_dict)
142
143
# Create from csv.Dialect
144
csv_excel = csv.excel
145
simple_from_csv = clevercsv.SimpleDialect.from_csv_dialect(csv_excel)
146
```
147
148
### Predefined Dialects
149
150
CleverCSV provides access to standard CSV dialects for common formats.
151
152
```python { .api }
153
# Standard CSV dialects
154
excel: csv.Dialect # Excel-compatible format (comma-separated, quoted fields)
155
excel_tab: csv.Dialect # Excel tab-separated format
156
unix_dialect: csv.Dialect # Unix-style format (comma-separated, quoted fields, escaped quotes)
157
```
158
159
#### Usage Examples
160
161
```python
162
import clevercsv
163
164
# Use predefined dialects
165
with open('data.csv', 'r', newline='') as f:
166
reader = clevercsv.reader(f, dialect=clevercsv.excel)
167
data = list(reader)
168
169
# Compare dialects
170
print("Excel dialect:")
171
excel_simple = clevercsv.SimpleDialect.from_csv_dialect(clevercsv.excel)
172
print(f" Delimiter: '{excel_simple.delimiter}'")
173
print(f" Quote char: '{excel_simple.quotechar}'")
174
175
print("Unix dialect:")
176
unix_simple = clevercsv.SimpleDialect.from_csv_dialect(clevercsv.unix_dialect)
177
print(f" Delimiter: '{unix_simple.delimiter}'")
178
print(f" Quote char: '{unix_simple.quotechar}'")
179
print(f" Escape char: '{unix_simple.escapechar}'")
180
```
181
182
### Configuration Utilities
183
184
Utility functions for managing CSV parsing configuration and field size limits.
185
186
```python { .api }
187
def field_size_limit(*args, **kwargs) -> int:
188
"""
189
Get or set the field size limit for CSV parsing.
190
191
Parameters:
192
- limit (optional): New field size limit in characters
193
194
Returns:
195
Previous field size limit
196
197
Raises:
198
TypeError: If limit is not an integer or too many arguments provided
199
200
Notes:
201
- Default limit is 128KB (131,072 characters)
202
- Setting limit to 0 removes the limit (use with caution)
203
- Large limits may impact performance and memory usage
204
"""
205
```
206
207
#### Usage Examples
208
209
```python
210
import clevercsv
211
212
# Get current field size limit
213
current_limit = clevercsv.field_size_limit()
214
print(f"Current field size limit: {current_limit} characters")
215
216
# Set new field size limit
217
old_limit = clevercsv.field_size_limit(256 * 1024) # 256KB
218
print(f"Previous limit: {old_limit}, New limit: {clevercsv.field_size_limit()}")
219
220
# Remove field size limit (use with caution)
221
clevercsv.field_size_limit(0)
222
print("Field size limit removed")
223
224
# Restore reasonable limit
225
clevercsv.field_size_limit(128 * 1024) # 128KB default
226
```
227
228
## Advanced Dialect Management
229
230
### Custom Dialect Creation
231
232
Create specialized dialects for unique CSV formats:
233
234
```python
235
import clevercsv
236
237
def create_pipe_separated_dialect():
238
"""Create dialect for pipe-separated values."""
239
return clevercsv.SimpleDialect('|', '"', '\\')
240
241
def create_tab_separated_no_quotes():
242
"""Create dialect for tab-separated without quotes."""
243
return clevercsv.SimpleDialect('\t', '', '')
244
245
def create_semicolon_single_quotes():
246
"""Create dialect for semicolon-separated with single quotes."""
247
return clevercsv.SimpleDialect(';', "'", '')
248
249
# Usage
250
pipe_dialect = create_pipe_separated_dialect()
251
with open('pipe_data.csv', 'r', newline='') as f:
252
reader = clevercsv.reader(f, dialect=pipe_dialect)
253
data = list(reader)
254
```
255
256
### Dialect Comparison and Analysis
257
258
Compare and analyze different dialects:
259
260
```python
261
import clevercsv
262
263
def compare_dialects(file_path, dialects):
264
"""Compare how different dialects parse the same file."""
265
266
results = {}
267
268
with open(file_path, 'r', newline='') as f:
269
sample = f.read(1000) # First 1000 characters
270
271
for name, dialect in dialects.items():
272
try:
273
# Parse sample with this dialect
274
rows = list(clevercsv.parse_string(sample, dialect))
275
results[name] = {
276
'rows': len(rows),
277
'columns': len(rows[0]) if rows else 0,
278
'sample_row': rows[0] if rows else []
279
}
280
except Exception as e:
281
results[name] = {'error': str(e)}
282
283
return results
284
285
# Usage
286
dialects = {
287
'comma': clevercsv.SimpleDialect(',', '"', ''),
288
'semicolon': clevercsv.SimpleDialect(';', '"', ''),
289
'pipe': clevercsv.SimpleDialect('|', '"', ''),
290
'tab': clevercsv.SimpleDialect('\t', '"', '')
291
}
292
293
comparison = compare_dialects('ambiguous.csv', dialects)
294
for name, result in comparison.items():
295
print(f"{name}: {result}")
296
```
297
298
### Dialect Persistence
299
300
Save and load dialect configurations:
301
302
```python
303
import clevercsv
304
import json
305
306
class DialectManager:
307
"""Manage dialect configurations with persistence."""
308
309
def __init__(self, config_file='dialects.json'):
310
self.config_file = config_file
311
self.dialects = {}
312
self.load_dialects()
313
314
def save_dialect(self, name, dialect):
315
"""Save a dialect configuration."""
316
self.dialects[name] = dialect.to_dict()
317
self._save_to_file()
318
319
def load_dialect(self, name):
320
"""Load a dialect configuration."""
321
if name in self.dialects:
322
return clevercsv.SimpleDialect.from_dict(self.dialects[name])
323
return None
324
325
def list_dialects(self):
326
"""List all saved dialects."""
327
return list(self.dialects.keys())
328
329
def delete_dialect(self, name):
330
"""Delete a dialect configuration."""
331
if name in self.dialects:
332
del self.dialects[name]
333
self._save_to_file()
334
335
def load_dialects(self):
336
"""Load dialects from file."""
337
try:
338
with open(self.config_file, 'r') as f:
339
self.dialects = json.load(f)
340
except FileNotFoundError:
341
self.dialects = {}
342
343
def _save_to_file(self):
344
"""Save dialects to file."""
345
with open(self.config_file, 'w') as f:
346
json.dump(self.dialects, f, indent=2)
347
348
# Usage
349
manager = DialectManager()
350
351
# Save custom dialects
352
custom_dialect = clevercsv.SimpleDialect('|', "'", '\\')
353
manager.save_dialect('pipe_single_quote', custom_dialect)
354
355
# Load and use saved dialect
356
loaded_dialect = manager.load_dialect('pipe_single_quote')
357
if loaded_dialect:
358
with open('data.csv', 'r', newline='') as f:
359
reader = clevercsv.reader(f, dialect=loaded_dialect)
360
data = list(reader)
361
```
362
363
## Dialect Detection Integration
364
365
### Combining Detection and Configuration
366
367
Use detected dialects with configuration management:
368
369
```python
370
import clevercsv
371
372
def smart_csv_processing(file_path):
373
"""Process CSV with detection fallback to configuration."""
374
375
# Try automatic detection first
376
detected_dialect = clevercsv.detect_dialect(file_path)
377
378
if detected_dialect:
379
print(f"Using detected dialect: {detected_dialect}")
380
dialect = detected_dialect
381
else:
382
# Fallback to common dialects
383
print("Detection failed, trying common dialects...")
384
385
common_dialects = [
386
clevercsv.SimpleDialect(',', '"', ''), # Standard CSV
387
clevercsv.SimpleDialect(';', '"', ''), # European CSV
388
clevercsv.SimpleDialect('\t', '"', ''), # Tab-separated
389
clevercsv.SimpleDialect('|', '"', ''), # Pipe-separated
390
]
391
392
dialect = None
393
for test_dialect in common_dialects:
394
try:
395
with open(file_path, 'r', newline='') as f:
396
reader = clevercsv.reader(f, dialect=test_dialect)
397
first_row = next(reader)
398
if len(first_row) > 1: # Reasonable number of columns
399
dialect = test_dialect
400
print(f"Using fallback dialect: {dialect}")
401
break
402
except:
403
continue
404
405
if not dialect:
406
raise ValueError("Could not determine appropriate dialect")
407
408
# Process file with determined dialect
409
with open(file_path, 'r', newline='') as f:
410
reader = clevercsv.reader(f, dialect=dialect)
411
return list(reader)
412
413
# Usage
414
try:
415
data = smart_csv_processing('difficult_file.csv')
416
print(f"Successfully processed {len(data)} rows")
417
except ValueError as e:
418
print(f"Processing failed: {e}")
419
```
420
421
### Dialect Validation and Testing
422
423
Validate dialects against actual CSV files:
424
425
```python
426
import clevercsv
427
428
def validate_dialect_for_file(file_path, dialect):
429
"""Validate that a dialect works correctly for a file."""
430
431
validation_results = {
432
'valid': True,
433
'issues': [],
434
'statistics': {}
435
}
436
437
try:
438
with open(file_path, 'r', newline='') as f:
439
reader = clevercsv.reader(f, dialect=dialect)
440
rows = list(reader)
441
442
if not rows:
443
validation_results['valid'] = False
444
validation_results['issues'].append('No rows parsed')
445
return validation_results
446
447
# Check for consistent column count
448
column_counts = [len(row) for row in rows]
449
unique_counts = set(column_counts)
450
451
if len(unique_counts) > 1:
452
validation_results['issues'].append(
453
f'Inconsistent column counts: {sorted(unique_counts)}'
454
)
455
456
# Gather statistics
457
validation_results['statistics'] = {
458
'total_rows': len(rows),
459
'column_counts': dict(zip(*zip(*[(c, column_counts.count(c)) for c in unique_counts]))),
460
'average_columns': sum(column_counts) / len(column_counts),
461
'max_field_length': max(len(field) for row in rows for field in row) if rows else 0
462
}
463
464
except Exception as e:
465
validation_results['valid'] = False
466
validation_results['issues'].append(f'Parsing error: {str(e)}')
467
468
return validation_results
469
470
# Usage
471
test_dialect = clevercsv.SimpleDialect(',', '"', '')
472
results = validate_dialect_for_file('test.csv', test_dialect)
473
474
if results['valid']:
475
print("Dialect validation passed")
476
print(f"Statistics: {results['statistics']}")
477
else:
478
print("Dialect validation failed")
479
print(f"Issues: {results['issues']}")
480
```