A Python package for handling messy CSV files with enhanced dialect detection capabilities
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Dialect classes and configuration utilities for managing CSV parsing parameters. CleverCSV provides enhanced dialect support with the SimpleDialect class and utilities for working with various CSV formats and configurations.
Enhanced dialect class that provides a simplified and more flexible alternative to Python's csv.Dialect, with better support for CleverCSV's detection algorithms.
class SimpleDialect:
"""
Simplified dialect object for CSV parsing configuration.
For delimiter, quotechar, and escapechar:
- Empty string ('') means no delimiter/quotechar/escapechar in the file
- None is used to mark it as undefined
"""
def __init__(
self,
delimiter: Optional[str],
quotechar: Optional[str],
escapechar: Optional[str],
strict: bool = False
):
"""
Create a new SimpleDialect.
Parameters:
- delimiter: Field delimiter character
- quotechar: Quote character for fields containing special characters
- escapechar: Escape character for escaping delimiters/quotes
- strict: Whether to enforce strict parsing
"""
def validate(self) -> None:
"""
Validate dialect parameters.
Raises:
ValueError: If any parameter is invalid
"""
def to_csv_dialect(self) -> csv.Dialect:
"""
Convert to standard csv.Dialect object.
Returns:
csv.Dialect compatible object
"""
def to_dict(self) -> Dict[str, Union[str, bool, None]]:
"""
Convert dialect to dictionary representation.
Returns:
Dictionary with dialect parameters
"""
def serialize(self) -> str:
"""
Serialize dialect to JSON string.
Returns:
JSON string representation of dialect
"""
@classmethod
def deserialize(cls, obj: str) -> 'SimpleDialect':
"""
Deserialize dialect from JSON string.
Parameters:
- obj: JSON string representation
Returns:
SimpleDialect instance
"""
@classmethod
def from_dict(cls, d: Dict[str, Any]) -> 'SimpleDialect':
"""
Create SimpleDialect from dictionary.
Parameters:
- d: Dictionary with dialect parameters
Returns:
SimpleDialect instance
"""
@classmethod
def from_csv_dialect(cls, d: csv.Dialect) -> 'SimpleDialect':
"""
Create SimpleDialect from csv.Dialect.
Parameters:
- d: csv.Dialect instance
Returns:
SimpleDialect instance
"""import clevercsv
import json
# Create custom dialect
dialect = clevercsv.SimpleDialect(',', '"', '\\', strict=True)
print(f"Delimiter: '{dialect.delimiter}'")
print(f"Quote char: '{dialect.quotechar}'")
print(f"Escape char: '{dialect.escapechar}'")
# Validate dialect
try:
dialect.validate()
print("Dialect is valid")
except ValueError as e:
print(f"Invalid dialect: {e}")
# Convert to csv.Dialect for use with standard library
csv_dialect = dialect.to_csv_dialect()
with open('data.csv', 'r', newline='') as f:
reader = csv.reader(f, dialect=csv_dialect)
data = list(reader)
# Serialize dialect for storage
serialized = dialect.serialize()
print(f"Serialized: {serialized}")
# Deserialize dialect
restored_dialect = clevercsv.SimpleDialect.deserialize(serialized)
print(f"Restored: {restored_dialect}")
# Create from dictionary
dialect_dict = {'delimiter': ';', 'quotechar': "'", 'escapechar': '', 'strict': False}
dialect_from_dict = clevercsv.SimpleDialect.from_dict(dialect_dict)
# Create from csv.Dialect
csv_excel = csv.excel
simple_from_csv = clevercsv.SimpleDialect.from_csv_dialect(csv_excel)CleverCSV provides access to standard CSV dialects for common formats.
# Standard CSV dialects
excel: csv.Dialect # Excel-compatible format (comma-separated, quoted fields)
excel_tab: csv.Dialect # Excel tab-separated format
unix_dialect: csv.Dialect # Unix-style format (comma-separated, quoted fields, escaped quotes)import clevercsv
# Use predefined dialects
with open('data.csv', 'r', newline='') as f:
reader = clevercsv.reader(f, dialect=clevercsv.excel)
data = list(reader)
# Compare dialects
print("Excel dialect:")
excel_simple = clevercsv.SimpleDialect.from_csv_dialect(clevercsv.excel)
print(f" Delimiter: '{excel_simple.delimiter}'")
print(f" Quote char: '{excel_simple.quotechar}'")
print("Unix dialect:")
unix_simple = clevercsv.SimpleDialect.from_csv_dialect(clevercsv.unix_dialect)
print(f" Delimiter: '{unix_simple.delimiter}'")
print(f" Quote char: '{unix_simple.quotechar}'")
print(f" Escape char: '{unix_simple.escapechar}'")Utility functions for managing CSV parsing configuration and field size limits.
def field_size_limit(*args, **kwargs) -> int:
"""
Get or set the field size limit for CSV parsing.
Parameters:
- limit (optional): New field size limit in characters
Returns:
Previous field size limit
Raises:
TypeError: If limit is not an integer or too many arguments provided
Notes:
- Default limit is 128KB (131,072 characters)
- Setting limit to 0 removes the limit (use with caution)
- Large limits may impact performance and memory usage
"""import clevercsv
# Get current field size limit
current_limit = clevercsv.field_size_limit()
print(f"Current field size limit: {current_limit} characters")
# Set new field size limit
old_limit = clevercsv.field_size_limit(256 * 1024) # 256KB
print(f"Previous limit: {old_limit}, New limit: {clevercsv.field_size_limit()}")
# Remove field size limit (use with caution)
clevercsv.field_size_limit(0)
print("Field size limit removed")
# Restore reasonable limit
clevercsv.field_size_limit(128 * 1024) # 128KB defaultCreate specialized dialects for unique CSV formats:
import clevercsv
def create_pipe_separated_dialect():
"""Create dialect for pipe-separated values."""
return clevercsv.SimpleDialect('|', '"', '\\')
def create_tab_separated_no_quotes():
"""Create dialect for tab-separated without quotes."""
return clevercsv.SimpleDialect('\t', '', '')
def create_semicolon_single_quotes():
"""Create dialect for semicolon-separated with single quotes."""
return clevercsv.SimpleDialect(';', "'", '')
# Usage
pipe_dialect = create_pipe_separated_dialect()
with open('pipe_data.csv', 'r', newline='') as f:
reader = clevercsv.reader(f, dialect=pipe_dialect)
data = list(reader)Compare and analyze different dialects:
import clevercsv
def compare_dialects(file_path, dialects):
"""Compare how different dialects parse the same file."""
results = {}
with open(file_path, 'r', newline='') as f:
sample = f.read(1000) # First 1000 characters
for name, dialect in dialects.items():
try:
# Parse sample with this dialect
rows = list(clevercsv.parse_string(sample, dialect))
results[name] = {
'rows': len(rows),
'columns': len(rows[0]) if rows else 0,
'sample_row': rows[0] if rows else []
}
except Exception as e:
results[name] = {'error': str(e)}
return results
# Usage
dialects = {
'comma': clevercsv.SimpleDialect(',', '"', ''),
'semicolon': clevercsv.SimpleDialect(';', '"', ''),
'pipe': clevercsv.SimpleDialect('|', '"', ''),
'tab': clevercsv.SimpleDialect('\t', '"', '')
}
comparison = compare_dialects('ambiguous.csv', dialects)
for name, result in comparison.items():
print(f"{name}: {result}")Save and load dialect configurations:
import clevercsv
import json
class DialectManager:
"""Manage dialect configurations with persistence."""
def __init__(self, config_file='dialects.json'):
self.config_file = config_file
self.dialects = {}
self.load_dialects()
def save_dialect(self, name, dialect):
"""Save a dialect configuration."""
self.dialects[name] = dialect.to_dict()
self._save_to_file()
def load_dialect(self, name):
"""Load a dialect configuration."""
if name in self.dialects:
return clevercsv.SimpleDialect.from_dict(self.dialects[name])
return None
def list_dialects(self):
"""List all saved dialects."""
return list(self.dialects.keys())
def delete_dialect(self, name):
"""Delete a dialect configuration."""
if name in self.dialects:
del self.dialects[name]
self._save_to_file()
def load_dialects(self):
"""Load dialects from file."""
try:
with open(self.config_file, 'r') as f:
self.dialects = json.load(f)
except FileNotFoundError:
self.dialects = {}
def _save_to_file(self):
"""Save dialects to file."""
with open(self.config_file, 'w') as f:
json.dump(self.dialects, f, indent=2)
# Usage
manager = DialectManager()
# Save custom dialects
custom_dialect = clevercsv.SimpleDialect('|', "'", '\\')
manager.save_dialect('pipe_single_quote', custom_dialect)
# Load and use saved dialect
loaded_dialect = manager.load_dialect('pipe_single_quote')
if loaded_dialect:
with open('data.csv', 'r', newline='') as f:
reader = clevercsv.reader(f, dialect=loaded_dialect)
data = list(reader)Use detected dialects with configuration management:
import clevercsv
def smart_csv_processing(file_path):
"""Process CSV with detection fallback to configuration."""
# Try automatic detection first
detected_dialect = clevercsv.detect_dialect(file_path)
if detected_dialect:
print(f"Using detected dialect: {detected_dialect}")
dialect = detected_dialect
else:
# Fallback to common dialects
print("Detection failed, trying common dialects...")
common_dialects = [
clevercsv.SimpleDialect(',', '"', ''), # Standard CSV
clevercsv.SimpleDialect(';', '"', ''), # European CSV
clevercsv.SimpleDialect('\t', '"', ''), # Tab-separated
clevercsv.SimpleDialect('|', '"', ''), # Pipe-separated
]
dialect = None
for test_dialect in common_dialects:
try:
with open(file_path, 'r', newline='') as f:
reader = clevercsv.reader(f, dialect=test_dialect)
first_row = next(reader)
if len(first_row) > 1: # Reasonable number of columns
dialect = test_dialect
print(f"Using fallback dialect: {dialect}")
break
except:
continue
if not dialect:
raise ValueError("Could not determine appropriate dialect")
# Process file with determined dialect
with open(file_path, 'r', newline='') as f:
reader = clevercsv.reader(f, dialect=dialect)
return list(reader)
# Usage
try:
data = smart_csv_processing('difficult_file.csv')
print(f"Successfully processed {len(data)} rows")
except ValueError as e:
print(f"Processing failed: {e}")Validate dialects against actual CSV files:
import clevercsv
def validate_dialect_for_file(file_path, dialect):
"""Validate that a dialect works correctly for a file."""
validation_results = {
'valid': True,
'issues': [],
'statistics': {}
}
try:
with open(file_path, 'r', newline='') as f:
reader = clevercsv.reader(f, dialect=dialect)
rows = list(reader)
if not rows:
validation_results['valid'] = False
validation_results['issues'].append('No rows parsed')
return validation_results
# Check for consistent column count
column_counts = [len(row) for row in rows]
unique_counts = set(column_counts)
if len(unique_counts) > 1:
validation_results['issues'].append(
f'Inconsistent column counts: {sorted(unique_counts)}'
)
# Gather statistics
validation_results['statistics'] = {
'total_rows': len(rows),
'column_counts': dict(zip(*zip(*[(c, column_counts.count(c)) for c in unique_counts]))),
'average_columns': sum(column_counts) / len(column_counts),
'max_field_length': max(len(field) for row in rows for field in row) if rows else 0
}
except Exception as e:
validation_results['valid'] = False
validation_results['issues'].append(f'Parsing error: {str(e)}')
return validation_results
# Usage
test_dialect = clevercsv.SimpleDialect(',', '"', '')
results = validate_dialect_for_file('test.csv', test_dialect)
if results['valid']:
print("Dialect validation passed")
print(f"Statistics: {results['statistics']}")
else:
print("Dialect validation failed")
print(f"Issues: {results['issues']}")Install with Tessl CLI
npx tessl i tessl/pypi-clevercsv