A Python package for handling messy CSV files with enhanced dialect detection capabilities
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Advanced dialect detection capabilities using pattern analysis and consistency measures. CleverCSV provides sophisticated algorithms to automatically identify CSV dialects with 97% accuracy, offering a significant improvement over standard library approaches for messy CSV files.
The main dialect detection engine that provides both modern and compatibility interfaces for CSV dialect detection.
class Detector:
"""
Detect CSV dialects using normal forms or data consistency measures.
Provides a drop-in replacement for Python's csv.Sniffer.
"""
def detect(
self,
sample: str,
delimiters: Optional[Iterable[str]] = None,
verbose: bool = False,
method: Union[DetectionMethod, str] = 'auto',
skip: bool = True
) -> Optional[SimpleDialect]:
"""
Detect the dialect of a CSV file sample.
Parameters:
- sample: Text sample from CSV file (entire file recommended for best results)
- delimiters: Set of delimiters to consider (auto-detected if None)
- verbose: Enable progress output
- method: Detection method ('auto', 'normal', 'consistency')
- skip: Skip low-scoring dialects in consistency detection
Returns:
Detected SimpleDialect or None if detection failed
"""
def sniff(
self,
sample: str,
delimiters: Optional[Iterable[str]] = None,
verbose: bool = False
) -> Optional[SimpleDialect]:
"""
Compatibility method for Python csv.Sniffer interface.
Parameters:
- sample: Text sample from CSV file
- delimiters: Set of delimiters to consider
- verbose: Enable progress output
Returns:
Detected SimpleDialect or None if detection failed
"""
def has_header(self, sample: str, max_rows_to_check: int = 20) -> bool:
"""
Detect if a CSV sample has a header row.
Parameters:
- sample: Text sample from CSV file
- max_rows_to_check: Maximum number of rows to analyze
Returns:
True if header row detected, False otherwise
Raises:
NoDetectionResult: If dialect detection fails
"""CleverCSV supports multiple detection strategies that can be selected based on your needs and file characteristics.
class DetectionMethod(str, Enum):
"""Available detection methods for dialect detection."""
AUTO = 'auto' # Try normal form first, then consistency
NORMAL = 'normal' # Normal form detection only
CONSISTENCY = 'consistency' # Data consistency measure onlyimport clevercsv
# Basic detection with auto method
detector = clevercsv.Detector()
with open('data.csv', 'r') as f:
sample = f.read()
dialect = detector.detect(sample)
print(f"Detected: {dialect}")
# Use specific detection method
dialect = detector.detect(sample, method='normal', verbose=True)
# Compatibility with csv.Sniffer
dialect = detector.sniff(sample)
# Check for header row
has_header = detector.has_header(sample)
print(f"Has header: {has_header}")
# Custom delimiters
custom_delims = [',', ';', '|', '\t']
dialect = detector.detect(sample, delimiters=custom_delims)High-level function for direct file-based dialect detection without manual file handling.
def detect_dialect(
filename: Union[str, PathLike],
num_chars: Optional[int] = None,
encoding: Optional[str] = None,
verbose: bool = False,
method: str = 'auto',
skip: bool = True
) -> Optional[SimpleDialect]:
"""
Detect the dialect of a CSV file.
Parameters:
- filename: Path to the CSV file
- num_chars: Number of characters to read (entire file if None)
- encoding: File encoding (auto-detected if None)
- verbose: Enable progress output
- method: Detection method ('auto', 'normal', 'consistency')
- skip: Skip low-scoring dialects in consistency detection
Returns:
Detected SimpleDialect or None if detection failed
"""import clevercsv
# Simple file-based detection
dialect = clevercsv.detect_dialect('data.csv')
if dialect:
print(f"Delimiter: '{dialect.delimiter}'")
print(f"Quote char: '{dialect.quotechar}'")
print(f"Escape char: '{dialect.escapechar}'")
# Fast detection for large files
dialect = clevercsv.detect_dialect('large_file.csv', num_chars=50000)
# Verbose detection with specific method
dialect = clevercsv.detect_dialect(
'messy_file.csv',
method='consistency',
verbose=True
)
# Custom encoding
dialect = clevercsv.detect_dialect('data.csv', encoding='latin-1')The primary detection method that analyzes patterns in row lengths and data types to identify the most likely dialect. This method is fast and highly accurate for well-structured CSV files.
How it works:
Best for:
Fallback method that uses data consistency scoring when normal form detection is inconclusive. This method is more robust for irregular or messy CSV files.
How it works:
Best for:
The default method that combines both approaches for optimal results:
# For speed on large files (may reduce accuracy)
dialect = clevercsv.detect_dialect('huge_file.csv', num_chars=10000)
# For maximum accuracy (slower on large files)
dialect = clevercsv.detect_dialect('file.csv') # reads entire file
# Balanced approach for very large files
dialect = clevercsv.detect_dialect('file.csv', num_chars=100000)# Fastest: normal form only (good for regular files)
dialect = clevercsv.detect_dialect('file.csv', method='normal')
# Most robust: consistency only (good for messy files)
dialect = clevercsv.detect_dialect('file.csv', method='consistency')
# Balanced: auto method (recommended default)
dialect = clevercsv.detect_dialect('file.csv', method='auto')import clevercsv
dialect = clevercsv.detect_dialect('problematic.csv')
if dialect is None:
print("Detection failed - file may not be valid CSV")
# Fallback options:
# 1. Try with specific delimiters
# 2. Use manual dialect specification
# 3. Preprocess the filetry:
detector = clevercsv.Detector()
has_header = detector.has_header(sample)
except clevercsv.NoDetectionResult:
print("Could not detect dialect for header analysis")
# Fallback: assume no header or use domain knowledge# For files with unusual delimiters
exotic_delims = ['|', '§', '¦', '•']
detector = clevercsv.Detector()
dialect = detector.detect(sample, delimiters=exotic_delims)import clevercsv
import csv
# Detect with CleverCSV, use with standard csv
dialect = clevercsv.detect_dialect('data.csv')
csv_dialect = dialect.to_csv_dialect()
with open('data.csv', 'r') as f:
reader = csv.reader(f, dialect=csv_dialect)
data = list(reader)import clevercsv
import pandas as pd
# Manual detection then pandas
dialect = clevercsv.detect_dialect('data.csv')
df = pd.read_csv('data.csv', dialect=dialect.to_csv_dialect())
# Or use CleverCSV's integrated function
df = clevercsv.read_dataframe('data.csv') # Detection handled automaticallyInstall with Tessl CLI
npx tessl i tessl/pypi-clevercsv