A Python package for handling messy CSV files with enhanced dialect detection capabilities
npx @tessl/cli install tessl/pypi-clevercsv@0.8.0A comprehensive Python library that provides a drop-in replacement for the built-in csv module with enhanced dialect detection capabilities for handling messy and inconsistent CSV files. The package offers advanced pattern recognition algorithms to automatically detect row and type patterns in CSV data, enabling reliable parsing of files that would otherwise cause issues with standard CSV parsers.
pip install clevercsv (core) or pip install clevercsv[full] (with CLI tools)import clevercsvDrop-in replacement usage:
import clevercsv as csvimport clevercsv
# Automatic dialect detection and reading
rows = clevercsv.read_table('./data.csv')
# Read as pandas DataFrame (requires pandas)
df = clevercsv.read_dataframe('./data.csv')
# Read as dictionaries (first row as headers)
records = clevercsv.read_dicts('./data.csv')
# Traditional csv-style usage with automatic detection
with open('./data.csv', newline='') as csvfile:
dialect = clevercsv.Sniffer().sniff(csvfile.read())
csvfile.seek(0)
reader = clevercsv.reader(csvfile, dialect)
rows = list(reader)
# Manual dialect detection
dialect = clevercsv.detect_dialect('./data.csv')
print(f"Detected: {dialect}")CleverCSV employs a multi-stage dialect detection system:
This design enables CleverCSV to achieve 97% accuracy for dialect detection with a 21% improvement on non-standard CSV files compared to Python's standard library.
Convenient wrapper functions that automatically detect dialects and encodings, providing the easiest way to work with CSV files without manual configuration.
def read_table(filename, dialect=None, encoding=None, num_chars=None, verbose=False) -> List[List[str]]: ...
def read_dicts(filename, dialect=None, encoding=None, num_chars=None, verbose=False) -> List[Dict[str, str]]: ...
def read_dataframe(filename, *args, num_chars=None, **kwargs): ...
def stream_table(filename, dialect=None, encoding=None, num_chars=None, verbose=False) -> Iterator[List[str]]: ...
def stream_dicts(filename, dialect=None, encoding=None, num_chars=None, verbose=False) -> Iterator[Dict[str, str]]: ...Advanced dialect detection capabilities using pattern analysis and consistency measures, with support for custom detection parameters and manual dialect specification.
class Detector:
def detect(self, sample, delimiters=None, verbose=False, method='auto', skip=True) -> Optional[SimpleDialect]: ...
def sniff(self, sample, delimiters=None, verbose=False) -> Optional[SimpleDialect]: ...
def has_header(self, sample, max_rows_to_check=20) -> bool: ...
def detect_dialect(filename, num_chars=None, encoding=None, verbose=False, method='auto', skip=True) -> Optional[SimpleDialect]: ...Low-level CSV reader and writer classes that provide drop-in compatibility with Python's csv module while supporting CleverCSV's enhanced dialect handling.
class reader:
def __init__(self, csvfile, dialect='excel', **fmtparams): ...
def __iter__(self) -> Iterator[List[str]]: ...
def __next__(self) -> List[str]: ...
class writer:
def __init__(self, csvfile, dialect='excel', **fmtparams): ...
def writerow(self, row) -> Any: ...
def writerows(self, rows) -> Any: ...Dictionary-based reading and writing that treats the first row as headers, providing a more convenient interface for structured CSV data.
class DictReader:
def __init__(self, f, fieldnames=None, restkey=None, restval=None, dialect='excel', *args, **kwds): ...
def __iter__(self) -> Iterator[Dict[str, str]]: ...
def __next__(self) -> Dict[str, str]: ...
class DictWriter:
def __init__(self, f, fieldnames, restval='', extrasaction='raise', dialect='excel', *args, **kwds): ...
def writeheader(self) -> Any: ...
def writerow(self, rowdict) -> Any: ...
def writerows(self, rowdicts) -> None: ...Dialect classes and configuration utilities for managing CSV parsing parameters, including predefined dialects and custom dialect creation.
class SimpleDialect:
def __init__(self, delimiter, quotechar, escapechar, strict=False): ...
def validate(self) -> None: ...
def to_csv_dialect(self): ...
def to_dict(self) -> Dict[str, Union[str, bool, None]]: ...
# Predefined dialects
excel: csv.Dialect
excel_tab: csv.Dialect
unix_dialect: csv.DialectHigh-level function for writing tabular data to CSV files with automatic formatting and RFC-4180 compliance by default.
def write_table(table, filename, dialect='excel', transpose=False, encoding=None) -> None: ...# Detection results
Optional[SimpleDialect]
# File paths
Union[str, PathLike]
# CSV data structures
List[List[str]] # Table data
List[Dict[str, str]] # Dictionary records
Iterator[List[str]] # Streaming table data
Iterator[Dict[str, str]] # Streaming dictionary records
# Dialect specifications
Union[str, SimpleDialect, csv.Dialect]
# Detection methods
Literal['auto', 'normal', 'consistency']# Quoting constants (from csv module)
QUOTE_ALL: int
QUOTE_MINIMAL: int
QUOTE_NONE: int
QUOTE_NONNUMERIC: intclass Error(Exception):
"""General CleverCSV error"""
class NoDetectionResult(Exception):
"""Raised when dialect detection fails"""