A Python package for handling messy CSV files with enhanced dialect detection capabilities
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Dictionary-based CSV reading and writing that treats the first row as column headers, providing a more convenient interface for structured CSV data. These classes mirror Python's csv.DictReader and csv.DictWriter but with CleverCSV's enhanced dialect support.
Dictionary-based CSV reader that automatically uses the first row as field names (headers) and returns each subsequent row as a dictionary.
class DictReader:
"""
CSV reader that returns rows as dictionaries.
Uses first row as field names unless fieldnames are explicitly provided.
"""
def __init__(
self,
f: Iterable[str],
fieldnames: Optional[Sequence[str]] = None,
restkey: Optional[str] = None,
restval: Optional[str] = None,
dialect: Union[str, SimpleDialect, csv.Dialect] = 'excel',
*args,
**kwds
):
"""
Initialize dictionary CSV reader.
Parameters:
- f: File-like object or iterable of strings
- fieldnames: Field names to use (first row if None)
- restkey: Key for fields beyond fieldnames length
- restval: Value for missing fields
- dialect: Dialect specification
- *args, **kwds: Additional arguments passed to underlying reader
"""
def __iter__(self) -> Iterator[Dict[str, str]]:
"""Return iterator over dictionary rows."""
def __next__(self) -> Dict[str, str]:
"""
Return next row as dictionary.
Returns:
Dictionary mapping field names to values
Raises:
StopIteration: When no more rows available
"""
@property
def fieldnames(self) -> Sequence[str]:
"""Field names (column headers) used for dictionaries."""
@fieldnames.setter
def fieldnames(self, value: Sequence[str]) -> None:
"""Set field names explicitly."""
@property
def line_num(self) -> int:
"""Current line number being processed."""import clevercsv
# Basic dictionary reading
with open('employees.csv', 'r', newline='') as f:
reader = clevercsv.DictReader(f)
for row in reader:
print(f"Name: {row['name']}, Age: {row['age']}, Department: {row['dept']}")
# With automatic dialect detection
with open('data.csv', 'r', newline='') as f:
sample = f.read()
dialect = clevercsv.Detector().detect(sample)
f.seek(0)
reader = clevercsv.DictReader(f, dialect=dialect)
records = list(reader)
# Custom field names (ignore first row)
fieldnames = ['id', 'name', 'score', 'grade']
with open('data.csv', 'r', newline='') as f:
reader = clevercsv.DictReader(f, fieldnames=fieldnames)
for row in reader:
print(f"Student {row['name']} scored {row['score']}")
# Handle extra/missing fields
with open('irregular.csv', 'r', newline='') as f:
reader = clevercsv.DictReader(f, restkey='extra_fields', restval='N/A')
for row in reader:
print(f"Regular data: {row}")
if 'extra_fields' in row:
print(f"Extra fields: {row['extra_fields']}")Dictionary-based CSV writer that writes dictionaries as CSV rows, using field names to determine column order and handling.
class DictWriter:
"""
CSV writer that accepts dictionaries and writes them as CSV rows.
Requires fieldnames to determine column order and content.
"""
def __init__(
self,
f: SupportsWrite[str],
fieldnames: Collection[str],
restval: Optional[Any] = '',
extrasaction: Literal['raise', 'ignore'] = 'raise',
dialect: Union[str, SimpleDialect, csv.Dialect] = 'excel',
*args,
**kwds
):
"""
Initialize dictionary CSV writer.
Parameters:
- f: File-like object that supports writing
- fieldnames: Field names that determine column order
- restval: Value for missing dictionary keys
- extrasaction: Action for extra dictionary keys ('raise' or 'ignore')
- dialect: Dialect specification
- *args, **kwds: Additional arguments passed to underlying writer
"""
def writeheader(self) -> Any:
"""
Write header row containing field names.
Returns:
Return value from underlying writerow call
"""
def writerow(self, rowdict: Mapping[str, Any]) -> Any:
"""
Write a single dictionary as a CSV row.
Parameters:
- rowdict: Dictionary with field values
Returns:
Return value from underlying writerow call
Raises:
ValueError: If extrasaction='raise' and dictionary contains extra keys
"""
def writerows(self, rowdicts: Iterable[Mapping[str, Any]]) -> None:
"""
Write multiple dictionaries as CSV rows.
Parameters:
- rowdicts: Iterable of dictionaries to write
Raises:
ValueError: If extrasaction='raise' and any dictionary contains extra keys
"""
@property
def fieldnames(self) -> Collection[str]:
"""Field names that determine column order."""import clevercsv
# Basic dictionary writing
data = [
{'name': 'Alice', 'age': 30, 'city': 'New York'},
{'name': 'Bob', 'age': 25, 'city': 'San Francisco'},
{'name': 'Charlie', 'age': 35, 'city': 'Chicago'}
]
fieldnames = ['name', 'age', 'city']
with open('output.csv', 'w', newline='') as f:
writer = clevercsv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
# With specific dialect
dialect = clevercsv.SimpleDialect(';', '"', '')
with open('output.csv', 'w', newline='') as f:
writer = clevercsv.DictWriter(f, fieldnames=fieldnames, dialect=dialect)
writer.writeheader()
for row in data:
writer.writerow(row)
# Handle missing values
data_with_missing = [
{'name': 'Alice', 'age': 30}, # Missing 'city'
{'name': 'Bob', 'city': 'SF'}, # Missing 'age'
]
with open('output.csv', 'w', newline='') as f:
writer = clevercsv.DictWriter(f, fieldnames=['name', 'age', 'city'], restval='Unknown')
writer.writeheader()
writer.writerows(data_with_missing)
# Handle extra fields
data_with_extra = [
{'name': 'Alice', 'age': 30, 'city': 'NYC', 'country': 'USA'}, # Extra 'country'
]
# Ignore extra fields
with open('output.csv', 'w', newline='') as f:
writer = clevercsv.DictWriter(f, fieldnames=['name', 'age', 'city'], extrasaction='ignore')
writer.writeheader()
writer.writerows(data_with_extra)Process CSV data through transformation pipelines while maintaining dictionary structure:
import clevercsv
def process_employee_data(input_file, output_file):
"""Process employee data with transformations."""
transformations = []
with open(input_file, 'r', newline='') as infile:
reader = clevercsv.DictReader(infile)
for row in reader:
# Apply transformations
row['name'] = row['name'].title() # Capitalize names
row['age'] = int(row['age']) if row['age'].isdigit() else 0
row['salary'] = float(row['salary'].replace('$', '').replace(',', ''))
# Add computed fields
row['seniority'] = 'Senior' if int(row['age']) > 40 else 'Junior'
transformations.append(row)
# Write processed data
if transformations:
fieldnames = list(transformations[0].keys())
with open(output_file, 'w', newline='') as outfile:
writer = clevercsv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(transformations)
# Usage
process_employee_data('employees.csv', 'processed_employees.csv')Validate and filter CSV data using dictionary operations:
import clevercsv
def validate_and_filter_data(filename, validation_rules):
"""Validate and filter CSV data based on rules."""
valid_records = []
invalid_records = []
with open(filename, 'r', newline='') as f:
reader = clevercsv.DictReader(f)
for row_num, row in enumerate(reader, 1):
errors = []
# Apply validation rules
for field, rule in validation_rules.items():
if field in row:
if not rule(row[field]):
errors.append(f"Invalid {field}: {row[field]}")
else:
errors.append(f"Missing required field: {field}")
if errors:
invalid_records.append({
'row_number': row_num,
'data': row,
'errors': errors
})
else:
valid_records.append(row)
return valid_records, invalid_records
# Usage
validation_rules = {
'email': lambda x: '@' in x and '.' in x,
'age': lambda x: x.isdigit() and 0 < int(x) < 120,
'salary': lambda x: x.replace('$', '').replace(',', '').replace('.', '').isdigit()
}
valid_data, invalid_data = validate_and_filter_data('employees.csv', validation_rules)
print(f"Valid records: {len(valid_data)}")
print(f"Invalid records: {len(invalid_data)}")Map and rename columns during CSV processing:
import clevercsv
def remap_csv_columns(input_file, output_file, column_mapping):
"""Remap column names and reorganize CSV data."""
with open(input_file, 'r', newline='') as infile:
reader = clevercsv.DictReader(infile)
# Prepare data with remapped columns
remapped_data = []
for row in reader:
new_row = {}
for old_name, new_name in column_mapping.items():
if old_name in row:
new_row[new_name] = row[old_name]
else:
new_row[new_name] = '' # Default for missing columns
remapped_data.append(new_row)
# Write remapped data
if remapped_data:
fieldnames = list(column_mapping.values())
with open(output_file, 'w', newline='') as outfile:
writer = clevercsv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(remapped_data)
# Usage
column_mapping = {
'full_name': 'name',
'years_old': 'age',
'home_city': 'city',
'job_title': 'position'
}
remap_csv_columns('input.csv', 'output.csv', column_mapping)CleverCSV warns about duplicate field names in headers:
import clevercsv
import warnings
# Capture warnings about duplicate headers
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
with open('file_with_duplicate_headers.csv', 'r', newline='') as f:
reader = clevercsv.DictReader(f)
data = list(reader)
if w:
for warning in w:
print(f"Warning: {warning.message}")import clevercsv
# Raise error on extra fields
try:
with open('output.csv', 'w', newline='') as f:
writer = clevercsv.DictWriter(f, fieldnames=['a', 'b'], extrasaction='raise')
writer.writerow({'a': '1', 'b': '2', 'c': '3'}) # 'c' is extra
except ValueError as e:
print(f"Extra field error: {e}")
# Ignore extra fields silently
with open('output.csv', 'w', newline='') as f:
writer = clevercsv.DictWriter(f, fieldnames=['a', 'b'], extrasaction='ignore')
writer.writerow({'a': '1', 'b': '2', 'c': '3'}) # 'c' ignoredimport clevercsv
# Use restval for missing fields
data = [{'name': 'Alice'}, {'name': 'Bob', 'age': 25}] # Missing 'age' in first row
with open('output.csv', 'w', newline='') as f:
writer = clevercsv.DictWriter(f, fieldnames=['name', 'age'], restval='N/A')
writer.writeheader()
writer.writerows(data)import clevercsv
def process_large_csv_efficiently(filename):
"""Process large CSV files without loading all data into memory."""
with open(filename, 'r', newline='') as f:
reader = clevercsv.DictReader(f)
# Process one row at a time
for row in reader:
# Process row immediately
process_single_record(row)
# Don't store in list - keeps memory usage constant
def process_large_csv_inefficiently(filename):
"""Inefficient approach that loads everything into memory."""
with open(filename, 'r', newline='') as f:
reader = clevercsv.DictReader(f)
all_records = list(reader) # Loads entire file into memory
for record in all_records:
process_single_record(record)# Efficient: Access fieldnames once
reader = clevercsv.DictReader(file)
fieldnames = reader.fieldnames # Cache fieldnames
for row in reader:
# Use cached fieldnames if needed
process_row(row, fieldnames)
# Less efficient: Access fieldnames repeatedly in loop
reader = clevercsv.DictReader(file)
for row in reader:
fieldnames = reader.fieldnames # Repeated access
process_row(row, fieldnames)Install with Tessl CLI
npx tessl i tessl/pypi-clevercsv