CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-clevercsv

A Python package for handling messy CSV files with enhanced dialect detection capabilities

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview
Eval results
Files

dictionary-operations.mddocs/

Dictionary Operations

Dictionary-based CSV reading and writing that treats the first row as column headers, providing a more convenient interface for structured CSV data. These classes mirror Python's csv.DictReader and csv.DictWriter but with CleverCSV's enhanced dialect support.

Capabilities

DictReader Class

Dictionary-based CSV reader that automatically uses the first row as field names (headers) and returns each subsequent row as a dictionary.

class DictReader:
    """
    CSV reader that returns rows as dictionaries.
    Uses first row as field names unless fieldnames are explicitly provided.
    """
    
    def __init__(
        self,
        f: Iterable[str],
        fieldnames: Optional[Sequence[str]] = None,
        restkey: Optional[str] = None,
        restval: Optional[str] = None,
        dialect: Union[str, SimpleDialect, csv.Dialect] = 'excel',
        *args,
        **kwds
    ):
        """
        Initialize dictionary CSV reader.
        
        Parameters:
        - f: File-like object or iterable of strings
        - fieldnames: Field names to use (first row if None)
        - restkey: Key for fields beyond fieldnames length
        - restval: Value for missing fields
        - dialect: Dialect specification
        - *args, **kwds: Additional arguments passed to underlying reader
        """
    
    def __iter__(self) -> Iterator[Dict[str, str]]:
        """Return iterator over dictionary rows."""
    
    def __next__(self) -> Dict[str, str]:
        """
        Return next row as dictionary.
        
        Returns:
        Dictionary mapping field names to values
        
        Raises:
        StopIteration: When no more rows available
        """
    
    @property
    def fieldnames(self) -> Sequence[str]:
        """Field names (column headers) used for dictionaries."""
    
    @fieldnames.setter
    def fieldnames(self, value: Sequence[str]) -> None:
        """Set field names explicitly."""
    
    @property
    def line_num(self) -> int:
        """Current line number being processed."""

Usage Examples

import clevercsv

# Basic dictionary reading
with open('employees.csv', 'r', newline='') as f:
    reader = clevercsv.DictReader(f)
    for row in reader:
        print(f"Name: {row['name']}, Age: {row['age']}, Department: {row['dept']}")

# With automatic dialect detection
with open('data.csv', 'r', newline='') as f:
    sample = f.read()
    dialect = clevercsv.Detector().detect(sample)
    f.seek(0)
    reader = clevercsv.DictReader(f, dialect=dialect)
    records = list(reader)

# Custom field names (ignore first row)
fieldnames = ['id', 'name', 'score', 'grade']
with open('data.csv', 'r', newline='') as f:
    reader = clevercsv.DictReader(f, fieldnames=fieldnames)
    for row in reader:
        print(f"Student {row['name']} scored {row['score']}")

# Handle extra/missing fields
with open('irregular.csv', 'r', newline='') as f:
    reader = clevercsv.DictReader(f, restkey='extra_fields', restval='N/A')
    for row in reader:
        print(f"Regular data: {row}")
        if 'extra_fields' in row:
            print(f"Extra fields: {row['extra_fields']}")

DictWriter Class

Dictionary-based CSV writer that writes dictionaries as CSV rows, using field names to determine column order and handling.

class DictWriter:
    """
    CSV writer that accepts dictionaries and writes them as CSV rows.
    Requires fieldnames to determine column order and content.
    """
    
    def __init__(
        self,
        f: SupportsWrite[str],
        fieldnames: Collection[str],
        restval: Optional[Any] = '',
        extrasaction: Literal['raise', 'ignore'] = 'raise',
        dialect: Union[str, SimpleDialect, csv.Dialect] = 'excel',
        *args,
        **kwds
    ):
        """
        Initialize dictionary CSV writer.
        
        Parameters:
        - f: File-like object that supports writing
        - fieldnames: Field names that determine column order
        - restval: Value for missing dictionary keys
        - extrasaction: Action for extra dictionary keys ('raise' or 'ignore')
        - dialect: Dialect specification
        - *args, **kwds: Additional arguments passed to underlying writer
        """
    
    def writeheader(self) -> Any:
        """
        Write header row containing field names.
        
        Returns:
        Return value from underlying writerow call
        """
    
    def writerow(self, rowdict: Mapping[str, Any]) -> Any:
        """
        Write a single dictionary as a CSV row.
        
        Parameters:
        - rowdict: Dictionary with field values
        
        Returns:
        Return value from underlying writerow call
        
        Raises:
        ValueError: If extrasaction='raise' and dictionary contains extra keys
        """
    
    def writerows(self, rowdicts: Iterable[Mapping[str, Any]]) -> None:
        """
        Write multiple dictionaries as CSV rows.
        
        Parameters:
        - rowdicts: Iterable of dictionaries to write
        
        Raises:
        ValueError: If extrasaction='raise' and any dictionary contains extra keys
        """
    
    @property
    def fieldnames(self) -> Collection[str]:
        """Field names that determine column order."""

Usage Examples

import clevercsv

# Basic dictionary writing
data = [
    {'name': 'Alice', 'age': 30, 'city': 'New York'},
    {'name': 'Bob', 'age': 25, 'city': 'San Francisco'},
    {'name': 'Charlie', 'age': 35, 'city': 'Chicago'}
]

fieldnames = ['name', 'age', 'city']
with open('output.csv', 'w', newline='') as f:
    writer = clevercsv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(data)

# With specific dialect
dialect = clevercsv.SimpleDialect(';', '"', '')
with open('output.csv', 'w', newline='') as f:
    writer = clevercsv.DictWriter(f, fieldnames=fieldnames, dialect=dialect)
    writer.writeheader()
    for row in data:
        writer.writerow(row)

# Handle missing values
data_with_missing = [
    {'name': 'Alice', 'age': 30},  # Missing 'city'
    {'name': 'Bob', 'city': 'SF'},  # Missing 'age'
]

with open('output.csv', 'w', newline='') as f:
    writer = clevercsv.DictWriter(f, fieldnames=['name', 'age', 'city'], restval='Unknown')
    writer.writeheader()
    writer.writerows(data_with_missing)

# Handle extra fields
data_with_extra = [
    {'name': 'Alice', 'age': 30, 'city': 'NYC', 'country': 'USA'},  # Extra 'country'
]

# Ignore extra fields
with open('output.csv', 'w', newline='') as f:
    writer = clevercsv.DictWriter(f, fieldnames=['name', 'age', 'city'], extrasaction='ignore')
    writer.writeheader()
    writer.writerows(data_with_extra)

Advanced Usage Patterns

Data Processing Pipeline

Process CSV data through transformation pipelines while maintaining dictionary structure:

import clevercsv

def process_employee_data(input_file, output_file):
    """Process employee data with transformations."""
    
    transformations = []
    
    with open(input_file, 'r', newline='') as infile:
        reader = clevercsv.DictReader(infile)
        
        for row in reader:
            # Apply transformations
            row['name'] = row['name'].title()  # Capitalize names
            row['age'] = int(row['age']) if row['age'].isdigit() else 0
            row['salary'] = float(row['salary'].replace('$', '').replace(',', ''))
            
            # Add computed fields
            row['seniority'] = 'Senior' if int(row['age']) > 40 else 'Junior'
            
            transformations.append(row)
    
    # Write processed data
    if transformations:
        fieldnames = list(transformations[0].keys())
        with open(output_file, 'w', newline='') as outfile:
            writer = clevercsv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(transformations)

# Usage
process_employee_data('employees.csv', 'processed_employees.csv')

Data Validation and Filtering

Validate and filter CSV data using dictionary operations:

import clevercsv

def validate_and_filter_data(filename, validation_rules):
    """Validate and filter CSV data based on rules."""
    
    valid_records = []
    invalid_records = []
    
    with open(filename, 'r', newline='') as f:
        reader = clevercsv.DictReader(f)
        
        for row_num, row in enumerate(reader, 1):
            errors = []
            
            # Apply validation rules
            for field, rule in validation_rules.items():
                if field in row:
                    if not rule(row[field]):
                        errors.append(f"Invalid {field}: {row[field]}")
                else:
                    errors.append(f"Missing required field: {field}")
            
            if errors:
                invalid_records.append({
                    'row_number': row_num,
                    'data': row,
                    'errors': errors
                })
            else:
                valid_records.append(row)
    
    return valid_records, invalid_records

# Usage
validation_rules = {
    'email': lambda x: '@' in x and '.' in x,
    'age': lambda x: x.isdigit() and 0 < int(x) < 120,
    'salary': lambda x: x.replace('$', '').replace(',', '').replace('.', '').isdigit()
}

valid_data, invalid_data = validate_and_filter_data('employees.csv', validation_rules)
print(f"Valid records: {len(valid_data)}")
print(f"Invalid records: {len(invalid_data)}")

Column Mapping and Renaming

Map and rename columns during CSV processing:

import clevercsv

def remap_csv_columns(input_file, output_file, column_mapping):
    """Remap column names and reorganize CSV data."""
    
    with open(input_file, 'r', newline='') as infile:
        reader = clevercsv.DictReader(infile)
        
        # Prepare data with remapped columns
        remapped_data = []
        for row in reader:
            new_row = {}
            for old_name, new_name in column_mapping.items():
                if old_name in row:
                    new_row[new_name] = row[old_name]
                else:
                    new_row[new_name] = ''  # Default for missing columns
            remapped_data.append(new_row)
    
    # Write remapped data
    if remapped_data:
        fieldnames = list(column_mapping.values())
        with open(output_file, 'w', newline='') as outfile:
            writer = clevercsv.DictWriter(outfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(remapped_data)

# Usage
column_mapping = {
    'full_name': 'name',
    'years_old': 'age',
    'home_city': 'city',
    'job_title': 'position'
}

remap_csv_columns('input.csv', 'output.csv', column_mapping)

Error Handling

Handling Duplicate Field Names

CleverCSV warns about duplicate field names in headers:

import clevercsv
import warnings

# Capture warnings about duplicate headers
with warnings.catch_warnings(record=True) as w:
    warnings.simplefilter("always")
    
    with open('file_with_duplicate_headers.csv', 'r', newline='') as f:
        reader = clevercsv.DictReader(f)
        data = list(reader)
    
    if w:
        for warning in w:
            print(f"Warning: {warning.message}")

Handling Extra Fields

import clevercsv

# Raise error on extra fields
try:
    with open('output.csv', 'w', newline='') as f:
        writer = clevercsv.DictWriter(f, fieldnames=['a', 'b'], extrasaction='raise')
        writer.writerow({'a': '1', 'b': '2', 'c': '3'})  # 'c' is extra
except ValueError as e:
    print(f"Extra field error: {e}")

# Ignore extra fields silently
with open('output.csv', 'w', newline='') as f:
    writer = clevercsv.DictWriter(f, fieldnames=['a', 'b'], extrasaction='ignore')
    writer.writerow({'a': '1', 'b': '2', 'c': '3'})  # 'c' ignored

Handling Missing Fields

import clevercsv

# Use restval for missing fields
data = [{'name': 'Alice'}, {'name': 'Bob', 'age': 25}]  # Missing 'age' in first row

with open('output.csv', 'w', newline='') as f:
    writer = clevercsv.DictWriter(f, fieldnames=['name', 'age'], restval='N/A')
    writer.writeheader()
    writer.writerows(data)

Performance Considerations

Memory Efficiency for Large Files

import clevercsv

def process_large_csv_efficiently(filename):
    """Process large CSV files without loading all data into memory."""
    
    with open(filename, 'r', newline='') as f:
        reader = clevercsv.DictReader(f)
        
        # Process one row at a time
        for row in reader:
            # Process row immediately
            process_single_record(row)
            # Don't store in list - keeps memory usage constant

def process_large_csv_inefficiently(filename):
    """Inefficient approach that loads everything into memory."""
    
    with open(filename, 'r', newline='') as f:
        reader = clevercsv.DictReader(f)
        all_records = list(reader)  # Loads entire file into memory
        
        for record in all_records:
            process_single_record(record)

Field Name Optimization

# Efficient: Access fieldnames once
reader = clevercsv.DictReader(file)
fieldnames = reader.fieldnames  # Cache fieldnames
for row in reader:
    # Use cached fieldnames if needed
    process_row(row, fieldnames)

# Less efficient: Access fieldnames repeatedly in loop
reader = clevercsv.DictReader(file)
for row in reader:
    fieldnames = reader.fieldnames  # Repeated access
    process_row(row, fieldnames)

Install with Tessl CLI

npx tessl i tessl/pypi-clevercsv

docs

core-reading-writing.md

data-reading.md

data-writing.md

dialect-detection.md

dialects-configuration.md

dictionary-operations.md

index.md

tile.json