tessl/pypi-petl

A Python package for extracting, transforming and loading tables of data.

—

Pending

Overview

Eval results

Files

Validation and Analysis

Name: tessl/pypi-petl
Author: tessl

Tools for data validation, quality assessment, and statistical analysis. Includes data profiling, constraint validation, summary statistics, and visualization utilities for understanding and validating data quality.

Capabilities

Data Validation

Validate data against constraints and rules to ensure data quality.

def validate(table, constraints=None, header=None) -> Table:
    """
    Validate table data against constraints and expected header structure.
    
    Parameters:
    - table: Input table to validate
    - constraints: List of constraint dictionaries with the following keys:
        - name (str): Name of the constraint
        - field (str, optional): Field name to validate (omit for row-level constraints)
        - test (callable, optional): Function to test field values (e.g., int, float)
        - assertion (callable, optional): Function returning True/False for validation
        - optional (bool, optional): If True, missing fields are allowed
    - header: Expected header tuple/list for structure validation
    
    Returns:
    Table with columns: name, row, field, value, error
    
    Example constraint formats:
    - Field type test: dict(name='foo_int', field='foo', test=int)
    - Field assertion: dict(name='bar_enum', field='bar', assertion=lambda v: v in ['A', 'B'])
    - Row-level assertion: dict(name='no_nulls', assertion=lambda row: None not in row)
    - Optional field: dict(name='optional_field', field='opt', test=str, optional=True)
    """

Data Visualization and Inspection

Tools for examining and understanding table structure and content.

def look(table, limit=0, vrepr=None, index_header=None, style=None, truncate=None, width=None):
    """
    Print a table in a formatted display.
    
    Parameters:
    - table: Input table
    - limit: Maximum number of rows to display (0 for all)
    - vrepr: Function for value representation
    - index_header: Whether to show row indices
    - style: Display style ('grid', 'simple', 'minimal')
    - truncate: Maximum length for cell values
    - width: Maximum display width
    """

def lookall(table, **kwargs):
    """Print entire table with formatting options."""

def lookstr(table, limit=0, **kwargs) -> str:
    """
    Return table as formatted string.
    
    Parameters:
    - table: Input table
    - limit: Maximum number of rows
    - kwargs: Formatting options
    
    Returns:
    Formatted string representation of table
    """

def lookallstr(table, **kwargs) -> str:
    """Return entire table as formatted string."""

def see(table, limit=0, vrepr=None, index_header=None):
    """
    Print table with field details and statistics.
    
    Parameters:
    - table: Input table
    - limit: Maximum number of rows to analyze
    - vrepr: Function for value representation
    - index_header: Whether to show row indices
    """

Statistical Analysis

Calculate summary statistics and analyze data distributions.

def stats(table, field):
    """
    Return basic statistics for a numeric field.
    
    Parameters:
    - table: Input table
    - field: Numeric field name
    
    Returns:
    Dictionary with count, min, max, sum, mean, stddev
    """

def limits(table, field):
    """
    Return minimum and maximum values for a field.
    
    Parameters:
    - table: Input table
    - field: Field name
    
    Returns:
    Tuple of (min_value, max_value)
    """

def nrows(table) -> int:
    """
    Count the number of data rows in the table.
    
    Parameters:
    - table: Input table
    
    Returns:
    Integer count of rows (excluding header)
    """

Data Profiling

Analyze data types, patterns, and quality metrics.

def typeset(table, field):
    """
    Return set of types found in the specified field.
    
    Parameters:
    - table: Input table
    - field: Field name to analyze
    
    Returns:
    Set of Python types found in the field
    """

def typecounter(table, field):
    """
    Return a Counter of Python types found in the specified field.
    
    Parameters:
    - table: Input table
    - field: Field name to analyze
    
    Returns:
    Counter object with type counts
    """

def typecounts(table, field) -> Table:
    """
    Return a table with type counts for the specified field.
    
    Parameters:
    - table: Input table
    - field: Field name to analyze
    
    Returns:
    Table with type names and counts
    """

def stringpatterncounter(table, field):
    """
    Count string patterns in the specified field.
    
    Parameters:
    - table: Input table
    - field: Field name containing string values
    
    Returns:
    Counter with string pattern counts
    """

def stringpatterns(table, field) -> Table:
    """
    Return a table with string pattern counts.
    
    Parameters:
    - table: Input table
    - field: Field name containing string values
    
    Returns:
    Table with patterns and their frequencies
    """

def rowlengths(table) -> Table:
    """
    Return counts of row lengths in the table.
    
    Parameters:
    - table: Input table
    
    Returns:
    Table with row lengths and their frequencies
    """

Value Analysis

Analyze value distributions and frequencies.

def valuecount(table, field, value, missing=None) -> int:
    """
    Count occurrences of a specific value in a field.
    
    Parameters:
    - table: Input table
    - field: Field name to search
    - value: Value to count
    - missing: Value to treat as missing
    
    Returns:
    Integer count of occurrences
    """

def valuecounter(table, *field, **kwargs):
    """
    Return a Counter of values in the specified field(s).
    
    Parameters:
    - table: Input table
    - field: Field name(s) to count
    - kwargs: Additional options
    
    Returns:
    Counter object with value frequencies
    """

def valuecounts(table, *field, **kwargs) -> Table:
    """
    Return a table with value counts for the specified field(s).
    
    Parameters:
    - table: Input table
    - field: Field name(s) to count
    - kwargs: Additional options
    
    Returns:
    Table with values and their frequencies
    """

Data Quality Assessment

Identify data quality issues and inconsistencies.

def diffheaders(t1, t2):
    """
    Find differences between table headers.
    
    Parameters:
    - t1: First table
    - t2: Second table
    
    Returns:
    Information about header differences
    """

def diffvalues(t1, t2, field):
    """
    Find different values between tables for a field.
    
    Parameters:
    - t1: First table
    - t2: Second table
    - field: Field name to compare
    
    Returns:
    Information about value differences
    """

def parsecounter(table, field, parsers=(('int', int), ('float', float))):
    """
    Count successful parsing attempts with different parsers.
    
    Parameters:
    - table: Input table
    - field: Field name to test parsing
    - parsers: List of (name, parser_function) tuples
    
    Returns:
    Counter with parsing success counts
    """

def parsecounts(table, field, parsers=(('int', int), ('float', float))) -> Table:
    """
    Return a table with parsing attempt counts.
    
    Parameters:
    - table: Input table
    - field: Field name to test parsing
    - parsers: List of (name, parser_function) tuples
    
    Returns:
    Table with parser names and success counts
    """

Performance Monitoring

Monitor and measure data processing performance.

def progress(table, batchsize=1000, prefix="", out=None) -> Table:
    """
    Report progress while iterating through table.
    
    Parameters:
    - table: Input table
    - batchsize: Number of rows between progress reports
    - prefix: Prefix for progress messages
    - out: Output stream for progress messages
    
    Returns:
    Table that reports progress during iteration
    """

def clock(table) -> Table:
    """
    Time table operations.
    
    Parameters:
    - table: Input table
    
    Returns:
    Table that measures processing time
    """

Test Data Generation

Generate test data for validation and testing purposes.

def randomtable(numflds=5, numrows=100, wait=0, seed=None) -> Table:
    """
    Generate a table with random data.
    
    Parameters:
    - numflds: Number of fields
    - numrows: Number of rows
    - wait: Delay between row generation (for testing)
    - seed: Random seed for reproducibility
    
    Returns:
    Table with random data
    """

def dummytable(numflds=5, numrows=100) -> Table:
    """
    Generate a table with dummy data.
    
    Parameters:
    - numflds: Number of fields
    - numrows: Number of rows
    
    Returns:
    Table with predictable dummy data
    """

Usage Examples

Data Inspection and Visualization

import petl as etl

data = etl.fromcsv('dataset.csv')

# Quick look at data structure
etl.look(data, limit=10)

# Detailed analysis with statistics
etl.see(data, limit=20)

# Get formatted string representation
table_str = etl.lookstr(data, limit=5, style='simple')
print(table_str)

Statistical Analysis

import petl as etl

sales = etl.fromcsv('sales.csv')

# Basic statistics for numeric field
sales_stats = etl.stats(sales, 'amount')
print(f"Mean: {sales_stats['mean']}, StdDev: {sales_stats['stddev']}")

# Find data range
min_val, max_val = etl.limits(sales, 'amount')
print(f"Range: {min_val} to {max_val}")

# Count total rows
total_rows = etl.nrows(sales)
print(f"Total records: {total_rows}")

Data Profiling

import petl as etl

customer_data = etl.fromcsv('customers.csv')

# Analyze data types
age_types = etl.typecounts(customer_data, 'age')
etl.look(age_types)

# Check string patterns
phone_patterns = etl.stringpatterns(customer_data, 'phone')
etl.look(phone_patterns)

# Analyze row structure
row_lengths = etl.rowlengths(customer_data)
etl.look(row_lengths)

# Test parsing capabilities
income_parsing = etl.parsecounts(customer_data, 'income', [
    ('int', int),
    ('float', float),
    ('currency', lambda x: float(x.replace('$', '').replace(',', '')))
])
etl.look(income_parsing)

Value Distribution Analysis

import petl as etl

survey = etl.fromcsv('survey.csv')

# Count specific value
male_count = etl.valuecount(survey, 'gender', 'Male')
print(f"Male respondents: {male_count}")

# Get value frequency table
gender_dist = etl.valuecounts(survey, 'gender')
etl.look(gender_dist)

# Cross-tabulation
region_gender = etl.valuecounts(survey, 'region', 'gender')
etl.look(region_gender)

# Get counter for programmatic access
age_counter = etl.valuecounter(survey, 'age_group')
most_common = age_counter.most_common(3)
print(f"Top 3 age groups: {most_common}")

Data Quality Assessment

import petl as etl

# Compare two datasets
current_data = etl.fromcsv('current.csv')  
previous_data = etl.fromcsv('previous.csv')

# Check header consistency
header_diff = etl.diffheaders(current_data, previous_data)
if header_diff:
    print("Header differences found:", header_diff)

# Compare specific field values
value_diff = etl.diffvalues(current_data, previous_data, 'status')
if value_diff:
    print("Value differences in status field:", value_diff)

# Validate data format consistency
email_types = etl.typeset(current_data, 'email')
if len(email_types) > 1:
    print("Mixed data types found in email field:", email_types)

Data Validation

import petl as etl

products = etl.fromcsv('products.csv')

# Define validation constraints
constraints = {
    'price': lambda x: x is not None and x > 0,
    'category': lambda x: x in ['Electronics', 'Clothing', 'Books'],
    'stock': lambda x: isinstance(x, int) and x >= 0
}

# Validate data
validation_results = etl.validate(products, constraints)
etl.look(validation_results)

# Count validation failures
failed_rows = etl.select(validation_results, 'valid', False)
failure_count = etl.nrows(failed_rows)
print(f"Validation failures: {failure_count}")

Performance Monitoring

import petl as etl

large_dataset = etl.fromcsv('large_file.csv')

# Monitor processing progress
processed = (large_dataset
             .progress(batchsize=10000, prefix="Processing: ")
             .select('status', 'active')
             .convert('amount', float)
             .sort('timestamp'))

# Time the operations
timed_result = etl.clock(processed)

# Process with timing
result = etl.tocsv(timed_result, 'output.csv')

Install with Tessl CLI