A Python package for extracting, transforming and loading tables of data.
—
Tools for data validation, quality assessment, and statistical analysis. Includes data profiling, constraint validation, summary statistics, and visualization utilities for understanding and validating data quality.
Validate data against constraints and rules to ensure data quality.
def validate(table, constraints=None, header=None) -> Table:
"""
Validate table data against constraints and expected header structure.
Parameters:
- table: Input table to validate
- constraints: List of constraint dictionaries with the following keys:
- name (str): Name of the constraint
- field (str, optional): Field name to validate (omit for row-level constraints)
- test (callable, optional): Function to test field values (e.g., int, float)
- assertion (callable, optional): Function returning True/False for validation
- optional (bool, optional): If True, missing fields are allowed
- header: Expected header tuple/list for structure validation
Returns:
Table with columns: name, row, field, value, error
Example constraint formats:
- Field type test: dict(name='foo_int', field='foo', test=int)
- Field assertion: dict(name='bar_enum', field='bar', assertion=lambda v: v in ['A', 'B'])
- Row-level assertion: dict(name='no_nulls', assertion=lambda row: None not in row)
- Optional field: dict(name='optional_field', field='opt', test=str, optional=True)
"""Tools for examining and understanding table structure and content.
def look(table, limit=0, vrepr=None, index_header=None, style=None, truncate=None, width=None):
"""
Print a table in a formatted display.
Parameters:
- table: Input table
- limit: Maximum number of rows to display (0 for all)
- vrepr: Function for value representation
- index_header: Whether to show row indices
- style: Display style ('grid', 'simple', 'minimal')
- truncate: Maximum length for cell values
- width: Maximum display width
"""
def lookall(table, **kwargs):
"""Print entire table with formatting options."""
def lookstr(table, limit=0, **kwargs) -> str:
"""
Return table as formatted string.
Parameters:
- table: Input table
- limit: Maximum number of rows
- kwargs: Formatting options
Returns:
Formatted string representation of table
"""
def lookallstr(table, **kwargs) -> str:
"""Return entire table as formatted string."""
def see(table, limit=0, vrepr=None, index_header=None):
"""
Print table with field details and statistics.
Parameters:
- table: Input table
- limit: Maximum number of rows to analyze
- vrepr: Function for value representation
- index_header: Whether to show row indices
"""Calculate summary statistics and analyze data distributions.
def stats(table, field):
"""
Return basic statistics for a numeric field.
Parameters:
- table: Input table
- field: Numeric field name
Returns:
Dictionary with count, min, max, sum, mean, stddev
"""
def limits(table, field):
"""
Return minimum and maximum values for a field.
Parameters:
- table: Input table
- field: Field name
Returns:
Tuple of (min_value, max_value)
"""
def nrows(table) -> int:
"""
Count the number of data rows in the table.
Parameters:
- table: Input table
Returns:
Integer count of rows (excluding header)
"""Analyze data types, patterns, and quality metrics.
def typeset(table, field):
"""
Return set of types found in the specified field.
Parameters:
- table: Input table
- field: Field name to analyze
Returns:
Set of Python types found in the field
"""
def typecounter(table, field):
"""
Return a Counter of Python types found in the specified field.
Parameters:
- table: Input table
- field: Field name to analyze
Returns:
Counter object with type counts
"""
def typecounts(table, field) -> Table:
"""
Return a table with type counts for the specified field.
Parameters:
- table: Input table
- field: Field name to analyze
Returns:
Table with type names and counts
"""
def stringpatterncounter(table, field):
"""
Count string patterns in the specified field.
Parameters:
- table: Input table
- field: Field name containing string values
Returns:
Counter with string pattern counts
"""
def stringpatterns(table, field) -> Table:
"""
Return a table with string pattern counts.
Parameters:
- table: Input table
- field: Field name containing string values
Returns:
Table with patterns and their frequencies
"""
def rowlengths(table) -> Table:
"""
Return counts of row lengths in the table.
Parameters:
- table: Input table
Returns:
Table with row lengths and their frequencies
"""Analyze value distributions and frequencies.
def valuecount(table, field, value, missing=None) -> int:
"""
Count occurrences of a specific value in a field.
Parameters:
- table: Input table
- field: Field name to search
- value: Value to count
- missing: Value to treat as missing
Returns:
Integer count of occurrences
"""
def valuecounter(table, *field, **kwargs):
"""
Return a Counter of values in the specified field(s).
Parameters:
- table: Input table
- field: Field name(s) to count
- kwargs: Additional options
Returns:
Counter object with value frequencies
"""
def valuecounts(table, *field, **kwargs) -> Table:
"""
Return a table with value counts for the specified field(s).
Parameters:
- table: Input table
- field: Field name(s) to count
- kwargs: Additional options
Returns:
Table with values and their frequencies
"""Identify data quality issues and inconsistencies.
def diffheaders(t1, t2):
"""
Find differences between table headers.
Parameters:
- t1: First table
- t2: Second table
Returns:
Information about header differences
"""
def diffvalues(t1, t2, field):
"""
Find different values between tables for a field.
Parameters:
- t1: First table
- t2: Second table
- field: Field name to compare
Returns:
Information about value differences
"""
def parsecounter(table, field, parsers=(('int', int), ('float', float))):
"""
Count successful parsing attempts with different parsers.
Parameters:
- table: Input table
- field: Field name to test parsing
- parsers: List of (name, parser_function) tuples
Returns:
Counter with parsing success counts
"""
def parsecounts(table, field, parsers=(('int', int), ('float', float))) -> Table:
"""
Return a table with parsing attempt counts.
Parameters:
- table: Input table
- field: Field name to test parsing
- parsers: List of (name, parser_function) tuples
Returns:
Table with parser names and success counts
"""Monitor and measure data processing performance.
def progress(table, batchsize=1000, prefix="", out=None) -> Table:
"""
Report progress while iterating through table.
Parameters:
- table: Input table
- batchsize: Number of rows between progress reports
- prefix: Prefix for progress messages
- out: Output stream for progress messages
Returns:
Table that reports progress during iteration
"""
def clock(table) -> Table:
"""
Time table operations.
Parameters:
- table: Input table
Returns:
Table that measures processing time
"""Generate test data for validation and testing purposes.
def randomtable(numflds=5, numrows=100, wait=0, seed=None) -> Table:
"""
Generate a table with random data.
Parameters:
- numflds: Number of fields
- numrows: Number of rows
- wait: Delay between row generation (for testing)
- seed: Random seed for reproducibility
Returns:
Table with random data
"""
def dummytable(numflds=5, numrows=100) -> Table:
"""
Generate a table with dummy data.
Parameters:
- numflds: Number of fields
- numrows: Number of rows
Returns:
Table with predictable dummy data
"""import petl as etl
data = etl.fromcsv('dataset.csv')
# Quick look at data structure
etl.look(data, limit=10)
# Detailed analysis with statistics
etl.see(data, limit=20)
# Get formatted string representation
table_str = etl.lookstr(data, limit=5, style='simple')
print(table_str)import petl as etl
sales = etl.fromcsv('sales.csv')
# Basic statistics for numeric field
sales_stats = etl.stats(sales, 'amount')
print(f"Mean: {sales_stats['mean']}, StdDev: {sales_stats['stddev']}")
# Find data range
min_val, max_val = etl.limits(sales, 'amount')
print(f"Range: {min_val} to {max_val}")
# Count total rows
total_rows = etl.nrows(sales)
print(f"Total records: {total_rows}")import petl as etl
customer_data = etl.fromcsv('customers.csv')
# Analyze data types
age_types = etl.typecounts(customer_data, 'age')
etl.look(age_types)
# Check string patterns
phone_patterns = etl.stringpatterns(customer_data, 'phone')
etl.look(phone_patterns)
# Analyze row structure
row_lengths = etl.rowlengths(customer_data)
etl.look(row_lengths)
# Test parsing capabilities
income_parsing = etl.parsecounts(customer_data, 'income', [
('int', int),
('float', float),
('currency', lambda x: float(x.replace('$', '').replace(',', '')))
])
etl.look(income_parsing)import petl as etl
survey = etl.fromcsv('survey.csv')
# Count specific value
male_count = etl.valuecount(survey, 'gender', 'Male')
print(f"Male respondents: {male_count}")
# Get value frequency table
gender_dist = etl.valuecounts(survey, 'gender')
etl.look(gender_dist)
# Cross-tabulation
region_gender = etl.valuecounts(survey, 'region', 'gender')
etl.look(region_gender)
# Get counter for programmatic access
age_counter = etl.valuecounter(survey, 'age_group')
most_common = age_counter.most_common(3)
print(f"Top 3 age groups: {most_common}")import petl as etl
# Compare two datasets
current_data = etl.fromcsv('current.csv')
previous_data = etl.fromcsv('previous.csv')
# Check header consistency
header_diff = etl.diffheaders(current_data, previous_data)
if header_diff:
print("Header differences found:", header_diff)
# Compare specific field values
value_diff = etl.diffvalues(current_data, previous_data, 'status')
if value_diff:
print("Value differences in status field:", value_diff)
# Validate data format consistency
email_types = etl.typeset(current_data, 'email')
if len(email_types) > 1:
print("Mixed data types found in email field:", email_types)import petl as etl
products = etl.fromcsv('products.csv')
# Define validation constraints
constraints = {
'price': lambda x: x is not None and x > 0,
'category': lambda x: x in ['Electronics', 'Clothing', 'Books'],
'stock': lambda x: isinstance(x, int) and x >= 0
}
# Validate data
validation_results = etl.validate(products, constraints)
etl.look(validation_results)
# Count validation failures
failed_rows = etl.select(validation_results, 'valid', False)
failure_count = etl.nrows(failed_rows)
print(f"Validation failures: {failure_count}")import petl as etl
large_dataset = etl.fromcsv('large_file.csv')
# Monitor processing progress
processed = (large_dataset
.progress(batchsize=10000, prefix="Processing: ")
.select('status', 'active')
.convert('amount', float)
.sort('timestamp'))
# Time the operations
timed_result = etl.clock(processed)
# Process with timing
result = etl.tocsv(timed_result, 'output.csv')Install with Tessl CLI
npx tessl i tessl/pypi-petl