tessl/pypi-petl

A Python package for extracting, transforming and loading tables of data.

—

Pending

Overview

Eval results

Files

Data Reshaping

Name: tessl/pypi-petl
Author: tessl

Advanced reshaping operations including pivoting, melting, transposing, and data normalization. Essential for converting between wide and long data formats and restructuring data for analysis and reporting.

Capabilities

Wide/Long Format Conversion

Transform between wide and long data formats for different analysis needs.

def melt(table, key=None, variables=None, variablefield='variable', valuefield='value') -> Table:
    """
    Transform from wide to long format.
    
    Parameters:
    - table: Input table
    - key: Fields to keep as identifier variables
    - variables: Fields to melt (default: all non-key fields)
    - variablefield: Name for variable column
    - valuefield: Name for value column
    
    Returns:
    Long-format table
    """

def recast(table, key=None, variablefield='variable', valuefield='value', 
           samplesize=1000, missing=None, reducers=None, fill=None) -> Table:
    """
    Transform from long to wide format.
    
    Parameters:
    - table: Input table in long format
    - key: Fields that identify observations
    - variablefield: Field containing variable names
    - valuefield: Field containing values
    - samplesize: Number of rows to sample for structure detection
    - missing: Value for missing data
    - reducers: Functions to handle multiple values per cell
    - fill: Value to fill missing combinations
    
    Returns:
    Wide-format table
    """

Pivoting and Cross-tabulation

Create pivot tables and cross-tabulations for data analysis.

def pivot(table, f1, f2, f3, aggfun, missing=None, presorted=False, 
          buffersize=None, tempdir=None, cache=True) -> Table:
    """
    Construct a pivot table.
    
    Parameters:
    - table: Input table
    - f1: Field for row labels
    - f2: Field for column labels
    - f3: Field containing values to aggregate
    - aggfun: Aggregation function (sum, count, mean, etc.)
    - missing: Value for missing data
    - presorted: If True, table is pre-sorted
    - buffersize: Buffer size for sorting
    - tempdir: Directory for temporary files
    - cache: Whether to cache results
    
    Returns:
    Pivot table with f1 as rows, f2 as columns, aggregated f3 as values
    """

Table Structure Transformation

Fundamental operations for changing table structure and layout.

def transpose(table) -> Table:
    """
    Transpose the table (swap rows and columns).
    
    Parameters:
    - table: Input table
    
    Returns:
    Transposed table where original columns become rows
    """

def flatten(table) -> Table:
    """
    Flatten nested field values.
    
    Parameters:
    - table: Input table with nested/iterable field values
    
    Returns:
    Table with flattened field values
    """

def unflatten(*args, **kwargs) -> Table:
    """
    Unflatten previously flattened data.
    
    Parameters:
    - args: Unflatten specifications
    - kwargs: Additional options
    
    Returns:
    Table with nested structure restored
    """

Field Unpacking

Extract nested data structures into separate fields.

def unpack(table, field, newfields=None, include_original=False, missing=None) -> Table:
    """
    Unpack iterable field values into separate fields.
    
    Parameters:
    - table: Input table
    - field: Field containing iterable values (lists, tuples)
    - newfields: Names for unpacked fields
    - include_original: Whether to keep original field
    - missing: Value for missing data
    
    Returns:
    Table with unpacked fields
    """

def unpackdict(table, field, keys=None, includeoriginal=False, 
               samplesize=1000, missing=None) -> Table:
    """
    Unpack dictionary field values into separate fields.
    
    Parameters:
    - table: Input table
    - field: Field containing dictionary values
    - keys: Specific keys to unpack (default: all found keys)
    - includeoriginal: Whether to keep original field
    - samplesize: Number of rows to sample for key detection
    - missing: Value for missing keys
    
    Returns:
    Table with dictionary keys as separate fields
    """

Regular Expression Reshaping

Use regular expressions to extract and reshape data from text fields.

def capture(table, field, pattern, newfields=None, include_original=False, 
            flags=0, fill=None) -> Table:
    """
    Extract data using regular expression capture groups.
    
    Parameters:
    - table: Input table
    - field: Field to apply regex pattern
    - pattern: Regular expression with capture groups
    - newfields: Names for captured groups
    - include_original: Whether to keep original field
    - flags: Regular expression flags
    - fill: Value for non-matching rows
    
    Returns:
    Table with captured groups as new fields
    """

def split(table, field, pattern, newfields=None, include_original=False, 
          maxsplit=0, flags=0) -> Table:
    """
    Split field values using regular expression.
    
    Parameters:
    - table: Input table
    - field: Field to split
    - pattern: Regular expression pattern for splitting
    - newfields: Names for split parts
    - include_original: Whether to keep original field
    - maxsplit: Maximum number of splits
    - flags: Regular expression flags
    
    Returns:
    Table with split parts as separate fields
    """

def splitdown(table, field, pattern, maxsplit=0, flags=0) -> Table:
    """
    Split field values and create multiple rows.
    
    Parameters:
    - table: Input table
    - field: Field to split
    - pattern: Regular expression pattern
    - maxsplit: Maximum number of splits
    - flags: Regular expression flags
    
    Returns:
    Table with additional rows for split values
    """

Usage Examples

Wide to Long Format (Melt)

import petl as etl

# Wide format data
wide_data = [
    ['id', 'name', 'jan_sales', 'feb_sales', 'mar_sales'],
    [1, 'Alice', 100, 150, 200],
    [2, 'Bob', 120, 180, 160]
]
wide_table = etl.wrap(wide_data)

# Convert to long format
long_table = etl.melt(wide_table, 
                     key=['id', 'name'],
                     variablefield='month',
                     valuefield='sales')
# Result: id, name, month, sales
#         1, Alice, jan_sales, 100
#         1, Alice, feb_sales, 150
#         etc.

Long to Wide Format (Recast)

import petl as etl

long_data = [
    ['id', 'name', 'month', 'sales'],
    [1, 'Alice', 'jan', 100],
    [1, 'Alice', 'feb', 150],
    [2, 'Bob', 'jan', 120],
    [2, 'Bob', 'feb', 180]
]
long_table = etl.wrap(long_data)

# Convert to wide format
wide_table = etl.recast(long_table,
                       key=['id', 'name'],
                       variablefield='month',
                       valuefield='sales')
# Result: id, name, jan, feb
#         1, Alice, 100, 150
#         2, Bob, 120, 180

Pivot Tables

import petl as etl

sales = etl.fromcsv('sales.csv')  # region, product, quarter, amount

# Create pivot table
pivot_table = etl.pivot(sales, 'region', 'quarter', 'amount', sum)
# Regions as rows, quarters as columns, sum of amounts as values

# Multiple aggregations
pivot_with_count = etl.pivot(sales, 'product', 'region', 'amount', 
                            lambda values: (sum(values), len(values)))

Data Unpacking

import petl as etl

# Unpack list/tuple fields
data_with_coords = [
    ['name', 'coordinates'],
    ['Location A', (40.7128, -74.0060)],
    ['Location B', (34.0522, -118.2437)]
]
table = etl.wrap(data_with_coords)

unpacked = etl.unpack(table, 'coordinates', ['latitude', 'longitude'])
# Result: name, latitude, longitude

# Unpack dictionary fields
data_with_dict = [
    ['name', 'details'],
    ['Alice', {'age': 30, 'city': 'NYC', 'salary': 75000}],
    ['Bob', {'age': 25, 'city': 'LA', 'salary': 65000}]
]
dict_table = etl.wrap(data_with_dict)

unpacked_dict = etl.unpackdict(dict_table, 'details')
# Result: name, age, city, salary

Regular Expression Extraction

import petl as etl

# Extract structured data from text
log_data = [
    ['timestamp', 'log_entry'],
    ['2023-01-01', '2023-01-01 10:30:45 ERROR user123 failed login'],
    ['2023-01-01', '2023-01-01 10:31:12 INFO user456 successful login']
]
log_table = etl.wrap(log_data)

# Extract components using regex
extracted = etl.capture(log_table, 'log_entry',
                       r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (\w+) (\w+) (.+)',
                       ['datetime', 'level', 'user', 'message'])

# Split email addresses
email_data = [
    ['contact'],
    ['alice@company.com'],
    ['bob@organization.org']
]
email_table = etl.wrap(email_data)

split_emails = etl.split(email_table, 'contact', '@', 
                        ['username', 'domain'])

Complex Reshaping Workflows

import etl as etl

# Multi-step reshaping process
raw_data = etl.fromcsv('complex_data.csv')

# 1. Unpack nested JSON-like field
step1 = etl.unpackdict(raw_data, 'metadata')

# 2. Melt measurement columns
step2 = etl.melt(step1, 
                key=['id', 'timestamp', 'location'],
                variables=['temp', 'humidity', 'pressure'],
                variablefield='measurement_type',
                valuefield='value')

# 3. Extract date components
step3 = etl.capture(step2, 'timestamp',
                   r'(\d{4})-(\d{2})-(\d{2})',
                   ['year', 'month', 'day'])

# 4. Pivot for final analysis format
final = etl.pivot(step3, ['location', 'year'], 'measurement_type', 'value', 
                 lambda vals: sum(vals) / len(vals))  # average

Install with Tessl CLI