A Python package for extracting, transforming and loading tables of data.
—
Advanced reshaping operations including pivoting, melting, transposing, and data normalization. Essential for converting between wide and long data formats and restructuring data for analysis and reporting.
Transform between wide and long data formats for different analysis needs.
def melt(table, key=None, variables=None, variablefield='variable', valuefield='value') -> Table:
"""
Transform from wide to long format.
Parameters:
- table: Input table
- key: Fields to keep as identifier variables
- variables: Fields to melt (default: all non-key fields)
- variablefield: Name for variable column
- valuefield: Name for value column
Returns:
Long-format table
"""
def recast(table, key=None, variablefield='variable', valuefield='value',
samplesize=1000, missing=None, reducers=None, fill=None) -> Table:
"""
Transform from long to wide format.
Parameters:
- table: Input table in long format
- key: Fields that identify observations
- variablefield: Field containing variable names
- valuefield: Field containing values
- samplesize: Number of rows to sample for structure detection
- missing: Value for missing data
- reducers: Functions to handle multiple values per cell
- fill: Value to fill missing combinations
Returns:
Wide-format table
"""Create pivot tables and cross-tabulations for data analysis.
def pivot(table, f1, f2, f3, aggfun, missing=None, presorted=False,
buffersize=None, tempdir=None, cache=True) -> Table:
"""
Construct a pivot table.
Parameters:
- table: Input table
- f1: Field for row labels
- f2: Field for column labels
- f3: Field containing values to aggregate
- aggfun: Aggregation function (sum, count, mean, etc.)
- missing: Value for missing data
- presorted: If True, table is pre-sorted
- buffersize: Buffer size for sorting
- tempdir: Directory for temporary files
- cache: Whether to cache results
Returns:
Pivot table with f1 as rows, f2 as columns, aggregated f3 as values
"""Fundamental operations for changing table structure and layout.
def transpose(table) -> Table:
"""
Transpose the table (swap rows and columns).
Parameters:
- table: Input table
Returns:
Transposed table where original columns become rows
"""
def flatten(table) -> Table:
"""
Flatten nested field values.
Parameters:
- table: Input table with nested/iterable field values
Returns:
Table with flattened field values
"""
def unflatten(*args, **kwargs) -> Table:
"""
Unflatten previously flattened data.
Parameters:
- args: Unflatten specifications
- kwargs: Additional options
Returns:
Table with nested structure restored
"""Extract nested data structures into separate fields.
def unpack(table, field, newfields=None, include_original=False, missing=None) -> Table:
"""
Unpack iterable field values into separate fields.
Parameters:
- table: Input table
- field: Field containing iterable values (lists, tuples)
- newfields: Names for unpacked fields
- include_original: Whether to keep original field
- missing: Value for missing data
Returns:
Table with unpacked fields
"""
def unpackdict(table, field, keys=None, includeoriginal=False,
samplesize=1000, missing=None) -> Table:
"""
Unpack dictionary field values into separate fields.
Parameters:
- table: Input table
- field: Field containing dictionary values
- keys: Specific keys to unpack (default: all found keys)
- includeoriginal: Whether to keep original field
- samplesize: Number of rows to sample for key detection
- missing: Value for missing keys
Returns:
Table with dictionary keys as separate fields
"""Use regular expressions to extract and reshape data from text fields.
def capture(table, field, pattern, newfields=None, include_original=False,
flags=0, fill=None) -> Table:
"""
Extract data using regular expression capture groups.
Parameters:
- table: Input table
- field: Field to apply regex pattern
- pattern: Regular expression with capture groups
- newfields: Names for captured groups
- include_original: Whether to keep original field
- flags: Regular expression flags
- fill: Value for non-matching rows
Returns:
Table with captured groups as new fields
"""
def split(table, field, pattern, newfields=None, include_original=False,
maxsplit=0, flags=0) -> Table:
"""
Split field values using regular expression.
Parameters:
- table: Input table
- field: Field to split
- pattern: Regular expression pattern for splitting
- newfields: Names for split parts
- include_original: Whether to keep original field
- maxsplit: Maximum number of splits
- flags: Regular expression flags
Returns:
Table with split parts as separate fields
"""
def splitdown(table, field, pattern, maxsplit=0, flags=0) -> Table:
"""
Split field values and create multiple rows.
Parameters:
- table: Input table
- field: Field to split
- pattern: Regular expression pattern
- maxsplit: Maximum number of splits
- flags: Regular expression flags
Returns:
Table with additional rows for split values
"""import petl as etl
# Wide format data
wide_data = [
['id', 'name', 'jan_sales', 'feb_sales', 'mar_sales'],
[1, 'Alice', 100, 150, 200],
[2, 'Bob', 120, 180, 160]
]
wide_table = etl.wrap(wide_data)
# Convert to long format
long_table = etl.melt(wide_table,
key=['id', 'name'],
variablefield='month',
valuefield='sales')
# Result: id, name, month, sales
# 1, Alice, jan_sales, 100
# 1, Alice, feb_sales, 150
# etc.import petl as etl
long_data = [
['id', 'name', 'month', 'sales'],
[1, 'Alice', 'jan', 100],
[1, 'Alice', 'feb', 150],
[2, 'Bob', 'jan', 120],
[2, 'Bob', 'feb', 180]
]
long_table = etl.wrap(long_data)
# Convert to wide format
wide_table = etl.recast(long_table,
key=['id', 'name'],
variablefield='month',
valuefield='sales')
# Result: id, name, jan, feb
# 1, Alice, 100, 150
# 2, Bob, 120, 180import petl as etl
sales = etl.fromcsv('sales.csv') # region, product, quarter, amount
# Create pivot table
pivot_table = etl.pivot(sales, 'region', 'quarter', 'amount', sum)
# Regions as rows, quarters as columns, sum of amounts as values
# Multiple aggregations
pivot_with_count = etl.pivot(sales, 'product', 'region', 'amount',
lambda values: (sum(values), len(values)))import petl as etl
# Unpack list/tuple fields
data_with_coords = [
['name', 'coordinates'],
['Location A', (40.7128, -74.0060)],
['Location B', (34.0522, -118.2437)]
]
table = etl.wrap(data_with_coords)
unpacked = etl.unpack(table, 'coordinates', ['latitude', 'longitude'])
# Result: name, latitude, longitude
# Unpack dictionary fields
data_with_dict = [
['name', 'details'],
['Alice', {'age': 30, 'city': 'NYC', 'salary': 75000}],
['Bob', {'age': 25, 'city': 'LA', 'salary': 65000}]
]
dict_table = etl.wrap(data_with_dict)
unpacked_dict = etl.unpackdict(dict_table, 'details')
# Result: name, age, city, salaryimport petl as etl
# Extract structured data from text
log_data = [
['timestamp', 'log_entry'],
['2023-01-01', '2023-01-01 10:30:45 ERROR user123 failed login'],
['2023-01-01', '2023-01-01 10:31:12 INFO user456 successful login']
]
log_table = etl.wrap(log_data)
# Extract components using regex
extracted = etl.capture(log_table, 'log_entry',
r'(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) (\w+) (\w+) (.+)',
['datetime', 'level', 'user', 'message'])
# Split email addresses
email_data = [
['contact'],
['alice@company.com'],
['bob@organization.org']
]
email_table = etl.wrap(email_data)
split_emails = etl.split(email_table, 'contact', '@',
['username', 'domain'])import etl as etl
# Multi-step reshaping process
raw_data = etl.fromcsv('complex_data.csv')
# 1. Unpack nested JSON-like field
step1 = etl.unpackdict(raw_data, 'metadata')
# 2. Melt measurement columns
step2 = etl.melt(step1,
key=['id', 'timestamp', 'location'],
variables=['temp', 'humidity', 'pressure'],
variablefield='measurement_type',
valuefield='value')
# 3. Extract date components
step3 = etl.capture(step2, 'timestamp',
r'(\d{4})-(\d{2})-(\d{2})',
['year', 'month', 'day'])
# 4. Pivot for final analysis format
final = etl.pivot(step3, ['location', 'year'], 'measurement_type', 'value',
lambda vals: sum(vals) / len(vals)) # averageInstall with Tessl CLI
npx tessl i tessl/pypi-petl