A Python package for extracting, transforming and loading tables of data.
—
Advanced operations for sorting data and combining multiple tables through various join types. PETL provides both memory-based and disk-based sorting for large datasets, along with comprehensive join operations for data integration and analysis.
Sort table data using various keys and algorithms, with support for large datasets through external sorting.
def sort(table, key=None, reverse=False, buffersize=None, tempdir=None, cache=True) -> Table:
"""
Sort table rows by key.
Parameters:
- table: Input table
- key: Sort key (field name, index, function, or tuple of keys)
- reverse: If True, sort in descending order
- buffersize: Buffer size for external sorting (default: 100000)
- tempdir: Directory for temporary files during external sorting
- cache: Whether to cache sorted results
Returns:
Sorted table
"""
def mergesort(*tables, **kwargs) -> Table:
"""
Merge multiple pre-sorted tables.
Parameters:
- tables: Pre-sorted tables to merge
- key: Sort key used for merging
- reverse: Whether tables are sorted in reverse order
- kwargs: Additional sorting options
Returns:
Single merged and sorted table
"""
def issorted(table, key=None, reverse=False, strict=False) -> bool:
"""
Test if table is sorted by key.
Parameters:
- table: Input table
- key: Sort key to test
- reverse: Whether to test for reverse sort order
- strict: If True, require strict ordering (no equal values)
Returns:
Boolean indicating if table is sorted
"""Combine tables by matching key values, returning only rows with matches in both tables.
def join(table1, table2, key=None, lkey=None, rkey=None, presorted=False,
buffersize=None, tempdir=None, cache=True) -> Table:
"""
Join two tables on matching key values.
Parameters:
- table1: Left table
- table2: Right table
- key: Join key (used for both tables if lkey/rkey not specified)
- lkey: Left table join key
- rkey: Right table join key
- presorted: If True, tables are already sorted by join key
- buffersize: Buffer size for sorting
- tempdir: Directory for temporary files
- cache: Whether to cache results
Returns:
Table with matching rows from both tables
"""
def hashjoin(left, right, key=None, lkey=None, rkey=None, cache=True,
lprefix=None, rprefix=None) -> Table:
"""
Hash-based inner join (memory efficient for large tables).
Parameters:
- left: Left table
- right: Right table
- key: Join key
- lkey: Left table join key
- rkey: Right table join key
- cache: Whether to cache hash table
- lprefix: Prefix for left table field names in conflicts
- rprefix: Prefix for right table field names in conflicts
Returns:
Table with hash-joined results
"""Include rows that may not have matches in both tables.
def leftjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
"""
Left outer join two tables.
Parameters:
- left: Left table (all rows preserved)
- right: Right table
- key: Join key
- lkey: Left table join key
- rkey: Right table join key
- missing: Value for missing fields from right table
- presorted: If True, tables are pre-sorted
- buffersize: Buffer size for sorting
- tempdir: Directory for temporary files
- cache: Whether to cache results
Returns:
Table with all left rows plus matching right rows
"""
def rightjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
"""
Right outer join two tables.
Parameters:
- left: Left table
- right: Right table (all rows preserved)
- key: Join key
- lkey: Left table join key
- rkey: Right table join key
- missing: Value for missing fields from left table
- presorted: If True, tables are pre-sorted
- buffersize: Buffer size for sorting
- tempdir: Directory for temporary files
- cache: Whether to cache results
Returns:
Table with all right rows plus matching left rows
"""
def outerjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
"""
Full outer join two tables.
Parameters:
- left: Left table
- right: Right table
- key: Join key
- lkey: Left table join key
- rkey: Right table join key
- missing: Value for missing fields
- presorted: If True, tables are pre-sorted
- buffersize: Buffer size for sorting
- tempdir: Directory for temporary files
- cache: Whether to cache results
Returns:
Table with all rows from both tables
"""
def hashleftjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
cache=True, lprefix=None, rprefix=None) -> Table:
"""Hash-based left outer join."""
def hashrightjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
cache=True, lprefix=None, rprefix=None) -> Table:
"""Hash-based right outer join."""Specialized join operations for data analysis and lookup scenarios.
def antijoin(left, right, key=None, lkey=None, rkey=None, presorted=False,
buffersize=None, tempdir=None, cache=True) -> Table:
"""
Return rows from left table with no match in right table.
Parameters:
- left: Left table
- right: Right table
- key: Join key
- lkey: Left table join key
- rkey: Right table join key
- presorted: If True, tables are pre-sorted
- buffersize: Buffer size for sorting
- tempdir: Directory for temporary files
- cache: Whether to cache results
Returns:
Table with left rows that have no match in right table
"""
def hashantijoin(left, right, key=None, lkey=None, rkey=None) -> Table:
"""Hash-based anti-join."""
def lookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
"""
Join using lookup from right table.
Parameters:
- left: Left table
- right: Right table
- key: Join key
- lkey: Left table join key
- rkey: Right table join key
- missing: Value for missing lookups
- presorted: If True, tables are pre-sorted
- buffersize: Buffer size for sorting
- tempdir: Directory for temporary files
- cache: Whether to cache results
Returns:
Table with lookup values from right table
"""
def hashlookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None,
lprefix=None, rprefix=None) -> Table:
"""Hash-based lookup join."""Operations for Cartesian products and table decomposition.
def crossjoin(*tables, **kwargs) -> Table:
"""
Cartesian product of tables.
Parameters:
- tables: Tables to cross join
- kwargs: Additional options
Returns:
Table with all possible row combinations
"""
def unjoin(table, value, key=None, autoincrement=None, presorted=False,
buffersize=None, tempdir=None, cache=True) -> tuple:
"""
Split a table into two by extracting key and value fields.
Parameters:
- table: Input table
- value: Value field name
- key: Key field name (default: remaining fields)
- autoincrement: If True, add auto-incrementing key
- presorted: If True, table is pre-sorted
- buffersize: Buffer size for sorting
- tempdir: Directory for temporary files
- cache: Whether to cache results
Returns:
Tuple of (key_table, value_table)
"""Operations treating tables as sets for data comparison and analysis.
def complement(a, b, presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
"""
Return rows in table a but not in table b.
Parameters:
- a: First table
- b: Second table
- presorted: If True, tables are pre-sorted
- buffersize: Buffer size for sorting
- tempdir: Directory for temporary files
- cache: Whether to cache results
Returns:
Table with rows in a but not in b
"""
def intersection(a, b, presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
"""
Return rows present in both tables.
Parameters:
- a: First table
- b: Second table
- presorted: If True, tables are pre-sorted
- buffersize: Buffer size for sorting
- tempdir: Directory for temporary files
- cache: Whether to cache results
Returns:
Table with rows present in both tables
"""
def diff(a, b, presorted=False, buffersize=None, tempdir=None, cache=True, strict=False) -> Table:
"""
Return rows that differ between tables.
Parameters:
- a: First table
- b: Second table
- presorted: If True, tables are pre-sorted
- buffersize: Buffer size for sorting
- tempdir: Directory for temporary files
- cache: Whether to cache results
- strict: If True, use strict comparison
Returns:
Table with differing rows
"""
def hashcomplement(a, b, strict=False) -> Table:
"""Hash-based complement operation."""
def hashintersection(a, b) -> Table:
"""Hash-based intersection operation."""import petl as etl
table = etl.fromcsv('employees.csv')
# Sort by single field
sorted_by_name = etl.sort(table, 'name')
# Sort by multiple fields
sorted_multi = etl.sort(table, ('department', 'salary'))
# Sort in reverse order
sorted_desc = etl.sort(table, 'salary', reverse=True)
# Sort with custom function
sorted_custom = etl.sort(table, lambda row: (row.department, -row.salary))
# Check if table is sorted
is_sorted = etl.issorted(table, 'name')import petl as etl
employees = etl.fromcsv('employees.csv') # id, name, dept_id
departments = etl.fromcsv('departments.csv') # id, dept_name
# Simple join on matching field names
joined = etl.join(employees, departments, 'id')
# Join with different key names
joined = etl.join(employees, departments,
lkey='dept_id', rkey='id')
# Hash join for better performance with large tables
hash_joined = etl.hashjoin(employees, departments,
lkey='dept_id', rkey='id')import petl as etl
customers = etl.fromcsv('customers.csv')
orders = etl.fromcsv('orders.csv')
# Left join - all customers, with orders where they exist
customer_orders = etl.leftjoin(customers, orders,
lkey='id', rkey='customer_id',
missing='No orders')
# Right join - all orders with customer details
order_details = etl.rightjoin(customers, orders,
lkey='id', rkey='customer_id')
# Full outer join - all customers and all orders
full_join = etl.outerjoin(customers, orders,
lkey='id', rkey='customer_id')import petl as etl
all_products = etl.fromcsv('all_products.csv')
sold_products = etl.fromcsv('sold_products.csv')
# Find products that haven't been sold
unsold = etl.antijoin(all_products, sold_products, 'product_id')
# Lookup product names
orders = etl.fromcsv('orders.csv')
products = etl.fromcsv('products.csv')
orders_with_names = etl.lookupjoin(orders, products,
lkey='product_id', rkey='id')import petl as etl
current_employees = etl.fromcsv('current_employees.csv')
former_employees = etl.fromcsv('former_employees.csv')
# Find employees who are only current (not former)
only_current = etl.complement(current_employees, former_employees)
# Find employees who have been both current and former
both = etl.intersection(current_employees, former_employees)
# Find all differences between two employee lists
differences = etl.diff(current_employees, former_employees)import petl as etl
# Multi-table joins
employees = etl.fromcsv('employees.csv')
departments = etl.fromcsv('departments.csv')
locations = etl.fromcsv('locations.csv')
# Chain joins for multi-table relationships
emp_dept = etl.join(employees, departments,
lkey='dept_id', rkey='id')
full_info = etl.join(emp_dept, locations,
lkey='location_id', rkey='id')
# Self-join for hierarchical data
managers = etl.join(employees, employees,
lkey='manager_id', rkey='id',
rprefix='mgr_')
# Conditional joins with preprocessing
high_performers = etl.select(employees, 'rating', lambda x: x > 4.0)
bonus_eligible = etl.join(high_performers, departments,
lkey='dept_id', rkey='id')import petl as etl
large_table1 = etl.fromcsv('large_data1.csv')
large_table2 = etl.fromcsv('large_data2.csv')
# Use hash joins for better performance
fast_join = etl.hashjoin(large_table1, large_table2, 'id')
# Pre-sort for sort-merge joins
sorted1 = etl.sort(large_table1, 'id')
sorted2 = etl.sort(large_table2, 'id')
efficient_join = etl.join(sorted1, sorted2, 'id', presorted=True)
# External sorting for very large datasets
huge_table = etl.fromcsv('huge_data.csv')
sorted_huge = etl.sort(huge_table, 'timestamp',
buffersize=50000, # Smaller buffer
tempdir='/tmp/sorting')Install with Tessl CLI
npx tessl i tessl/pypi-petl