tessl/pypi-petl

A Python package for extracting, transforming and loading tables of data.

—

Pending

Overview

Eval results

Files

Sorting and Joins

Name: tessl/pypi-petl
Author: tessl

Advanced operations for sorting data and combining multiple tables through various join types. PETL provides both memory-based and disk-based sorting for large datasets, along with comprehensive join operations for data integration and analysis.

Capabilities

Sorting Operations

Sort table data using various keys and algorithms, with support for large datasets through external sorting.

def sort(table, key=None, reverse=False, buffersize=None, tempdir=None, cache=True) -> Table:
    """
    Sort table rows by key.
    
    Parameters:
    - table: Input table
    - key: Sort key (field name, index, function, or tuple of keys)
    - reverse: If True, sort in descending order
    - buffersize: Buffer size for external sorting (default: 100000)
    - tempdir: Directory for temporary files during external sorting
    - cache: Whether to cache sorted results
    
    Returns:
    Sorted table
    """

def mergesort(*tables, **kwargs) -> Table:
    """
    Merge multiple pre-sorted tables.
    
    Parameters:
    - tables: Pre-sorted tables to merge
    - key: Sort key used for merging
    - reverse: Whether tables are sorted in reverse order
    - kwargs: Additional sorting options
    
    Returns:
    Single merged and sorted table
    """

def issorted(table, key=None, reverse=False, strict=False) -> bool:
    """
    Test if table is sorted by key.
    
    Parameters:
    - table: Input table
    - key: Sort key to test
    - reverse: Whether to test for reverse sort order
    - strict: If True, require strict ordering (no equal values)
    
    Returns:
    Boolean indicating if table is sorted
    """

Inner Joins

Combine tables by matching key values, returning only rows with matches in both tables.

def join(table1, table2, key=None, lkey=None, rkey=None, presorted=False, 
         buffersize=None, tempdir=None, cache=True) -> Table:
    """
    Join two tables on matching key values.
    
    Parameters:
    - table1: Left table
    - table2: Right table
    - key: Join key (used for both tables if lkey/rkey not specified)
    - lkey: Left table join key
    - rkey: Right table join key
    - presorted: If True, tables are already sorted by join key
    - buffersize: Buffer size for sorting
    - tempdir: Directory for temporary files
    - cache: Whether to cache results
    
    Returns:
    Table with matching rows from both tables
    """

def hashjoin(left, right, key=None, lkey=None, rkey=None, cache=True, 
             lprefix=None, rprefix=None) -> Table:
    """
    Hash-based inner join (memory efficient for large tables).
    
    Parameters:
    - left: Left table
    - right: Right table
    - key: Join key
    - lkey: Left table join key
    - rkey: Right table join key
    - cache: Whether to cache hash table
    - lprefix: Prefix for left table field names in conflicts
    - rprefix: Prefix for right table field names in conflicts
    
    Returns:
    Table with hash-joined results
    """

Outer Joins

Include rows that may not have matches in both tables.

def leftjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
             presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
    """
    Left outer join two tables.
    
    Parameters:
    - left: Left table (all rows preserved)
    - right: Right table
    - key: Join key
    - lkey: Left table join key
    - rkey: Right table join key
    - missing: Value for missing fields from right table
    - presorted: If True, tables are pre-sorted
    - buffersize: Buffer size for sorting
    - tempdir: Directory for temporary files
    - cache: Whether to cache results
    
    Returns:
    Table with all left rows plus matching right rows
    """

def rightjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
              presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
    """
    Right outer join two tables.
    
    Parameters:
    - left: Left table
    - right: Right table (all rows preserved)
    - key: Join key
    - lkey: Left table join key
    - rkey: Right table join key
    - missing: Value for missing fields from left table
    - presorted: If True, tables are pre-sorted
    - buffersize: Buffer size for sorting
    - tempdir: Directory for temporary files
    - cache: Whether to cache results
    
    Returns:
    Table with all right rows plus matching left rows
    """

def outerjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
              presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
    """
    Full outer join two tables.
    
    Parameters:
    - left: Left table
    - right: Right table
    - key: Join key
    - lkey: Left table join key
    - rkey: Right table join key
    - missing: Value for missing fields
    - presorted: If True, tables are pre-sorted
    - buffersize: Buffer size for sorting
    - tempdir: Directory for temporary files
    - cache: Whether to cache results
    
    Returns:
    Table with all rows from both tables
    """

def hashleftjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
                 cache=True, lprefix=None, rprefix=None) -> Table:
    """Hash-based left outer join."""

def hashrightjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
                  cache=True, lprefix=None, rprefix=None) -> Table:
    """Hash-based right outer join."""

Anti-Joins and Lookup Joins

Specialized join operations for data analysis and lookup scenarios.

def antijoin(left, right, key=None, lkey=None, rkey=None, presorted=False, 
             buffersize=None, tempdir=None, cache=True) -> Table:
    """
    Return rows from left table with no match in right table.
    
    Parameters:
    - left: Left table
    - right: Right table
    - key: Join key
    - lkey: Left table join key
    - rkey: Right table join key
    - presorted: If True, tables are pre-sorted
    - buffersize: Buffer size for sorting
    - tempdir: Directory for temporary files
    - cache: Whether to cache results
    
    Returns:
    Table with left rows that have no match in right table
    """

def hashantijoin(left, right, key=None, lkey=None, rkey=None) -> Table:
    """Hash-based anti-join."""

def lookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
               presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
    """
    Join using lookup from right table.
    
    Parameters:
    - left: Left table
    - right: Right table
    - key: Join key
    - lkey: Left table join key
    - rkey: Right table join key
    - missing: Value for missing lookups
    - presorted: If True, tables are pre-sorted
    - buffersize: Buffer size for sorting
    - tempdir: Directory for temporary files
    - cache: Whether to cache results
    
    Returns:
    Table with lookup values from right table
    """

def hashlookupjoin(left, right, key=None, lkey=None, rkey=None, missing=None, 
                   lprefix=None, rprefix=None) -> Table:
    """Hash-based lookup join."""

Cross Joins and Table Splitting

Operations for Cartesian products and table decomposition.

def crossjoin(*tables, **kwargs) -> Table:
    """
    Cartesian product of tables.
    
    Parameters:
    - tables: Tables to cross join
    - kwargs: Additional options
    
    Returns:
    Table with all possible row combinations
    """

def unjoin(table, value, key=None, autoincrement=None, presorted=False, 
           buffersize=None, tempdir=None, cache=True) -> tuple:
    """
    Split a table into two by extracting key and value fields.
    
    Parameters:
    - table: Input table
    - value: Value field name
    - key: Key field name (default: remaining fields)
    - autoincrement: If True, add auto-incrementing key
    - presorted: If True, table is pre-sorted
    - buffersize: Buffer size for sorting
    - tempdir: Directory for temporary files
    - cache: Whether to cache results
    
    Returns:
    Tuple of (key_table, value_table)
    """

Set Operations

Operations treating tables as sets for data comparison and analysis.

def complement(a, b, presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
    """
    Return rows in table a but not in table b.
    
    Parameters:
    - a: First table
    - b: Second table
    - presorted: If True, tables are pre-sorted
    - buffersize: Buffer size for sorting
    - tempdir: Directory for temporary files
    - cache: Whether to cache results
    
    Returns:
    Table with rows in a but not in b
    """

def intersection(a, b, presorted=False, buffersize=None, tempdir=None, cache=True) -> Table:
    """
    Return rows present in both tables.
    
    Parameters:
    - a: First table
    - b: Second table
    - presorted: If True, tables are pre-sorted
    - buffersize: Buffer size for sorting
    - tempdir: Directory for temporary files
    - cache: Whether to cache results
    
    Returns:
    Table with rows present in both tables
    """

def diff(a, b, presorted=False, buffersize=None, tempdir=None, cache=True, strict=False) -> Table:
    """
    Return rows that differ between tables.
    
    Parameters:
    - a: First table
    - b: Second table
    - presorted: If True, tables are pre-sorted
    - buffersize: Buffer size for sorting
    - tempdir: Directory for temporary files
    - cache: Whether to cache results
    - strict: If True, use strict comparison
    
    Returns:
    Table with differing rows
    """

def hashcomplement(a, b, strict=False) -> Table:
    """Hash-based complement operation."""

def hashintersection(a, b) -> Table:
    """Hash-based intersection operation."""

Usage Examples

Basic Sorting

import petl as etl

table = etl.fromcsv('employees.csv')

# Sort by single field
sorted_by_name = etl.sort(table, 'name')

# Sort by multiple fields
sorted_multi = etl.sort(table, ('department', 'salary'))

# Sort in reverse order
sorted_desc = etl.sort(table, 'salary', reverse=True)

# Sort with custom function
sorted_custom = etl.sort(table, lambda row: (row.department, -row.salary))

# Check if table is sorted
is_sorted = etl.issorted(table, 'name')

Inner Joins

import petl as etl

employees = etl.fromcsv('employees.csv')  # id, name, dept_id
departments = etl.fromcsv('departments.csv')  # id, dept_name

# Simple join on matching field names
joined = etl.join(employees, departments, 'id')

# Join with different key names
joined = etl.join(employees, departments, 
                 lkey='dept_id', rkey='id')

# Hash join for better performance with large tables
hash_joined = etl.hashjoin(employees, departments, 
                          lkey='dept_id', rkey='id')

Outer Joins

import petl as etl

customers = etl.fromcsv('customers.csv')
orders = etl.fromcsv('orders.csv')

# Left join - all customers, with orders where they exist
customer_orders = etl.leftjoin(customers, orders, 
                              lkey='id', rkey='customer_id',
                              missing='No orders')

# Right join - all orders with customer details
order_details = etl.rightjoin(customers, orders,
                             lkey='id', rkey='customer_id')

# Full outer join - all customers and all orders
full_join = etl.outerjoin(customers, orders,
                         lkey='id', rkey='customer_id')

Anti-Joins and Lookups

import petl as etl

all_products = etl.fromcsv('all_products.csv')
sold_products = etl.fromcsv('sold_products.csv')

# Find products that haven't been sold
unsold = etl.antijoin(all_products, sold_products, 'product_id')

# Lookup product names
orders = etl.fromcsv('orders.csv')
products = etl.fromcsv('products.csv')

orders_with_names = etl.lookupjoin(orders, products,
                                  lkey='product_id', rkey='id')

Set Operations

import petl as etl

current_employees = etl.fromcsv('current_employees.csv')
former_employees = etl.fromcsv('former_employees.csv')

# Find employees who are only current (not former)
only_current = etl.complement(current_employees, former_employees)

# Find employees who have been both current and former
both = etl.intersection(current_employees, former_employees)

# Find all differences between two employee lists
differences = etl.diff(current_employees, former_employees)

Complex Join Scenarios

import petl as etl

# Multi-table joins
employees = etl.fromcsv('employees.csv')
departments = etl.fromcsv('departments.csv')
locations = etl.fromcsv('locations.csv')

# Chain joins for multi-table relationships
emp_dept = etl.join(employees, departments, 
                   lkey='dept_id', rkey='id')
full_info = etl.join(emp_dept, locations,
                    lkey='location_id', rkey='id')

# Self-join for hierarchical data
managers = etl.join(employees, employees, 
                   lkey='manager_id', rkey='id',
                   rprefix='mgr_')

# Conditional joins with preprocessing
high_performers = etl.select(employees, 'rating', lambda x: x > 4.0)
bonus_eligible = etl.join(high_performers, departments,
                         lkey='dept_id', rkey='id')

Performance Optimization

import petl as etl

large_table1 = etl.fromcsv('large_data1.csv')
large_table2 = etl.fromcsv('large_data2.csv')

# Use hash joins for better performance
fast_join = etl.hashjoin(large_table1, large_table2, 'id')

# Pre-sort for sort-merge joins
sorted1 = etl.sort(large_table1, 'id')
sorted2 = etl.sort(large_table2, 'id')
efficient_join = etl.join(sorted1, sorted2, 'id', presorted=True)

# External sorting for very large datasets
huge_table = etl.fromcsv('huge_data.csv')
sorted_huge = etl.sort(huge_table, 'timestamp', 
                      buffersize=50000,  # Smaller buffer
                      tempdir='/tmp/sorting')

Install with Tessl CLI