A Python package for extracting, transforming and loading tables of data.
—
A comprehensive Python library for extracting, transforming and loading tables of data. PETL enables developers to efficiently process, manipulate, and transform data tables through a fluent API that supports various data sources including CSV, JSON, databases, Excel files, and remote sources. The library offers extensive transformation capabilities including filtering, sorting, joining, aggregating, pivoting, and data type conversions, with built-in support for handling large datasets through memory-efficient iterators and streaming operations.
pip install petlimport petlCommon usage pattern:
import petl as etlAll functions can be imported directly:
from petl import fromcsv, tocsv, cut, select, joinimport petl as etl
# Load data from CSV
table = etl.fromcsv('data.csv')
# Transform data with method chaining
result = table.cut('name', 'age', 'city').select('age', lambda age: age > 18).sort('name')
# Save to new file
result.tocsv('adults.csv')
# Alternative functional style
result = etl.tocsv(
etl.sort(
etl.select(
etl.cut(
etl.fromcsv('data.csv'),
'name', 'age', 'city'
),
'age', lambda age: age > 18
),
'name'
),
'adults.csv'
)PETL is built around the Table abstraction - an iterable container that represents tabular data. The core design principles include:
The library is organized into six main modules: util (table operations), transform (data transformations), io (input/output), comparison (sorting utilities), errors (exceptions), and config (settings).
Core functionality for creating, accessing, and manipulating table structures. Includes table creation from various sources, header manipulation, data access, and conversion to different formats.
def empty() -> Table: ...
def header(table): ...
def fieldnames(table): ...
def data(table, *sliceargs): ...
def records(table, *sliceargs, **kwargs): ...
def dicts(table, *sliceargs, **kwargs): ...
def namedtuples(table, *sliceargs, **kwargs): ...
def values(table, *field, **kwargs): ...
def columns(table, missing=None): ...
def facetcolumns(table, key, missing=None): ...
def expr(expression_text, trusted=True): ...
def rowgroupby(table, key, value=None): ...Comprehensive support for reading and writing data from various sources including CSV, JSON, XML, Excel, databases, and many specialized formats. Supports both file-based and streaming I/O operations.
def fromcsv(source=None, **kwargs) -> Table: ...
def tocsv(table, source=None, **kwargs): ...
def fromjson(source, **kwargs) -> Table: ...
def tojson(table, source=None, **kwargs): ...
def fromdb(dbo, query, **kwargs) -> Table: ...
def todb(table, dbo, tablename, **kwargs): ...Core transformation operations for reshaping, filtering, and manipulating table data. Includes field operations, row selection, data conversion, and structural transformations.
def cut(table, *args, **kwargs) -> Table: ...
def select(table, *args, **kwargs) -> Table: ...
def convert(table, *args, **kwargs) -> Table: ...
def rename(table, *args, **kwargs) -> Table: ...
def addfield(table, field, value=None, **kwargs) -> Table: ...Advanced operations for sorting data and combining multiple tables through various join types. Includes both memory-based and disk-based sorting for large datasets.
def sort(table, key=None, reverse=False, **kwargs) -> Table: ...
def join(table1, table2, key=None, **kwargs) -> Table: ...
def leftjoin(left, right, key=None, **kwargs) -> Table: ...
def outerjoin(left, right, key=None, **kwargs) -> Table: ...
def hashjoin(left, right, key=None, **kwargs) -> Table: ...Functions for grouping data and performing aggregation operations. Supports custom aggregation functions and reduction operations on grouped data.
def aggregate(table, key, aggregation=None, **kwargs) -> Table: ...
def rowreduce(table, key, reducer, **kwargs) -> Table: ...
def merge(table, key, **kwargs) -> Table: ...
def fold(table, key, f, **kwargs) -> Table: ...Advanced reshaping operations including pivoting, melting, transposing, and data normalization. Essential for converting between wide and long data formats.
def melt(table, key=None, **kwargs) -> Table: ...
def recast(table, key=None, **kwargs) -> Table: ...
def pivot(table, f1, f2, f3, aggfun, **kwargs) -> Table: ...
def transpose(table) -> Table: ...
def flatten(table) -> Table: ...Tools for data validation, quality assessment, and statistical analysis. Includes data profiling, constraint validation, and summary statistics.
def validate(table, constraints=None, **kwargs) -> Table: ...
def look(table, limit=0, **kwargs): ...
def see(table, limit=0, **kwargs): ...
def stats(table, field): ...
def valuecounts(table, *field, **kwargs) -> Table: ...
def limits(table, field): ...
def nrows(table): ...
def typecounts(table, field) -> Table: ...class Table(IterContainer):
"""Main table abstraction supporting iteration and transformations."""
def __iter__(self): ...
def __getitem__(self, item): ... # Field access via string or row slicing
def cut(self, *args, **kwargs) -> 'Table': ...
def select(self, *args, **kwargs) -> 'Table': ...
def sort(self, key=None, **kwargs) -> 'Table': ...
# ... all transformation methods available as methods
class Record(tuple):
"""Named tuple subclass for table rows with field access by name or index."""
def __init__(self, row, flds, missing=None): ...
def __getitem__(self, f): ... # Access by field name or index
def __getattr__(self, f): ... # Access by field name as attribute
def get(self, key, default=None): ... # Safe field access
class RecordsView(IterContainer):
"""Container for table records with record-based access."""
def __iter__(self): ...
def __getitem__(self, key): ...
class ValuesView(IterContainer):
"""Container for values from a specific field."""
def __iter__(self): ...
def __getitem__(self, key): ...
class IterContainer:
"""Base container class for iterable data structures with utility methods."""
def __iter__(self): ...
def __getitem__(self, key): ...
def __len__(self): ...
def __contains__(self, item): ...
def list(self): ...
def tuple(self): ...
def set(self): ...
def counter(self): ...
# Source classes for data I/O
class FileSource:
"""File-based data source."""
def __init__(self, filename, mode='r', **kwargs): ...
class URLSource:
"""URL-based data source for web resources."""
def __init__(self, url, **kwargs): ...
class GzipSource:
"""Gzip-compressed file source."""
def __init__(self, filename, mode='r', **kwargs): ...
class ZipSource:
"""ZIP archive file source."""
def __init__(self, filename, member=None, **kwargs): ...
# Exception classes
class DuplicateKeyError(Exception):
"""Raised when duplicate keys are found where unique keys are expected."""
pass
class FieldSelectionError(Exception):
"""Raised when field selection is invalid or ambiguous."""
pass
class ArgumentError(Exception):
"""Raised when function arguments are invalid."""
passInstall with Tessl CLI
npx tessl i tessl/pypi-petl