A Python package for extracting, transforming and loading tables of data.
—
Comprehensive support for reading and writing data from various sources including CSV, JSON, XML, Excel, databases, and many specialized formats. PETL provides a unified interface for data I/O operations with support for different encodings, formats, and data sources.
Read and write comma-separated values and tab-separated values with extensive formatting options.
def fromcsv(source=None, encoding=None, errors='strict', header=None, **csvargs) -> Table:
"""
Extract a table from a CSV file or source.
Parameters:
- source: File path, file object, or URL
- encoding: Character encoding (default: system default)
- errors: Error handling strategy ('strict', 'ignore', 'replace')
- header: Row number for header (0-based) or None for no header
- csvargs: Additional CSV reader arguments (delimiter, quotechar, etc.)
Returns:
Table object
"""
def fromtsv(source=None, encoding=None, errors='strict', header=None, **csvargs) -> Table:
"""Extract a table from a TSV (tab-separated values) file."""
def tocsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs):
"""
Write a table to a CSV file.
Parameters:
- table: Input table
- source: Output file path or file object
- encoding: Character encoding
- errors: Error handling strategy
- write_header: Whether to write header row
- csvargs: Additional CSV writer arguments
"""
def totsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs):
"""Write a table to a TSV file."""
def appendcsv(table, source=None, encoding=None, errors='strict', **csvargs):
"""Append a table to an existing CSV file."""
def appendtsv(table, source=None, encoding=None, errors='strict', **csvargs):
"""Append a table to an existing TSV file."""
def teecsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs) -> Table:
"""Write a table to CSV while returning the table for further processing."""
def teetsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs) -> Table:
"""Write a table to TSV while returning the table for further processing."""Handle JSON data in various formats including JSON Lines and arrays of objects.
def fromjson(source, *args, **kwargs) -> Table:
"""
Extract a table from JSON lines format or array of objects.
Parameters:
- source: File path, file object, or URL containing JSON data
- args: Additional arguments passed to json.loads
- kwargs: Additional keyword arguments
Returns:
Table object
"""
def fromdicts(dicts, header=None, sample=1000, missing=None) -> Table:
"""
Construct a table from an iterable of dictionaries.
Parameters:
- dicts: Iterable of dictionary objects
- header: Explicit field names (optional)
- sample: Number of records to sample for field detection
- missing: Value for missing fields
Returns:
Table object
"""
def tojson(table, source=None, prefix=None, suffix=None, *args, **kwargs):
"""
Write a table to JSON format.
Parameters:
- table: Input table
- source: Output file path or file object
- prefix: String to prepend to output
- suffix: String to append to output
- args: Additional arguments passed to json.dumps
- kwargs: Additional keyword arguments
"""
def tojsonarrays(table, source=None, prefix=None, suffix=None, output_header=False, *args, **kwargs):
"""
Write a table to JSON array format.
Parameters:
- table: Input table
- source: Output file path or file object
- prefix: String to prepend to output
- suffix: String to append to output
- output_header: Whether to include header in output
- args: Additional arguments
- kwargs: Additional keyword arguments
"""Connect to and work with various database systems using SQLAlchemy.
def fromdb(dbo, query, *args, **kwargs) -> Table:
"""
Extract a table from a database query.
Parameters:
- dbo: Database connection object (SQLAlchemy engine/connection)
- query: SQL query string
- args: Query parameters
- kwargs: Additional arguments
Returns:
Table object with query results
"""
def todb(table, dbo, tablename, schema=None, commit=True, create=False, drop=False,
constraints=True, metadata=None, dialect=None, typeconv=None):
"""
Write a table to a database.
Parameters:
- table: Input table
- dbo: Database connection object
- tablename: Target table name
- schema: Database schema name
- commit: Whether to commit transaction
- create: Whether to create table if it doesn't exist
- drop: Whether to drop existing table
- constraints: Whether to create constraints
- metadata: SQLAlchemy metadata object
- dialect: Database dialect
- typeconv: Type conversion dictionary
"""
def appenddb(table, dbo, tablename, schema=None, commit=True):
"""
Append a table to a database table.
Parameters:
- table: Input table
- dbo: Database connection object
- tablename: Target table name
- schema: Database schema name
- commit: Whether to commit transaction
"""Read and write Microsoft Excel files in both legacy (.xls) and modern (.xlsx) formats.
def fromxls(filename, sheet=None, use_view=True, **kwargs) -> Table:
"""
Extract a table from an Excel .xls file.
Parameters:
- filename: Path to Excel file
- sheet: Sheet name or index (default: first sheet)
- use_view: Whether to use optimized view
- kwargs: Additional arguments
Returns:
Table object
"""
def fromxlsx(filename, sheet=None, range_string=None, min_row=None, max_row=None,
min_col=None, max_col=None, read_only=False, **kwargs) -> Table:
"""
Extract a table from an Excel .xlsx file.
Parameters:
- filename: Path to Excel file
- sheet: Sheet name or index
- range_string: Excel range string (e.g., 'A1:C10')
- min_row, max_row: Row range limits
- min_col, max_col: Column range limits
- read_only: Whether to open in read-only mode
- kwargs: Additional arguments
Returns:
Table object
"""
def toxls(table, filename, sheet=None, encoding=None, style_compression=0, **kwargs):
"""Write a table to an Excel .xls file."""
def toxlsx(table, filename, sheet=None, write_header=True, mode="replace"):
"""Write a table to an Excel .xlsx file."""
def appendxlsx(table, filename, sheet=None, write_header=False):
"""Append a table to an Excel .xlsx file."""Process XML and HTML data for web scraping and data integration.
def fromxml(source, *args, **kwargs) -> Table:
"""
Extract a table from an XML file.
Parameters:
- source: XML file path or file object
- args: Additional arguments for XML parsing
- kwargs: Additional keyword arguments
Returns:
Table object
"""
def toxml(table, target=None, root='table', head='row', **kwargs):
"""
Write a table to XML format.
Parameters:
- table: Input table
- target: Output file path or file object
- root: Root element name
- head: Row element name
- kwargs: Additional XML formatting options
"""
def tohtml(table, source=None, encoding=None, errors='strict', caption=None,
vrepr=str, lineterminator='\r\n', class_=None, **kwargs):
"""
Write a table to HTML format.
Parameters:
- table: Input table
- source: Output file path or file object
- encoding: Character encoding
- errors: Error handling strategy
- caption: HTML table caption
- vrepr: Value representation function
- lineterminator: Line ending character
- class_: CSS class for table element
- kwargs: Additional HTML formatting options
"""
def teehtml(table, source=None, encoding=None, errors='strict', caption=None,
vrepr=str, lineterminator='\r\n', class_=None, **kwargs) -> Table:
"""Write a table to HTML while returning the table for further processing."""Support for various specialized data formats and scientific computing libraries.
def fromarray(array) -> Table:
"""
Construct a table from a NumPy array.
Parameters:
- array: NumPy array
Returns:
Table object
"""
def toarray(table, dtype=None, count=-1, sample=1000):
"""
Convert a table to a NumPy array.
Parameters:
- table: Input table
- dtype: NumPy data type
- count: Maximum number of rows (-1 for all)
- sample: Number of rows to sample for type detection
Returns:
NumPy array
"""
def fromdataframe(df, include_index=False) -> Table:
"""
Construct a table from a Pandas DataFrame.
Parameters:
- df: Pandas DataFrame
- include_index: Whether to include DataFrame index as a column
Returns:
Table object
"""
def todataframe(table, index=None, exclude=None, columns=None,
coerce_float=False, nrows=None):
"""
Convert a table to a Pandas DataFrame.
Parameters:
- table: Input table
- index: Column to use as DataFrame index
- exclude: Columns to exclude
- columns: Column names to use
- coerce_float: Whether to coerce numeric strings to float
- nrows: Maximum number of rows
Returns:
Pandas DataFrame
"""
def frompickle(source=None) -> Table:
"""Extract a table from a pickle file."""
def topickle(table, source=None, protocol=-1, write_header=True):
"""Write a table to a pickle file."""
def fromavro(source, limit=None, skips=0, **avro_args) -> Table:
"""Extract a table from an Apache Avro file."""
def toavro(table, target, schema=None, sample=9, mode='wb', **avro_args):
"""Write a table to Apache Avro format."""
def fromhdf5(source, where=None, name=None, condition=None,
condvars=None, start=None, stop=None, step=None, **kwargs) -> Table:
"""
Extract a table from an HDF5 file using PyTables.
Parameters:
- source: HDF5 file path or file object
- where: Path to HDF5 table within file
- name: Name of table within HDF5 file
- condition: Selection condition
- condvars: Variables for selection condition
- start: Start index for selection
- stop: Stop index for selection
- step: Step size for selection
- kwargs: Additional PyTables arguments
Returns:
Table object
"""
def tohdf5(table, source, where=None, name='table', create=False,
description=None, title='', filters=None, expectedrows=10000,
chunkshape=None, byteorder=None, createparents=False,
sample=1000, **kwargs):
"""Write a table to HDF5 format using PyTables."""
def fromhdf5sorted(source, *args, **kwargs) -> Table:
"""Extract a table from HDF5 with sorted index reading."""
def frombcolz(source, expression=None, outcols=None, limit=None, skip=0) -> Table:
"""
Extract a table from a Bcolz compressed array.
Parameters:
- source: Bcolz source file or object
- expression: Query expression for filtering
- outcols: Output columns to select
- limit: Maximum number of rows to read
- skip: Number of rows to skip
Returns:
Table object
"""
def tobcolz(table, source=None, mode='w', **bcolz_args):
"""Write a table to Bcolz compressed format."""
def fromgsheet(url=None, title=None, sheet_name=None, encoding='utf-8',
auth_method='service_account', scopes=None, credentials=None,
**gsheet_args) -> Table:
"""
Extract a table from a Google Sheets document.
Parameters:
- url: Google Sheets URL
- title: Sheet title
- sheet_name: Name of specific sheet
- encoding: Character encoding
- auth_method: Authentication method
- scopes: OAuth scopes
- credentials: Authentication credentials
- gsheet_args: Additional Google Sheets arguments
Returns:
Table object
"""
def togsheet(table, url=None, title=None, sheet_name=None,
auth_method='service_account', scopes=None, credentials=None,
**gsheet_args):
"""Write a table to Google Sheets."""Flexible data source classes for handling various input/output scenarios.
class FileSource:
"""File source for reading/writing local files."""
def __init__(self, filename, mode='rb'): ...
class URLSource:
"""URL-based data source for remote files."""
def __init__(self, url, **kwargs): ...
class StringSource:
"""In-memory string source."""
def __init__(self, string_data): ...
class GzipSource:
"""Compressed gzip file source."""
def __init__(self, filename, mode='rb'): ...
class RemoteSource:
"""Remote file system source using fsspec."""
def __init__(self, url, **kwargs): ...
class SMBSource:
"""SMB/CIFS network share source."""
def __init__(self, url, **kwargs): ...import petl as etl
# Read CSV with custom options
table = etl.fromcsv('data.csv', delimiter=';', encoding='utf-8')
# Write CSV with specific formatting
etl.tocsv(table, 'output.csv', delimiter='|', quotechar='"')
# Chain operations with CSV I/O
result = (etl.fromcsv('input.csv')
.select('age', lambda age: age > 18)
.cut('name', 'age')
.sort('name'))
result.tocsv('adults.csv')import petl as etl
from sqlalchemy import create_engine
# Connect to database
engine = create_engine('postgresql://user:pass@host:port/db')
# Extract data from database
table = etl.fromdb(engine, 'SELECT * FROM customers WHERE active = true')
# Process and write back to database
processed = table.convert('email', str.lower).convert('phone', lambda x: x.replace('-', ''))
etl.todb(processed, engine, 'customers_clean', create=True)import petl as etl
# Read JSON Lines file
table = etl.fromjson('data.jsonl')
# Convert list of dictionaries to table
data = [{'name': 'Alice', 'age': 25}, {'name': 'Bob', 'age': 30}]
table = etl.fromdicts(data)
# Write to JSON format
etl.tojson(table, 'output.json')import petl as etl
# Read specific sheet and range
table = etl.fromxlsx('report.xlsx', sheet='Sales', range_string='A1:F100')
# Write to Excel with formatting
etl.toxlsx(table, 'summary.xlsx', sheet='Results')
# Append to existing Excel file
etl.appendxlsx(additional_data, 'summary.xlsx', sheet='Results')Install with Tessl CLI
npx tessl i tessl/pypi-petl