tessl/pypi-petl

A Python package for extracting, transforming and loading tables of data.

—

Pending

Overview

Eval results

Files

Data Input/Output

Name: tessl/pypi-petl
Author: tessl

Comprehensive support for reading and writing data from various sources including CSV, JSON, XML, Excel, databases, and many specialized formats. PETL provides a unified interface for data I/O operations with support for different encodings, formats, and data sources.

Capabilities

CSV and TSV Operations

Read and write comma-separated values and tab-separated values with extensive formatting options.

def fromcsv(source=None, encoding=None, errors='strict', header=None, **csvargs) -> Table:
    """
    Extract a table from a CSV file or source.
    
    Parameters:
    - source: File path, file object, or URL
    - encoding: Character encoding (default: system default)
    - errors: Error handling strategy ('strict', 'ignore', 'replace')
    - header: Row number for header (0-based) or None for no header
    - csvargs: Additional CSV reader arguments (delimiter, quotechar, etc.)
    
    Returns:
    Table object
    """

def fromtsv(source=None, encoding=None, errors='strict', header=None, **csvargs) -> Table:
    """Extract a table from a TSV (tab-separated values) file."""

def tocsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs):
    """
    Write a table to a CSV file.
    
    Parameters:
    - table: Input table
    - source: Output file path or file object
    - encoding: Character encoding
    - errors: Error handling strategy
    - write_header: Whether to write header row
    - csvargs: Additional CSV writer arguments
    """

def totsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs):
    """Write a table to a TSV file."""

def appendcsv(table, source=None, encoding=None, errors='strict', **csvargs):
    """Append a table to an existing CSV file."""

def appendtsv(table, source=None, encoding=None, errors='strict', **csvargs):
    """Append a table to an existing TSV file."""

def teecsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs) -> Table:
    """Write a table to CSV while returning the table for further processing."""

def teetsv(table, source=None, encoding=None, errors='strict', write_header=True, **csvargs) -> Table:
    """Write a table to TSV while returning the table for further processing."""

JSON Operations

Handle JSON data in various formats including JSON Lines and arrays of objects.

def fromjson(source, *args, **kwargs) -> Table:
    """
    Extract a table from JSON lines format or array of objects.
    
    Parameters:
    - source: File path, file object, or URL containing JSON data
    - args: Additional arguments passed to json.loads
    - kwargs: Additional keyword arguments
    
    Returns:
    Table object
    """

def fromdicts(dicts, header=None, sample=1000, missing=None) -> Table:
    """
    Construct a table from an iterable of dictionaries.
    
    Parameters:
    - dicts: Iterable of dictionary objects
    - header: Explicit field names (optional)
    - sample: Number of records to sample for field detection
    - missing: Value for missing fields
    
    Returns:
    Table object
    """

def tojson(table, source=None, prefix=None, suffix=None, *args, **kwargs):
    """
    Write a table to JSON format.
    
    Parameters:
    - table: Input table
    - source: Output file path or file object
    - prefix: String to prepend to output
    - suffix: String to append to output
    - args: Additional arguments passed to json.dumps
    - kwargs: Additional keyword arguments
    """

def tojsonarrays(table, source=None, prefix=None, suffix=None, output_header=False, *args, **kwargs):
    """
    Write a table to JSON array format.
    
    Parameters:
    - table: Input table
    - source: Output file path or file object
    - prefix: String to prepend to output
    - suffix: String to append to output
    - output_header: Whether to include header in output
    - args: Additional arguments
    - kwargs: Additional keyword arguments
    """

Database Operations

Connect to and work with various database systems using SQLAlchemy.

def fromdb(dbo, query, *args, **kwargs) -> Table:
    """
    Extract a table from a database query.
    
    Parameters:
    - dbo: Database connection object (SQLAlchemy engine/connection)
    - query: SQL query string
    - args: Query parameters
    - kwargs: Additional arguments
    
    Returns:
    Table object with query results
    """

def todb(table, dbo, tablename, schema=None, commit=True, create=False, drop=False, 
         constraints=True, metadata=None, dialect=None, typeconv=None):
    """
    Write a table to a database.
    
    Parameters:
    - table: Input table
    - dbo: Database connection object
    - tablename: Target table name
    - schema: Database schema name
    - commit: Whether to commit transaction
    - create: Whether to create table if it doesn't exist
    - drop: Whether to drop existing table
    - constraints: Whether to create constraints
    - metadata: SQLAlchemy metadata object
    - dialect: Database dialect
    - typeconv: Type conversion dictionary
    """

def appenddb(table, dbo, tablename, schema=None, commit=True):
    """
    Append a table to a database table.
    
    Parameters:
    - table: Input table
    - dbo: Database connection object
    - tablename: Target table name
    - schema: Database schema name
    - commit: Whether to commit transaction
    """

Excel Operations

Read and write Microsoft Excel files in both legacy (.xls) and modern (.xlsx) formats.

def fromxls(filename, sheet=None, use_view=True, **kwargs) -> Table:
    """
    Extract a table from an Excel .xls file.
    
    Parameters:
    - filename: Path to Excel file
    - sheet: Sheet name or index (default: first sheet)
    - use_view: Whether to use optimized view
    - kwargs: Additional arguments
    
    Returns:
    Table object
    """

def fromxlsx(filename, sheet=None, range_string=None, min_row=None, max_row=None, 
             min_col=None, max_col=None, read_only=False, **kwargs) -> Table:
    """
    Extract a table from an Excel .xlsx file.
    
    Parameters:
    - filename: Path to Excel file
    - sheet: Sheet name or index
    - range_string: Excel range string (e.g., 'A1:C10')
    - min_row, max_row: Row range limits
    - min_col, max_col: Column range limits
    - read_only: Whether to open in read-only mode
    - kwargs: Additional arguments
    
    Returns:
    Table object
    """

def toxls(table, filename, sheet=None, encoding=None, style_compression=0, **kwargs):
    """Write a table to an Excel .xls file."""

def toxlsx(table, filename, sheet=None, write_header=True, mode="replace"):
    """Write a table to an Excel .xlsx file."""

def appendxlsx(table, filename, sheet=None, write_header=False):
    """Append a table to an Excel .xlsx file."""

XML and HTML Operations

Process XML and HTML data for web scraping and data integration.

def fromxml(source, *args, **kwargs) -> Table:
    """
    Extract a table from an XML file.
    
    Parameters:
    - source: XML file path or file object
    - args: Additional arguments for XML parsing
    - kwargs: Additional keyword arguments
    
    Returns:
    Table object
    """

def toxml(table, target=None, root='table', head='row', **kwargs):
    """
    Write a table to XML format.
    
    Parameters:
    - table: Input table
    - target: Output file path or file object
    - root: Root element name
    - head: Row element name
    - kwargs: Additional XML formatting options
    """

def tohtml(table, source=None, encoding=None, errors='strict', caption=None, 
           vrepr=str, lineterminator='\r\n', class_=None, **kwargs):
    """
    Write a table to HTML format.
    
    Parameters:
    - table: Input table
    - source: Output file path or file object
    - encoding: Character encoding
    - errors: Error handling strategy
    - caption: HTML table caption
    - vrepr: Value representation function
    - lineterminator: Line ending character
    - class_: CSS class for table element
    - kwargs: Additional HTML formatting options
    """

def teehtml(table, source=None, encoding=None, errors='strict', caption=None, 
            vrepr=str, lineterminator='\r\n', class_=None, **kwargs) -> Table:
    """Write a table to HTML while returning the table for further processing."""

Specialized Formats

Support for various specialized data formats and scientific computing libraries.

def fromarray(array) -> Table:
    """
    Construct a table from a NumPy array.
    
    Parameters:
    - array: NumPy array
    
    Returns:
    Table object
    """

def toarray(table, dtype=None, count=-1, sample=1000):
    """
    Convert a table to a NumPy array.
    
    Parameters:
    - table: Input table
    - dtype: NumPy data type
    - count: Maximum number of rows (-1 for all)
    - sample: Number of rows to sample for type detection
    
    Returns:
    NumPy array
    """

def fromdataframe(df, include_index=False) -> Table:
    """
    Construct a table from a Pandas DataFrame.
    
    Parameters:
    - df: Pandas DataFrame
    - include_index: Whether to include DataFrame index as a column
    
    Returns:
    Table object
    """

def todataframe(table, index=None, exclude=None, columns=None, 
                coerce_float=False, nrows=None):
    """
    Convert a table to a Pandas DataFrame.
    
    Parameters:
    - table: Input table
    - index: Column to use as DataFrame index
    - exclude: Columns to exclude
    - columns: Column names to use
    - coerce_float: Whether to coerce numeric strings to float
    - nrows: Maximum number of rows
    
    Returns:
    Pandas DataFrame
    """

def frompickle(source=None) -> Table:
    """Extract a table from a pickle file."""

def topickle(table, source=None, protocol=-1, write_header=True):
    """Write a table to a pickle file."""

def fromavro(source, limit=None, skips=0, **avro_args) -> Table:
    """Extract a table from an Apache Avro file."""

def toavro(table, target, schema=None, sample=9, mode='wb', **avro_args):
    """Write a table to Apache Avro format."""

def fromhdf5(source, where=None, name=None, condition=None, 
             condvars=None, start=None, stop=None, step=None, **kwargs) -> Table:
    """
    Extract a table from an HDF5 file using PyTables.
    
    Parameters:
    - source: HDF5 file path or file object
    - where: Path to HDF5 table within file
    - name: Name of table within HDF5 file
    - condition: Selection condition
    - condvars: Variables for selection condition
    - start: Start index for selection
    - stop: Stop index for selection  
    - step: Step size for selection
    - kwargs: Additional PyTables arguments
    
    Returns:
    Table object
    """

def tohdf5(table, source, where=None, name='table', create=False, 
           description=None, title='', filters=None, expectedrows=10000, 
           chunkshape=None, byteorder=None, createparents=False, 
           sample=1000, **kwargs):
    """Write a table to HDF5 format using PyTables."""

def fromhdf5sorted(source, *args, **kwargs) -> Table:
    """Extract a table from HDF5 with sorted index reading."""

def frombcolz(source, expression=None, outcols=None, limit=None, skip=0) -> Table:
    """
    Extract a table from a Bcolz compressed array.
    
    Parameters:
    - source: Bcolz source file or object
    - expression: Query expression for filtering
    - outcols: Output columns to select
    - limit: Maximum number of rows to read
    - skip: Number of rows to skip
    
    Returns:
    Table object
    """

def tobcolz(table, source=None, mode='w', **bcolz_args):
    """Write a table to Bcolz compressed format."""

def fromgsheet(url=None, title=None, sheet_name=None, encoding='utf-8', 
               auth_method='service_account', scopes=None, credentials=None, 
               **gsheet_args) -> Table:
    """
    Extract a table from a Google Sheets document.
    
    Parameters:
    - url: Google Sheets URL
    - title: Sheet title  
    - sheet_name: Name of specific sheet
    - encoding: Character encoding
    - auth_method: Authentication method
    - scopes: OAuth scopes
    - credentials: Authentication credentials
    - gsheet_args: Additional Google Sheets arguments
    
    Returns:
    Table object
    """

def togsheet(table, url=None, title=None, sheet_name=None, 
             auth_method='service_account', scopes=None, credentials=None,
             **gsheet_args):
    """Write a table to Google Sheets."""

Data Sources

Flexible data source classes for handling various input/output scenarios.

class FileSource:
    """File source for reading/writing local files."""
    def __init__(self, filename, mode='rb'): ...

class URLSource:
    """URL-based data source for remote files."""
    def __init__(self, url, **kwargs): ...

class StringSource:
    """In-memory string source."""
    def __init__(self, string_data): ...

class GzipSource:
    """Compressed gzip file source."""
    def __init__(self, filename, mode='rb'): ...

class RemoteSource:
    """Remote file system source using fsspec."""
    def __init__(self, url, **kwargs): ...

class SMBSource:
    """SMB/CIFS network share source."""
    def __init__(self, url, **kwargs): ...

Usage Examples

Working with CSV Files

import petl as etl

# Read CSV with custom options
table = etl.fromcsv('data.csv', delimiter=';', encoding='utf-8')

# Write CSV with specific formatting
etl.tocsv(table, 'output.csv', delimiter='|', quotechar='"')

# Chain operations with CSV I/O
result = (etl.fromcsv('input.csv')
            .select('age', lambda age: age > 18)
            .cut('name', 'age')
            .sort('name'))
result.tocsv('adults.csv')

Database Integration

import petl as etl
from sqlalchemy import create_engine

# Connect to database
engine = create_engine('postgresql://user:pass@host:port/db')

# Extract data from database
table = etl.fromdb(engine, 'SELECT * FROM customers WHERE active = true')

# Process and write back to database
processed = table.convert('email', str.lower).convert('phone', lambda x: x.replace('-', ''))
etl.todb(processed, engine, 'customers_clean', create=True)

JSON Data Processing

import petl as etl

# Read JSON Lines file
table = etl.fromjson('data.jsonl')

# Convert list of dictionaries to table
data = [{'name': 'Alice', 'age': 25}, {'name': 'Bob', 'age': 30}]
table = etl.fromdicts(data)

# Write to JSON format
etl.tojson(table, 'output.json')

Excel File Operations

import petl as etl

# Read specific sheet and range
table = etl.fromxlsx('report.xlsx', sheet='Sales', range_string='A1:F100')

# Write to Excel with formatting
etl.toxlsx(table, 'summary.xlsx', sheet='Results')

# Append to existing Excel file
etl.appendxlsx(additional_data, 'summary.xlsx', sheet='Results')

Install with Tessl CLI