tessl/pypi-awkward

Manipulate JSON-like data with NumPy-like idioms for scientific computing and high-energy physics.

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Data Conversion and I/O

Name: tessl/pypi-awkward
Author: tessl

Extensive support for reading from and writing to various data formats including Arrow, Parquet, JSON, NumPy, and integration with popular frameworks like PyTorch, TensorFlow, and JAX. These functions enable seamless interoperability with the broader data science ecosystem.

Capabilities

NumPy Integration

Conversion functions for seamless integration with NumPy arrays, the foundation of the Python scientific computing ecosystem.

def to_numpy(array, allow_missing=True):
    """
    Convert array to NumPy format.
    
    Parameters:
    - array: Array to convert
    - allow_missing: bool, if False raise error for arrays with missing values
    
    Returns:
    numpy.ndarray containing the array data
    
    Raises:
    ValueError if array contains variable-length lists or missing values when allow_missing=False
    """

def to_list(array, behavior=None):
    """
    Convert array to Python list of nested objects.
    
    Parameters:
    - array: Array to convert
    - behavior: dict, custom behavior for conversion
    
    Returns:
    Python list/dict structure containing the array data
    """

def to_packed(array, highlevel=True, behavior=None):
    """
    Pack array into contiguous memory layout for efficient I/O.
    
    Parameters:
    - array: Array to pack
    - highlevel: bool, if True return Array, if False return Content layout
    - behavior: dict, custom behavior for the result
    
    Returns:
    Array with packed, contiguous memory layout
    """

Apache Arrow Integration

Comprehensive support for Apache Arrow format, enabling high-performance data exchange and columnar analytics.

def to_arrow(array, list_to32=False, string_to32=True, bytestring_to32=True, 
           extension_array=True, count_nulls=True, extensionarray=None, 
           categorical_as_dictionary=False):
    """
    Convert array to Apache Arrow format.
    
    Parameters:
    - array: Array to convert
    - list_to32: bool, if True use 32-bit list offsets
    - string_to32: bool, if True use 32-bit string offsets  
    - bytestring_to32: bool, if True use 32-bit bytestring offsets
    - extension_array: bool, if True use Arrow extension arrays
    - count_nulls: bool, if True include null count in metadata
    - extensionarray: deprecated, use extension_array
    - categorical_as_dictionary: bool, if True convert categorical to Arrow dictionary
    
    Returns:
    pyarrow.Array containing the converted data
    """

def to_arrow_table(arrays, list_to32=False, string_to32=True, bytestring_to32=True,
                  extension_array=True, count_nulls=True, extensionarray=None,
                  categorical_as_dictionary=False):
    """
    Convert arrays to Apache Arrow Table format.
    
    Parameters:
    - arrays: dict mapping column names to Arrays, or single Array
    - list_to32: bool, if True use 32-bit list offsets
    - string_to32: bool, if True use 32-bit string offsets
    - bytestring_to32: bool, if True use 32-bit bytestring offsets  
    - extension_array: bool, if True use Arrow extension arrays
    - count_nulls: bool, if True include null count in metadata
    - extensionarray: deprecated, use extension_array
    - categorical_as_dictionary: bool, if True convert categorical to Arrow dictionary
    
    Returns:
    pyarrow.Table containing the converted data
    """

def to_buffers(array):
    """
    Convert array to buffers format for serialization.
    
    Parameters:
    - array: Array to convert to buffers
    
    Returns:
    dict containing form, length, and container with named buffers
    """

Parquet File Format

High-performance columnar storage with compression, metadata preservation, and chunked I/O support.

def to_parquet(array, destination, list_to32=False, string_to32=True, 
              bytestring_to32=True, extension_array=True, count_nulls=True,
              compression="zstd", compression_level=None, row_group_size=64*1024*1024,
              data_page_size=None, parquet_flavor=None, parquet_version="2.4",
              parquet_page_version="1.0", parquet_metadata_statistics=True,
              parquet_dictionary_encoding=True, parquet_byte_stream_split=False,
              parquet_coerce_timestamps=None, parquet_old_int96_timestamps=None,
              parquet_compliant_nested=False, parquet_extra_options=None):
    """
    Write array to Parquet file format.
    
    Parameters:
    - array: Array to write
    - destination: str, file path or file-like object
    - list_to32: bool, if True use 32-bit list offsets
    - string_to32: bool, if True use 32-bit string offsets
    - bytestring_to32: bool, if True use 32-bit bytestring offsets
    - extension_array: bool, if True use Arrow extension arrays
    - count_nulls: bool, if True include null count in metadata
    - compression: str, compression algorithm ("none", "snappy", "gzip", "lz4", "zstd", "brotli")
    - compression_level: int, compression level (algorithm-specific)
    - row_group_size: int, target row group size in bytes
    - data_page_size: int, target data page size in bytes
    - parquet_flavor: str, Parquet flavor ("spark", None)
    - parquet_version: str, Parquet format version
    - parquet_page_version: str, Parquet page format version
    - parquet_metadata_statistics: bool, include column statistics
    - parquet_dictionary_encoding: bool, use dictionary encoding
    - parquet_byte_stream_split: bool, use byte stream split encoding
    - parquet_coerce_timestamps: str, timestamp coercion behavior
    - parquet_old_int96_timestamps: bool, use old int96 timestamp format
    - parquet_compliant_nested: bool, use Parquet-compliant nested encoding
    - parquet_extra_options: dict, additional Parquet options
    """

def to_parquet_dataset(arrays, destination, **kwargs):
    """
    Write arrays as Parquet dataset with partitioning.
    
    Parameters:
    - arrays: dict mapping column names to Arrays
    - destination: str, directory path for dataset
    - kwargs: additional arguments passed to to_parquet
    """

def to_parquet_row_groups(arrays, destination, **kwargs):
    """
    Write arrays as Parquet file with multiple row groups.
    
    Parameters:  
    - arrays: sequence of dicts, each containing Arrays for one row group
    - destination: str, file path
    - kwargs: additional arguments passed to to_parquet
    """

Feather/Arrow IPC Format

Fast binary columnar format for efficient data exchange between processes and languages.

def to_feather(array, file, compression="zstd", compression_level=None):
    """
    Write array to Feather (Arrow IPC) format.
    
    Parameters:
    - array: Array to write  
    - file: str, file path or file-like object
    - compression: str, compression algorithm ("none", "zstd", "lz4")
    - compression_level: int, compression level
    """

JSON Format

Human-readable text format supporting complex nested structures and mixed data types.

def to_json(array, destination=None, pretty=False, maxdecimals=None,
           convert_bytes=None, convert_other=None):
    """
    Convert array to JSON format.
    
    Parameters:
    - array: Array to convert
    - destination: str or file-like, output destination (None for string return)
    - pretty: bool, if True format with indentation
    - maxdecimals: int, maximum decimal places for floats
    - convert_bytes: callable, function to convert bytes objects
    - convert_other: callable, function to convert unrecognized types
    
    Returns:
    str containing JSON data if destination is None
    """

DataFrame Integration

Conversion to and from Pandas DataFrames for integration with data analysis workflows.

def to_dataframe(array, how="inner", levelname="sublevel", anonymous="values"):
    """
    Convert array to Pandas DataFrame.
    
    Parameters:
    - array: Array to convert
    - how: str, how to handle nested structure ("inner", "outer") 
    - levelname: str, name for MultiIndex levels
    - anonymous: str, name for arrays without field names
    
    Returns:
    pandas.DataFrame containing the array data
    """

def to_rdataframe(array):
    """
    Convert array to ROOT RDataFrame.
    
    Parameters:
    - array: Array to convert
    
    Returns:
    ROOT.RDataFrame containing the array data
    """

Machine Learning Framework Integration

Seamless conversion to and from popular ML frameworks for deep learning and numerical computing workflows.

def to_torch(array, device=None):
    """
    Convert array to PyTorch tensor.
    
    Parameters:
    - array: Array to convert (must be rectangular/regular)
    - device: torch.device, target device for tensor
    
    Returns:
    torch.Tensor containing the array data
    """

def to_tensorflow(array):
    """
    Convert array to TensorFlow tensor.
    
    Parameters:
    - array: Array to convert (must be rectangular/regular)
    
    Returns:
    tf.Tensor containing the array data
    """

def to_raggedtensor(array):
    """
    Convert array to TensorFlow RaggedTensor.
    
    Parameters:
    - array: Array to convert
    
    Returns:
    tf.RaggedTensor containing the array data with nested structure
    """

def to_jax(array):
    """
    Convert array to JAX array.
    
    Parameters:
    - array: Array to convert (must be rectangular/regular)
    
    Returns:
    jax.numpy.ndarray containing the array data
    """

def to_cupy(array):
    """
    Convert array to CuPy array for GPU computation.
    
    Parameters:
    - array: Array to convert (must be rectangular/regular)
    
    Returns:
    cupy.ndarray containing the array data
    """

def to_cudf(array):
    """
    Convert array to cuDF DataFrame for GPU-accelerated analytics.
    
    Parameters:
    - array: Array to convert
    
    Returns:
    cudf.DataFrame containing the array data
    """

Type and Layout Conversion

Functions for converting between different array representations and type systems.

def to_layout(array):
    """
    Get low-level Content layout from high-level Array.
    
    Parameters:
    - array: Array to get layout from
    
    Returns:
    Content layout object representing array structure
    """

def to_regular(array, axis=1, highlevel=True, behavior=None):
    """
    Convert variable-length lists to regular (fixed-length) array.
    
    Parameters:
    - array: Array to convert
    - axis: int, axis along which to regularize
    - highlevel: bool, if True return Array, if False return Content layout
    - behavior: dict, custom behavior for the result
    
    Returns:
    Array with regular structure (fails if lists have different lengths)
    """

def values_astype(array, to, highlevel=True, behavior=None):
    """
    Cast array values to specified dtype.
    
    Parameters:
    - array: Array to cast
    - to: numpy.dtype or str, target data type
    - highlevel: bool, if True return Array, if False return Content layout  
    - behavior: dict, custom behavior for the result
    
    Returns:
    Array with values cast to new type
    """

def strings_astype(array, to, highlevel=True, behavior=None):
    """
    Cast string array to specified type by parsing.
    
    Parameters:
    - array: Array of strings to parse
    - to: numpy.dtype or str, target data type  
    - highlevel: bool, if True return Array, if False return Content layout
    - behavior: dict, custom behavior for the result
    
    Returns:
    Array with strings parsed to new type
    """

def categories(array):
    """
    Get categories from categorical array.
    
    Parameters:
    - array: Categorical Array
    
    Returns:
    Array containing the category values
    """

Backend Management

Functions for managing computational backends and moving data between different execution environments.

def backend(array):
    """
    Get the computational backend used by array.
    
    Parameters:
    - array: Array to check backend for
    
    Returns:
    str indicating backend ("cpu", "cuda", "jax", etc.)
    """

def to_backend(array, backend, highlevel=True, behavior=None):
    """
    Move array to specified computational backend.
    
    Parameters:
    - array: Array to move
    - backend: str, target backend ("cpu", "cuda", "jax")  
    - highlevel: bool, if True return Array, if False return Content layout
    - behavior: dict, custom behavior for the result
    
    Returns:
    Array moved to target backend
    """

Specialized Formats

Support for domain-specific data formats common in scientific computing.

def to_avro(array, file, schema=None):
    """
    Write array to Avro format.
    
    Parameters:
    - array: Array to write
    - file: str, file path or file-like object
    - schema: dict, Avro schema (inferred if None)
    """

Usage Examples

Basic Conversions

import awkward as ak
import numpy as np

# Create nested array
data = ak.Array([[1, 2, 3], [4], [5, 6]])

# Convert to Python lists
python_list = ak.to_list(data)  # [[1, 2, 3], [4], [5, 6]]

# Convert flat data to NumPy
flat_data = ak.Array([1, 2, 3, 4, 5])
numpy_array = ak.to_numpy(flat_data)  # np.array([1, 2, 3, 4, 5])

File I/O

import awkward as ak

# Create sample data
records = ak.Array([
    {"x": [1, 2], "y": 3.14, "name": "alice"}, 
    {"x": [4], "y": 2.71, "name": "bob"}
])

# Write to Parquet
ak.to_parquet(records, "data.parquet")

# Write to JSON
ak.to_json(records, "data.json", pretty=True)

# Write to Feather
ak.to_feather(records, "data.feather")

Arrow Integration

import awkward as ak
import pyarrow as pa

data = ak.Array([[1, 2, 3], [4], [5, 6]])

# Convert to Arrow array
arrow_array = ak.to_arrow(data)

# Convert to Arrow table  
table_data = {"numbers": data, "counts": ak.num(data)}
arrow_table = ak.to_arrow_table(table_data)

DataFrame Conversion

import awkward as ak
import pandas as pd

# Nested data
records = ak.Array([
    {"a": [1, 2], "b": "x"},
    {"a": [3, 4, 5], "b": "y"}  
])

# Convert to DataFrame (flattens nested structure)
df = ak.to_dataframe(records)
print(df)
#      a  b
# 0    1  x
# 1    2  x  
# 2    3  y
# 3    4  y
# 4    5  y

ML Framework Integration

import awkward as ak
import torch
import tensorflow as tf

# Regular (rectangular) data for ML frameworks
regular_data = ak.Array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])

# Convert to PyTorch
torch_tensor = ak.to_torch(regular_data)  # torch.Tensor([[1,2], [3,4], [5,6]])

# Convert to TensorFlow
tf_tensor = ak.to_tensorflow(regular_data)  # tf.Tensor([[1,2], [3,4], [5,6]])

# Variable-length data for TensorFlow RaggedTensor
variable_data = ak.Array([[1, 2, 3], [4], [5, 6]])
ragged_tensor = ak.to_raggedtensor(variable_data)

Type Conversion

import awkward as ak
import numpy as np

# String to numeric conversion
strings = ak.Array(["1.5", "2.7", "3.14"])
floats = ak.strings_astype(strings, np.float64)

# Change numeric type  
integers = ak.Array([1, 2, 3])
floats = ak.values_astype(integers, np.float32)

# Convert to regular array (if possible)
data = ak.Array([[1, 2], [3, 4], [5, 6]])  # All lists length 2
regular = ak.to_regular(data)  # RegularArray with size=2

Install with Tessl CLI