tessl/pypi-pyarrow

Python library for Apache Arrow columnar memory format and computing libraries

—

Pending

Overview

Eval results

Files

Core Data Structures

Name: tessl/pypi-pyarrow
Author: tessl

Fundamental data containers that form the foundation of PyArrow's columnar data processing capabilities. These structures provide efficient storage and manipulation of typed data in memory-optimized columnar layouts.

Capabilities

Arrays

One-dimensional sequences of values with a specific data type. Arrays are immutable and provide the basic building blocks for all other data structures in PyArrow.

def array(obj, type=None, mask=None, size=None, from_pandas=None, safe=True):
    """
    Create Arrow array from Python sequence, NumPy array, or pandas data.
    
    Parameters:
    - obj: sequence, NumPy array, or pandas Series to convert
    - type: DataType, explicit type for the array
    - mask: array-like, boolean mask for null values
    - size: int, length of array if obj is scalar
    - from_pandas: bool, interpret pandas-specific data
    - safe: bool, check for overflow/truncation during conversion
    
    Returns:
    Array: Arrow array with specified type
    """

def chunked_array(arrays, type=None):
    """
    Create chunked array from list of arrays.
    
    Parameters:
    - arrays: sequence of Array objects
    - type: DataType, explicit type (must match all arrays)
    
    Returns:
    ChunkedArray: Chunked array composed of input arrays
    """

def nulls(size, type=None):
    """
    Create array of null values.
    
    Parameters:
    - size: int, length of array
    - type: DataType, type of nulls (default: null type)
    
    Returns:
    Array: Array of null values
    """

def repeat(value, size):
    """
    Create array by repeating a single value.
    
    Parameters:
    - value: scalar value to repeat
    - size: int, number of repetitions
    
    Returns:
    Array: Array with repeated value
    """

def arange(start, stop=None, step=1, dtype=None):
    """
    Create array with range of values.
    
    Parameters:
    - start: int, start value (or stop if stop is None)
    - stop: int, stop value (exclusive)
    - step: int, step size
    - dtype: DataType, array data type
    
    Returns:
    Array: Array with range values
    """

class Array:
    """
    Base class for all Arrow arrays.
    
    Attributes:
    - type: DataType of the array
    - length: Number of elements
    - null_count: Number of null values
    - is_valid: Boolean array indicating non-null values
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def to_pylist(self):
        """Convert to Python list."""
    
    def to_pandas(self, **kwargs):
        """Convert to pandas Series."""
    
    def to_numpy(self, **kwargs):
        """Convert to NumPy array."""
    
    def slice(self, offset=0, length=None):
        """Return slice of array."""
    
    def take(self, indices):
        """Select elements by indices."""
    
    def filter(self, mask):
        """Filter array by boolean mask."""
    
    def sort(self, **kwargs):
        """Return sorted array."""
    
    def unique(self):
        """Return array of unique values."""
    
    def value_counts(self):
        """Return struct array of value counts."""

class ChunkedArray:
    """
    Array composed of multiple contiguous arrays (chunks).
    
    Attributes:
    - type: DataType of the chunked array
    - length: Total number of elements across chunks
    - null_count: Total number of null values
    - num_chunks: Number of chunks
    - chunks: List of Array chunks
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def chunk(self, i):
        """Get chunk at index i."""
    
    def to_pylist(self):
        """Convert to Python list."""
    
    def to_pandas(self, **kwargs):
        """Convert to pandas Series."""
    
    def slice(self, offset=0, length=None):
        """Return slice of chunked array."""
    
    def take(self, indices):
        """Select elements by indices."""
    
    def filter(self, mask):
        """Filter by boolean mask."""
    
    def combine_chunks(self):
        """Combine chunks into single array."""

Tables

Two-dimensional datasets with named columns, similar to SQL tables or pandas DataFrames. Tables provide the primary interface for working with tabular data in PyArrow.

def table(data, schema=None, metadata=None, columns=None):
    """
    Create Arrow table from various data sources.
    
    Parameters:
    - data: dict, list of arrays, pandas DataFrame, or RecordBatch
    - schema: Schema, explicit schema for the table
    - metadata: dict, key-value metadata
    - columns: list of str, column names (when data is list)
    
    Returns:
    Table: Arrow table with specified schema
    """

def record_batch(data, schema=None, metadata=None):
    """
    Create RecordBatch from data.
    
    Parameters:
    - data: dict, list of arrays, or sequence
    - schema: Schema, explicit schema
    - metadata: dict, key-value metadata
    
    Returns:
    RecordBatch: Single batch of columnar data
    """

def concat_tables(tables, promote=False):
    """
    Concatenate tables vertically.
    
    Parameters:
    - tables: sequence of Table objects
    - promote: bool, promote schemas to compatible type
    
    Returns:
    Table: Concatenated table
    """

def concat_arrays(arrays):
    """
    Concatenate arrays into single array.
    
    Parameters:
    - arrays: sequence of Array objects with same type
    
    Returns:
    Array: Concatenated array
    """

def concat_batches(batches, promote=False):
    """
    Concatenate record batches.
    
    Parameters:
    - batches: sequence of RecordBatch objects
    - promote: bool, promote schemas to compatible type
    
    Returns:
    Table: Table created from concatenated batches
    """

class Table:
    """
    Two-dimensional table of columnar data.
    
    Attributes:
    - schema: Schema of the table
    - num_columns: Number of columns
    - num_rows: Number of rows
    - column_names: List of column names
    - columns: List of ChunkedArray columns
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def column(self, i):
        """Get column by index or name."""
    
    def select(self, columns):
        """Select subset of columns."""
    
    def slice(self, offset=0, length=None):
        """Return slice of table."""
    
    def filter(self, mask):
        """Filter rows by boolean mask."""
    
    def take(self, indices):
        """Select rows by indices."""
    
    def sort_by(self, sorting):
        """Sort table by columns."""
    
    def group_by(self, keys):
        """Group table by columns."""
    
    def join(self, right_table, **kwargs):
        """Join with another table."""
    
    def to_pandas(self, **kwargs):
        """Convert to pandas DataFrame."""
    
    def to_pydict(self):
        """Convert to dictionary of Python lists."""
    
    def to_batches(self, max_chunksize=None):
        """Convert to iterator of RecordBatch objects."""
    
    def add_column(self, i, field, column):
        """Add column at position i."""
    
    def append_column(self, field, column):
        """Append column to table."""
    
    def remove_column(self, i):
        """Remove column at position i."""
    
    def rename_columns(self, names):
        """Rename columns."""
    
    def drop(self, columns):
        """Drop columns by name."""
    
    def replace_schema_metadata(self, metadata):
        """Replace table metadata."""

class RecordBatch:
    """
    Collection of arrays with shared length representing a single batch.
    
    Attributes:
    - schema: Schema of the batch
    - num_columns: Number of columns
    - num_rows: Number of rows
    - column_names: List of column names
    - columns: List of Array columns
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def column(self, i):
        """Get column by index or name."""
    
    def select(self, columns):
        """Select subset of columns."""
    
    def slice(self, offset=0, length=None):
        """Return slice of batch."""
    
    def filter(self, mask):
        """Filter rows by boolean mask."""
    
    def take(self, indices):
        """Select rows by indices."""
    
    def to_pandas(self, **kwargs):
        """Convert to pandas DataFrame."""
    
    def to_pydict(self):
        """Convert to dictionary of Python lists."""
    
    def add_column(self, i, field, column):
        """Add column at position i."""
    
    def remove_column(self, i):
        """Remove column at position i."""
    
    def rename_columns(self, names):
        """Rename columns."""

class RecordBatchReader:
    """
    Interface for reading stream of record batches.
    """
    
    def __iter__(self): ...
    
    def read_next_batch(self):
        """Read next batch from stream."""
    
    def read_all(self):
        """Read all batches into table."""
    
    def schema(self):
        """Get schema of batches."""

class TableGroupBy:
    """
    Grouped table operations.
    """
    
    def aggregate(self, aggregations):
        """Perform aggregations on groups."""

Schemas and Fields

Schema definitions that describe table structure, column types, and metadata. Schemas provide type safety and enable efficient data processing by defining the expected structure of tabular data.

def schema(fields, metadata=None):
    """
    Create schema from list of fields.
    
    Parameters:
    - fields: sequence of Field objects or (name, type) tuples
    - metadata: dict, key-value metadata for schema
    
    Returns:
    Schema: Schema object with specified fields
    """

def field(name, type, nullable=True, metadata=None):
    """
    Create field with name and type.
    
    Parameters:
    - name: str, field name
    - type: DataType, field data type
    - nullable: bool, whether field can contain nulls
    - metadata: dict, key-value metadata for field
    
    Returns:
    Field: Field object with specified properties
    """

def unify_schemas(schemas):
    """
    Unify multiple schemas into compatible schema.
    
    Parameters:
    - schemas: sequence of Schema objects
    
    Returns:
    Schema: Unified schema compatible with all input schemas
    """

class Schema:
    """
    Schema defining structure of tabular data.
    
    Attributes:
    - names: List of field names
    - types: List of field types
    - metadata: Key-value metadata
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def field(self, i):
        """Get field by index or name."""
    
    def get_field_index(self, name):
        """Get index of field by name."""
    
    def select(self, names):
        """Select subset of fields."""
    
    def insert(self, i, field):
        """Insert field at position i."""
    
    def append(self, field):
        """Append field to schema."""
    
    def remove(self, i):
        """Remove field at position i."""
    
    def with_metadata(self, metadata):
        """Return schema with new metadata."""
    
    def equals(self, other, check_metadata=True):
        """Check equality with another schema."""
    
    def to_string(self, **kwargs):
        """String representation of schema."""

class Field:
    """
    Named field in a schema with type and metadata.
    
    Attributes:
    - name: Field name
    - type: DataType of field
    - nullable: Whether field can contain nulls
    - metadata: Key-value metadata
    """
    
    def with_name(self, name):
        """Return field with new name."""
    
    def with_type(self, type):
        """Return field with new type."""
    
    def with_nullable(self, nullable):
        """Return field with new nullable setting."""
    
    def with_metadata(self, metadata):
        """Return field with new metadata."""
    
    def equals(self, other, check_metadata=True):
        """Check equality with another field."""
    
    def to_string(self, **kwargs):
        """String representation of field."""

class KeyValueMetadata:
    """
    Key-value metadata container.
    """
    
    def __len__(self): ...
    def __getitem__(self, key): ...
    def __iter__(self): ...
    
    def get(self, key, default=None):
        """Get value by key."""
    
    def keys(self):
        """Get all keys."""
    
    def values(self):
        """Get all values."""
    
    def items(self):
        """Get key-value pairs."""
    
    def to_dict(self):
        """Convert to Python dictionary."""

Scalars

Single typed values that provide consistent interface for working with individual data elements. Scalars maintain type information and null state, enabling type-safe operations on individual values.

def scalar(value, type=None):
    """
    Create scalar from Python value.
    
    Parameters:
    - value: Python value to wrap
    - type: DataType, explicit type for scalar
    
    Returns:
    Scalar: Typed scalar value
    """

# Scalar constants
NA = ...  # Not Available scalar
NULL = ...  # Null scalar

class Scalar:
    """
    Base class for typed scalar values.
    
    Attributes:
    - type: DataType of scalar
    - is_valid: Whether scalar is non-null
    """
    
    def __eq__(self, other): ...
    def __hash__(self): ...
    
    def as_py(self):
        """Convert to Python value."""
    
    def cast(self, target_type, safe=True):
        """Cast to different type."""
    
    def equals(self, other):
        """Check equality with another scalar."""

# Specific scalar types are available for all Arrow data types:
# NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
# UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, HalfFloatScalar,
# FloatScalar, DoubleScalar, Decimal128Scalar, StringScalar, BinaryScalar,
# Date32Scalar, Date64Scalar, TimestampScalar, Time32Scalar, Time64Scalar,
# DurationScalar, ListScalar, StructScalar, MapScalar, DictionaryScalar, etc.

Tensors and Sparse Data

Multi-dimensional arrays and sparse data structures for advanced numerical computing and machine learning applications.

class Tensor:
    """
    Multi-dimensional array with Arrow data.
    
    Attributes:
    - type: DataType of tensor elements
    - shape: Shape tuple of tensor dimensions
    - strides: Strides tuple for memory layout
    - is_mutable: Whether tensor data is mutable
    """
    
    def __getitem__(self, key): ...
    
    def to_numpy(self):
        """Convert to NumPy array."""
    
    def equals(self, other):
        """Check equality with another tensor."""

class SparseCOOTensor:
    """Sparse tensor in COOrdinate format."""
    
class SparseCSRMatrix:
    """Sparse matrix in Compressed Sparse Row format."""
    
class SparseCSCMatrix:
    """Sparse matrix in Compressed Sparse Column format."""

class SparseCSFTensor:
    """Sparse tensor in Compressed Sparse Fiber format."""

Type Definitions

Memory Management

class DictionaryMemo:
    """
    Memo for dictionary encoding to ensure consistent dictionaries.
    """
    
    def __init__(self): ...
    
    def get_dictionary(self, type):
        """Get dictionary for type."""
    
    def set_dictionary(self, type, dictionary):
        """Set dictionary for type.</""

Usage Examples

Creating and Manipulating Arrays

import pyarrow as pa
import numpy as np

# Create arrays from various sources
int_array = pa.array([1, 2, 3, 4, 5])
str_array = pa.array(['apple', 'banana', 'cherry', None])
np_array = pa.array(np.random.randn(1000))

# Create chunked array
chunks = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]
chunked = pa.chunked_array(chunks)

# Array operations
filtered = int_array.filter(pa.array([True, False, True, False, True]))
sorted_array = str_array.sort()
unique_values = str_array.unique()

# Convert to other formats
python_list = int_array.to_pylist()
pandas_series = int_array.to_pandas()
numpy_array = int_array.to_numpy()

Working with Tables

import pyarrow as pa

# Create table from dictionary
data = {
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
    'age': [25, 30, 35, 28, 32],
    'salary': [50000.0, 60000.0, 70000.0, 55000.0, 65000.0]
}
table = pa.table(data)

# Table operations
subset = table.select(['name', 'age'])
filtered = table.filter(pa.compute.greater(table['age'], 30))
sorted_table = table.sort_by([('age', 'descending')])

# Add/remove columns
new_table = table.add_column(4, pa.field('bonus', pa.float64()), 
                            pa.array([5000.0, 6000.0, 7000.0, 5500.0, 6500.0]))
dropped = table.drop(['salary'])

# Convert to pandas
df = table.to_pandas()

Schema Definition

import pyarrow as pa

# Define schema explicitly
schema = pa.schema([
    pa.field('id', pa.int64()),
    pa.field('name', pa.string()),
    pa.field('scores', pa.list_(pa.float64())),
    pa.field('metadata', pa.map_(pa.string(), pa.string()))
])

# Create table with schema
table = pa.table({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie'],
    'scores': [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
    'metadata': [{'key': 'value'}, {}, {'foo': 'bar'}]
}, schema=schema)

# Schema operations
field = schema.field('name')
field_index = schema.get_field_index('scores')
partial_schema = schema.select(['id', 'name'])

Install with Tessl CLI