CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pyarrow

Python library for Apache Arrow columnar memory format and computing libraries

Pending
Overview
Eval results
Files

data-types.mddocs/

Data Types

Comprehensive type system supporting primitive types, nested structures, temporal types, and custom extension types. PyArrow's type system provides rich data modeling capabilities with type checking, conversion, and inference for robust data processing workflows.

Capabilities

Type Factory Functions

Functions for creating Arrow data types. These factory functions return DataType objects that can be used to define schemas and create typed arrays.

# Primitive types
def null():
    """Null type containing only null values."""

def bool_():
    """Boolean type (true/false values)."""

def int8():
    """8-bit signed integer type."""

def int16():
    """16-bit signed integer type."""

def int32():
    """32-bit signed integer type."""

def int64():
    """64-bit signed integer type."""

def uint8():
    """8-bit unsigned integer type."""

def uint16():
    """16-bit unsigned integer type."""

def uint32():
    """32-bit unsigned integer type."""

def uint64():
    """64-bit unsigned integer type."""

def float16():
    """16-bit floating point type."""

def float32():
    """32-bit floating point type."""

def float64():
    """64-bit floating point type."""

# Decimal types
def decimal32(precision, scale=0):
    """
    32-bit decimal type.
    
    Parameters:
    - precision: int, total number of digits (1-7)
    - scale: int, number of digits after decimal point
    
    Returns:
    Decimal32Type: 32-bit decimal type
    """

def decimal64(precision, scale=0):
    """
    64-bit decimal type.
    
    Parameters:
    - precision: int, total number of digits (1-15)
    - scale: int, number of digits after decimal point
    
    Returns:
    Decimal64Type: 64-bit decimal type
    """

def decimal128(precision, scale=0):
    """
    128-bit decimal type.
    
    Parameters:
    - precision: int, total number of digits (1-38)
    - scale: int, number of digits after decimal point
    
    Returns:
    Decimal128Type: 128-bit decimal type
    """

def decimal256(precision, scale=0):
    """
    256-bit decimal type.
    
    Parameters:
    - precision: int, total number of digits (1-76)
    - scale: int, number of digits after decimal point
    
    Returns:
    Decimal256Type: 256-bit decimal type
    """

# Temporal types
def time32(unit='s'):
    """
    32-bit time type.
    
    Parameters:
    - unit: str, time unit ('s' for seconds, 'ms' for milliseconds)
    
    Returns:
    Time32Type: 32-bit time type
    """

def time64(unit='us'):
    """
    64-bit time type.
    
    Parameters:
    - unit: str, time unit ('us' for microseconds, 'ns' for nanoseconds)
    
    Returns:
    Time64Type: 64-bit time type
    """

def timestamp(unit, tz=None):
    """
    Timestamp type with timezone support.
    
    Parameters:
    - unit: str, time unit ('s', 'ms', 'us', 'ns')
    - tz: str, timezone identifier (e.g., 'UTC', 'America/New_York')
    
    Returns:
    TimestampType: Timestamp type with specified precision and timezone
    """

def date32():
    """32-bit date type (days since epoch)."""

def date64():
    """64-bit date type (milliseconds since epoch)."""

def duration(unit):
    """
    Duration type.
    
    Parameters:
    - unit: str, time unit ('s', 'ms', 'us', 'ns')
    
    Returns:
    DurationType: Duration type with specified unit
    """

def month_day_nano_interval():
    """Month-day-nanosecond interval type."""

# Binary and string types
def binary():
    """Variable-length binary type."""

def string():
    """Variable-length string type (UTF-8)."""

def utf8():
    """Alias for string() - UTF-8 encoded strings."""

def large_binary():
    """Large variable-length binary type (64-bit offsets)."""

def large_string():
    """Large variable-length string type (64-bit offsets)."""

def large_utf8():
    """Alias for large_string() - large UTF-8 strings."""

def binary_view():
    """Binary view type for large binary data."""

def string_view():
    """String view type for large string data."""

def fixed_size_binary(byte_width):
    """
    Fixed-size binary type.
    
    Parameters:
    - byte_width: int, number of bytes per value
    
    Returns:
    FixedSizeBinaryType: Fixed-size binary type
    """

# Container types
def list_(value_type):
    """
    Variable-length list type.
    
    Parameters:
    - value_type: DataType, type of list elements
    
    Returns:
    ListType: List type with specified element type
    """

def large_list(value_type):
    """
    Large variable-length list type (64-bit offsets).
    
    Parameters:
    - value_type: DataType, type of list elements
    
    Returns:
    LargeListType: Large list type with specified element type
    """

def fixed_size_list(value_type, list_size):
    """
    Fixed-size list type.
    
    Parameters:
    - value_type: DataType, type of list elements
    - list_size: int, number of elements per list
    
    Returns:
    FixedSizeListType: Fixed-size list type
    """

def list_view(value_type):
    """
    List view type for efficient list operations.
    
    Parameters:
    - value_type: DataType, type of list elements
    
    Returns:
    ListViewType: List view type with specified element type
    """

def large_list_view(value_type):
    """
    Large list view type.
    
    Parameters:
    - value_type: DataType, type of list elements
    
    Returns:
    LargeListViewType: Large list view type with specified element type
    """

def map_(key_type, item_type, keys_sorted=False):
    """
    Map type (key-value pairs).
    
    Parameters:
    - key_type: DataType, type of map keys
    - item_type: DataType, type of map values
    - keys_sorted: bool, whether keys are sorted
    
    Returns:
    MapType: Map type with specified key and value types
    """

def struct(fields):
    """
    Struct type with named fields.
    
    Parameters:
    - fields: list of Field objects or (name, type) tuples
    
    Returns:
    StructType: Struct type with specified fields
    """

def union(fields, mode='sparse'):
    """
    Union type supporting multiple value types.
    
    Parameters:
    - fields: list of Field objects
    - mode: str, union mode ('sparse' or 'dense')
    
    Returns:
    UnionType: Union type with specified fields and mode
    """

def sparse_union(fields):
    """
    Sparse union type.
    
    Parameters:
    - fields: list of Field objects
    
    Returns:
    SparseUnionType: Sparse union type
    """

def dense_union(fields):
    """
    Dense union type.
    
    Parameters:
    - fields: list of Field objects
    
    Returns:
    DenseUnionType: Dense union type
    """

def dictionary(index_type, value_type, ordered=False):
    """
    Dictionary-encoded type.
    
    Parameters:
    - index_type: DataType, type of dictionary indices
    - value_type: DataType, type of dictionary values
    - ordered: bool, whether dictionary is ordered
    
    Returns:
    DictionaryType: Dictionary type
    """

def run_end_encoded(run_end_type, value_type):
    """
    Run-end encoded type for efficient storage of repeated values.
    
    Parameters:
    - run_end_type: DataType, type for run end indices
    - value_type: DataType, type of encoded values
    
    Returns:
    RunEndEncodedType: Run-end encoded type
    """

# Advanced types
def fixed_shape_tensor(shape, value_type):
    """
    Fixed-shape tensor type.
    
    Parameters:
    - shape: tuple of int, tensor shape
    - value_type: DataType, type of tensor elements
    
    Returns:
    FixedShapeTensorType: Fixed-shape tensor type
    """

def json_():
    """JSON type for storing JSON documents."""

def opaque(opaque_type):
    """
    Opaque type for application-specific data.
    
    Parameters:
    - opaque_type: DataType, underlying storage type
    
    Returns:
    OpaqueType: Opaque type
    """

def uuid():
    """UUID type for universally unique identifiers."""

Type System Functions

Utility functions for working with types, including type inference, conversion, and registration of custom types.

def type_for_alias(name):
    """
    Get Arrow type from string alias.
    
    Parameters:
    - name: str, type alias (e.g., 'int64', 'string', 'float32')
    
    Returns:
    DataType: Arrow type corresponding to alias
    """

def from_numpy_dtype(dtype):
    """
    Convert NumPy dtype to Arrow type.
    
    Parameters:
    - dtype: numpy.dtype, NumPy data type
    
    Returns:
    DataType: Corresponding Arrow type
    """

def infer_type(values, mask=None, from_pandas=False):
    """
    Infer Arrow type from Python sequence.
    
    Parameters:
    - values: sequence, data to infer type from
    - mask: array-like, boolean mask for null values
    - from_pandas: bool, use pandas-specific inference
    
    Returns:
    DataType: Inferred Arrow type
    """

def register_extension_type(ext_type):
    """
    Register custom extension type.
    
    Parameters:
    - ext_type: ExtensionType, extension type to register
    """

def unregister_extension_type(type_name):
    """
    Unregister extension type.
    
    Parameters:
    - type_name: str, name of extension type to unregister
    """

Type Classes

Base classes and specific implementations for all Arrow data types. These classes provide type information and enable type-safe operations.

class DataType:
    """
    Base class for all Arrow data types.
    
    Attributes:
    - id: Type identifier
    """
    
    def __eq__(self, other): ...
    def __hash__(self): ...
    
    def equals(self, other):
        """Check type equality."""
    
    def to_pandas_dtype(self):
        """Convert to pandas dtype."""

class DictionaryType(DataType):
    """
    Dictionary-encoded type.
    
    Attributes:
    - index_type: Type of dictionary indices
    - value_type: Type of dictionary values
    - ordered: Whether dictionary is ordered
    """

class StructType(DataType):
    """
    Struct type with named fields.
    
    Attributes:
    - num_fields: Number of fields
    """
    
    def field(self, i):
        """Get field by index."""
    
    def get_field_index(self, name):
        """Get field index by name."""
    
    def get_all_field_indices(self, name):
        """Get all field indices by name."""

class ListType(DataType):
    """
    Variable-length list type.
    
    Attributes:
    - value_type: Type of list elements
    """

class LargeListType(DataType):
    """
    Large variable-length list type.
    
    Attributes:
    - value_type: Type of list elements
    """

class FixedSizeListType(DataType):
    """
    Fixed-size list type.
    
    Attributes:
    - value_type: Type of list elements
    - list_size: Number of elements per list
    """

class ListViewType(DataType):
    """
    List view type.
    
    Attributes:
    - value_type: Type of list elements
    """

class LargeListViewType(DataType):
    """
    Large list view type.
    
    Attributes:
    - value_type: Type of list elements
    """

class MapType(DataType):
    """
    Map type for key-value pairs.
    
    Attributes:
    - key_type: Type of map keys
    - item_type: Type of map values
    - keys_sorted: Whether keys are sorted
    """

class UnionType(DataType):
    """
    Base class for union types.
    
    Attributes:
    - mode: Union mode ('sparse' or 'dense')
    - num_fields: Number of union fields
    """

class SparseUnionType(UnionType):
    """Sparse union type."""

class DenseUnionType(UnionType):
    """Dense union type."""

class TimestampType(DataType):
    """
    Timestamp type.
    
    Attributes:
    - unit: Time unit ('s', 'ms', 'us', 'ns')
    - tz: Timezone identifier
    """

class Time32Type(DataType):
    """
    32-bit time type.
    
    Attributes:
    - unit: Time unit ('s', 'ms')
    """

class Time64Type(DataType):
    """
    64-bit time type.
    
    Attributes:
    - unit: Time unit ('us', 'ns')
    """

class DurationType(DataType):
    """
    Duration type.
    
    Attributes:
    - unit: Time unit ('s', 'ms', 'us', 'ns')
    """

class FixedSizeBinaryType(DataType):
    """
    Fixed-size binary type.
    
    Attributes:
    - byte_width: Number of bytes per value
    """

class Decimal32Type(DataType):
    """
    32-bit decimal type.
    
    Attributes:
    - precision: Total number of digits
    - scale: Number of digits after decimal point
    """

class Decimal64Type(DataType):
    """
    64-bit decimal type.
    
    Attributes:
    - precision: Total number of digits
    - scale: Number of digits after decimal point
    """

class Decimal128Type(DataType):
    """
    128-bit decimal type.
    
    Attributes:
    - precision: Total number of digits
    - scale: Number of digits after decimal point
    """

class Decimal256Type(DataType):
    """
    256-bit decimal type.
    
    Attributes:
    - precision: Total number of digits
    - scale: Number of digits after decimal point
    """

class BaseExtensionType(DataType):
    """Base class for extension types."""

class ExtensionType(BaseExtensionType):
    """
    User-defined extension type.
    
    Attributes:
    - extension_name: Name of extension type
    - storage_type: Underlying storage type
    """
    
    def __arrow_ext_serialize__(self):
        """Serialize extension type metadata."""
    
    def __arrow_ext_deserialize__(self, storage_type, serialized):
        """Deserialize extension type from metadata."""

class RunEndEncodedType(DataType):
    """
    Run-end encoded type.
    
    Attributes:
    - run_end_type: Type of run end indices
    - value_type: Type of encoded values
    """

class FixedShapeTensorType(DataType):
    """
    Fixed-shape tensor type.
    
    Attributes:
    - shape: Tensor shape
    - value_type: Type of tensor elements
    """

class JsonType(DataType):
    """JSON document type."""

class OpaqueType(DataType):
    """
    Opaque type for application-specific data.
    
    Attributes:
    - opaque_type: Underlying storage type
    """

class UuidType(DataType):
    """UUID type."""

class UnknownExtensionType(ExtensionType):
    """Unknown extension type placeholder."""

Type Checking Functions

Functions to check and validate Arrow data types. These predicates enable type-safe programming and conditional logic based on type information.

# Primitive type checks
def is_null(type):
    """Check if type is null type."""

def is_boolean(type):
    """Check if type is boolean type."""

def is_integer(type):
    """Check if type is any integer type."""

def is_signed_integer(type):
    """Check if type is signed integer type."""

def is_unsigned_integer(type):
    """Check if type is unsigned integer type."""

def is_int8(type):
    """Check if type is 8-bit signed integer."""

def is_int16(type):
    """Check if type is 16-bit signed integer."""

def is_int32(type):
    """Check if type is 32-bit signed integer."""

def is_int64(type):
    """Check if type is 64-bit signed integer."""

def is_uint8(type):
    """Check if type is 8-bit unsigned integer."""

def is_uint16(type):
    """Check if type is 16-bit unsigned integer."""

def is_uint32(type):
    """Check if type is 32-bit unsigned integer."""

def is_uint64(type):
    """Check if type is 64-bit unsigned integer."""

def is_floating(type):
    """Check if type is floating point type."""

def is_float16(type):
    """Check if type is 16-bit floating point."""

def is_float32(type):
    """Check if type is 32-bit floating point."""

def is_float64(type):
    """Check if type is 64-bit floating point."""

# Container type checks
def is_list(type):
    """Check if type is variable-length list."""

def is_large_list(type):
    """Check if type is large variable-length list."""

def is_fixed_size_list(type):
    """Check if type is fixed-size list."""

def is_list_view(type):
    """Check if type is list view."""

def is_large_list_view(type):
    """Check if type is large list view."""

def is_struct(type):
    """Check if type is struct type."""

def is_union(type):
    """Check if type is union type."""

def is_nested(type):
    """Check if type is nested (list, struct, map, union)."""

def is_run_end_encoded(type):
    """Check if type is run-end encoded."""

# Temporal type checks
def is_temporal(type):
    """Check if type is temporal (timestamp, date, time, duration)."""

def is_timestamp(type):
    """Check if type is timestamp."""

def is_duration(type):
    """Check if type is duration."""

def is_time(type):
    """Check if type is time (32-bit or 64-bit)."""

def is_time32(type):
    """Check if type is 32-bit time."""

def is_time64(type):
    """Check if type is 64-bit time."""

def is_date(type):
    """Check if type is date (32-bit or 64-bit)."""

def is_date32(type):
    """Check if type is 32-bit date."""

def is_date64(type):
    """Check if type is 64-bit date."""

# Binary and string type checks
def is_binary(type):
    """Check if type is variable-length binary."""

def is_large_binary(type):
    """Check if type is large variable-length binary."""

def is_string(type):
    """Check if type is variable-length string."""

def is_large_string(type):
    """Check if type is large variable-length string."""

def is_binary_view(type):
    """Check if type is binary view."""

def is_string_view(type):
    """Check if type is string view."""

def is_fixed_size_binary(type):
    """Check if type is fixed-size binary."""

# Other type checks
def is_map(type):
    """Check if type is map type."""

def is_decimal(type):
    """Check if type is any decimal type."""

def is_decimal32(type):
    """Check if type is 32-bit decimal."""

def is_decimal64(type):
    """Check if type is 64-bit decimal."""

def is_decimal128(type):
    """Check if type is 128-bit decimal."""

def is_decimal256(type):
    """Check if type is 256-bit decimal."""

def is_dictionary(type):
    """Check if type is dictionary-encoded."""

def is_interval(type):
    """Check if type is interval type."""

def is_primitive(type):
    """Check if type is primitive (non-nested)."""

Usage Examples

Creating and Using Types

import pyarrow as pa

# Create primitive types
int_type = pa.int64()
str_type = pa.string()
float_type = pa.float64()

# Create temporal types
timestamp_type = pa.timestamp('ms', tz='UTC')
date_type = pa.date32()
duration_type = pa.duration('us')

# Create decimal types
decimal_type = pa.decimal128(precision=10, scale=2)

# Create nested types
list_type = pa.list_(pa.int32())
struct_type = pa.struct([
    pa.field('name', pa.string()),
    pa.field('age', pa.int32()),
    pa.field('scores', pa.list_(pa.float64()))
])
map_type = pa.map_(pa.string(), pa.int64())

Type Checking and Conversion

import pyarrow as pa

# Type checking
data_type = pa.int64()
print(pa.types.is_integer(data_type))  # True
print(pa.types.is_floating(data_type))  # False
print(pa.types.is_signed_integer(data_type))  # True

# Type inference
values = [1, 2, 3, 4, 5]
inferred_type = pa.infer_type(values)
print(inferred_type)  # int64

# Convert from NumPy
import numpy as np
numpy_dtype = np.dtype('float32')
arrow_type = pa.from_numpy_dtype(numpy_dtype)
print(arrow_type)  # float32

# Type aliases
string_type = pa.type_for_alias('string')
int_type = pa.type_for_alias('int64')

Working with Complex Types

import pyarrow as pa

# Create schema with complex types
schema = pa.schema([
    pa.field('id', pa.int64()),
    pa.field('name', pa.string()),
    pa.field('tags', pa.list_(pa.string())),
    pa.field('metadata', pa.map_(pa.string(), pa.string())),
    pa.field('location', pa.struct([
        pa.field('lat', pa.float64()),
        pa.field('lon', pa.float64())
    ])),
    pa.field('timestamp', pa.timestamp('ms', tz='UTC'))
])

# Create arrays with complex types
tags_array = pa.array([['python', 'data'], ['arrow', 'columnar'], ['analytics']])
metadata_array = pa.array([
    {'version': '1.0', 'author': 'alice'},
    {'version': '2.0'},
    {}
])
location_array = pa.array([
    {'lat': 40.7128, 'lon': -74.0060},
    {'lat': 51.5074, 'lon': -0.1278},
    {'lat': 35.6762, 'lon': 139.6503}
])

# Create table with complex data
table = pa.table({
    'id': [1, 2, 3],
    'name': ['New York', 'London', 'Tokyo'],
    'tags': tags_array,
    'metadata': metadata_array,
    'location': location_array,
    'timestamp': pa.array([
        '2023-01-01T00:00:00.000Z',
        '2023-01-02T00:00:00.000Z',
        '2023-01-03T00:00:00.000Z'
    ], type=pa.timestamp('ms', tz='UTC'))
}, schema=schema)

Extension Types

import pyarrow as pa

# Define custom extension type
class UuidType(pa.ExtensionType):
    def __init__(self):
        super().__init__(pa.binary(16), "uuid")
    
    def __arrow_ext_serialize__(self):
        return b''
    
    @classmethod
    def __arrow_ext_deserialize__(cls, storage_type, serialized):
        return UuidType()

# Register extension type
pa.register_extension_type(UuidType())

# Create array with extension type
uuid_type = UuidType()
uuid_array = pa.array([
    b'\x12\x34\x56\x78\x90\xab\xcd\xef\x12\x34\x56\x78\x90\xab\xcd\xef',
    b'\xfe\xdc\xba\x98\x76\x54\x32\x10\xfe\xdc\xba\x98\x76\x54\x32\x10'
], type=uuid_type)

Install with Tessl CLI

npx tessl i tessl/pypi-pyarrow

docs

advanced-features.md

arrow-flight.md

compute-functions.md

core-data-structures.md

data-types.md

dataset-operations.md

file-formats.md

index.md

memory-io.md

tile.json