Python support for Parquet file format with high performance reading and writing capabilities
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Tools for working with parquet schemas, type conversions, and metadata management to ensure proper data representation and compatibility between pandas and parquet formats.
Utility class for working with parquet schema structures and navigating complex nested schemas.
class SchemaHelper:
"""
Helper class for parquet schema navigation and analysis.
Provides methods to understand and work with parquet schema structures,
including nested types and logical type conversions.
"""
def __init__(self, schema_elements):
"""
Initialize SchemaHelper with parquet schema elements.
Parameters:
- schema_elements: list, parquet schema element list
"""
def schema_element(self, name):
"""
Get schema element by name or path.
Parameters:
- name: str or list, column name or path in schema
Returns:
SchemaElement: The schema element for the specified path
"""
def is_required(self, name):
"""
Check if field is required (not optional).
Parameters:
- name: str, column name to check
Returns:
bool: True if field is required, False if optional
"""
def max_repetition_level(self, parts):
"""
Calculate maximum repetition level for column path.
Parameters:
- parts: list, column path components
Returns:
int: Maximum repetition level
"""
def max_definition_level(self, parts):
"""
Calculate maximum definition level for column path.
Parameters:
- parts: list, column path components
Returns:
int: Maximum definition level
"""
@property
def text(self):
"""Human-readable schema representation."""Functions for building and analyzing parquet schema structures.
def schema_tree(schema, i=0):
"""
Build tree structure from flat schema list.
Parameters:
- schema: list, flat list of schema elements
- i: int, starting index in schema list
Returns:
int: Final index after processing tree
"""
def schema_to_text(root, indent=[]):
"""
Convert schema to human-readable text representation.
Parameters:
- root: SchemaElement, root schema element
- indent: list, indentation tracking for nested elements
Returns:
str: Human-readable schema representation
"""
def flatten(schema_helper, path=None):
"""
Flatten nested schema into list of column paths.
Parameters:
- schema_helper: SchemaHelper, schema navigation helper
- path: list, current path for recursion
Returns:
list: List of flattened column paths
"""
def _is_list_like(helper, name):
"""
Check if column represents a list-like structure.
Parameters:
- helper: SchemaHelper, schema navigation helper
- name: str, column name to check
Returns:
bool: True if column is list-like, False otherwise
"""
def _is_map_like(helper, name):
"""
Check if column represents a map-like structure.
Parameters:
- helper: SchemaHelper, schema navigation helper
- name: str, column name to check
Returns:
bool: True if column is map-like, False otherwise
"""Convert pandas data types to appropriate parquet representations.
def find_type(data, fixed_text=None, object_encoding=None,
times='int64', is_index=None):
"""
Determine appropriate parquet type codes for pandas Series.
Parameters:
- data: pandas.Series, input data to analyze
- fixed_text: int, fixed-length string size for string/bytes columns
- object_encoding: str, encoding method for object dtype columns
- times: str, timestamp encoding format ('int64' or 'int96')
- is_index: bool, whether data represents DataFrame index
Returns:
tuple: (SchemaElement, parquet_type_code)
"""
def convert(data, se):
"""
Convert pandas data according to schema element specification.
Parameters:
- data: pandas.Series, input data to convert
- se: SchemaElement, target parquet schema element
Returns:
numpy.ndarray: Converted data ready for parquet encoding
"""Handle encoding of Python object types to parquet-compatible formats.
def infer_object_encoding(data):
"""
Automatically infer appropriate encoding for object dtype column.
Parameters:
- data: pandas.Series, object dtype data to analyze
Returns:
str: Inferred encoding type ('utf8', 'bytes', 'json', 'bool', etc.)
"""Convert parquet data back to appropriate pandas types.
def convert(data, schema_element, metadata=None):
"""
Convert raw parquet data to appropriate pandas types.
Parameters:
- data: numpy.ndarray, raw parquet data
- schema_element: SchemaElement, parquet schema information
- metadata: dict, additional pandas metadata for conversion
Returns:
numpy.ndarray: Converted data suitable for pandas
"""
# Type mapping constants
simple = {
# Mapping from parquet primitive types to numpy dtypes
'INT32': 'int32',
'INT64': 'int64',
'FLOAT': 'float32',
'DOUBLE': 'float64',
'BOOLEAN': 'bool',
'BYTE_ARRAY': 'object',
'FIXED_LEN_BYTE_ARRAY': 'object'
}
complex = {
# Mapping from parquet logical types to numpy dtypes
'UTF8': 'object',
'JSON': 'object',
'BSON': 'object',
'DECIMAL': 'float64',
'TIMESTAMP_MILLIS': 'datetime64[ms]',
'TIMESTAMP_MICROS': 'datetime64[us]',
'TIME_MICROS': 'timedelta64[us]'
}Functions for working with Binary JSON (BSON) encoded data.
def tobson(obj):
"""
Convert Python object to BSON binary format.
Parameters:
- obj: Any, Python object to encode
Returns:
bytes: BSON-encoded binary data
"""
def unbson(data):
"""
Convert BSON binary data back to Python object.
Parameters:
- data: bytes, BSON-encoded binary data
Returns:
Any: Decoded Python object
"""Generate pandas-compatible metadata for columns during the writing process.
def get_column_metadata(column, name, object_dtype=None):
"""
Generate pandas column metadata for parquet storage.
Parameters:
- column: pandas.Series, source column data
- name: str, column name
- object_dtype: str, specific object encoding type
Returns:
dict: Pandas metadata dictionary with type and encoding info
"""
def get_numpy_type(dtype):
"""
Get numpy type string representation for pandas dtype.
Parameters:
- dtype: pandas.dtype, input pandas data type
Returns:
str: String representation of equivalent numpy type
"""Utilities for inferring and validating data types during conversion.
def infer_dtype(column):
"""
Infer pandas dtype of column data.
Parameters:
- column: pandas.Series, data to analyze
Returns:
str: Inferred pandas dtype string
"""
def groupby_types(iterable):
"""
Group objects by their Python type.
Parameters:
- iterable: Iterable, collection of objects to group
Returns:
dict: Mapping from type to list of objects of that type
"""from fastparquet import ParquetFile
# Read schema information
pf = ParquetFile('data.parquet')
schema = pf.schema
# Print schema structure
print(schema) # Human-readable schema representation
# Access individual schema elements
for column_name in pf.columns:
element = schema.schema_element([column_name])
print(f"{column_name}: {element.type}")import pandas as pd
from fastparquet.writer import find_type, convert
# Analyze pandas data for parquet conversion
data = pd.Series([1, 2, 3, 4, 5], name='numbers')
schema_element, type_code = find_type(data)
print(f"Parquet type: {schema_element.type}")
print(f"Converted type: {schema_element.converted_type}")
# Convert data for writing
converted_data = convert(data, schema_element)
print(f"Converted shape: {converted_data.shape}")from fastparquet.writer import find_type
# String data (automatic UTF-8 encoding)
text_data = pd.Series(['hello', 'world', 'test'])
se, _ = find_type(text_data, object_encoding='utf8')
# JSON data
json_data = pd.Series([{'a': 1}, {'b': 2}, {'c': 3}])
se, _ = find_type(json_data, object_encoding='json')
# Binary data
binary_data = pd.Series([b'data1', b'data2', b'data3'])
se, _ = find_type(binary_data, object_encoding='bytes')
# Automatic inference
mixed_data = pd.Series(['text1', 'text2', 'text3'])
se, _ = find_type(mixed_data, object_encoding='infer')from fastparquet.writer import find_type
# Fixed-length strings
fixed_text_data = pd.Series(['ABC', 'DEF', 'GHI'])
se, _ = find_type(fixed_text_data, fixed_text=3)
# Decimal data
decimal_data = pd.Series([1.234, 5.678, 9.012])
se, _ = find_type(decimal_data, object_encoding='decimal')
# Timestamp with different encodings
timestamp_data = pd.Series(pd.date_range('2023-01-01', periods=3))
# 64-bit integer timestamps (default)
se_int64, _ = find_type(timestamp_data, times='int64')
# 96-bit timestamps (legacy compatibility)
se_int96, _ = find_type(timestamp_data, times='int96')from fastparquet.util import get_column_metadata
# Generate metadata for different column types
df = pd.DataFrame({
'int_col': [1, 2, 3],
'float_col': [1.1, 2.2, 3.3],
'str_col': ['a', 'b', 'c'],
'cat_col': pd.Categorical(['X', 'Y', 'Z']),
'date_col': pd.date_range('2023-01-01', periods=3)
})
for col_name, col_data in df.items():
metadata = get_column_metadata(col_data, col_name)
print(f"{col_name}: {metadata['pandas_type']} -> {metadata['numpy_type']}")# Object encoding options
ObjectEncoding = Literal[
'infer', # Automatically detect encoding
'utf8', # UTF-8 text encoding
'bytes', # Raw binary data
'json', # JSON serialization
'bson', # Binary JSON encoding
'bool', # Boolean values
'int', # Integer values (64-bit)
'int32', # Integer values (32-bit)
'float', # Floating point values
'decimal' # Decimal number handling
]
# Timestamp encoding formats
TimeEncoding = Literal[
'int64', # 64-bit integer (nanosecond precision)
'int96' # 96-bit format (legacy compatibility)
]
# Parquet primitive types
ParquetPrimitiveType = Literal[
'BOOLEAN',
'INT32',
'INT64',
'FLOAT',
'DOUBLE',
'BYTE_ARRAY',
'FIXED_LEN_BYTE_ARRAY',
'INT96'
]
# Parquet logical types
ParquetLogicalType = Literal[
'UTF8',
'JSON',
'BSON',
'DECIMAL',
'TIMESTAMP_MILLIS',
'TIMESTAMP_MICROS',
'TIME_MICROS',
'INT_8',
'INT_16',
'INT_32',
'INT_64',
'UINT_8',
'UINT_16',
'UINT_32',
'UINT_64'
]
# Schema element structure
SchemaElement = Any # parquet_thrift.SchemaElement
# Column metadata structure
ColumnMetadata = Dict[str, Union[str, Dict[str, Any]]]Install with Tessl CLI
npx tessl i tessl/pypi-fastparquet