chDB is an in-process SQL OLAP Engine powered by ClickHouse
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Data type utilities for converting between formats, inferring data types, and handling nested data structures in analytical workflows. These functions support data preparation and type management for chDB operations.
Convert nested dictionary structures into flat key-value pairs for tabular data processing.
def flatten_dict(d: dict, parent_key: str = "", sep: str = "_") -> dict:
"""
Flatten nested dictionary structure with configurable key separation.
Parameters:
- d: Dictionary to flatten (may contain nested dicts and lists)
- parent_key: Base key to prepend to flattened keys
- sep: Separator for concatenating nested keys (default: "_")
Returns:
dict: Flattened dictionary with composite keys
Notes:
- Nested dictionaries become flattened with sep-separated keys
- Lists of dictionaries are serialized to JSON strings
- Mixed lists have individual items flattened with index suffixes
- Non-dict list items become indexed keys
"""Transform lists of dictionaries into columnar format for analytical processing.
def convert_to_columnar(items: list) -> dict:
"""
Convert list of dictionaries to columnar format.
Parameters:
- items: List of dictionaries to convert
Returns:
dict: Dictionary with column names as keys and lists of values
Notes:
- Missing values represented as None
- Automatically flattens nested dictionaries in input
- Handles variable dictionary schemas across list items
- Optimized for analytical query operations
"""Analyze data values to determine appropriate SQL data types for columns.
def infer_data_type(values: list) -> str:
"""
Infer most suitable SQL data type for list of values.
Parameters:
- values: List of values to analyze (any types)
Returns:
str: Inferred data type name
Possible return values:
- Integer types: "int8", "int16", "int32", "int64", "int128", "int256"
- Unsigned types: "uint8", "uint16", "uint32", "uint64", "uint128", "uint256"
- Decimal types: "decimal128", "decimal256"
- Float types: "float32", "float64"
- Default: "string"
Notes:
- Returns "string" if any non-numeric values found
- Returns "string" if all values are None
- Chooses smallest integer type that fits value range
- Prefers unsigned types when all values >= 0
"""
def infer_data_types(column_data: dict, n_rows: int = 10000) -> list:
"""
Infer data types for all columns in columnar data structure.
Parameters:
- column_data: Dictionary with column names as keys, value lists as values
- n_rows: Number of rows to sample for inference (default: 10000)
Returns:
list: List of (column_name, data_type) tuples
Notes:
- Samples first n_rows for performance on large datasets
- Uses infer_data_type() for individual column analysis
- Maintains column ordering from input dictionary
"""from chdb.utils import flatten_dict
# Simple nested dictionary
nested_data = {
"user": {
"id": 123,
"profile": {
"name": "Alice",
"settings": {
"theme": "dark",
"notifications": True
}
}
},
"timestamp": 1642612345
}
flattened = flatten_dict(nested_data)
print("Flattened dictionary:")
for key, value in flattened.items():
print(f" {key}: {value}")
# Output:
# user_id: 123
# user_profile_name: Alice
# user_profile_settings_theme: dark
# user_profile_settings_notifications: True
# timestamp: 1642612345from chdb.utils import flatten_dict
data = {
"metrics": {
"cpu": {"usage": 75.5, "cores": 4},
"memory": {"used": 8192, "total": 16384}
}
}
# Use dot separator instead of underscore
flattened_dots = flatten_dict(data, sep=".")
print("Dot-separated keys:")
for key, value in flattened_dots.items():
print(f" {key}: {value}")
# Output:
# metrics.cpu.usage: 75.5
# metrics.cpu.cores: 4
# metrics.memory.used: 8192
# metrics.memory.total: 16384from chdb.utils import flatten_dict
complex_data = {
"items": [10, 20, {"nested": "value"}],
"users": [
{"id": 1, "name": "Alice"},
{"id": 2, "name": "Bob"}
],
"simple_list": ["a", "b", "c"]
}
flattened = flatten_dict(complex_data)
print("Complex flattening:")
for key, value in flattened.items():
print(f" {key}: {value}")
# Output shows:
# - Individual list items with index suffixes
# - JSON serialization for lists of dictionaries
# - Nested object expansion within listsfrom chdb.utils import convert_to_columnar
# List of user records with varying fields
users = [
{"id": 1, "name": "Alice", "email": "alice@example.com", "age": 30},
{"id": 2, "name": "Bob", "age": 25}, # Missing email
{"id": 3, "name": "Charlie", "email": "charlie@example.com"}, # Missing age
{"id": 4, "name": "Diana", "email": "diana@example.com", "age": 28}
]
# Convert to columnar format
columnar = convert_to_columnar(users)
print("Columnar data:")
for column, values in columnar.items():
print(f" {column}: {values}")
# Output:
# id: [1, 2, 3, 4]
# name: ['Alice', 'Bob', 'Charlie', 'Diana']
# email: ['alice@example.com', None, 'charlie@example.com', 'diana@example.com']
# age: [30, 25, None, 28]from chdb.utils import convert_to_columnar
# Records with nested structures
events = [
{
"event_id": "evt_001",
"user": {"id": 123, "name": "Alice"},
"metadata": {"source": "web", "version": "1.2"}
},
{
"event_id": "evt_002",
"user": {"id": 456, "name": "Bob"},
"metadata": {"source": "mobile"} # Missing version
}
]
columnar = convert_to_columnar(events)
print("Nested data converted to columnar:")
for column, values in columnar.items():
print(f" {column}: {values}")
# Automatically flattens nested structures:
# event_id: ['evt_001', 'evt_002']
# user_id: [123, 456]
# user_name: ['Alice', 'Bob']
# metadata_source: ['web', 'mobile']
# metadata_version: ['1.2', None]from chdb.utils import infer_data_type
# Test different data types
integer_data = [1, 2, 3, 100, -50]
float_data = [1.5, 2.7, 3.14, 100.0]
string_data = ["hello", "world", "test"]
mixed_data = [1, "hello", 3.14]
large_int_data = [2**32, 2**33, 2**34]
print("Data type inference:")
print(f"Integer data: {infer_data_type(integer_data)}") # int32 or int64
print(f"Float data: {infer_data_type(float_data)}") # float32 or float64
print(f"String data: {infer_data_type(string_data)}") # string
print(f"Mixed data: {infer_data_type(mixed_data)}") # string
print(f"Large int data: {infer_data_type(large_int_data)}") # int64, int128, etc.from chdb.utils import infer_data_type
# Test integer ranges for optimal type selection
small_positive = [1, 2, 3, 255] # Should be uint8
small_negative = [-128, -1, 0, 127] # Should be int8
medium_values = [1000, 2000, 32767] # Should be int16 or uint16
large_values = [2**31, 2**32] # Should be int64 or larger
print("Integer type optimization:")
print(f"Small positive [0-255]: {infer_data_type(small_positive)}")
print(f"Small range [-128,127]: {infer_data_type(small_negative)}")
print(f"Medium values: {infer_data_type(medium_values)}")
print(f"Large values: {infer_data_type(large_values)}")from chdb.utils import convert_to_columnar, infer_data_types
# Sample dataset
records = [
{"id": 1, "name": "Alice", "score": 95.5, "active": True},
{"id": 2, "name": "Bob", "score": 87.2, "active": False},
{"id": 3, "name": "Charlie", "score": 92.0, "active": True},
]
# Convert to columnar and infer types
columnar_data = convert_to_columnar(records)
data_types = infer_data_types(columnar_data)
print("Column data types:")
for column_name, data_type in data_types:
print(f" {column_name}: {data_type}")
# Typical output:
# id: int8 (small positive integers)
# name: string (text data)
# score: float32 or float64 (decimal numbers)
# active: string (boolean values treated as strings)from chdb.utils import infer_data_types
import random
# Simulate large dataset
large_dataset = {
"transaction_id": list(range(1000000)),
"amount": [random.uniform(10.0, 1000.0) for _ in range(1000000)],
"category": [random.choice(["food", "transport", "entertainment"]) for _ in range(1000000)]
}
# Infer types using sampling (first 5000 rows)
sampled_types = infer_data_types(large_dataset, n_rows=5000)
print("Data types from large dataset sample:")
for column, dtype in sampled_types:
print(f" {column}: {dtype}")
# Much faster than analyzing all 1M rowsfrom chdb.utils import convert_to_columnar, infer_data_types
import chdb
# Prepare raw data
raw_events = [
{"timestamp": "2024-01-01 10:00:00", "user_id": 123, "action": "login", "duration": 45.2},
{"timestamp": "2024-01-01 10:05:00", "user_id": 456, "action": "view_page", "duration": 120.5},
{"timestamp": "2024-01-01 10:10:00", "user_id": 123, "action": "logout", "duration": 5.1}
]
# Convert and analyze
columnar_events = convert_to_columnar(raw_events)
event_types = infer_data_types(columnar_events)
print("Event data structure:")
for col, dtype in event_types:
print(f" {col}: {dtype}")
# Now the data structure is understood and can be efficiently queried
# (This would typically be saved to a file format that chDB can read)from chdb.utils import flatten_dict, convert_to_columnar, infer_data_type
# Handle edge cases
try:
# Empty data
empty_result = convert_to_columnar([])
print(f"Empty list result: {empty_result}") # Returns {}
# All None values
none_type = infer_data_type([None, None, None])
print(f"All None values: {none_type}") # Returns "string"
# Invalid dictionary
invalid_dict = "not a dictionary"
# flatten_dict will raise TypeError for non-dict input
except Exception as e:
print(f"Error handling: {e}")Install with Tessl CLI
npx tessl i tessl/pypi-chdb