tessl/pypi-chdb

chDB is an in-process SQL OLAP Engine powered by ClickHouse

—

Pending

Quality

Pending

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

Overview

Eval results

Files

Utility Functions

Name: tessl/pypi-chdb
Author: tessl

Data type utilities for converting between formats, inferring data types, and handling nested data structures in analytical workflows. These functions support data preparation and type management for chDB operations.

Capabilities

Dictionary Flattening

Convert nested dictionary structures into flat key-value pairs for tabular data processing.

def flatten_dict(d: dict, parent_key: str = "", sep: str = "_") -> dict:
    """
    Flatten nested dictionary structure with configurable key separation.
    
    Parameters:
    - d: Dictionary to flatten (may contain nested dicts and lists)
    - parent_key: Base key to prepend to flattened keys
    - sep: Separator for concatenating nested keys (default: "_")
    
    Returns:
    dict: Flattened dictionary with composite keys
    
    Notes:
    - Nested dictionaries become flattened with sep-separated keys
    - Lists of dictionaries are serialized to JSON strings
    - Mixed lists have individual items flattened with index suffixes
    - Non-dict list items become indexed keys
    """

Columnar Data Conversion

Transform lists of dictionaries into columnar format for analytical processing.

def convert_to_columnar(items: list) -> dict:
    """
    Convert list of dictionaries to columnar format.
    
    Parameters:
    - items: List of dictionaries to convert
    
    Returns:
    dict: Dictionary with column names as keys and lists of values
    
    Notes:
    - Missing values represented as None
    - Automatically flattens nested dictionaries in input
    - Handles variable dictionary schemas across list items
    - Optimized for analytical query operations
    """

Data Type Inference

Analyze data values to determine appropriate SQL data types for columns.

def infer_data_type(values: list) -> str:
    """
    Infer most suitable SQL data type for list of values.
    
    Parameters:
    - values: List of values to analyze (any types)
    
    Returns:
    str: Inferred data type name
    
    Possible return values:
    - Integer types: "int8", "int16", "int32", "int64", "int128", "int256"
    - Unsigned types: "uint8", "uint16", "uint32", "uint64", "uint128", "uint256"  
    - Decimal types: "decimal128", "decimal256"
    - Float types: "float32", "float64"
    - Default: "string"
    
    Notes:
    - Returns "string" if any non-numeric values found
    - Returns "string" if all values are None
    - Chooses smallest integer type that fits value range
    - Prefers unsigned types when all values >= 0
    """

def infer_data_types(column_data: dict, n_rows: int = 10000) -> list:
    """
    Infer data types for all columns in columnar data structure.
    
    Parameters:
    - column_data: Dictionary with column names as keys, value lists as values
    - n_rows: Number of rows to sample for inference (default: 10000)
    
    Returns:
    list: List of (column_name, data_type) tuples
    
    Notes:
    - Samples first n_rows for performance on large datasets
    - Uses infer_data_type() for individual column analysis
    - Maintains column ordering from input dictionary
    """

Usage Examples

Dictionary Flattening

from chdb.utils import flatten_dict

# Simple nested dictionary
nested_data = {
    "user": {
        "id": 123,
        "profile": {
            "name": "Alice",
            "settings": {
                "theme": "dark",
                "notifications": True
            }
        }
    },
    "timestamp": 1642612345
}

flattened = flatten_dict(nested_data)
print("Flattened dictionary:")
for key, value in flattened.items():
    print(f"  {key}: {value}")

# Output:
# user_id: 123
# user_profile_name: Alice
# user_profile_settings_theme: dark
# user_profile_settings_notifications: True
# timestamp: 1642612345

Custom Separator for Flattening

from chdb.utils import flatten_dict

data = {
    "metrics": {
        "cpu": {"usage": 75.5, "cores": 4},
        "memory": {"used": 8192, "total": 16384}
    }
}

# Use dot separator instead of underscore
flattened_dots = flatten_dict(data, sep=".")
print("Dot-separated keys:")
for key, value in flattened_dots.items():
    print(f"  {key}: {value}")

# Output:
# metrics.cpu.usage: 75.5
# metrics.cpu.cores: 4
# metrics.memory.used: 8192
# metrics.memory.total: 16384

Handling Lists in Flattening

from chdb.utils import flatten_dict

complex_data = {
    "items": [10, 20, {"nested": "value"}],
    "users": [
        {"id": 1, "name": "Alice"},
        {"id": 2, "name": "Bob"}
    ],
    "simple_list": ["a", "b", "c"]
}

flattened = flatten_dict(complex_data)
print("Complex flattening:")
for key, value in flattened.items():
    print(f"  {key}: {value}")

# Output shows:
# - Individual list items with index suffixes
# - JSON serialization for lists of dictionaries
# - Nested object expansion within lists

Columnar Data Conversion

from chdb.utils import convert_to_columnar

# List of user records with varying fields
users = [
    {"id": 1, "name": "Alice", "email": "alice@example.com", "age": 30},
    {"id": 2, "name": "Bob", "age": 25},  # Missing email
    {"id": 3, "name": "Charlie", "email": "charlie@example.com"},  # Missing age
    {"id": 4, "name": "Diana", "email": "diana@example.com", "age": 28}
]

# Convert to columnar format
columnar = convert_to_columnar(users)

print("Columnar data:")
for column, values in columnar.items():
    print(f"  {column}: {values}")

# Output:
# id: [1, 2, 3, 4]
# name: ['Alice', 'Bob', 'Charlie', 'Diana']
# email: ['alice@example.com', None, 'charlie@example.com', 'diana@example.com']
# age: [30, 25, None, 28]

Working with Nested Data in Columnar Conversion

from chdb.utils import convert_to_columnar

# Records with nested structures
events = [
    {
        "event_id": "evt_001",
        "user": {"id": 123, "name": "Alice"},
        "metadata": {"source": "web", "version": "1.2"}
    },
    {
        "event_id": "evt_002", 
        "user": {"id": 456, "name": "Bob"},
        "metadata": {"source": "mobile"}  # Missing version
    }
]

columnar = convert_to_columnar(events)

print("Nested data converted to columnar:")
for column, values in columnar.items():
    print(f"  {column}: {values}")

# Automatically flattens nested structures:
# event_id: ['evt_001', 'evt_002']
# user_id: [123, 456]
# user_name: ['Alice', 'Bob']
# metadata_source: ['web', 'mobile']
# metadata_version: ['1.2', None]

Data Type Inference

from chdb.utils import infer_data_type

# Test different data types
integer_data = [1, 2, 3, 100, -50]
float_data = [1.5, 2.7, 3.14, 100.0]
string_data = ["hello", "world", "test"]
mixed_data = [1, "hello", 3.14]
large_int_data = [2**32, 2**33, 2**34]

print("Data type inference:")
print(f"Integer data: {infer_data_type(integer_data)}")     # int32 or int64
print(f"Float data: {infer_data_type(float_data)}")         # float32 or float64
print(f"String data: {infer_data_type(string_data)}")       # string
print(f"Mixed data: {infer_data_type(mixed_data)}")         # string
print(f"Large int data: {infer_data_type(large_int_data)}") # int64, int128, etc.

Integer Range Detection

from chdb.utils import infer_data_type

# Test integer ranges for optimal type selection
small_positive = [1, 2, 3, 255]           # Should be uint8
small_negative = [-128, -1, 0, 127]       # Should be int8
medium_values = [1000, 2000, 32767]       # Should be int16 or uint16
large_values = [2**31, 2**32]             # Should be int64 or larger

print("Integer type optimization:")
print(f"Small positive [0-255]: {infer_data_type(small_positive)}")
print(f"Small range [-128,127]: {infer_data_type(small_negative)}")  
print(f"Medium values: {infer_data_type(medium_values)}")
print(f"Large values: {infer_data_type(large_values)}")

Columnar Data Type Inference

from chdb.utils import convert_to_columnar, infer_data_types

# Sample dataset
records = [
    {"id": 1, "name": "Alice", "score": 95.5, "active": True},
    {"id": 2, "name": "Bob", "score": 87.2, "active": False},
    {"id": 3, "name": "Charlie", "score": 92.0, "active": True},
]

# Convert to columnar and infer types
columnar_data = convert_to_columnar(records)
data_types = infer_data_types(columnar_data)

print("Column data types:")
for column_name, data_type in data_types:
    print(f"  {column_name}: {data_type}")

# Typical output:
# id: int8 (small positive integers)
# name: string (text data)
# score: float32 or float64 (decimal numbers)
# active: string (boolean values treated as strings)

Processing Large Datasets with Sampling

from chdb.utils import infer_data_types
import random

# Simulate large dataset
large_dataset = {
    "transaction_id": list(range(1000000)),
    "amount": [random.uniform(10.0, 1000.0) for _ in range(1000000)],
    "category": [random.choice(["food", "transport", "entertainment"]) for _ in range(1000000)]
}

# Infer types using sampling (first 5000 rows)
sampled_types = infer_data_types(large_dataset, n_rows=5000)

print("Data types from large dataset sample:")
for column, dtype in sampled_types:
    print(f"  {column}: {dtype}")

# Much faster than analyzing all 1M rows

Integration with chDB Queries

from chdb.utils import convert_to_columnar, infer_data_types
import chdb

# Prepare raw data
raw_events = [
    {"timestamp": "2024-01-01 10:00:00", "user_id": 123, "action": "login", "duration": 45.2},
    {"timestamp": "2024-01-01 10:05:00", "user_id": 456, "action": "view_page", "duration": 120.5},
    {"timestamp": "2024-01-01 10:10:00", "user_id": 123, "action": "logout", "duration": 5.1}
]

# Convert and analyze
columnar_events = convert_to_columnar(raw_events)
event_types = infer_data_types(columnar_events)

print("Event data structure:")
for col, dtype in event_types:
    print(f"  {col}: {dtype}")

# Now the data structure is understood and can be efficiently queried
# (This would typically be saved to a file format that chDB can read)

Error Handling

from chdb.utils import flatten_dict, convert_to_columnar, infer_data_type

# Handle edge cases
try:
    # Empty data
    empty_result = convert_to_columnar([])
    print(f"Empty list result: {empty_result}")  # Returns {}
    
    # All None values
    none_type = infer_data_type([None, None, None])
    print(f"All None values: {none_type}")  # Returns "string"
    
    # Invalid dictionary
    invalid_dict = "not a dictionary"
    # flatten_dict will raise TypeError for non-dict input
    
except Exception as e:
    print(f"Error handling: {e}")

Install with Tessl CLI