Tessl Tile for pypi/chdb@2.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

dataframe.md dbapi.md index.md query-functions.md sessions.md udf.md utils.md

utils.mddocs/

0
# Utility Functions
1

2
Data type utilities for converting between formats, inferring data types, and handling nested data structures in analytical workflows. These functions support data preparation and type management for chDB operations.
3

4
## Capabilities
5

6
### Dictionary Flattening
7

8
Convert nested dictionary structures into flat key-value pairs for tabular data processing.
9

10
```python { .api }
11
def flatten_dict(d: dict, parent_key: str = "", sep: str = "_") -> dict:
12
    """
13
    Flatten nested dictionary structure with configurable key separation.
14
    
15
    Parameters:
16
    - d: Dictionary to flatten (may contain nested dicts and lists)
17
    - parent_key: Base key to prepend to flattened keys
18
    - sep: Separator for concatenating nested keys (default: "_")
19
    
20
    Returns:
21
    dict: Flattened dictionary with composite keys
22
    
23
    Notes:
24
    - Nested dictionaries become flattened with sep-separated keys
25
    - Lists of dictionaries are serialized to JSON strings
26
    - Mixed lists have individual items flattened with index suffixes
27
    - Non-dict list items become indexed keys
28
    """
29
```
30

31
### Columnar Data Conversion
32

33
Transform lists of dictionaries into columnar format for analytical processing.
34

35
```python { .api }
36
def convert_to_columnar(items: list) -> dict:
37
    """
38
    Convert list of dictionaries to columnar format.
39
    
40
    Parameters:
41
    - items: List of dictionaries to convert
42
    
43
    Returns:
44
    dict: Dictionary with column names as keys and lists of values
45
    
46
    Notes:
47
    - Missing values represented as None
48
    - Automatically flattens nested dictionaries in input
49
    - Handles variable dictionary schemas across list items
50
    - Optimized for analytical query operations
51
    """
52
```
53

54
### Data Type Inference
55

56
Analyze data values to determine appropriate SQL data types for columns.
57

58
```python { .api }
59
def infer_data_type(values: list) -> str:
60
    """
61
    Infer most suitable SQL data type for list of values.
62
    
63
    Parameters:
64
    - values: List of values to analyze (any types)
65
    
66
    Returns:
67
    str: Inferred data type name
68
    
69
    Possible return values:
70
    - Integer types: "int8", "int16", "int32", "int64", "int128", "int256"
71
    - Unsigned types: "uint8", "uint16", "uint32", "uint64", "uint128", "uint256"  
72
    - Decimal types: "decimal128", "decimal256"
73
    - Float types: "float32", "float64"
74
    - Default: "string"
75
    
76
    Notes:
77
    - Returns "string" if any non-numeric values found
78
    - Returns "string" if all values are None
79
    - Chooses smallest integer type that fits value range
80
    - Prefers unsigned types when all values >= 0
81
    """
82

83
def infer_data_types(column_data: dict, n_rows: int = 10000) -> list:
84
    """
85
    Infer data types for all columns in columnar data structure.
86
    
87
    Parameters:
88
    - column_data: Dictionary with column names as keys, value lists as values
89
    - n_rows: Number of rows to sample for inference (default: 10000)
90
    
91
    Returns:
92
    list: List of (column_name, data_type) tuples
93
    
94
    Notes:
95
    - Samples first n_rows for performance on large datasets
96
    - Uses infer_data_type() for individual column analysis
97
    - Maintains column ordering from input dictionary
98
    """
99
```
100

101
## Usage Examples
102

103
### Dictionary Flattening
104

105
```python
106
from chdb.utils import flatten_dict
107

108
# Simple nested dictionary
109
nested_data = {
110
    "user": {
111
        "id": 123,
112
        "profile": {
113
            "name": "Alice",
114
            "settings": {
115
                "theme": "dark",
116
                "notifications": True
117
            }
118
        }
119
    },
120
    "timestamp": 1642612345
121
}
122

123
flattened = flatten_dict(nested_data)
124
print("Flattened dictionary:")
125
for key, value in flattened.items():
126
    print(f"  {key}: {value}")
127

128
# Output:
129
# user_id: 123
130
# user_profile_name: Alice
131
# user_profile_settings_theme: dark
132
# user_profile_settings_notifications: True
133
# timestamp: 1642612345
134
```
135

136
### Custom Separator for Flattening
137

138
```python
139
from chdb.utils import flatten_dict
140

141
data = {
142
    "metrics": {
143
        "cpu": {"usage": 75.5, "cores": 4},
144
        "memory": {"used": 8192, "total": 16384}
145
    }
146
}
147

148
# Use dot separator instead of underscore
149
flattened_dots = flatten_dict(data, sep=".")
150
print("Dot-separated keys:")
151
for key, value in flattened_dots.items():
152
    print(f"  {key}: {value}")
153

154
# Output:
155
# metrics.cpu.usage: 75.5
156
# metrics.cpu.cores: 4
157
# metrics.memory.used: 8192
158
# metrics.memory.total: 16384
159
```
160

161
### Handling Lists in Flattening
162

163
```python
164
from chdb.utils import flatten_dict
165

166
complex_data = {
167
    "items": [10, 20, {"nested": "value"}],
168
    "users": [
169
        {"id": 1, "name": "Alice"},
170
        {"id": 2, "name": "Bob"}
171
    ],
172
    "simple_list": ["a", "b", "c"]
173
}
174

175
flattened = flatten_dict(complex_data)
176
print("Complex flattening:")
177
for key, value in flattened.items():
178
    print(f"  {key}: {value}")
179

180
# Output shows:
181
# - Individual list items with index suffixes
182
# - JSON serialization for lists of dictionaries
183
# - Nested object expansion within lists
184
```
185

186
### Columnar Data Conversion
187

188
```python
189
from chdb.utils import convert_to_columnar
190

191
# List of user records with varying fields
192
users = [
193
    {"id": 1, "name": "Alice", "email": "alice@example.com", "age": 30},
194
    {"id": 2, "name": "Bob", "age": 25},  # Missing email
195
    {"id": 3, "name": "Charlie", "email": "charlie@example.com"},  # Missing age
196
    {"id": 4, "name": "Diana", "email": "diana@example.com", "age": 28}
197
]
198

199
# Convert to columnar format
200
columnar = convert_to_columnar(users)
201

202
print("Columnar data:")
203
for column, values in columnar.items():
204
    print(f"  {column}: {values}")
205

206
# Output:
207
# id: [1, 2, 3, 4]
208
# name: ['Alice', 'Bob', 'Charlie', 'Diana']
209
# email: ['alice@example.com', None, 'charlie@example.com', 'diana@example.com']
210
# age: [30, 25, None, 28]
211
```
212

213
### Working with Nested Data in Columnar Conversion
214

215
```python
216
from chdb.utils import convert_to_columnar
217

218
# Records with nested structures
219
events = [
220
    {
221
        "event_id": "evt_001",
222
        "user": {"id": 123, "name": "Alice"},
223
        "metadata": {"source": "web", "version": "1.2"}
224
    },
225
    {
226
        "event_id": "evt_002", 
227
        "user": {"id": 456, "name": "Bob"},
228
        "metadata": {"source": "mobile"}  # Missing version
229
    }
230
]
231

232
columnar = convert_to_columnar(events)
233

234
print("Nested data converted to columnar:")
235
for column, values in columnar.items():
236
    print(f"  {column}: {values}")
237

238
# Automatically flattens nested structures:
239
# event_id: ['evt_001', 'evt_002']
240
# user_id: [123, 456]
241
# user_name: ['Alice', 'Bob']
242
# metadata_source: ['web', 'mobile']
243
# metadata_version: ['1.2', None]
244
```
245

246
### Data Type Inference
247

248
```python
249
from chdb.utils import infer_data_type
250

251
# Test different data types
252
integer_data = [1, 2, 3, 100, -50]
253
float_data = [1.5, 2.7, 3.14, 100.0]
254
string_data = ["hello", "world", "test"]
255
mixed_data = [1, "hello", 3.14]
256
large_int_data = [2**32, 2**33, 2**34]
257

258
print("Data type inference:")
259
print(f"Integer data: {infer_data_type(integer_data)}")     # int32 or int64
260
print(f"Float data: {infer_data_type(float_data)}")         # float32 or float64
261
print(f"String data: {infer_data_type(string_data)}")       # string
262
print(f"Mixed data: {infer_data_type(mixed_data)}")         # string
263
print(f"Large int data: {infer_data_type(large_int_data)}") # int64, int128, etc.
264
```
265

266
### Integer Range Detection
267

268
```python
269
from chdb.utils import infer_data_type
270

271
# Test integer ranges for optimal type selection
272
small_positive = [1, 2, 3, 255]           # Should be uint8
273
small_negative = [-128, -1, 0, 127]       # Should be int8
274
medium_values = [1000, 2000, 32767]       # Should be int16 or uint16
275
large_values = [2**31, 2**32]             # Should be int64 or larger
276

277
print("Integer type optimization:")
278
print(f"Small positive [0-255]: {infer_data_type(small_positive)}")
279
print(f"Small range [-128,127]: {infer_data_type(small_negative)}")  
280
print(f"Medium values: {infer_data_type(medium_values)}")
281
print(f"Large values: {infer_data_type(large_values)}")
282
```
283

284
### Columnar Data Type Inference
285

286
```python
287
from chdb.utils import convert_to_columnar, infer_data_types
288

289
# Sample dataset
290
records = [
291
    {"id": 1, "name": "Alice", "score": 95.5, "active": True},
292
    {"id": 2, "name": "Bob", "score": 87.2, "active": False},
293
    {"id": 3, "name": "Charlie", "score": 92.0, "active": True},
294
]
295

296
# Convert to columnar and infer types
297
columnar_data = convert_to_columnar(records)
298
data_types = infer_data_types(columnar_data)
299

300
print("Column data types:")
301
for column_name, data_type in data_types:
302
    print(f"  {column_name}: {data_type}")
303

304
# Typical output:
305
# id: int8 (small positive integers)
306
# name: string (text data)
307
# score: float32 or float64 (decimal numbers)
308
# active: string (boolean values treated as strings)
309
```
310

311
### Processing Large Datasets with Sampling
312

313
```python
314
from chdb.utils import infer_data_types
315
import random
316

317
# Simulate large dataset
318
large_dataset = {
319
    "transaction_id": list(range(1000000)),
320
    "amount": [random.uniform(10.0, 1000.0) for _ in range(1000000)],
321
    "category": [random.choice(["food", "transport", "entertainment"]) for _ in range(1000000)]
322
}
323

324
# Infer types using sampling (first 5000 rows)
325
sampled_types = infer_data_types(large_dataset, n_rows=5000)
326

327
print("Data types from large dataset sample:")
328
for column, dtype in sampled_types:
329
    print(f"  {column}: {dtype}")
330

331
# Much faster than analyzing all 1M rows
332
```
333

334
### Integration with chDB Queries
335

336
```python
337
from chdb.utils import convert_to_columnar, infer_data_types
338
import chdb
339

340
# Prepare raw data
341
raw_events = [
342
    {"timestamp": "2024-01-01 10:00:00", "user_id": 123, "action": "login", "duration": 45.2},
343
    {"timestamp": "2024-01-01 10:05:00", "user_id": 456, "action": "view_page", "duration": 120.5},
344
    {"timestamp": "2024-01-01 10:10:00", "user_id": 123, "action": "logout", "duration": 5.1}
345
]
346

347
# Convert and analyze
348
columnar_events = convert_to_columnar(raw_events)
349
event_types = infer_data_types(columnar_events)
350

351
print("Event data structure:")
352
for col, dtype in event_types:
353
    print(f"  {col}: {dtype}")
354

355
# Now the data structure is understood and can be efficiently queried
356
# (This would typically be saved to a file format that chDB can read)
357
```
358

359
### Error Handling
360

361
```python
362
from chdb.utils import flatten_dict, convert_to_columnar, infer_data_type
363

364
# Handle edge cases
365
try:
366
    # Empty data
367
    empty_result = convert_to_columnar([])
368
    print(f"Empty list result: {empty_result}")  # Returns {}
369
    
370
    # All None values
371
    none_type = infer_data_type([None, None, None])
372
    print(f"All None values: {none_type}")  # Returns "string"
373
    
374
    # Invalid dictionary
375
    invalid_dict = "not a dictionary"
376
    # flatten_dict will raise TypeError for non-dict input
377
    
378
except Exception as e:
379
    print(f"Error handling: {e}")
380
```

Version

Tile

Files

utils.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utils.mddocs/