or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

dataframe.mddbapi.mdindex.mdquery-functions.mdsessions.mdudf.mdutils.md

utils.mddocs/

0

# Utility Functions

1

2

Data type utilities for converting between formats, inferring data types, and handling nested data structures in analytical workflows. These functions support data preparation and type management for chDB operations.

3

4

## Capabilities

5

6

### Dictionary Flattening

7

8

Convert nested dictionary structures into flat key-value pairs for tabular data processing.

9

10

```python { .api }

11

def flatten_dict(d: dict, parent_key: str = "", sep: str = "_") -> dict:

12

"""

13

Flatten nested dictionary structure with configurable key separation.

14

15

Parameters:

16

- d: Dictionary to flatten (may contain nested dicts and lists)

17

- parent_key: Base key to prepend to flattened keys

18

- sep: Separator for concatenating nested keys (default: "_")

19

20

Returns:

21

dict: Flattened dictionary with composite keys

22

23

Notes:

24

- Nested dictionaries become flattened with sep-separated keys

25

- Lists of dictionaries are serialized to JSON strings

26

- Mixed lists have individual items flattened with index suffixes

27

- Non-dict list items become indexed keys

28

"""

29

```

30

31

### Columnar Data Conversion

32

33

Transform lists of dictionaries into columnar format for analytical processing.

34

35

```python { .api }

36

def convert_to_columnar(items: list) -> dict:

37

"""

38

Convert list of dictionaries to columnar format.

39

40

Parameters:

41

- items: List of dictionaries to convert

42

43

Returns:

44

dict: Dictionary with column names as keys and lists of values

45

46

Notes:

47

- Missing values represented as None

48

- Automatically flattens nested dictionaries in input

49

- Handles variable dictionary schemas across list items

50

- Optimized for analytical query operations

51

"""

52

```

53

54

### Data Type Inference

55

56

Analyze data values to determine appropriate SQL data types for columns.

57

58

```python { .api }

59

def infer_data_type(values: list) -> str:

60

"""

61

Infer most suitable SQL data type for list of values.

62

63

Parameters:

64

- values: List of values to analyze (any types)

65

66

Returns:

67

str: Inferred data type name

68

69

Possible return values:

70

- Integer types: "int8", "int16", "int32", "int64", "int128", "int256"

71

- Unsigned types: "uint8", "uint16", "uint32", "uint64", "uint128", "uint256"

72

- Decimal types: "decimal128", "decimal256"

73

- Float types: "float32", "float64"

74

- Default: "string"

75

76

Notes:

77

- Returns "string" if any non-numeric values found

78

- Returns "string" if all values are None

79

- Chooses smallest integer type that fits value range

80

- Prefers unsigned types when all values >= 0

81

"""

82

83

def infer_data_types(column_data: dict, n_rows: int = 10000) -> list:

84

"""

85

Infer data types for all columns in columnar data structure.

86

87

Parameters:

88

- column_data: Dictionary with column names as keys, value lists as values

89

- n_rows: Number of rows to sample for inference (default: 10000)

90

91

Returns:

92

list: List of (column_name, data_type) tuples

93

94

Notes:

95

- Samples first n_rows for performance on large datasets

96

- Uses infer_data_type() for individual column analysis

97

- Maintains column ordering from input dictionary

98

"""

99

```

100

101

## Usage Examples

102

103

### Dictionary Flattening

104

105

```python

106

from chdb.utils import flatten_dict

107

108

# Simple nested dictionary

109

nested_data = {

110

"user": {

111

"id": 123,

112

"profile": {

113

"name": "Alice",

114

"settings": {

115

"theme": "dark",

116

"notifications": True

117

}

118

}

119

},

120

"timestamp": 1642612345

121

}

122

123

flattened = flatten_dict(nested_data)

124

print("Flattened dictionary:")

125

for key, value in flattened.items():

126

print(f" {key}: {value}")

127

128

# Output:

129

# user_id: 123

130

# user_profile_name: Alice

131

# user_profile_settings_theme: dark

132

# user_profile_settings_notifications: True

133

# timestamp: 1642612345

134

```

135

136

### Custom Separator for Flattening

137

138

```python

139

from chdb.utils import flatten_dict

140

141

data = {

142

"metrics": {

143

"cpu": {"usage": 75.5, "cores": 4},

144

"memory": {"used": 8192, "total": 16384}

145

}

146

}

147

148

# Use dot separator instead of underscore

149

flattened_dots = flatten_dict(data, sep=".")

150

print("Dot-separated keys:")

151

for key, value in flattened_dots.items():

152

print(f" {key}: {value}")

153

154

# Output:

155

# metrics.cpu.usage: 75.5

156

# metrics.cpu.cores: 4

157

# metrics.memory.used: 8192

158

# metrics.memory.total: 16384

159

```

160

161

### Handling Lists in Flattening

162

163

```python

164

from chdb.utils import flatten_dict

165

166

complex_data = {

167

"items": [10, 20, {"nested": "value"}],

168

"users": [

169

{"id": 1, "name": "Alice"},

170

{"id": 2, "name": "Bob"}

171

],

172

"simple_list": ["a", "b", "c"]

173

}

174

175

flattened = flatten_dict(complex_data)

176

print("Complex flattening:")

177

for key, value in flattened.items():

178

print(f" {key}: {value}")

179

180

# Output shows:

181

# - Individual list items with index suffixes

182

# - JSON serialization for lists of dictionaries

183

# - Nested object expansion within lists

184

```

185

186

### Columnar Data Conversion

187

188

```python

189

from chdb.utils import convert_to_columnar

190

191

# List of user records with varying fields

192

users = [

193

{"id": 1, "name": "Alice", "email": "alice@example.com", "age": 30},

194

{"id": 2, "name": "Bob", "age": 25}, # Missing email

195

{"id": 3, "name": "Charlie", "email": "charlie@example.com"}, # Missing age

196

{"id": 4, "name": "Diana", "email": "diana@example.com", "age": 28}

197

]

198

199

# Convert to columnar format

200

columnar = convert_to_columnar(users)

201

202

print("Columnar data:")

203

for column, values in columnar.items():

204

print(f" {column}: {values}")

205

206

# Output:

207

# id: [1, 2, 3, 4]

208

# name: ['Alice', 'Bob', 'Charlie', 'Diana']

209

# email: ['alice@example.com', None, 'charlie@example.com', 'diana@example.com']

210

# age: [30, 25, None, 28]

211

```

212

213

### Working with Nested Data in Columnar Conversion

214

215

```python

216

from chdb.utils import convert_to_columnar

217

218

# Records with nested structures

219

events = [

220

{

221

"event_id": "evt_001",

222

"user": {"id": 123, "name": "Alice"},

223

"metadata": {"source": "web", "version": "1.2"}

224

},

225

{

226

"event_id": "evt_002",

227

"user": {"id": 456, "name": "Bob"},

228

"metadata": {"source": "mobile"} # Missing version

229

}

230

]

231

232

columnar = convert_to_columnar(events)

233

234

print("Nested data converted to columnar:")

235

for column, values in columnar.items():

236

print(f" {column}: {values}")

237

238

# Automatically flattens nested structures:

239

# event_id: ['evt_001', 'evt_002']

240

# user_id: [123, 456]

241

# user_name: ['Alice', 'Bob']

242

# metadata_source: ['web', 'mobile']

243

# metadata_version: ['1.2', None]

244

```

245

246

### Data Type Inference

247

248

```python

249

from chdb.utils import infer_data_type

250

251

# Test different data types

252

integer_data = [1, 2, 3, 100, -50]

253

float_data = [1.5, 2.7, 3.14, 100.0]

254

string_data = ["hello", "world", "test"]

255

mixed_data = [1, "hello", 3.14]

256

large_int_data = [2**32, 2**33, 2**34]

257

258

print("Data type inference:")

259

print(f"Integer data: {infer_data_type(integer_data)}") # int32 or int64

260

print(f"Float data: {infer_data_type(float_data)}") # float32 or float64

261

print(f"String data: {infer_data_type(string_data)}") # string

262

print(f"Mixed data: {infer_data_type(mixed_data)}") # string

263

print(f"Large int data: {infer_data_type(large_int_data)}") # int64, int128, etc.

264

```

265

266

### Integer Range Detection

267

268

```python

269

from chdb.utils import infer_data_type

270

271

# Test integer ranges for optimal type selection

272

small_positive = [1, 2, 3, 255] # Should be uint8

273

small_negative = [-128, -1, 0, 127] # Should be int8

274

medium_values = [1000, 2000, 32767] # Should be int16 or uint16

275

large_values = [2**31, 2**32] # Should be int64 or larger

276

277

print("Integer type optimization:")

278

print(f"Small positive [0-255]: {infer_data_type(small_positive)}")

279

print(f"Small range [-128,127]: {infer_data_type(small_negative)}")

280

print(f"Medium values: {infer_data_type(medium_values)}")

281

print(f"Large values: {infer_data_type(large_values)}")

282

```

283

284

### Columnar Data Type Inference

285

286

```python

287

from chdb.utils import convert_to_columnar, infer_data_types

288

289

# Sample dataset

290

records = [

291

{"id": 1, "name": "Alice", "score": 95.5, "active": True},

292

{"id": 2, "name": "Bob", "score": 87.2, "active": False},

293

{"id": 3, "name": "Charlie", "score": 92.0, "active": True},

294

]

295

296

# Convert to columnar and infer types

297

columnar_data = convert_to_columnar(records)

298

data_types = infer_data_types(columnar_data)

299

300

print("Column data types:")

301

for column_name, data_type in data_types:

302

print(f" {column_name}: {data_type}")

303

304

# Typical output:

305

# id: int8 (small positive integers)

306

# name: string (text data)

307

# score: float32 or float64 (decimal numbers)

308

# active: string (boolean values treated as strings)

309

```

310

311

### Processing Large Datasets with Sampling

312

313

```python

314

from chdb.utils import infer_data_types

315

import random

316

317

# Simulate large dataset

318

large_dataset = {

319

"transaction_id": list(range(1000000)),

320

"amount": [random.uniform(10.0, 1000.0) for _ in range(1000000)],

321

"category": [random.choice(["food", "transport", "entertainment"]) for _ in range(1000000)]

322

}

323

324

# Infer types using sampling (first 5000 rows)

325

sampled_types = infer_data_types(large_dataset, n_rows=5000)

326

327

print("Data types from large dataset sample:")

328

for column, dtype in sampled_types:

329

print(f" {column}: {dtype}")

330

331

# Much faster than analyzing all 1M rows

332

```

333

334

### Integration with chDB Queries

335

336

```python

337

from chdb.utils import convert_to_columnar, infer_data_types

338

import chdb

339

340

# Prepare raw data

341

raw_events = [

342

{"timestamp": "2024-01-01 10:00:00", "user_id": 123, "action": "login", "duration": 45.2},

343

{"timestamp": "2024-01-01 10:05:00", "user_id": 456, "action": "view_page", "duration": 120.5},

344

{"timestamp": "2024-01-01 10:10:00", "user_id": 123, "action": "logout", "duration": 5.1}

345

]

346

347

# Convert and analyze

348

columnar_events = convert_to_columnar(raw_events)

349

event_types = infer_data_types(columnar_events)

350

351

print("Event data structure:")

352

for col, dtype in event_types:

353

print(f" {col}: {dtype}")

354

355

# Now the data structure is understood and can be efficiently queried

356

# (This would typically be saved to a file format that chDB can read)

357

```

358

359

### Error Handling

360

361

```python

362

from chdb.utils import flatten_dict, convert_to_columnar, infer_data_type

363

364

# Handle edge cases

365

try:

366

# Empty data

367

empty_result = convert_to_columnar([])

368

print(f"Empty list result: {empty_result}") # Returns {}

369

370

# All None values

371

none_type = infer_data_type([None, None, None])

372

print(f"All None values: {none_type}") # Returns "string"

373

374

# Invalid dictionary

375

invalid_dict = "not a dictionary"

376

# flatten_dict will raise TypeError for non-dict input

377

378

except Exception as e:

379

print(f"Error handling: {e}")

380

```