or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

dataset-management.mdindex.mdreading.mdschema-types.mdwriting.md

schema-types.mddocs/

0

# Schema and Types

1

2

Tools for working with parquet schemas, type conversions, and metadata management to ensure proper data representation and compatibility between pandas and parquet formats.

3

4

## Capabilities

5

6

### Schema Management

7

8

#### SchemaHelper Class

9

10

Utility class for working with parquet schema structures and navigating complex nested schemas.

11

12

```python { .api }

13

class SchemaHelper:

14

"""

15

Helper class for parquet schema navigation and analysis.

16

17

Provides methods to understand and work with parquet schema structures,

18

including nested types and logical type conversions.

19

"""

20

21

def __init__(self, schema_elements):

22

"""

23

Initialize SchemaHelper with parquet schema elements.

24

25

Parameters:

26

- schema_elements: list, parquet schema element list

27

"""

28

29

def schema_element(self, name):

30

"""

31

Get schema element by name or path.

32

33

Parameters:

34

- name: str or list, column name or path in schema

35

36

Returns:

37

SchemaElement: The schema element for the specified path

38

"""

39

40

def is_required(self, name):

41

"""

42

Check if field is required (not optional).

43

44

Parameters:

45

- name: str, column name to check

46

47

Returns:

48

bool: True if field is required, False if optional

49

"""

50

51

def max_repetition_level(self, parts):

52

"""

53

Calculate maximum repetition level for column path.

54

55

Parameters:

56

- parts: list, column path components

57

58

Returns:

59

int: Maximum repetition level

60

"""

61

62

def max_definition_level(self, parts):

63

"""

64

Calculate maximum definition level for column path.

65

66

Parameters:

67

- parts: list, column path components

68

69

Returns:

70

int: Maximum definition level

71

"""

72

73

@property

74

def text(self):

75

"""Human-readable schema representation."""

76

```

77

78

#### Schema Structure Functions

79

80

Functions for building and analyzing parquet schema structures.

81

82

```python { .api }

83

def schema_tree(schema, i=0):

84

"""

85

Build tree structure from flat schema list.

86

87

Parameters:

88

- schema: list, flat list of schema elements

89

- i: int, starting index in schema list

90

91

Returns:

92

int: Final index after processing tree

93

"""

94

95

def schema_to_text(root, indent=[]):

96

"""

97

Convert schema to human-readable text representation.

98

99

Parameters:

100

- root: SchemaElement, root schema element

101

- indent: list, indentation tracking for nested elements

102

103

Returns:

104

str: Human-readable schema representation

105

"""

106

107

def flatten(schema_helper, path=None):

108

"""

109

Flatten nested schema into list of column paths.

110

111

Parameters:

112

- schema_helper: SchemaHelper, schema navigation helper

113

- path: list, current path for recursion

114

115

Returns:

116

list: List of flattened column paths

117

"""

118

119

def _is_list_like(helper, name):

120

"""

121

Check if column represents a list-like structure.

122

123

Parameters:

124

- helper: SchemaHelper, schema navigation helper

125

- name: str, column name to check

126

127

Returns:

128

bool: True if column is list-like, False otherwise

129

"""

130

131

def _is_map_like(helper, name):

132

"""

133

Check if column represents a map-like structure.

134

135

Parameters:

136

- helper: SchemaHelper, schema navigation helper

137

- name: str, column name to check

138

139

Returns:

140

bool: True if column is map-like, False otherwise

141

"""

142

```

143

144

### Type Conversion System

145

146

#### Pandas to Parquet Conversion

147

148

Convert pandas data types to appropriate parquet representations.

149

150

```python { .api }

151

def find_type(data, fixed_text=None, object_encoding=None,

152

times='int64', is_index=None):

153

"""

154

Determine appropriate parquet type codes for pandas Series.

155

156

Parameters:

157

- data: pandas.Series, input data to analyze

158

- fixed_text: int, fixed-length string size for string/bytes columns

159

- object_encoding: str, encoding method for object dtype columns

160

- times: str, timestamp encoding format ('int64' or 'int96')

161

- is_index: bool, whether data represents DataFrame index

162

163

Returns:

164

tuple: (SchemaElement, parquet_type_code)

165

"""

166

167

def convert(data, se):

168

"""

169

Convert pandas data according to schema element specification.

170

171

Parameters:

172

- data: pandas.Series, input data to convert

173

- se: SchemaElement, target parquet schema element

174

175

Returns:

176

numpy.ndarray: Converted data ready for parquet encoding

177

"""

178

```

179

180

#### Object Encoding Functions

181

182

Handle encoding of Python object types to parquet-compatible formats.

183

184

```python { .api }

185

def infer_object_encoding(data):

186

"""

187

Automatically infer appropriate encoding for object dtype column.

188

189

Parameters:

190

- data: pandas.Series, object dtype data to analyze

191

192

Returns:

193

str: Inferred encoding type ('utf8', 'bytes', 'json', 'bool', etc.)

194

"""

195

```

196

197

### Parquet to Pandas Conversion

198

199

#### Type Mapping and Conversion

200

201

Convert parquet data back to appropriate pandas types.

202

203

```python { .api }

204

def convert(data, schema_element, metadata=None):

205

"""

206

Convert raw parquet data to appropriate pandas types.

207

208

Parameters:

209

- data: numpy.ndarray, raw parquet data

210

- schema_element: SchemaElement, parquet schema information

211

- metadata: dict, additional pandas metadata for conversion

212

213

Returns:

214

numpy.ndarray: Converted data suitable for pandas

215

"""

216

217

# Type mapping constants

218

simple = {

219

# Mapping from parquet primitive types to numpy dtypes

220

'INT32': 'int32',

221

'INT64': 'int64',

222

'FLOAT': 'float32',

223

'DOUBLE': 'float64',

224

'BOOLEAN': 'bool',

225

'BYTE_ARRAY': 'object',

226

'FIXED_LEN_BYTE_ARRAY': 'object'

227

}

228

229

complex = {

230

# Mapping from parquet logical types to numpy dtypes

231

'UTF8': 'object',

232

'JSON': 'object',

233

'BSON': 'object',

234

'DECIMAL': 'float64',

235

'TIMESTAMP_MILLIS': 'datetime64[ms]',

236

'TIMESTAMP_MICROS': 'datetime64[us]',

237

'TIME_MICROS': 'timedelta64[us]'

238

}

239

```

240

241

### Binary Data Handling

242

243

#### BSON Support

244

245

Functions for working with Binary JSON (BSON) encoded data.

246

247

```python { .api }

248

def tobson(obj):

249

"""

250

Convert Python object to BSON binary format.

251

252

Parameters:

253

- obj: Any, Python object to encode

254

255

Returns:

256

bytes: BSON-encoded binary data

257

"""

258

259

def unbson(data):

260

"""

261

Convert BSON binary data back to Python object.

262

263

Parameters:

264

- data: bytes, BSON-encoded binary data

265

266

Returns:

267

Any: Decoded Python object

268

"""

269

```

270

271

### Column Metadata Generation

272

273

#### Pandas Metadata Creation

274

275

Generate pandas-compatible metadata for columns during the writing process.

276

277

```python { .api }

278

def get_column_metadata(column, name, object_dtype=None):

279

"""

280

Generate pandas column metadata for parquet storage.

281

282

Parameters:

283

- column: pandas.Series, source column data

284

- name: str, column name

285

- object_dtype: str, specific object encoding type

286

287

Returns:

288

dict: Pandas metadata dictionary with type and encoding info

289

"""

290

291

def get_numpy_type(dtype):

292

"""

293

Get numpy type string representation for pandas dtype.

294

295

Parameters:

296

- dtype: pandas.dtype, input pandas data type

297

298

Returns:

299

str: String representation of equivalent numpy type

300

"""

301

```

302

303

### Type Inference and Validation

304

305

#### Automatic Type Detection

306

307

Utilities for inferring and validating data types during conversion.

308

309

```python { .api }

310

def infer_dtype(column):

311

"""

312

Infer pandas dtype of column data.

313

314

Parameters:

315

- column: pandas.Series, data to analyze

316

317

Returns:

318

str: Inferred pandas dtype string

319

"""

320

321

def groupby_types(iterable):

322

"""

323

Group objects by their Python type.

324

325

Parameters:

326

- iterable: Iterable, collection of objects to group

327

328

Returns:

329

dict: Mapping from type to list of objects of that type

330

"""

331

```

332

333

## Usage Examples

334

335

### Working with Schemas

336

337

```python

338

from fastparquet import ParquetFile

339

340

# Read schema information

341

pf = ParquetFile('data.parquet')

342

schema = pf.schema

343

344

# Print schema structure

345

print(schema) # Human-readable schema representation

346

347

# Access individual schema elements

348

for column_name in pf.columns:

349

element = schema.schema_element([column_name])

350

print(f"{column_name}: {element.type}")

351

```

352

353

### Type Conversion Examples

354

355

```python

356

import pandas as pd

357

from fastparquet.writer import find_type, convert

358

359

# Analyze pandas data for parquet conversion

360

data = pd.Series([1, 2, 3, 4, 5], name='numbers')

361

schema_element, type_code = find_type(data)

362

363

print(f"Parquet type: {schema_element.type}")

364

print(f"Converted type: {schema_element.converted_type}")

365

366

# Convert data for writing

367

converted_data = convert(data, schema_element)

368

print(f"Converted shape: {converted_data.shape}")

369

```

370

371

### Object Encoding Examples

372

373

```python

374

from fastparquet.writer import find_type

375

376

# String data (automatic UTF-8 encoding)

377

text_data = pd.Series(['hello', 'world', 'test'])

378

se, _ = find_type(text_data, object_encoding='utf8')

379

380

# JSON data

381

json_data = pd.Series([{'a': 1}, {'b': 2}, {'c': 3}])

382

se, _ = find_type(json_data, object_encoding='json')

383

384

# Binary data

385

binary_data = pd.Series([b'data1', b'data2', b'data3'])

386

se, _ = find_type(binary_data, object_encoding='bytes')

387

388

# Automatic inference

389

mixed_data = pd.Series(['text1', 'text2', 'text3'])

390

se, _ = find_type(mixed_data, object_encoding='infer')

391

```

392

393

### Custom Type Handling

394

395

```python

396

from fastparquet.writer import find_type

397

398

# Fixed-length strings

399

fixed_text_data = pd.Series(['ABC', 'DEF', 'GHI'])

400

se, _ = find_type(fixed_text_data, fixed_text=3)

401

402

# Decimal data

403

decimal_data = pd.Series([1.234, 5.678, 9.012])

404

se, _ = find_type(decimal_data, object_encoding='decimal')

405

406

# Timestamp with different encodings

407

timestamp_data = pd.Series(pd.date_range('2023-01-01', periods=3))

408

409

# 64-bit integer timestamps (default)

410

se_int64, _ = find_type(timestamp_data, times='int64')

411

412

# 96-bit timestamps (legacy compatibility)

413

se_int96, _ = find_type(timestamp_data, times='int96')

414

```

415

416

### Metadata Generation

417

418

```python

419

from fastparquet.util import get_column_metadata

420

421

# Generate metadata for different column types

422

df = pd.DataFrame({

423

'int_col': [1, 2, 3],

424

'float_col': [1.1, 2.2, 3.3],

425

'str_col': ['a', 'b', 'c'],

426

'cat_col': pd.Categorical(['X', 'Y', 'Z']),

427

'date_col': pd.date_range('2023-01-01', periods=3)

428

})

429

430

for col_name, col_data in df.items():

431

metadata = get_column_metadata(col_data, col_name)

432

print(f"{col_name}: {metadata['pandas_type']} -> {metadata['numpy_type']}")

433

```

434

435

## Type Definitions

436

437

```python { .api }

438

# Object encoding options

439

ObjectEncoding = Literal[

440

'infer', # Automatically detect encoding

441

'utf8', # UTF-8 text encoding

442

'bytes', # Raw binary data

443

'json', # JSON serialization

444

'bson', # Binary JSON encoding

445

'bool', # Boolean values

446

'int', # Integer values (64-bit)

447

'int32', # Integer values (32-bit)

448

'float', # Floating point values

449

'decimal' # Decimal number handling

450

]

451

452

# Timestamp encoding formats

453

TimeEncoding = Literal[

454

'int64', # 64-bit integer (nanosecond precision)

455

'int96' # 96-bit format (legacy compatibility)

456

]

457

458

# Parquet primitive types

459

ParquetPrimitiveType = Literal[

460

'BOOLEAN',

461

'INT32',

462

'INT64',

463

'FLOAT',

464

'DOUBLE',

465

'BYTE_ARRAY',

466

'FIXED_LEN_BYTE_ARRAY',

467

'INT96'

468

]

469

470

# Parquet logical types

471

ParquetLogicalType = Literal[

472

'UTF8',

473

'JSON',

474

'BSON',

475

'DECIMAL',

476

'TIMESTAMP_MILLIS',

477

'TIMESTAMP_MICROS',

478

'TIME_MICROS',

479

'INT_8',

480

'INT_16',

481

'INT_32',

482

'INT_64',

483

'UINT_8',

484

'UINT_16',

485

'UINT_32',

486

'UINT_64'

487

]

488

489

# Schema element structure

490

SchemaElement = Any # parquet_thrift.SchemaElement

491

492

# Column metadata structure

493

ColumnMetadata = Dict[str, Union[str, Dict[str, Any]]]

494

```