or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration-management.mdfilesystem-utilities.mdformat-utilities.mdindex.mdio-operations.mdstorage-operations.md

format-utilities.mddocs/

0

# Format Utilities

1

2

Specialized utilities for working with Parquet, ORC, and HFile formats, including metadata reading, schema conversions, and format-specific optimizations. Provides comprehensive support for common big data file formats in Hadoop ecosystems.

3

4

## Capabilities

5

6

### Parquet Utilities

7

8

Comprehensive utilities for working with Parquet files, including metadata access and row key filtering.

9

10

```java { .api }

11

/**

12

* Parquet format utilities extending FileFormatUtils

13

* Provides Parquet-specific operations and optimizations

14

*/

15

public class ParquetUtils extends FileFormatUtils {

16

17

/**

18

* Read Parquet file metadata

19

* @param storage - HoodieStorage instance for file access

20

* @param parquetFilePath - Path to Parquet file

21

* @return ParquetMetadata containing file metadata

22

*/

23

public static ParquetMetadata readMetadata(HoodieStorage storage, StoragePath parquetFilePath);

24

25

/**

26

* Filter row keys from Parquet file

27

* Efficiently reads only row keys that match the filter set

28

* @param storage - HoodieStorage instance for file access

29

* @param filePath - Path to Parquet file

30

* @param filter - Set of keys to filter for

31

* @return Set of Pair<String, Long> containing matching keys and row numbers

32

*/

33

public Set<Pair<String, Long>> filterRowKeys(HoodieStorage storage, StoragePath filePath,

34

Set<String> filter);

35

36

/**

37

* Get compression codec name from string

38

* @param codecName - String name of codec

39

* @return CompressionCodecName enum value

40

*/

41

public static CompressionCodecName getCompressionCodecName(String codecName);

42

43

/**

44

* Fetch record keys with their positions

45

* @param storage - HoodieStorage instance for file access

46

* @param filePath - Path to Parquet file

47

* @return ClosableIterator of Pair<HoodieKey, Long> containing keys and positions

48

*/

49

public ClosableIterator<Pair<HoodieKey, Long>> fetchRecordKeysWithPositions(HoodieStorage storage,

50

StoragePath filePath);

51

52

/**

53

* Get HoodieKey iterator for records in file

54

* @param storage - HoodieStorage instance for file access

55

* @param filePath - Path to Parquet file

56

* @return ClosableIterator of HoodieKey records

57

*/

58

public ClosableIterator<HoodieKey> getHoodieKeyIterator(HoodieStorage storage, StoragePath filePath);

59

60

/**

61

* Read Parquet schema as MessageType

62

* @param storage - HoodieStorage instance for file access

63

* @param parquetFilePath - Path to Parquet file

64

* @return MessageType schema

65

*/

66

public MessageType readSchema(HoodieStorage storage, StoragePath parquetFilePath);

67

68

/**

69

* Read Avro schema from Parquet file

70

* @param storage - HoodieStorage instance for file access

71

* @param filePath - Path to Parquet file

72

* @return Avro Schema

73

*/

74

public Schema readAvroSchema(HoodieStorage storage, StoragePath filePath);

75

76

/**

77

* Read column statistics from Parquet metadata

78

* @param storage - HoodieStorage instance for file access

79

* @param filePath - Path to Parquet file

80

* @param columnNames - List of column names to read stats for

81

* @return List of HoodieColumnRangeMetadata with statistics

82

*/

83

public List<HoodieColumnRangeMetadata<Comparable>> readColumnStatsFromMetadata(HoodieStorage storage,

84

StoragePath filePath,

85

List<String> columnNames);

86

87

/**

88

* Read all Avro records from Parquet file

89

* @param storage - HoodieStorage instance for file access

90

* @param filePath - Path to Parquet file

91

* @return List of GenericRecord objects

92

*/

93

public List<GenericRecord> readAvroRecords(HoodieStorage storage, StoragePath filePath);

94

95

/**

96

* Read Avro records from Parquet file with specific schema

97

* @param storage - HoodieStorage instance for file access

98

* @param filePath - Path to Parquet file

99

* @param schema - Avro schema to use for reading

100

* @return List of GenericRecord objects

101

*/

102

public List<GenericRecord> readAvroRecords(HoodieStorage storage, StoragePath filePath, Schema schema);

103

104

/**

105

* Get row count from Parquet file

106

* @param storage - HoodieStorage instance for file access

107

* @param filePath - Path to Parquet file

108

* @return Number of rows in the file

109

*/

110

public long getRowCount(HoodieStorage storage, StoragePath filePath);

111

112

/**

113

* Get file format

114

* @return HoodieFileFormat.PARQUET

115

*/

116

public HoodieFileFormat getFormat();

117

}

118

```

119

120

### ORC Utilities

121

122

Utilities for working with ORC (Optimized Row Columnar) format files with Hadoop integration.

123

124

```java { .api }

125

/**

126

* ORC format utilities extending FileFormatUtils

127

* Provides ORC-specific operations and optimizations

128

*/

129

public class OrcUtils extends FileFormatUtils {

130

// ORC-specific file format utility methods

131

// Includes metadata reading, schema extraction, and optimization hints

132

// for working with ORC files in Hadoop environments

133

}

134

```

135

136

### HFile Utilities

137

138

Utilities for working with HBase HFile format, providing integration with HBase storage systems.

139

140

```java { .api }

141

/**

142

* HFile format utilities extending FileFormatUtils

143

* Provides HFile-specific operations for HBase integration

144

*/

145

public class HFileUtils extends FileFormatUtils {

146

// HFile-specific file format utility methods

147

// Includes HBase integration, key-value operations, and

148

// specialized access patterns for HFile format

149

}

150

```

151

152

### Avro-ORC Conversion Utilities

153

154

Comprehensive utilities for converting between Avro and ORC schemas and data formats.

155

156

```java { .api }

157

/**

158

* Utilities for Avro-ORC conversions

159

* Handles schema conversion and type mapping between formats

160

*/

161

public class AvroOrcUtils {

162

163

/**

164

* Create ORC schema from Avro schema

165

* @param avroSchema - Avro Schema to convert

166

* @return ORC TypeDescription equivalent

167

*/

168

public static TypeDescription createOrcSchema(Schema avroSchema);

169

170

/**

171

* Create Avro schema from ORC TypeDescription

172

* @param orcSchema - ORC TypeDescription to convert

173

* @return Avro Schema equivalent

174

*/

175

public static Schema createAvroSchema(TypeDescription orcSchema);

176

177

/**

178

* Create Avro schema with default values from ORC schema

179

* @param orcSchema - ORC TypeDescription to convert

180

* @param recordName - Name for the record

181

* @param namespace - Namespace for the schema

182

* @param nullable - Whether fields should be nullable

183

* @return Avro Schema with default values

184

*/

185

public static Schema createAvroSchemaWithDefaultValue(TypeDescription orcSchema, String recordName,

186

String namespace, boolean nullable);

187

188

/**

189

* Add value to ORC column vector

190

* @param type - ORC type description

191

* @param colVector - Column vector to add to

192

* @param avroSchema - Avro schema for the value

193

* @param value - Value to add

194

* @param vectorPos - Position in vector

195

*/

196

public static void addToVector(TypeDescription type, ColumnVector colVector, Schema avroSchema,

197

Object value, int vectorPos);

198

199

/**

200

* Read value from ORC column vector

201

* @param type - ORC type description

202

* @param colVector - Column vector to read from

203

* @param avroSchema - Avro schema for the value

204

* @param vectorPos - Position in vector

205

* @return Object value read from vector

206

*/

207

public static Object readFromVector(TypeDescription type, ColumnVector colVector, Schema avroSchema,

208

int vectorPos);

209

210

/**

211

* Get column names from ORC schema

212

* @param orcSchema - ORC TypeDescription

213

* @return List of column names in order

214

*/

215

public static List<String> getOrcColumnNames(TypeDescription orcSchema);

216

217

/**

218

* Get field mapping from ORC schema

219

* @param orcSchema - ORC TypeDescription

220

* @return Map of field names to TypeDescription

221

*/

222

public static Map<String, TypeDescription> getOrcFields(TypeDescription orcSchema);

223

}

224

```

225

226

### Hadoop Configuration Utilities

227

228

Utilities for optimizing Hadoop configuration for different file formats and operations.

229

230

```java { .api }

231

/**

232

* Hadoop configuration utilities

233

* Provides optimized configurations for different scenarios

234

*/

235

public class HadoopConfigUtils {

236

237

/**

238

* Get optimized configuration for reading operations

239

* @param conf - Base Hadoop configuration

240

* @return Configuration optimized for reading

241

*/

242

public static Configuration getReaderConf(Configuration conf);

243

244

/**

245

* Add shutdown hook for configuration cleanup

246

* @param conf - Hadoop configuration

247

* @return Configuration with shutdown hook added

248

*/

249

public static Configuration addShutdownHook(Configuration conf);

250

}

251

```

252

253

### Schema Conversion Examples

254

255

Common patterns for converting between different schema formats:

256

257

#### Avro to ORC Schema Conversion

258

259

```java { .api }

260

// Example Avro schema

261

{

262

"type": "record",

263

"name": "User",

264

"fields": [

265

{"name": "id", "type": "long"},

266

{"name": "name", "type": "string"},

267

{"name": "email", "type": ["null", "string"], "default": null},

268

{"name": "scores", "type": {"type": "array", "items": "int"}},

269

{"name": "metadata", "type": {"type": "map", "values": "string"}}

270

]

271

}

272

273

// Converts to ORC TypeDescription:

274

// struct<id:bigint,name:string,email:string,scores:array<int>,metadata:map<string,string>>

275

```

276

277

#### ORC to Avro Schema Conversion

278

279

```java { .api }

280

// ORC TypeDescription: struct<product_id:bigint,product_name:string,price:decimal(10,2)>

281

//

282

// Converts to Avro schema:

283

{

284

"type": "record",

285

"name": "Product",

286

"fields": [

287

{"name": "product_id", "type": "long"},

288

{"name": "product_name", "type": "string"},

289

{"name": "price", "type": {"type": "bytes", "logicalType": "decimal", "precision": 10, "scale": 2}}

290

]

291

}

292

```

293

294

### Type Mapping Reference

295

296

Common type mappings between Avro and ORC formats:

297

298

| Avro Type | ORC Type | Notes |

299

|-----------|----------|--------|

300

| `boolean` | `boolean` | Direct mapping |

301

| `int` | `int` | Direct mapping |

302

| `long` | `bigint` | Direct mapping |

303

| `float` | `float` | Direct mapping |

304

| `double` | `double` | Direct mapping |

305

| `string` | `string` | Direct mapping |

306

| `bytes` | `binary` | Direct mapping |

307

| `array<T>` | `array<T>` | Recursive type mapping |

308

| `map<string,T>` | `map<string,T>` | Recursive type mapping |

309

| `record` | `struct` | Field-by-field mapping |

310

| `union` | Complex | Requires special handling |

311

| `enum` | `string` | Converted to string representation |

312

| `fixed` | `binary` | Fixed-length binary |

313

314

### Bootstrap Index Integration

315

316

Integration utilities for HFile-based bootstrap indexing with HBase compatibility.

317

318

```java { .api }

319

/**

320

* HFile-based bootstrap index utilities

321

* Provides integration with HBase for efficient indexing

322

*/

323

public class HFileBootstrapIndex {

324

325

/**

326

* Key-value comparator for HBase integration

327

* Provides proper ordering for HFile operations

328

*/

329

public static class HoodieKVComparator extends CellComparatorImpl {

330

// Specialized comparator for Hudi key-value pairs

331

// Ensures proper ordering in HFile structures

332

}

333

}

334

```

335

336

### Bootstrap Index Writers and Readers

337

338

Specialized classes for reading and writing HFile-based bootstrap indexes.

339

340

```java { .api }

341

/**

342

* HBase HFile bootstrap index reader

343

* Provides efficient reading of bootstrap index data

344

*/

345

public class HBaseHFileBootstrapIndexReader {

346

// Methods for reading bootstrap index data from HFiles

347

// Optimized for HBase storage patterns

348

}

349

350

/**

351

* HBase HFile bootstrap index writer

352

* Provides efficient writing of bootstrap index data

353

*/

354

public class HBaseHFileBootstrapIndexWriter {

355

// Methods for writing bootstrap index data to HFiles

356

// Optimized for HBase storage patterns and bulk loading

357

}

358

```

359

360

**Usage Examples:**

361

362

```java

363

import org.apache.hudi.common.util.*;

364

import org.apache.avro.Schema;

365

import org.apache.orc.TypeDescription;

366

import org.apache.parquet.hadoop.metadata.ParquetMetadata;

367

368

// Working with Parquet files

369

HoodieStorage storage = new HoodieHadoopStorage(storagePath, storageConf);

370

StoragePath parquetFile = new StoragePath("/data/table/partition/file.parquet");

371

372

// Read Parquet metadata

373

ParquetMetadata metadata = ParquetUtils.readMetadata(storage, parquetFile);

374

System.out.println("Number of rows: " + metadata.getBlocks().get(0).getRowCount());

375

System.out.println("Number of columns: " + metadata.getFileMetaData().getSchema().getColumns().size());

376

377

// Filter row keys from Parquet file

378

Set<String> keysToFind = Set.of("key1", "key2", "key3");

379

Set<Pair<String, Long>> foundKeys = ParquetUtils.filterRowKeys(storage, parquetFile, keysToFind);

380

for (Pair<String, Long> keyRow : foundKeys) {

381

System.out.println("Found key: " + keyRow.getLeft() + " at row: " + keyRow.getRight());

382

}

383

384

// Schema conversion between Avro and ORC

385

String avroSchemaJson = """

386

{

387

"type": "record",

388

"name": "Customer",

389

"fields": [

390

{"name": "customer_id", "type": "long"},

391

{"name": "name", "type": "string"},

392

{"name": "email", "type": ["null", "string"], "default": null},

393

{"name": "orders", "type": {"type": "array", "items": "string"}},

394

{"name": "preferences", "type": {"type": "map", "values": "string"}}

395

]

396

}

397

""";

398

399

Schema avroSchema = new Schema.Parser().parse(avroSchemaJson);

400

401

// Convert Avro to ORC

402

TypeDescription orcSchema = AvroOrcUtils.createOrcSchema(avroSchema);

403

System.out.println("ORC Schema: " + orcSchema.toString());

404

// Output: struct<customer_id:bigint,name:string,email:string,orders:array<string>,preferences:map<string,string>>

405

406

// Convert back to Avro

407

Schema convertedBackSchema = AvroOrcUtils.createAvroSchema(orcSchema);

408

System.out.println("Converted back to Avro: " + convertedBackSchema.toString(true));

409

410

// Get ORC column information

411

List<String> columnNames = AvroOrcUtils.getOrcColumnNames(orcSchema);

412

System.out.println("Column names: " + columnNames);

413

// Output: [customer_id, name, email, orders, preferences]

414

415

Map<String, TypeDescription> fields = AvroOrcUtils.getOrcFields(orcSchema);

416

for (Map.Entry<String, TypeDescription> field : fields.entrySet()) {

417

System.out.println("Field: " + field.getKey() + ", Type: " + field.getValue().getCategory());

418

}

419

420

// Optimized Hadoop configuration for reading

421

Configuration baseConf = new Configuration();

422

Configuration readerConf = HadoopConfigUtils.getReaderConf(baseConf);

423

424

// Add shutdown hook for cleanup

425

Configuration confWithHook = HadoopConfigUtils.addShutdownHook(readerConf);

426

427

// Working with different file formats

428

StoragePath orcFile = new StoragePath("/data/table/partition/file.orc");

429

StoragePath hfile = new StoragePath("/data/index/region/family/hfile");

430

431

// ORC operations using OrcUtils

432

// (Specific methods depend on OrcUtils implementation)

433

434

// HFile operations using HFileUtils

435

// (Specific methods depend on HFileUtils implementation)

436

437

// Complex schema conversion example

438

String complexAvroSchema = """

439

{

440

"type": "record",

441

"name": "Transaction",

442

"fields": [

443

{"name": "id", "type": "string"},

444

{"name": "amount", "type": {"type": "bytes", "logicalType": "decimal", "precision": 10, "scale": 2}},

445

{"name": "timestamp", "type": {"type": "long", "logicalType": "timestamp-millis"}},

446

{"name": "metadata", "type": {

447

"type": "record",

448

"name": "TransactionMetadata",

449

"fields": [

450

{"name": "source", "type": "string"},

451

{"name": "tags", "type": {"type": "array", "items": "string"}}

452

]

453

}}

454

]

455

}

456

""";

457

458

Schema complexSchema = new Schema.Parser().parse(complexAvroSchema);

459

TypeDescription complexOrcSchema = AvroOrcUtils.createOrcSchema(complexSchema);

460

System.out.println("Complex ORC Schema: " + complexOrcSchema.toString());

461

```

462

463

### Performance Considerations

464

465

1. **Parquet Metadata Caching**: Cache ParquetMetadata objects for frequently accessed files

466

2. **Schema Conversion Caching**: Cache converted schemas to avoid repeated conversion overhead

467

3. **Row Key Filtering**: Use bloom filters or other pre-filtering when possible before row key filtering

468

4. **Configuration Optimization**: Use HadoopConfigUtils.getReaderConf() for read-heavy workloads

469

5. **Format Selection**: Choose appropriate formats based on access patterns (Parquet for analytics, ORC for ACID operations, HFile for key-value access)