or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration-management.mdfilesystem-utilities.mdformat-utilities.mdindex.mdio-operations.mdstorage-operations.md

io-operations.mddocs/

0

# I/O Operations

1

2

Factory pattern for creating format-specific file readers and writers with support for Avro, Parquet, and ORC formats in Hadoop environments. Provides comprehensive I/O capabilities for reading and writing structured data files.

3

4

## Capabilities

5

6

### HoodieHadoopIOFactory

7

8

Primary I/O factory for creating Hadoop-based file readers and writers with format-specific optimizations.

9

10

```java { .api }

11

/**

12

* Factory for creating Hadoop-based file readers and writers

13

* Supports multiple record types and file formats

14

*/

15

public class HoodieHadoopIOFactory implements HoodieIOFactory {

16

17

/** Create I/O factory with storage backend */

18

public HoodieHadoopIOFactory(HoodieStorage storage);

19

20

/** Get reader factory for specific record type */

21

public HoodieFileReaderFactory getReaderFactory(HoodieRecord.HoodieRecordType recordType);

22

23

/** Get writer factory for specific record type */

24

public HoodieFileWriterFactory getWriterFactory(HoodieRecord.HoodieRecordType recordType);

25

26

/** Get format utilities for specific file format */

27

public FileFormatUtils getFileFormatUtils(HoodieFileFormat fileFormat);

28

29

/** Get storage instance for path */

30

public HoodieStorage getStorage(StoragePath storagePath);

31

32

/** Get storage instance with retry configuration */

33

public HoodieStorage getStorage(StoragePath path, boolean enableRetry,

34

long maxRetryIntervalMs, int maxRetryNumbers,

35

long initialRetryIntervalMs, String retryExceptions,

36

ConsistencyGuard consistencyGuard);

37

}

38

```

39

40

### Avro File Reader Factory

41

42

Factory for creating Avro-based file readers supporting Parquet format with Avro serialization.

43

44

```java { .api }

45

/**

46

* Factory for creating Avro file readers

47

* Specialized for Parquet files with Avro schema

48

*/

49

public class HoodieAvroFileReaderFactory implements HoodieFileReaderFactory {

50

51

/** Create reader factory with storage backend */

52

public HoodieAvroFileReaderFactory(HoodieStorage storage);

53

54

/** Create Parquet file reader for Avro records */

55

public HoodieAvroFileReader newParquetFileReader(HoodieStorage storage, StoragePath path);

56

}

57

```

58

59

### Avro File Writer Factory

60

61

Factory for creating Avro-based file writers supporting multiple output formats.

62

63

```java { .api }

64

/**

65

* Factory for creating Avro file writers

66

* Supports Parquet and ORC output formats

67

*/

68

public class HoodieAvroFileWriterFactory implements HoodieFileWriterFactory {

69

70

/** Create writer factory with storage backend */

71

public HoodieAvroFileWriterFactory(HoodieStorage storage);

72

}

73

```

74

75

### Avro Parquet Reader

76

77

Avro-based Parquet file reader providing schema evolution and efficient columnar access.

78

79

```java { .api }

80

/**

81

* Avro-based Parquet file reader

82

* Supports schema evolution and columnar data access

83

*/

84

public class HoodieAvroParquetReader implements HoodieFileReader {

85

86

/** Create reader for Parquet file with Avro schema */

87

public HoodieAvroParquetReader(HoodieStorage storage, StoragePath filePath);

88

89

/** Create reader with explicit writer schema */

90

public HoodieAvroParquetReader(HoodieStorage storage, StoragePath filePath,

91

Option<Schema> writerSchemaOpt);

92

93

/** Get the schema of the file */

94

public Schema getSchema();

95

96

/** Get iterator for records with custom reader schema */

97

public ClosableIterator<IndexedRecord> getRecordIterator(Schema readerSchema);

98

99

/** Get iterator for records with file schema */

100

public ClosableIterator<IndexedRecord> getRecordIterator();

101

102

/** Close the reader and release resources */

103

public void close();

104

}

105

```

106

107

### Avro Parquet Writer

108

109

Avro-based Parquet file writer with bloom filter integration and metadata support.

110

111

```java { .api }

112

/**

113

* Avro-based Parquet file writer

114

* Supports bloom filters and custom metadata

115

*/

116

public class HoodieAvroParquetWriter implements HoodieFileWriter {

117

118

/** Create writer with configuration and schema */

119

public HoodieAvroParquetWriter(StoragePath file, HoodieConfig config, Schema schema,

120

Task task, Option<BloomFilter> bloomFilterOpt,

121

boolean populateMetaFields);

122

123

/** Check if writer can accept more data */

124

public boolean canWrite();

125

126

/** Write Avro record with Hudi metadata */

127

public void writeAvroWithMetadata(HoodieKey key, IndexedRecord avroRecord);

128

129

/** Write Avro record with record key */

130

public void writeAvro(String recordKey, IndexedRecord record);

131

132

/** Close writer and return status */

133

public WriteStatus close();

134

135

/** Get current write status */

136

public WriteStatus getWriteStatus();

137

138

/** Get number of bytes written */

139

public long getBytesWritten();

140

}

141

```

142

143

### Avro ORC Reader

144

145

Avro-based ORC file reader supporting schema evolution and efficient columnar access.

146

147

```java { .api }

148

/**

149

* Avro-based ORC file reader

150

* Supports schema evolution and columnar data access

151

*/

152

public class HoodieAvroOrcReader implements HoodieFileReader {

153

154

/** Create reader with explicit writer schema */

155

public HoodieAvroOrcReader(HoodieStorage storage, StoragePath filePath,

156

Option<Schema> writerSchemaOpt);

157

158

/** Get iterator for records with custom reader schema */

159

public ClosableIterator<IndexedRecord> getRecordIterator(Schema readerSchema);

160

161

/** Get iterator for records with file schema */

162

public ClosableIterator<IndexedRecord> getRecordIterator();

163

164

/** Close the reader and release resources */

165

public void close();

166

167

/** Get the schema of the file */

168

public Schema getSchema();

169

}

170

```

171

172

### Avro ORC Writer

173

174

Avro-based ORC file writer with bloom filter integration and metadata support.

175

176

```java { .api }

177

/**

178

* Avro-based ORC file writer

179

* Supports bloom filters and custom metadata

180

*/

181

public class HoodieAvroOrcWriter implements HoodieFileWriter {

182

183

/** Create writer with configuration and schema */

184

public HoodieAvroOrcWriter(StoragePath filePath, HoodieConfig config, Schema schema,

185

Task task, boolean populateMetaFields,

186

Option<BloomFilter> bloomFilterOpt);

187

188

/** Check if writer can accept more data */

189

public boolean canWrite();

190

191

/** Write Avro record with Hudi metadata */

192

public void writeAvroWithMetadata(HoodieKey key, IndexedRecord avroRecord);

193

194

/** Write Avro record with record key */

195

public void writeAvro(String recordKey, IndexedRecord record);

196

197

/** Close writer and return status */

198

public WriteStatus close();

199

200

/** Get current write status */

201

public WriteStatus getWriteStatus();

202

203

/** Get number of bytes written */

204

public long getBytesWritten();

205

}

206

```

207

208

### HFile Utilities

209

210

Utilities for working with HBase HFile format in Hadoop environments.

211

212

```java { .api }

213

/**

214

* Utilities for working with HFile format

215

* Provides HBase integration capabilities

216

*/

217

public class HoodieHFileUtils {

218

219

/** Create HFile reader with configuration */

220

public static HFile.Reader createHFileReader(FileSystem fs, Path path,

221

CacheConfig cacheConf, Configuration conf);

222

223

/** Get optimized configuration for HFile reading */

224

public static Configuration getHFileReaderConfiguration(Configuration conf);

225

226

/** Check if path points to an HFile */

227

public static boolean isHFile(StoragePath path);

228

}

229

```

230

231

### Parquet Write Support

232

233

Parquet write support for Avro records with bloom filter integration.

234

235

```java { .api }

236

/**

237

* Parquet write support for Avro with bloom filter integration

238

* Extends standard Parquet writing with Hudi-specific features

239

*/

240

public class HoodieAvroWriteSupport extends AvroWriteSupport<IndexedRecord> {

241

242

/** Create write support with schema and bloom filter */

243

public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema,

244

Option<BloomFilter> bloomFilterOpt, Properties properties);

245

246

/** Finalize write context with metadata */

247

public WriteSupport.FinalizedWriteContext finalizeWrite();

248

249

/** Add record key to bloom filter */

250

public void add(String recordKey);

251

252

/** Add custom footer metadata */

253

public void addFooterMetadata(String key, String value);

254

}

255

```

256

257

### Parquet Reader Builder

258

259

Builder pattern for creating Hoodie-specific Avro Parquet readers.

260

261

```java { .api }

262

/**

263

* Builder for Hoodie Avro Parquet readers

264

* Provides configuration and customization options

265

*/

266

public class HoodieAvroParquetReaderBuilder<T> extends ParquetReaderBuilder<T> {

267

268

/** Create builder with file path */

269

public HoodieAvroParquetReaderBuilder(Path path);

270

271

/** Create builder with input file */

272

public HoodieAvroParquetReaderBuilder(InputFile file);

273

274

/** Set Hadoop configuration */

275

public HoodieAvroParquetReaderBuilder<T> withConf(Configuration conf);

276

277

/** Build configured ParquetReader */

278

public ParquetReader<T> build();

279

}

280

```

281

282

### Parquet Read Support

283

284

Parquet read support for Avro records with Hudi-specific optimizations.

285

286

```java { .api }

287

/**

288

* Parquet read support for Avro records

289

* Extends standard AvroReadSupport with Hudi optimizations

290

*/

291

public class HoodieAvroReadSupport extends AvroReadSupport {

292

// Extends AvroReadSupport with Hoodie-specific functionality

293

// for reading Parquet files with Avro schema

294

}

295

```

296

297

**Usage Examples:**

298

299

```java

300

import org.apache.hudi.io.hadoop.*;

301

import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;

302

import org.apache.avro.Schema;

303

import org.apache.avro.generic.IndexedRecord;

304

305

// Set up I/O factory

306

HoodieHadoopStorage storage = new HoodieHadoopStorage(storagePath, storageConf);

307

HoodieHadoopIOFactory ioFactory = new HoodieHadoopIOFactory(storage);

308

309

// Reading Parquet files

310

StoragePath parquetFile = new StoragePath("/data/table1/file.parquet");

311

312

// Create Avro Parquet reader

313

HoodieAvroParquetReader reader = new HoodieAvroParquetReader(storage, parquetFile);

314

Schema schema = reader.getSchema();

315

316

// Read records

317

try (ClosableIterator<IndexedRecord> iterator = reader.getRecordIterator()) {

318

while (iterator.hasNext()) {

319

IndexedRecord record = iterator.next();

320

// Process record

321

}

322

}

323

reader.close();

324

325

// Writing Parquet files

326

Schema writeSchema = new Schema.Parser().parse(schemaString);

327

StoragePath outputFile = new StoragePath("/data/table1/output.parquet");

328

329

HoodieAvroParquetWriter writer = new HoodieAvroParquetWriter(

330

outputFile,

331

hoodieConfig,

332

writeSchema,

333

task,

334

Option.empty(), // No bloom filter

335

true // Populate meta fields

336

);

337

338

// Write records

339

for (IndexedRecord record : records) {

340

writer.writeAvro("key", record);

341

}

342

343

WriteStatus status = writer.close();

344

System.out.println("Bytes written: " + status.getBytesWritten());

345

346

// Working with ORC files

347

HoodieAvroOrcReader orcReader = new HoodieAvroOrcReader(storage, orcFilePath, Option.empty());

348

try (ClosableIterator<IndexedRecord> iterator = orcReader.getRecordIterator()) {

349

while (iterator.hasNext()) {

350

IndexedRecord record = iterator.next();

351

// Process ORC record

352

}

353

}

354

orcReader.close();

355

356

// HFile operations

357

boolean isHFile = HoodieHFileUtils.isHFile(somePath);

358

if (isHFile) {

359

Configuration hfileConf = HoodieHFileUtils.getHFileReaderConfiguration(hadoopConf);

360

// Work with HFile using optimized configuration

361

}

362

```