0
# I/O Operations
1
2
Factory pattern for creating format-specific file readers and writers with support for Avro, Parquet, and ORC formats in Hadoop environments. Provides comprehensive I/O capabilities for reading and writing structured data files.
3
4
## Capabilities
5
6
### HoodieHadoopIOFactory
7
8
Primary I/O factory for creating Hadoop-based file readers and writers with format-specific optimizations.
9
10
```java { .api }
11
/**
12
* Factory for creating Hadoop-based file readers and writers
13
* Supports multiple record types and file formats
14
*/
15
public class HoodieHadoopIOFactory implements HoodieIOFactory {
16
17
/** Create I/O factory with storage backend */
18
public HoodieHadoopIOFactory(HoodieStorage storage);
19
20
/** Get reader factory for specific record type */
21
public HoodieFileReaderFactory getReaderFactory(HoodieRecord.HoodieRecordType recordType);
22
23
/** Get writer factory for specific record type */
24
public HoodieFileWriterFactory getWriterFactory(HoodieRecord.HoodieRecordType recordType);
25
26
/** Get format utilities for specific file format */
27
public FileFormatUtils getFileFormatUtils(HoodieFileFormat fileFormat);
28
29
/** Get storage instance for path */
30
public HoodieStorage getStorage(StoragePath storagePath);
31
32
/** Get storage instance with retry configuration */
33
public HoodieStorage getStorage(StoragePath path, boolean enableRetry,
34
long maxRetryIntervalMs, int maxRetryNumbers,
35
long initialRetryIntervalMs, String retryExceptions,
36
ConsistencyGuard consistencyGuard);
37
}
38
```
39
40
### Avro File Reader Factory
41
42
Factory for creating Avro-based file readers supporting Parquet format with Avro serialization.
43
44
```java { .api }
45
/**
46
* Factory for creating Avro file readers
47
* Specialized for Parquet files with Avro schema
48
*/
49
public class HoodieAvroFileReaderFactory implements HoodieFileReaderFactory {
50
51
/** Create reader factory with storage backend */
52
public HoodieAvroFileReaderFactory(HoodieStorage storage);
53
54
/** Create Parquet file reader for Avro records */
55
public HoodieAvroFileReader newParquetFileReader(HoodieStorage storage, StoragePath path);
56
}
57
```
58
59
### Avro File Writer Factory
60
61
Factory for creating Avro-based file writers supporting multiple output formats.
62
63
```java { .api }
64
/**
65
* Factory for creating Avro file writers
66
* Supports Parquet and ORC output formats
67
*/
68
public class HoodieAvroFileWriterFactory implements HoodieFileWriterFactory {
69
70
/** Create writer factory with storage backend */
71
public HoodieAvroFileWriterFactory(HoodieStorage storage);
72
}
73
```
74
75
### Avro Parquet Reader
76
77
Avro-based Parquet file reader providing schema evolution and efficient columnar access.
78
79
```java { .api }
80
/**
81
* Avro-based Parquet file reader
82
* Supports schema evolution and columnar data access
83
*/
84
public class HoodieAvroParquetReader implements HoodieFileReader {
85
86
/** Create reader for Parquet file with Avro schema */
87
public HoodieAvroParquetReader(HoodieStorage storage, StoragePath filePath);
88
89
/** Create reader with explicit writer schema */
90
public HoodieAvroParquetReader(HoodieStorage storage, StoragePath filePath,
91
Option<Schema> writerSchemaOpt);
92
93
/** Get the schema of the file */
94
public Schema getSchema();
95
96
/** Get iterator for records with custom reader schema */
97
public ClosableIterator<IndexedRecord> getRecordIterator(Schema readerSchema);
98
99
/** Get iterator for records with file schema */
100
public ClosableIterator<IndexedRecord> getRecordIterator();
101
102
/** Close the reader and release resources */
103
public void close();
104
}
105
```
106
107
### Avro Parquet Writer
108
109
Avro-based Parquet file writer with bloom filter integration and metadata support.
110
111
```java { .api }
112
/**
113
* Avro-based Parquet file writer
114
* Supports bloom filters and custom metadata
115
*/
116
public class HoodieAvroParquetWriter implements HoodieFileWriter {
117
118
/** Create writer with configuration and schema */
119
public HoodieAvroParquetWriter(StoragePath file, HoodieConfig config, Schema schema,
120
Task task, Option<BloomFilter> bloomFilterOpt,
121
boolean populateMetaFields);
122
123
/** Check if writer can accept more data */
124
public boolean canWrite();
125
126
/** Write Avro record with Hudi metadata */
127
public void writeAvroWithMetadata(HoodieKey key, IndexedRecord avroRecord);
128
129
/** Write Avro record with record key */
130
public void writeAvro(String recordKey, IndexedRecord record);
131
132
/** Close writer and return status */
133
public WriteStatus close();
134
135
/** Get current write status */
136
public WriteStatus getWriteStatus();
137
138
/** Get number of bytes written */
139
public long getBytesWritten();
140
}
141
```
142
143
### Avro ORC Reader
144
145
Avro-based ORC file reader supporting schema evolution and efficient columnar access.
146
147
```java { .api }
148
/**
149
* Avro-based ORC file reader
150
* Supports schema evolution and columnar data access
151
*/
152
public class HoodieAvroOrcReader implements HoodieFileReader {
153
154
/** Create reader with explicit writer schema */
155
public HoodieAvroOrcReader(HoodieStorage storage, StoragePath filePath,
156
Option<Schema> writerSchemaOpt);
157
158
/** Get iterator for records with custom reader schema */
159
public ClosableIterator<IndexedRecord> getRecordIterator(Schema readerSchema);
160
161
/** Get iterator for records with file schema */
162
public ClosableIterator<IndexedRecord> getRecordIterator();
163
164
/** Close the reader and release resources */
165
public void close();
166
167
/** Get the schema of the file */
168
public Schema getSchema();
169
}
170
```
171
172
### Avro ORC Writer
173
174
Avro-based ORC file writer with bloom filter integration and metadata support.
175
176
```java { .api }
177
/**
178
* Avro-based ORC file writer
179
* Supports bloom filters and custom metadata
180
*/
181
public class HoodieAvroOrcWriter implements HoodieFileWriter {
182
183
/** Create writer with configuration and schema */
184
public HoodieAvroOrcWriter(StoragePath filePath, HoodieConfig config, Schema schema,
185
Task task, boolean populateMetaFields,
186
Option<BloomFilter> bloomFilterOpt);
187
188
/** Check if writer can accept more data */
189
public boolean canWrite();
190
191
/** Write Avro record with Hudi metadata */
192
public void writeAvroWithMetadata(HoodieKey key, IndexedRecord avroRecord);
193
194
/** Write Avro record with record key */
195
public void writeAvro(String recordKey, IndexedRecord record);
196
197
/** Close writer and return status */
198
public WriteStatus close();
199
200
/** Get current write status */
201
public WriteStatus getWriteStatus();
202
203
/** Get number of bytes written */
204
public long getBytesWritten();
205
}
206
```
207
208
### HFile Utilities
209
210
Utilities for working with HBase HFile format in Hadoop environments.
211
212
```java { .api }
213
/**
214
* Utilities for working with HFile format
215
* Provides HBase integration capabilities
216
*/
217
public class HoodieHFileUtils {
218
219
/** Create HFile reader with configuration */
220
public static HFile.Reader createHFileReader(FileSystem fs, Path path,
221
CacheConfig cacheConf, Configuration conf);
222
223
/** Get optimized configuration for HFile reading */
224
public static Configuration getHFileReaderConfiguration(Configuration conf);
225
226
/** Check if path points to an HFile */
227
public static boolean isHFile(StoragePath path);
228
}
229
```
230
231
### Parquet Write Support
232
233
Parquet write support for Avro records with bloom filter integration.
234
235
```java { .api }
236
/**
237
* Parquet write support for Avro with bloom filter integration
238
* Extends standard Parquet writing with Hudi-specific features
239
*/
240
public class HoodieAvroWriteSupport extends AvroWriteSupport<IndexedRecord> {
241
242
/** Create write support with schema and bloom filter */
243
public HoodieAvroWriteSupport(MessageType schema, Schema avroSchema,
244
Option<BloomFilter> bloomFilterOpt, Properties properties);
245
246
/** Finalize write context with metadata */
247
public WriteSupport.FinalizedWriteContext finalizeWrite();
248
249
/** Add record key to bloom filter */
250
public void add(String recordKey);
251
252
/** Add custom footer metadata */
253
public void addFooterMetadata(String key, String value);
254
}
255
```
256
257
### Parquet Reader Builder
258
259
Builder pattern for creating Hoodie-specific Avro Parquet readers.
260
261
```java { .api }
262
/**
263
* Builder for Hoodie Avro Parquet readers
264
* Provides configuration and customization options
265
*/
266
public class HoodieAvroParquetReaderBuilder<T> extends ParquetReaderBuilder<T> {
267
268
/** Create builder with file path */
269
public HoodieAvroParquetReaderBuilder(Path path);
270
271
/** Create builder with input file */
272
public HoodieAvroParquetReaderBuilder(InputFile file);
273
274
/** Set Hadoop configuration */
275
public HoodieAvroParquetReaderBuilder<T> withConf(Configuration conf);
276
277
/** Build configured ParquetReader */
278
public ParquetReader<T> build();
279
}
280
```
281
282
### Parquet Read Support
283
284
Parquet read support for Avro records with Hudi-specific optimizations.
285
286
```java { .api }
287
/**
288
* Parquet read support for Avro records
289
* Extends standard AvroReadSupport with Hudi optimizations
290
*/
291
public class HoodieAvroReadSupport extends AvroReadSupport {
292
// Extends AvroReadSupport with Hoodie-specific functionality
293
// for reading Parquet files with Avro schema
294
}
295
```
296
297
**Usage Examples:**
298
299
```java
300
import org.apache.hudi.io.hadoop.*;
301
import org.apache.hudi.storage.hadoop.HoodieHadoopStorage;
302
import org.apache.avro.Schema;
303
import org.apache.avro.generic.IndexedRecord;
304
305
// Set up I/O factory
306
HoodieHadoopStorage storage = new HoodieHadoopStorage(storagePath, storageConf);
307
HoodieHadoopIOFactory ioFactory = new HoodieHadoopIOFactory(storage);
308
309
// Reading Parquet files
310
StoragePath parquetFile = new StoragePath("/data/table1/file.parquet");
311
312
// Create Avro Parquet reader
313
HoodieAvroParquetReader reader = new HoodieAvroParquetReader(storage, parquetFile);
314
Schema schema = reader.getSchema();
315
316
// Read records
317
try (ClosableIterator<IndexedRecord> iterator = reader.getRecordIterator()) {
318
while (iterator.hasNext()) {
319
IndexedRecord record = iterator.next();
320
// Process record
321
}
322
}
323
reader.close();
324
325
// Writing Parquet files
326
Schema writeSchema = new Schema.Parser().parse(schemaString);
327
StoragePath outputFile = new StoragePath("/data/table1/output.parquet");
328
329
HoodieAvroParquetWriter writer = new HoodieAvroParquetWriter(
330
outputFile,
331
hoodieConfig,
332
writeSchema,
333
task,
334
Option.empty(), // No bloom filter
335
true // Populate meta fields
336
);
337
338
// Write records
339
for (IndexedRecord record : records) {
340
writer.writeAvro("key", record);
341
}
342
343
WriteStatus status = writer.close();
344
System.out.println("Bytes written: " + status.getBytesWritten());
345
346
// Working with ORC files
347
HoodieAvroOrcReader orcReader = new HoodieAvroOrcReader(storage, orcFilePath, Option.empty());
348
try (ClosableIterator<IndexedRecord> iterator = orcReader.getRecordIterator()) {
349
while (iterator.hasNext()) {
350
IndexedRecord record = iterator.next();
351
// Process ORC record
352
}
353
}
354
orcReader.close();
355
356
// HFile operations
357
boolean isHFile = HoodieHFileUtils.isHFile(somePath);
358
if (isHFile) {
359
Configuration hfileConf = HoodieHFileUtils.getHFileReaderConfiguration(hadoopConf);
360
// Work with HFile using optimized configuration
361
}
362
```