0
# File Operations
1
2
Avro file operations provide comprehensive file-based data storage and retrieval with embedded schemas, metadata, and compression support. Avro data files are self-describing, splittable, and compressible, making them ideal for big data processing and long-term storage.
3
4
## Capabilities
5
6
### File Reading
7
8
Read Avro data files with random access and streaming capabilities.
9
10
```java { .api }
11
public class DataFileReader<D> extends DataFileStream<D> implements FileReader<D> {
12
public DataFileReader(File file, DatumReader<D> reader) throws IOException;
13
public DataFileReader(SeekableInput sin, DatumReader<D> reader) throws IOException;
14
15
// Random access operations
16
public void seek(long position) throws IOException;
17
public void sync(long position) throws IOException;
18
public long tell() throws IOException;
19
public long previousSync() throws IOException;
20
21
// Metadata access
22
public String getMetaString(String key);
23
public byte[] getMeta(String key);
24
public long getBlockCount();
25
public long getBlockSize();
26
}
27
28
public interface FileReader<D> extends Iterator<D>, Iterable<D>, Closeable {
29
public Schema getSchema();
30
public void sync(long position) throws IOException;
31
public boolean pastSync(long position) throws IOException;
32
public long tell() throws IOException;
33
}
34
```
35
36
**Usage Examples:**
37
38
```java
39
// Read from file with generic records
40
File avroFile = new File("users.avro");
41
DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
42
DataFileReader<GenericRecord> fileReader = new DataFileReader<>(avroFile, datumReader);
43
44
// Read all records
45
while (fileReader.hasNext()) {
46
GenericRecord user = fileReader.next();
47
System.out.println("Name: " + user.get("name"));
48
System.out.println("Age: " + user.get("age"));
49
}
50
fileReader.close();
51
52
// Random access reading
53
DataFileReader<GenericRecord> randomReader = new DataFileReader<>(avroFile, datumReader);
54
Schema schema = randomReader.getSchema();
55
56
// Seek to specific position
57
long position = 1024;
58
randomReader.seek(position);
59
if (randomReader.hasNext()) {
60
GenericRecord record = randomReader.next();
61
processRecord(record);
62
}
63
64
// Access file metadata
65
String codec = randomReader.getMetaString("avro.codec");
66
byte[] schemaBytes = randomReader.getMeta("avro.schema");
67
System.out.println("Compression codec: " + codec);
68
randomReader.close();
69
```
70
71
### File Writing
72
73
Write Avro data files with compression and metadata support.
74
75
```java { .api }
76
public class DataFileWriter<D> implements Closeable, Flushable {
77
public DataFileWriter(DatumWriter<D> dout);
78
79
// File creation
80
public DataFileWriter<D> create(Schema schema, OutputStream outs) throws IOException;
81
public DataFileWriter<D> create(Schema schema, File file) throws IOException;
82
public DataFileWriter<D> create(Schema schema, OutputStream outs, Codec codec) throws IOException;
83
84
// Metadata operations
85
public DataFileWriter<D> setMeta(String key, byte[] value);
86
public DataFileWriter<D> setMeta(String key, String value);
87
public DataFileWriter<D> setSyncInterval(int syncInterval);
88
89
// Writing operations
90
public void append(D datum) throws IOException;
91
public void appendTo(DataFileWriter<D> writer) throws IOException;
92
93
// Synchronization and flushing
94
public long sync() throws IOException;
95
public void flush() throws IOException;
96
public void flusher() throws IOException;
97
98
// Closing
99
public void close() throws IOException;
100
}
101
```
102
103
**Usage Examples:**
104
105
```java
106
// Write generic records to file
107
Schema schema = new Schema.Parser().parse(schemaJson);
108
File outputFile = new File("output.avro");
109
110
DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
111
DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<>(datumWriter);
112
113
// Create file with compression
114
fileWriter.setCodec(CodecFactory.deflateCodec(6));
115
fileWriter.setMeta("created.by", "MyApplication");
116
fileWriter.create(schema, outputFile);
117
118
// Write multiple records
119
List<GenericRecord> users = createUsers();
120
for (GenericRecord user : users) {
121
fileWriter.append(user);
122
}
123
124
// Force sync periodically for large files
125
fileWriter.sync();
126
127
fileWriter.close();
128
129
// Write to output stream
130
OutputStream outputStream = new BufferedOutputStream(new FileOutputStream("stream.avro"));
131
DataFileWriter<GenericRecord> streamWriter = new DataFileWriter<>(datumWriter);
132
streamWriter.create(schema, outputStream);
133
134
for (GenericRecord user : users) {
135
streamWriter.append(user);
136
}
137
streamWriter.close();
138
```
139
140
### Stream Reading
141
142
Read Avro data from input streams without random access.
143
144
```java { .api }
145
public class DataFileStream<D> implements Iterator<D>, Iterable<D>, Closeable {
146
public DataFileStream(InputStream in, DatumReader<D> reader) throws IOException;
147
148
// Schema and metadata access
149
public Schema getSchema();
150
public String getMetaString(String key);
151
public byte[] getMeta(String key);
152
public List<String> getMetaKeys();
153
154
// Iterator operations
155
public boolean hasNext();
156
public D next();
157
public D next(D reuse);
158
159
// Stream operations
160
public void close() throws IOException;
161
}
162
```
163
164
**Usage Examples:**
165
166
```java
167
// Read from input stream
168
InputStream inputStream = new FileInputStream("data.avro");
169
DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
170
DataFileStream<GenericRecord> stream = new DataFileStream<>(inputStream, datumReader);
171
172
// Get schema from stream
173
Schema schema = stream.getSchema();
174
System.out.println("Schema: " + schema.toString(true));
175
176
// Read all records
177
GenericRecord reusedRecord = null;
178
while (stream.hasNext()) {
179
reusedRecord = stream.next(reusedRecord); // Reuse object for performance
180
processRecord(reusedRecord);
181
}
182
stream.close();
183
184
// Read from compressed stream
185
InputStream gzipStream = new GZIPInputStream(new FileInputStream("compressed.avro.gz"));
186
DataFileStream<GenericRecord> compressedStream = new DataFileStream<>(gzipStream, datumReader);
187
188
// Process compressed data
189
for (GenericRecord record : compressedStream) {
190
System.out.println("Record: " + record);
191
}
192
compressedStream.close();
193
```
194
195
### Seekable Input Interface
196
197
Interface for random access input operations.
198
199
```java { .api }
200
public interface SeekableInput extends Closeable {
201
void seek(long p) throws IOException;
202
long tell() throws IOException;
203
long length() throws IOException;
204
int read(byte[] b, int off, int len) throws IOException;
205
}
206
```
207
208
**Usage Examples:**
209
210
```java
211
// Implement custom seekable input
212
public class CustomSeekableInput implements SeekableInput {
213
private final RandomAccessFile file;
214
215
public CustomSeekableInput(File file) throws IOException {
216
this.file = new RandomAccessFile(file, "r");
217
}
218
219
@Override
220
public void seek(long p) throws IOException {
221
file.seek(p);
222
}
223
224
@Override
225
public long tell() throws IOException {
226
return file.getFilePointer();
227
}
228
229
@Override
230
public long length() throws IOException {
231
return file.length();
232
}
233
234
@Override
235
public int read(byte[] b, int off, int len) throws IOException {
236
return file.read(b, off, len);
237
}
238
239
@Override
240
public void close() throws IOException {
241
file.close();
242
}
243
}
244
245
// Use custom seekable input
246
SeekableInput seekableInput = new CustomSeekableInput(new File("data.avro"));
247
DatumReader<GenericRecord> reader = new GenericDatumReader<>();
248
DataFileReader<GenericRecord> fileReader = new DataFileReader<>(seekableInput, reader);
249
250
// Use random access capabilities
251
fileReader.seek(1000);
252
GenericRecord record = fileReader.next();
253
fileReader.close();
254
```
255
256
### Compression Codecs
257
258
Support for various compression algorithms to reduce file size.
259
260
```java { .api }
261
public abstract class Codec {
262
public abstract String getName();
263
public abstract ByteBuffer compress(ByteBuffer uncompressedData) throws IOException;
264
public abstract ByteBuffer decompress(ByteBuffer compressedData) throws IOException;
265
public abstract int hashCode();
266
public abstract boolean equals(Object obj);
267
}
268
269
public class CodecFactory {
270
public static Codec nullCodec();
271
public static Codec deflateCodec(int level);
272
public static Codec snappyCodec();
273
public static Codec bzip2Codec();
274
public static Codec xzCodec(int level);
275
public static Codec zstandardCodec(int level);
276
public static Codec zstandardCodec(int level, boolean includeChecksum);
277
278
public static Codec fromString(String codecName);
279
}
280
```
281
282
**Usage Examples:**
283
284
```java
285
// Use different compression codecs
286
DataFileWriter<GenericRecord> writer = new DataFileWriter<>(datumWriter);
287
288
// Deflate compression (good balance of speed and compression)
289
writer.setCodec(CodecFactory.deflateCodec(6));
290
291
// Snappy compression (very fast, moderate compression)
292
writer.setCodec(CodecFactory.snappyCodec());
293
294
// BZip2 compression (slower, better compression)
295
writer.setCodec(CodecFactory.bzip2Codec());
296
297
// XZ compression (slowest, best compression)
298
writer.setCodec(CodecFactory.xzCodec(6));
299
300
// Zstandard compression (good speed and compression)
301
writer.setCodec(CodecFactory.zstandardCodec(3, true));
302
303
// No compression
304
writer.setCodec(CodecFactory.nullCodec());
305
306
writer.create(schema, outputFile);
307
308
// Check codec used in file
309
DataFileReader<GenericRecord> reader = new DataFileReader<>(file, datumReader);
310
String codecName = reader.getMetaString("avro.codec");
311
System.out.println("File uses codec: " + codecName);
312
reader.close();
313
```
314
315
### File Metadata
316
317
Access and manipulate file-level metadata stored in Avro data files.
318
319
```java { .api }
320
// Writer metadata operations
321
public DataFileWriter<D> setMeta(String key, byte[] value);
322
public DataFileWriter<D> setMeta(String key, String value);
323
public DataFileWriter<D> setMeta(String key, long value);
324
325
// Reader metadata operations
326
public String getMetaString(String key);
327
public byte[] getMeta(String key);
328
public long getMetaLong(String key);
329
public List<String> getMetaKeys();
330
```
331
332
**Usage Examples:**
333
334
```java
335
// Set metadata when writing
336
DataFileWriter<GenericRecord> writer = new DataFileWriter<>(datumWriter);
337
writer.setMeta("created.by", "MyApplication v1.2.3");
338
writer.setMeta("created.time", System.currentTimeMillis());
339
writer.setMeta("source.system", "production");
340
writer.setMeta("record.count.estimate", 1000000L);
341
342
byte[] customData = "custom metadata".getBytes();
343
writer.setMeta("custom.data", customData);
344
345
writer.create(schema, outputFile);
346
347
// Read metadata from file
348
DataFileReader<GenericRecord> reader = new DataFileReader<>(file, datumReader);
349
350
// Get all metadata keys
351
List<String> metaKeys = reader.getMetaKeys();
352
System.out.println("Metadata keys: " + metaKeys);
353
354
// Read specific metadata
355
String createdBy = reader.getMetaString("created.by");
356
long createdTime = reader.getMetaLong("created.time");
357
byte[] customData = reader.getMeta("custom.data");
358
359
System.out.println("Created by: " + createdBy);
360
System.out.println("Created time: " + new Date(createdTime));
361
System.out.println("Custom data: " + new String(customData));
362
363
// Standard Avro metadata
364
String codecName = reader.getMetaString("avro.codec");
365
String schemaJson = reader.getMetaString("avro.schema");
366
System.out.println("Codec: " + codecName);
367
368
reader.close();
369
```
370
371
## Types
372
373
```java { .api }
374
public class DataFileReader<D> extends DataFileStream<D> implements FileReader<D> {
375
// File-based reader with random access
376
}
377
378
public class DataFileWriter<D> implements Closeable, Flushable, Syncable {
379
// File writer with compression and metadata support
380
}
381
382
public class DataFileStream<D> implements Iterator<D>, Iterable<D>, Closeable {
383
// Stream-based reader for sequential access
384
}
385
386
public interface FileReader<D> extends Iterator<D>, Iterable<D>, Closeable {
387
Schema getSchema();
388
void sync(long position) throws IOException;
389
boolean pastSync(long position) throws IOException;
390
long tell() throws IOException;
391
}
392
393
public interface SeekableInput extends Closeable {
394
void seek(long p) throws IOException;
395
long tell() throws IOException;
396
long length() throws IOException;
397
int read(byte[] b, int off, int len) throws IOException;
398
}
399
400
public interface Syncable {
401
void sync() throws IOException;
402
}
403
404
public abstract class Codec {
405
public abstract String getName();
406
public abstract ByteBuffer compress(ByteBuffer data) throws IOException;
407
public abstract ByteBuffer decompress(ByteBuffer data) throws IOException;
408
}
409
410
public class CodecFactory {
411
// Factory for creating compression codecs
412
}
413
414
// Specific codec implementations
415
public class DeflateCodec extends Codec;
416
public class SnappyCodec extends Codec;
417
public class BZip2Codec extends Codec;
418
public class XZCodec extends Codec;
419
public class ZstandardCodec extends Codec;
420
public class NullCodec extends Codec;
421
```