Apache Avro core components for data serialization with rich data structures, compact binary format, and schema evolution support
—
Avro file operations provide comprehensive file-based data storage and retrieval with embedded schemas, metadata, and compression support. Avro data files are self-describing, splittable, and compressible, making them ideal for big data processing and long-term storage.
Read Avro data files with random access and streaming capabilities.
public class DataFileReader<D> extends DataFileStream<D> implements FileReader<D> {
public DataFileReader(File file, DatumReader<D> reader) throws IOException;
public DataFileReader(SeekableInput sin, DatumReader<D> reader) throws IOException;
// Random access operations
public void seek(long position) throws IOException;
public void sync(long position) throws IOException;
public long tell() throws IOException;
public long previousSync() throws IOException;
// Metadata access
public String getMetaString(String key);
public byte[] getMeta(String key);
public long getBlockCount();
public long getBlockSize();
}
public interface FileReader<D> extends Iterator<D>, Iterable<D>, Closeable {
public Schema getSchema();
public void sync(long position) throws IOException;
public boolean pastSync(long position) throws IOException;
public long tell() throws IOException;
}Usage Examples:
// Read from file with generic records
File avroFile = new File("users.avro");
DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
DataFileReader<GenericRecord> fileReader = new DataFileReader<>(avroFile, datumReader);
// Read all records
while (fileReader.hasNext()) {
GenericRecord user = fileReader.next();
System.out.println("Name: " + user.get("name"));
System.out.println("Age: " + user.get("age"));
}
fileReader.close();
// Random access reading
DataFileReader<GenericRecord> randomReader = new DataFileReader<>(avroFile, datumReader);
Schema schema = randomReader.getSchema();
// Seek to specific position
long position = 1024;
randomReader.seek(position);
if (randomReader.hasNext()) {
GenericRecord record = randomReader.next();
processRecord(record);
}
// Access file metadata
String codec = randomReader.getMetaString("avro.codec");
byte[] schemaBytes = randomReader.getMeta("avro.schema");
System.out.println("Compression codec: " + codec);
randomReader.close();Write Avro data files with compression and metadata support.
public class DataFileWriter<D> implements Closeable, Flushable {
public DataFileWriter(DatumWriter<D> dout);
// File creation
public DataFileWriter<D> create(Schema schema, OutputStream outs) throws IOException;
public DataFileWriter<D> create(Schema schema, File file) throws IOException;
public DataFileWriter<D> create(Schema schema, OutputStream outs, Codec codec) throws IOException;
// Metadata operations
public DataFileWriter<D> setMeta(String key, byte[] value);
public DataFileWriter<D> setMeta(String key, String value);
public DataFileWriter<D> setSyncInterval(int syncInterval);
// Writing operations
public void append(D datum) throws IOException;
public void appendTo(DataFileWriter<D> writer) throws IOException;
// Synchronization and flushing
public long sync() throws IOException;
public void flush() throws IOException;
public void flusher() throws IOException;
// Closing
public void close() throws IOException;
}Usage Examples:
// Write generic records to file
Schema schema = new Schema.Parser().parse(schemaJson);
File outputFile = new File("output.avro");
DatumWriter<GenericRecord> datumWriter = new GenericDatumWriter<>(schema);
DataFileWriter<GenericRecord> fileWriter = new DataFileWriter<>(datumWriter);
// Create file with compression
fileWriter.setCodec(CodecFactory.deflateCodec(6));
fileWriter.setMeta("created.by", "MyApplication");
fileWriter.create(schema, outputFile);
// Write multiple records
List<GenericRecord> users = createUsers();
for (GenericRecord user : users) {
fileWriter.append(user);
}
// Force sync periodically for large files
fileWriter.sync();
fileWriter.close();
// Write to output stream
OutputStream outputStream = new BufferedOutputStream(new FileOutputStream("stream.avro"));
DataFileWriter<GenericRecord> streamWriter = new DataFileWriter<>(datumWriter);
streamWriter.create(schema, outputStream);
for (GenericRecord user : users) {
streamWriter.append(user);
}
streamWriter.close();Read Avro data from input streams without random access.
public class DataFileStream<D> implements Iterator<D>, Iterable<D>, Closeable {
public DataFileStream(InputStream in, DatumReader<D> reader) throws IOException;
// Schema and metadata access
public Schema getSchema();
public String getMetaString(String key);
public byte[] getMeta(String key);
public List<String> getMetaKeys();
// Iterator operations
public boolean hasNext();
public D next();
public D next(D reuse);
// Stream operations
public void close() throws IOException;
}Usage Examples:
// Read from input stream
InputStream inputStream = new FileInputStream("data.avro");
DatumReader<GenericRecord> datumReader = new GenericDatumReader<>();
DataFileStream<GenericRecord> stream = new DataFileStream<>(inputStream, datumReader);
// Get schema from stream
Schema schema = stream.getSchema();
System.out.println("Schema: " + schema.toString(true));
// Read all records
GenericRecord reusedRecord = null;
while (stream.hasNext()) {
reusedRecord = stream.next(reusedRecord); // Reuse object for performance
processRecord(reusedRecord);
}
stream.close();
// Read from compressed stream
InputStream gzipStream = new GZIPInputStream(new FileInputStream("compressed.avro.gz"));
DataFileStream<GenericRecord> compressedStream = new DataFileStream<>(gzipStream, datumReader);
// Process compressed data
for (GenericRecord record : compressedStream) {
System.out.println("Record: " + record);
}
compressedStream.close();Interface for random access input operations.
public interface SeekableInput extends Closeable {
void seek(long p) throws IOException;
long tell() throws IOException;
long length() throws IOException;
int read(byte[] b, int off, int len) throws IOException;
}Usage Examples:
// Implement custom seekable input
public class CustomSeekableInput implements SeekableInput {
private final RandomAccessFile file;
public CustomSeekableInput(File file) throws IOException {
this.file = new RandomAccessFile(file, "r");
}
@Override
public void seek(long p) throws IOException {
file.seek(p);
}
@Override
public long tell() throws IOException {
return file.getFilePointer();
}
@Override
public long length() throws IOException {
return file.length();
}
@Override
public int read(byte[] b, int off, int len) throws IOException {
return file.read(b, off, len);
}
@Override
public void close() throws IOException {
file.close();
}
}
// Use custom seekable input
SeekableInput seekableInput = new CustomSeekableInput(new File("data.avro"));
DatumReader<GenericRecord> reader = new GenericDatumReader<>();
DataFileReader<GenericRecord> fileReader = new DataFileReader<>(seekableInput, reader);
// Use random access capabilities
fileReader.seek(1000);
GenericRecord record = fileReader.next();
fileReader.close();Support for various compression algorithms to reduce file size.
public abstract class Codec {
public abstract String getName();
public abstract ByteBuffer compress(ByteBuffer uncompressedData) throws IOException;
public abstract ByteBuffer decompress(ByteBuffer compressedData) throws IOException;
public abstract int hashCode();
public abstract boolean equals(Object obj);
}
public class CodecFactory {
public static Codec nullCodec();
public static Codec deflateCodec(int level);
public static Codec snappyCodec();
public static Codec bzip2Codec();
public static Codec xzCodec(int level);
public static Codec zstandardCodec(int level);
public static Codec zstandardCodec(int level, boolean includeChecksum);
public static Codec fromString(String codecName);
}Usage Examples:
// Use different compression codecs
DataFileWriter<GenericRecord> writer = new DataFileWriter<>(datumWriter);
// Deflate compression (good balance of speed and compression)
writer.setCodec(CodecFactory.deflateCodec(6));
// Snappy compression (very fast, moderate compression)
writer.setCodec(CodecFactory.snappyCodec());
// BZip2 compression (slower, better compression)
writer.setCodec(CodecFactory.bzip2Codec());
// XZ compression (slowest, best compression)
writer.setCodec(CodecFactory.xzCodec(6));
// Zstandard compression (good speed and compression)
writer.setCodec(CodecFactory.zstandardCodec(3, true));
// No compression
writer.setCodec(CodecFactory.nullCodec());
writer.create(schema, outputFile);
// Check codec used in file
DataFileReader<GenericRecord> reader = new DataFileReader<>(file, datumReader);
String codecName = reader.getMetaString("avro.codec");
System.out.println("File uses codec: " + codecName);
reader.close();Access and manipulate file-level metadata stored in Avro data files.
// Writer metadata operations
public DataFileWriter<D> setMeta(String key, byte[] value);
public DataFileWriter<D> setMeta(String key, String value);
public DataFileWriter<D> setMeta(String key, long value);
// Reader metadata operations
public String getMetaString(String key);
public byte[] getMeta(String key);
public long getMetaLong(String key);
public List<String> getMetaKeys();Usage Examples:
// Set metadata when writing
DataFileWriter<GenericRecord> writer = new DataFileWriter<>(datumWriter);
writer.setMeta("created.by", "MyApplication v1.2.3");
writer.setMeta("created.time", System.currentTimeMillis());
writer.setMeta("source.system", "production");
writer.setMeta("record.count.estimate", 1000000L);
byte[] customData = "custom metadata".getBytes();
writer.setMeta("custom.data", customData);
writer.create(schema, outputFile);
// Read metadata from file
DataFileReader<GenericRecord> reader = new DataFileReader<>(file, datumReader);
// Get all metadata keys
List<String> metaKeys = reader.getMetaKeys();
System.out.println("Metadata keys: " + metaKeys);
// Read specific metadata
String createdBy = reader.getMetaString("created.by");
long createdTime = reader.getMetaLong("created.time");
byte[] customData = reader.getMeta("custom.data");
System.out.println("Created by: " + createdBy);
System.out.println("Created time: " + new Date(createdTime));
System.out.println("Custom data: " + new String(customData));
// Standard Avro metadata
String codecName = reader.getMetaString("avro.codec");
String schemaJson = reader.getMetaString("avro.schema");
System.out.println("Codec: " + codecName);
reader.close();public class DataFileReader<D> extends DataFileStream<D> implements FileReader<D> {
// File-based reader with random access
}
public class DataFileWriter<D> implements Closeable, Flushable, Syncable {
// File writer with compression and metadata support
}
public class DataFileStream<D> implements Iterator<D>, Iterable<D>, Closeable {
// Stream-based reader for sequential access
}
public interface FileReader<D> extends Iterator<D>, Iterable<D>, Closeable {
Schema getSchema();
void sync(long position) throws IOException;
boolean pastSync(long position) throws IOException;
long tell() throws IOException;
}
public interface SeekableInput extends Closeable {
void seek(long p) throws IOException;
long tell() throws IOException;
long length() throws IOException;
int read(byte[] b, int off, int len) throws IOException;
}
public interface Syncable {
void sync() throws IOException;
}
public abstract class Codec {
public abstract String getName();
public abstract ByteBuffer compress(ByteBuffer data) throws IOException;
public abstract ByteBuffer decompress(ByteBuffer data) throws IOException;
}
public class CodecFactory {
// Factory for creating compression codecs
}
// Specific codec implementations
public class DeflateCodec extends Codec;
public class SnappyCodec extends Codec;
public class BZip2Codec extends Codec;
public class XZCodec extends Codec;
public class ZstandardCodec extends Codec;
public class NullCodec extends Codec;Install with Tessl CLI
npx tessl i tessl/maven-org-apache-avro--avro