Core data management capabilities for CDAP including dataset operations, metadata management, lineage tracking, audit functionality, and data registry services for Hadoop-based applications.
npx @tessl/cli install tessl/maven-co-cask-cdap--cdap-data-fabric@5.1.0CDAP Data Fabric is a core component of the Cask Data Application Platform that provides essential data management and infrastructure services for Hadoop-based applications. It handles dataset metrics reporting and monitoring, comprehensive metadata management with indexing capabilities for efficient search and discovery, data lineage tracking to understand data flow and transformations, audit trail functionality for compliance and debugging, and a centralized data registry for managing dataset definitions and configurations.
<dependency>
<groupId>co.cask.cdap</groupId>
<artifactId>cdap-data-fabric</artifactId>
<version>5.1.2</version>
</dependency>import co.cask.cdap.data2.dataset2.DatasetFramework;
import co.cask.cdap.data2.metadata.dataset.MetadataDataset;
import co.cask.cdap.data2.registry.UsageRegistry;
import co.cask.cdap.store.NamespaceStore;
import co.cask.cdap.data2.audit.AuditPublisher;// Dataset management example
DatasetFramework datasetFramework = // ... obtain from DI container
DatasetId datasetId = NamespaceId.DEFAULT.dataset("myDataset");
DatasetProperties properties = DatasetProperties.builder().build();
// Create a dataset instance
datasetFramework.addInstance("keyValueTable", datasetId, properties, null);
// Access the dataset
KeyValueTable dataset = datasetFramework.getDataset(
datasetId, null, null, null, null, AccessType.READ_WRITE);
// Metadata management example
MetadataDataset metadataDataset = // ... obtain instance
MetadataEntity entity = // ... define entity
Map<String, String> properties = Map.of("environment", "production", "owner", "team-alpha");
// Set metadata properties
metadataDataset.setProperty(entity, properties);
// Add tags
Set<String> tags = Set.of("production", "critical", "team-alpha");
metadataDataset.addTags(entity, tags);
// Search metadata
SearchRequest searchRequest = SearchRequest.of("production").build();
SearchResults results = metadataDataset.search(searchRequest);The CDAP Data Fabric follows a layered architecture that abstracts complex data operations:
This architecture enables developers to build scalable data applications without dealing directly with underlying Hadoop complexities while maintaining full transactional guarantees and comprehensive metadata management.
Comprehensive dataset lifecycle management including creation, configuration, access, and administration across multiple storage backends with transaction support and lineage tracking.
public interface DatasetFramework {
void addInstance(String datasetTypeName, DatasetId datasetInstanceId,
DatasetProperties props, KerberosPrincipalId ownerPrincipal)
throws DatasetManagementException, IOException;
<T extends Dataset> T getDataset(DatasetId datasetInstanceId, Map<String, String> arguments,
ClassLoader classLoader, DatasetClassLoaderProvider classLoaderProvider,
Iterable<? extends EntityId> owners, AccessType accessType)
throws DatasetManagementException, IOException;
void deleteInstance(DatasetId datasetInstanceId) throws DatasetManagementException, IOException;
Collection<DatasetSpecificationSummary> getInstances(NamespaceId namespaceId)
throws DatasetManagementException;
}Complete metadata management system for properties, tags, search, and indexing with support for custom indexing strategies and historical snapshots.
public class MetadataDataset extends AbstractDataset {
public MetadataChange setProperty(MetadataEntity metadataEntity, String key, String value);
public MetadataChange addTags(MetadataEntity metadataEntity, Set<String> tagsToAdd);
public Metadata getMetadata(MetadataEntity metadataEntity);
public SearchResults search(SearchRequest request) throws BadRequestException;
public Set<Metadata> getSnapshotBeforeTime(Set<MetadataEntity> metadataEntitys, long timeMillis);
}Program-dataset relationship tracking for governance, lineage analysis, and impact assessment with comprehensive query capabilities.
public interface UsageRegistry extends UsageWriter {
void unregister(ApplicationId applicationId);
Set<DatasetId> getDatasets(ApplicationId id);
Set<ProgramId> getPrograms(DatasetId id);
Set<StreamId> getStreams(ProgramId id);
}Namespace lifecycle management for multi-tenancy support with metadata persistence and comprehensive administrative operations.
public interface NamespaceStore {
NamespaceMeta create(NamespaceMeta metadata);
void update(NamespaceMeta metadata);
NamespaceMeta get(NamespaceId id);
NamespaceMeta delete(NamespaceId id);
List<NamespaceMeta> list();
}Comprehensive audit logging system for compliance, monitoring, and debugging with pluggable publishers and structured payload builders.
public interface AuditPublisher {
void publish(EntityId entityId, AuditType auditType, AuditPayload auditPayload);
void publish(MetadataEntity metadataEntity, AuditType auditType, AuditPayload auditPayload);
}Distributed transaction support with retry logic, consumer state management, and integration with Apache Tephra for ACID guarantees.
public interface TransactionExecutorFactory extends org.apache.tephra.TransactionExecutorFactory {
// Transaction executor creation with custom configuration
}
public interface TransactionSystemClient {
// Transaction system client operations with distributed coordination
}Real-time stream processing capabilities with coordination, file management, partitioning, and multiple decoder support for various data formats.
public interface StreamAdmin {
// Stream administration and lifecycle operations
}
public interface StreamConsumer extends Closeable, TransactionAware {
// Stream consumption with transaction support and state management
}// Core entity identifiers
public final class DatasetId extends EntityId {
public static DatasetId of(String namespace, String dataset);
}
public final class NamespaceId extends EntityId {
public static final NamespaceId DEFAULT = new NamespaceId("default");
public static NamespaceId of(String namespace);
}
public final class ProgramId extends EntityId {
// Program identification with application and program type context
}
public final class ApplicationId extends EntityId {
// Application identification within namespace context
}
// Metadata entities
public interface MetadataEntity {
// Metadata entity representation for flexible entity types
}
// Dataset properties and specifications
public final class DatasetProperties {
public static Builder builder();
public Map<String, String> getProperties();
}
public interface DatasetSpecification {
String getName();
String getType();
DatasetProperties getProperties();
}
// Access and security
public enum AccessType {
READ, WRITE, ADMIN, READ_WRITE
}
public final class KerberosPrincipalId {
public static KerberosPrincipalId of(String principal);
}
// Metadata types
public enum MetadataScope {
USER, SYSTEM
}
public final class MetadataRecordV2 {
public MetadataEntity getMetadataEntity();
public Map<String, String> getProperties();
public Set<String> getTags();
public MetadataScope getScope();
}
public final class ViewSpecification {
// Stream view configuration specification
public String getFormat();
public Schema getSchema();
public Map<String, String> getSettings();
}
public final class ViewDetail {
// Complete view information including metadata
public StreamViewId getId();
public ViewSpecification getSpec();
public Map<String, String> getProperties();
}
public final class StreamViewId extends EntityId {
public static StreamViewId of(String namespace, String stream, String view);
public StreamId getParent();
public String getView();
}
public final class RetryStrategy {
// Configurable retry policies for operations
public static RetryStrategy noRetry();
public static RetryStrategy exponentialDelay(long initialDelay, long maxDelay, int maxAttempts);
}
public final class TransactionContextFactory {
// Factory for creating transaction contexts
}
// Exceptions
public class DatasetManagementException extends Exception {
// Dataset operation failures with detailed error context
}
public class BadRequestException extends Exception {
// Invalid request parameter handling
}
public class NotFoundException extends Exception {
// Resource not found handling
}