CDAP Java Client library providing programmatic APIs for interacting with the CDAP platform
—
The DatasetClient provides comprehensive dataset management including creation, configuration, property management, and data operations like truncation. Datasets are persistent storage abstractions in CDAP that provide type-safe access to data.
public class DatasetClient {
// Constructors
public DatasetClient(ClientConfig config);
public DatasetClient(ClientConfig config, RESTClient restClient);
// Dataset management methods
public List<DatasetSpecificationSummary> list(NamespaceId namespace);
public DatasetMeta get(DatasetId instance);
public void create(DatasetId instance, DatasetInstanceConfiguration properties);
public void create(DatasetId instance, String typeName);
public void update(DatasetId instance, Map<String, String> properties);
public void updateExisting(DatasetId instance, Map<String, String> properties);
public void delete(DatasetId instance);
public boolean exists(DatasetId instance);
public void truncate(DatasetId instance);
public Map<String, String> getProperties(DatasetId instance);
}public class DatasetSpecificationSummary {
public String getName();
public String getType();
public String getDescription();
public Map<String, String> getProperties();
}
public class DatasetMeta {
public DatasetSpecification getSpec();
public String getType();
public long getCreationTime();
public String getOwnerPrincipal();
public Map<String, String> getProperties();
public String getHiveTableName();
}
public class DatasetId {
public static DatasetId of(NamespaceId namespace, String dataset);
public NamespaceId getNamespace();
public String getDataset();
}
public class DatasetInstanceConfiguration {
public DatasetInstanceConfiguration(String typeName, Map<String, String> properties);
public DatasetInstanceConfiguration(String typeName, Map<String, String> properties, String description);
public String getTypeName();
public Map<String, String> getProperties();
public String getDescription();
}// List all datasets in namespace
List<DatasetSpecificationSummary> datasets = datasetClient.list(namespace);
System.out.println("Found " + datasets.size() + " datasets:");
for (DatasetSpecificationSummary dataset : datasets) {
System.out.println("- " + dataset.getName() + " (type: " + dataset.getType() + ")");
System.out.println(" Description: " + dataset.getDescription());
System.out.println(" Properties: " + dataset.getProperties());
}// Get detailed dataset information
DatasetId datasetId = DatasetId.of(namespace, "user-profiles");
DatasetMeta meta = datasetClient.get(datasetId);
System.out.println("Dataset: " + datasetId.getDataset());
System.out.println("Type: " + meta.getType());
System.out.println("Owner: " + meta.getOwnerPrincipal());
System.out.println("Created: " + new Date(meta.getCreationTime()));
System.out.println("Properties: " + meta.getProperties());
System.out.println("Hive table: " + meta.getHiveTableName());
// Check if dataset exists
boolean exists = datasetClient.exists(datasetId);
System.out.println("Dataset exists: " + exists);// Create dataset with type name only
DatasetId simpleDataset = DatasetId.of(namespace, "simple-table");
datasetClient.create(simpleDataset, "table");
// Create dataset with configuration
Map<String, String> properties = Map.of(
"schema", "user_id:STRING,name:STRING,email:STRING,created_at:LONG",
"table.rowkey.template", "%s",
"table.rowkey.separator", "|"
);
DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(
"table",
properties,
"User profile data table"
);
DatasetId configuredDataset = DatasetId.of(namespace, "user-profiles");
datasetClient.create(configuredDataset, config);// Create partitioned dataset
Map<String, String> partitionedProperties = Map.of(
"schema", "timestamp:LONG,event_type:STRING,user_id:STRING,data:STRING",
"partitioning", "HASH(user_id, 10)",
"partition.key", "event_date",
"explore.table.name", "events"
);
DatasetInstanceConfiguration partitionedConfig = new DatasetInstanceConfiguration(
"partitionedFileSet",
partitionedProperties,
"Partitioned event data"
);
DatasetId eventsDataset = DatasetId.of(namespace, "events");
datasetClient.create(eventsDataset, partitionedConfig);
// Create time-partitioned dataset
Map<String, String> timePartitionedProperties = Map.of(
"schema", "user_id:STRING,action:STRING,timestamp:LONG,metadata:STRING",
"basePath", "/data/user-actions",
"partitioning.time.format", "yyyy-MM-dd-HH",
"explore.enabled", "true"
);
DatasetInstanceConfiguration timePartitionedConfig = new DatasetInstanceConfiguration(
"timePartitionedFileSet",
timePartitionedProperties,
"Time-partitioned user actions"
);
DatasetId actionsDataset = DatasetId.of(namespace, "user-actions");
datasetClient.create(actionsDataset, timePartitionedConfig);// Create dataset with comprehensive configuration
Map<String, String> advancedProperties = Map.of(
// Schema definition
"schema", "id:STRING,name:STRING,age:INT,email:STRING,created_at:LONG,updated_at:LONG",
// Table configuration
"table.rowkey.template", "%s",
"table.rowkey.separator", "|",
"table.name.template", "users_%s",
// Storage configuration
"table.compress.type", "SNAPPY",
"table.block.size", "65536",
"table.bloom.filter", "ROW",
// Indexing configuration
"explore.enabled", "true",
"explore.table.name", "users",
"explore.format", "parquet",
// TTL configuration
"table.ttl.seconds", "7776000" // 90 days
);
DatasetInstanceConfiguration advancedConfig = new DatasetInstanceConfiguration(
"table",
advancedProperties,
"User database with advanced configuration"
);
DatasetId advancedDataset = DatasetId.of(namespace, "users");
datasetClient.create(advancedDataset, advancedConfig);// Update dataset properties
Map<String, String> updatedProperties = Map.of(
"table.ttl.seconds", "15552000", // Extended to 180 days
"table.compress.type", "LZ4", // Changed compression
"new.property", "new-value" // Added new property
);
datasetClient.update(datasetId, updatedProperties);
System.out.println("Updated dataset properties");
// Update only existing properties (won't add new ones)
Map<String, String> existingUpdates = Map.of(
"table.ttl.seconds", "31104000" // 360 days
);
datasetClient.updateExisting(datasetId, existingUpdates);// Get current properties
Map<String, String> currentProperties = datasetClient.getProperties(datasetId);
System.out.println("Current properties: " + currentProperties);
// Merge with new properties
Map<String, String> mergedProperties = new HashMap<>(currentProperties);
mergedProperties.putAll(Map.of(
"updated.by", "admin",
"updated.timestamp", String.valueOf(System.currentTimeMillis())
));
datasetClient.update(datasetId, mergedProperties);// Truncate dataset (remove all data but keep structure)
try {
datasetClient.truncate(datasetId);
System.out.println("Dataset truncated successfully");
} catch (DatasetNotFoundException e) {
System.err.println("Dataset not found: " + datasetId);
} catch (UnsupportedOperationException e) {
System.err.println("Truncation not supported for this dataset type");
}
// Truncate with confirmation
String confirmation = getUserConfirmation("Truncate dataset " + datasetId.getDataset() + "? (yes/no): ");
if ("yes".equalsIgnoreCase(confirmation)) {
datasetClient.truncate(datasetId);
System.out.println("Dataset truncated");
} else {
System.out.println("Truncation cancelled");
}// Delete dataset
try {
datasetClient.delete(datasetId);
System.out.println("Dataset deleted: " + datasetId.getDataset());
} catch (DatasetNotFoundException e) {
System.err.println("Dataset not found: " + datasetId);
} catch (DatasetInUseException e) {
System.err.println("Cannot delete dataset - it's being used: " + e.getMessage());
}
// Safe deletion with checks
if (datasetClient.exists(datasetId)) {
try {
// Optional: Check if dataset is empty before deletion
DatasetMeta meta = datasetClient.get(datasetId);
System.out.println("Deleting dataset: " + meta.getSpec().getName());
datasetClient.delete(datasetId);
System.out.println("Dataset deleted successfully");
// Verify deletion
if (!datasetClient.exists(datasetId)) {
System.out.println("Deletion confirmed");
}
} catch (Exception e) {
System.err.println("Error deleting dataset: " + e.getMessage());
}
} else {
System.out.println("Dataset does not exist: " + datasetId.getDataset());
}// Basic table dataset
Map<String, String> tableProperties = Map.of(
"schema", "key:STRING,value:STRING,timestamp:LONG"
);
DatasetInstanceConfiguration tableConfig = new DatasetInstanceConfiguration(
"table", tableProperties, "Key-value table"
);
// Table with row key template
Map<String, String> complexTableProperties = Map.of(
"schema", "user_id:STRING,session_id:STRING,event_type:STRING,data:STRING",
"table.rowkey.template", "%s:%s", // user_id:session_id
"table.rowkey.separator", ":"
);// Basic file set
Map<String, String> fileSetProperties = Map.of(
"basePath", "/data/files",
"explore.enabled", "true"
);
DatasetInstanceConfiguration fileSetConfig = new DatasetInstanceConfiguration(
"fileSet", fileSetProperties, "File storage"
);
// Partitioned file set
Map<String, String> partitionedFileSetProperties = Map.of(
"basePath", "/data/partitioned",
"partitioning", "field:year INT, field:month INT, field:day INT",
"explore.enabled", "true",
"explore.format", "parquet"
);
DatasetInstanceConfiguration partitionedFileSetConfig = new DatasetInstanceConfiguration(
"partitionedFileSet", partitionedFileSetProperties, "Partitioned data files"
);// Time-partitioned file set with hourly partitions
Map<String, String> timePartitionedProperties = Map.of(
"basePath", "/data/time-series",
"partitioning.time.format", "yyyy-MM-dd/HH",
"explore.enabled", "true",
"explore.format", "avro",
"schema", "timestamp:LONG,sensor_id:STRING,value:DOUBLE,quality:STRING"
);
DatasetInstanceConfiguration timePartitionedConfig = new DatasetInstanceConfiguration(
"timePartitionedFileSet", timePartitionedProperties, "Time-series sensor data"
);// Create multiple datasets
List<DatasetCreationRequest> datasets = List.of(
new DatasetCreationRequest("logs", "table", Map.of("schema", "timestamp:LONG,level:STRING,message:STRING")),
new DatasetCreationRequest("metrics", "table", Map.of("schema", "time:LONG,name:STRING,value:DOUBLE")),
new DatasetCreationRequest("events", "partitionedFileSet", Map.of("basePath", "/data/events"))
);
for (DatasetCreationRequest request : datasets) {
try {
DatasetId id = DatasetId.of(namespace, request.name);
DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(
request.type, request.properties, "Auto-created dataset"
);
datasetClient.create(id, config);
System.out.println("Created dataset: " + request.name);
} catch (Exception e) {
System.err.println("Failed to create dataset " + request.name + ": " + e.getMessage());
}
}
// Helper class for bulk operations
private static class DatasetCreationRequest {
String name, type;
Map<String, String> properties;
DatasetCreationRequest(String name, String type, Map<String, String> properties) {
this.name = name;
this.type = type;
this.properties = properties;
}
}// Validate dataset configuration
public boolean validateDataset(DatasetId datasetId) {
try {
if (!datasetClient.exists(datasetId)) {
System.err.println("Dataset does not exist: " + datasetId.getDataset());
return false;
}
DatasetMeta meta = datasetClient.get(datasetId);
Map<String, String> properties = meta.getProperties();
// Validate schema if present
if (properties.containsKey("schema")) {
String schema = properties.get("schema");
if (schema == null || schema.trim().isEmpty()) {
System.err.println("Invalid schema for dataset: " + datasetId.getDataset());
return false;
}
}
// Validate required properties based on type
String type = meta.getType();
if ("partitionedFileSet".equals(type)) {
if (!properties.containsKey("basePath")) {
System.err.println("Missing basePath for partitioned dataset: " + datasetId.getDataset());
return false;
}
}
System.out.println("Dataset validation passed: " + datasetId.getDataset());
return true;
} catch (Exception e) {
System.err.println("Error validating dataset: " + e.getMessage());
return false;
}
}// Migrate dataset configuration
public void migrateDataset(DatasetId sourceId, DatasetId targetId, Map<String, String> newProperties) {
try {
// Get source dataset configuration
DatasetMeta sourceMeta = datasetClient.get(sourceId);
Map<String, String> sourceProperties = new HashMap<>(sourceMeta.getProperties());
// Merge with new properties
sourceProperties.putAll(newProperties);
// Create target dataset
DatasetInstanceConfiguration targetConfig = new DatasetInstanceConfiguration(
sourceMeta.getType(),
sourceProperties,
"Migrated from " + sourceId.getDataset()
);
datasetClient.create(targetId, targetConfig);
System.out.println("Migrated dataset from " + sourceId.getDataset() + " to " + targetId.getDataset());
// Optionally truncate or delete source
// datasetClient.truncate(sourceId);
} catch (Exception e) {
System.err.println("Error migrating dataset: " + e.getMessage());
}
}Dataset operations may throw these exceptions:
try {
DatasetMeta meta = datasetClient.get(datasetId);
System.out.println("Dataset type: " + meta.getType());
} catch (DatasetNotFoundException e) {
System.err.println("Dataset not found: " + datasetId.getDataset());
} catch (UnauthorizedException e) {
System.err.println("No permission to access dataset: " + e.getMessage());
} catch (IOException e) {
System.err.println("Network error: " + e.getMessage());
}// Good: Comprehensive dataset creation with proper configuration
public DatasetId createDatasetWithBestPractices(String name, String schema, Map<String, String> customProperties) {
DatasetId datasetId = DatasetId.of(namespace, name);
// Check if dataset already exists
if (datasetClient.exists(datasetId)) {
System.out.println("Dataset already exists: " + name);
return datasetId;
}
// Build properties with defaults and custom overrides
Map<String, String> properties = new HashMap<>(Map.of(
"schema", schema,
"table.compress.type", "SNAPPY",
"explore.enabled", "true",
"created.by", System.getProperty("user.name"),
"created.timestamp", String.valueOf(System.currentTimeMillis())
));
// Add custom properties
if (customProperties != null) {
properties.putAll(customProperties);
}
try {
DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(
"table",
properties,
"Dataset created with best practices: " + name
);
datasetClient.create(datasetId, config);
System.out.println("Successfully created dataset: " + name);
// Validate creation
if (datasetClient.exists(datasetId)) {
System.out.println("Dataset creation confirmed");
}
return datasetId;
} catch (Exception e) {
System.err.println("Error creating dataset " + name + ": " + e.getMessage());
throw new RuntimeException("Failed to create dataset", e);
}
}Install with Tessl CLI
npx tessl i tessl/maven-io-cdap-cdap--cdap-client