CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/maven-io-cdap-cdap--cdap-client

CDAP Java Client library providing programmatic APIs for interacting with the CDAP platform

Pending
Overview
Eval results
Files

dataset-operations.mddocs/

Dataset Operations

The DatasetClient provides comprehensive dataset management including creation, configuration, property management, and data operations like truncation. Datasets are persistent storage abstractions in CDAP that provide type-safe access to data.

DatasetClient

public class DatasetClient {
    // Constructors
    public DatasetClient(ClientConfig config);
    public DatasetClient(ClientConfig config, RESTClient restClient);
    
    // Dataset management methods
    public List<DatasetSpecificationSummary> list(NamespaceId namespace);
    public DatasetMeta get(DatasetId instance);
    public void create(DatasetId instance, DatasetInstanceConfiguration properties);
    public void create(DatasetId instance, String typeName);
    public void update(DatasetId instance, Map<String, String> properties);
    public void updateExisting(DatasetId instance, Map<String, String> properties);
    public void delete(DatasetId instance);
    public boolean exists(DatasetId instance);
    public void truncate(DatasetId instance);
    public Map<String, String> getProperties(DatasetId instance);
}

Dataset Types and Metadata

public class DatasetSpecificationSummary {
    public String getName();
    public String getType();
    public String getDescription();
    public Map<String, String> getProperties();
}

public class DatasetMeta {
    public DatasetSpecification getSpec();
    public String getType();
    public long getCreationTime();
    public String getOwnerPrincipal();
    public Map<String, String> getProperties();
    public String getHiveTableName();
}

public class DatasetId {
    public static DatasetId of(NamespaceId namespace, String dataset);
    public NamespaceId getNamespace();
    public String getDataset();
}

public class DatasetInstanceConfiguration {
    public DatasetInstanceConfiguration(String typeName, Map<String, String> properties);
    public DatasetInstanceConfiguration(String typeName, Map<String, String> properties, String description);
    public String getTypeName();
    public Map<String, String> getProperties();
    public String getDescription();
}

Dataset Management

Listing Datasets

// List all datasets in namespace
List<DatasetSpecificationSummary> datasets = datasetClient.list(namespace);
System.out.println("Found " + datasets.size() + " datasets:");

for (DatasetSpecificationSummary dataset : datasets) {
    System.out.println("- " + dataset.getName() + " (type: " + dataset.getType() + ")");
    System.out.println("  Description: " + dataset.getDescription());
    System.out.println("  Properties: " + dataset.getProperties());
}

Dataset Information

// Get detailed dataset information
DatasetId datasetId = DatasetId.of(namespace, "user-profiles");
DatasetMeta meta = datasetClient.get(datasetId);

System.out.println("Dataset: " + datasetId.getDataset());
System.out.println("Type: " + meta.getType());
System.out.println("Owner: " + meta.getOwnerPrincipal());
System.out.println("Created: " + new Date(meta.getCreationTime()));
System.out.println("Properties: " + meta.getProperties());
System.out.println("Hive table: " + meta.getHiveTableName());

// Check if dataset exists
boolean exists = datasetClient.exists(datasetId);
System.out.println("Dataset exists: " + exists);

Dataset Creation

Basic Dataset Creation

// Create dataset with type name only
DatasetId simpleDataset = DatasetId.of(namespace, "simple-table");
datasetClient.create(simpleDataset, "table");

// Create dataset with configuration
Map<String, String> properties = Map.of(
    "schema", "user_id:STRING,name:STRING,email:STRING,created_at:LONG",
    "table.rowkey.template", "%s",
    "table.rowkey.separator", "|"
);

DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(
    "table",
    properties,
    "User profile data table"
);

DatasetId configuredDataset = DatasetId.of(namespace, "user-profiles");
datasetClient.create(configuredDataset, config);

Advanced Dataset Creation

// Create partitioned dataset
Map<String, String> partitionedProperties = Map.of(
    "schema", "timestamp:LONG,event_type:STRING,user_id:STRING,data:STRING",
    "partitioning", "HASH(user_id, 10)",
    "partition.key", "event_date",
    "explore.table.name", "events"
);

DatasetInstanceConfiguration partitionedConfig = new DatasetInstanceConfiguration(
    "partitionedFileSet",
    partitionedProperties,
    "Partitioned event data"
);

DatasetId eventsDataset = DatasetId.of(namespace, "events");
datasetClient.create(eventsDataset, partitionedConfig);

// Create time-partitioned dataset
Map<String, String> timePartitionedProperties = Map.of(
    "schema", "user_id:STRING,action:STRING,timestamp:LONG,metadata:STRING",
    "basePath", "/data/user-actions",
    "partitioning.time.format", "yyyy-MM-dd-HH",
    "explore.enabled", "true"
);

DatasetInstanceConfiguration timePartitionedConfig = new DatasetInstanceConfiguration(
    "timePartitionedFileSet",
    timePartitionedProperties,
    "Time-partitioned user actions"
);

DatasetId actionsDataset = DatasetId.of(namespace, "user-actions");
datasetClient.create(actionsDataset, timePartitionedConfig);

Dataset with Custom Properties

// Create dataset with comprehensive configuration
Map<String, String> advancedProperties = Map.of(
    // Schema definition
    "schema", "id:STRING,name:STRING,age:INT,email:STRING,created_at:LONG,updated_at:LONG",
    
    // Table configuration
    "table.rowkey.template", "%s",
    "table.rowkey.separator", "|",
    "table.name.template", "users_%s",
    
    // Storage configuration  
    "table.compress.type", "SNAPPY",
    "table.block.size", "65536",
    "table.bloom.filter", "ROW",
    
    // Indexing configuration
    "explore.enabled", "true",
    "explore.table.name", "users",
    "explore.format", "parquet",
    
    // TTL configuration
    "table.ttl.seconds", "7776000" // 90 days
);

DatasetInstanceConfiguration advancedConfig = new DatasetInstanceConfiguration(
    "table",
    advancedProperties,
    "User database with advanced configuration"
);

DatasetId advancedDataset = DatasetId.of(namespace, "users");
datasetClient.create(advancedDataset, advancedConfig);

Dataset Updates

Property Updates

// Update dataset properties
Map<String, String> updatedProperties = Map.of(
    "table.ttl.seconds", "15552000", // Extended to 180 days
    "table.compress.type", "LZ4",     // Changed compression
    "new.property", "new-value"       // Added new property
);

datasetClient.update(datasetId, updatedProperties);
System.out.println("Updated dataset properties");

// Update only existing properties (won't add new ones)
Map<String, String> existingUpdates = Map.of(
    "table.ttl.seconds", "31104000" // 360 days
);
datasetClient.updateExisting(datasetId, existingUpdates);

Property Management

// Get current properties
Map<String, String> currentProperties = datasetClient.getProperties(datasetId);
System.out.println("Current properties: " + currentProperties);

// Merge with new properties
Map<String, String> mergedProperties = new HashMap<>(currentProperties);
mergedProperties.putAll(Map.of(
    "updated.by", "admin",
    "updated.timestamp", String.valueOf(System.currentTimeMillis())
));

datasetClient.update(datasetId, mergedProperties);

Data Operations

Dataset Truncation

// Truncate dataset (remove all data but keep structure)
try {
    datasetClient.truncate(datasetId);
    System.out.println("Dataset truncated successfully");
} catch (DatasetNotFoundException e) {
    System.err.println("Dataset not found: " + datasetId);
} catch (UnsupportedOperationException e) {
    System.err.println("Truncation not supported for this dataset type");
}

// Truncate with confirmation
String confirmation = getUserConfirmation("Truncate dataset " + datasetId.getDataset() + "? (yes/no): ");
if ("yes".equalsIgnoreCase(confirmation)) {
    datasetClient.truncate(datasetId);
    System.out.println("Dataset truncated");
} else {
    System.out.println("Truncation cancelled");
}

Dataset Deletion

// Delete dataset
try {
    datasetClient.delete(datasetId);
    System.out.println("Dataset deleted: " + datasetId.getDataset());
} catch (DatasetNotFoundException e) {
    System.err.println("Dataset not found: " + datasetId);
} catch (DatasetInUseException e) {
    System.err.println("Cannot delete dataset - it's being used: " + e.getMessage());
}

// Safe deletion with checks
if (datasetClient.exists(datasetId)) {
    try {
        // Optional: Check if dataset is empty before deletion
        DatasetMeta meta = datasetClient.get(datasetId);
        System.out.println("Deleting dataset: " + meta.getSpec().getName());
        
        datasetClient.delete(datasetId);
        System.out.println("Dataset deleted successfully");
        
        // Verify deletion
        if (!datasetClient.exists(datasetId)) {
            System.out.println("Deletion confirmed");
        }
    } catch (Exception e) {
        System.err.println("Error deleting dataset: " + e.getMessage());
    }
} else {
    System.out.println("Dataset does not exist: " + datasetId.getDataset());
}

Dataset Types and Common Configurations

Table Dataset

// Basic table dataset
Map<String, String> tableProperties = Map.of(
    "schema", "key:STRING,value:STRING,timestamp:LONG"
);
DatasetInstanceConfiguration tableConfig = new DatasetInstanceConfiguration(
    "table", tableProperties, "Key-value table"
);

// Table with row key template
Map<String, String> complexTableProperties = Map.of(
    "schema", "user_id:STRING,session_id:STRING,event_type:STRING,data:STRING",
    "table.rowkey.template", "%s:%s", // user_id:session_id
    "table.rowkey.separator", ":"
);

FileSet Dataset

// Basic file set
Map<String, String> fileSetProperties = Map.of(
    "basePath", "/data/files",
    "explore.enabled", "true"
);
DatasetInstanceConfiguration fileSetConfig = new DatasetInstanceConfiguration(
    "fileSet", fileSetProperties, "File storage"
);

// Partitioned file set
Map<String, String> partitionedFileSetProperties = Map.of(
    "basePath", "/data/partitioned",
    "partitioning", "field:year INT, field:month INT, field:day INT",
    "explore.enabled", "true",
    "explore.format", "parquet"
);
DatasetInstanceConfiguration partitionedFileSetConfig = new DatasetInstanceConfiguration(
    "partitionedFileSet", partitionedFileSetProperties, "Partitioned data files"
);

Time-Partitioned FileSet

// Time-partitioned file set with hourly partitions
Map<String, String> timePartitionedProperties = Map.of(
    "basePath", "/data/time-series",
    "partitioning.time.format", "yyyy-MM-dd/HH",
    "explore.enabled", "true",
    "explore.format", "avro",
    "schema", "timestamp:LONG,sensor_id:STRING,value:DOUBLE,quality:STRING"
);
DatasetInstanceConfiguration timePartitionedConfig = new DatasetInstanceConfiguration(
    "timePartitionedFileSet", timePartitionedProperties, "Time-series sensor data"
);

Advanced Operations

Bulk Dataset Operations

// Create multiple datasets
List<DatasetCreationRequest> datasets = List.of(
    new DatasetCreationRequest("logs", "table", Map.of("schema", "timestamp:LONG,level:STRING,message:STRING")),
    new DatasetCreationRequest("metrics", "table", Map.of("schema", "time:LONG,name:STRING,value:DOUBLE")),
    new DatasetCreationRequest("events", "partitionedFileSet", Map.of("basePath", "/data/events"))
);

for (DatasetCreationRequest request : datasets) {
    try {
        DatasetId id = DatasetId.of(namespace, request.name);
        DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(
            request.type, request.properties, "Auto-created dataset"
        );
        datasetClient.create(id, config);
        System.out.println("Created dataset: " + request.name);
    } catch (Exception e) {
        System.err.println("Failed to create dataset " + request.name + ": " + e.getMessage());
    }
}

// Helper class for bulk operations
private static class DatasetCreationRequest {
    String name, type;
    Map<String, String> properties;
    
    DatasetCreationRequest(String name, String type, Map<String, String> properties) {
        this.name = name;
        this.type = type;
        this.properties = properties;
    }
}

Dataset Validation and Health Checks

// Validate dataset configuration
public boolean validateDataset(DatasetId datasetId) {
    try {
        if (!datasetClient.exists(datasetId)) {
            System.err.println("Dataset does not exist: " + datasetId.getDataset());
            return false;
        }
        
        DatasetMeta meta = datasetClient.get(datasetId);
        Map<String, String> properties = meta.getProperties();
        
        // Validate schema if present
        if (properties.containsKey("schema")) {
            String schema = properties.get("schema");
            if (schema == null || schema.trim().isEmpty()) {
                System.err.println("Invalid schema for dataset: " + datasetId.getDataset());
                return false;
            }
        }
        
        // Validate required properties based on type
        String type = meta.getType();
        if ("partitionedFileSet".equals(type)) {
            if (!properties.containsKey("basePath")) {
                System.err.println("Missing basePath for partitioned dataset: " + datasetId.getDataset());
                return false;
            }
        }
        
        System.out.println("Dataset validation passed: " + datasetId.getDataset());
        return true;
        
    } catch (Exception e) {
        System.err.println("Error validating dataset: " + e.getMessage());
        return false;
    }
}

Dataset Migration

// Migrate dataset configuration
public void migrateDataset(DatasetId sourceId, DatasetId targetId, Map<String, String> newProperties) {
    try {
        // Get source dataset configuration
        DatasetMeta sourceMeta = datasetClient.get(sourceId);
        Map<String, String> sourceProperties = new HashMap<>(sourceMeta.getProperties());
        
        // Merge with new properties
        sourceProperties.putAll(newProperties);
        
        // Create target dataset
        DatasetInstanceConfiguration targetConfig = new DatasetInstanceConfiguration(
            sourceMeta.getType(),
            sourceProperties,
            "Migrated from " + sourceId.getDataset()
        );
        
        datasetClient.create(targetId, targetConfig);
        System.out.println("Migrated dataset from " + sourceId.getDataset() + " to " + targetId.getDataset());
        
        // Optionally truncate or delete source
        // datasetClient.truncate(sourceId);
        
    } catch (Exception e) {
        System.err.println("Error migrating dataset: " + e.getMessage());
    }
}

Error Handling

Dataset operations may throw these exceptions:

  • DatasetNotFoundException: Dataset does not exist
  • DatasetAlreadyExistsException: Dataset already exists during creation
  • DatasetTypeNotFoundException: Specified dataset type is not available
  • DatasetInUseException: Cannot delete or modify dataset that's being used
  • UnsupportedOperationException: Operation not supported for dataset type
  • BadRequestException: Invalid dataset configuration or parameters
try {
    DatasetMeta meta = datasetClient.get(datasetId);
    System.out.println("Dataset type: " + meta.getType());
} catch (DatasetNotFoundException e) {
    System.err.println("Dataset not found: " + datasetId.getDataset());
} catch (UnauthorizedException e) {
    System.err.println("No permission to access dataset: " + e.getMessage());
} catch (IOException e) {
    System.err.println("Network error: " + e.getMessage());
}

Best Practices

  1. Schema Management: Define clear, evolvable schemas for your datasets
  2. Naming Conventions: Use consistent naming conventions for datasets
  3. Property Management: Document dataset properties and their purposes
  4. Lifecycle Management: Implement proper dataset lifecycle management
  5. Performance: Configure appropriate compression and storage settings
  6. Monitoring: Regularly check dataset health and usage patterns
// Good: Comprehensive dataset creation with proper configuration
public DatasetId createDatasetWithBestPractices(String name, String schema, Map<String, String> customProperties) {
    DatasetId datasetId = DatasetId.of(namespace, name);
    
    // Check if dataset already exists
    if (datasetClient.exists(datasetId)) {
        System.out.println("Dataset already exists: " + name);
        return datasetId;
    }
    
    // Build properties with defaults and custom overrides
    Map<String, String> properties = new HashMap<>(Map.of(
        "schema", schema,
        "table.compress.type", "SNAPPY",
        "explore.enabled", "true",
        "created.by", System.getProperty("user.name"),
        "created.timestamp", String.valueOf(System.currentTimeMillis())
    ));
    
    // Add custom properties
    if (customProperties != null) {
        properties.putAll(customProperties);
    }
    
    try {
        DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(
            "table",
            properties,
            "Dataset created with best practices: " + name
        );
        
        datasetClient.create(datasetId, config);
        System.out.println("Successfully created dataset: " + name);
        
        // Validate creation
        if (datasetClient.exists(datasetId)) {
            System.out.println("Dataset creation confirmed");
        }
        
        return datasetId;
        
    } catch (Exception e) {
        System.err.println("Error creating dataset " + name + ": " + e.getMessage());
        throw new RuntimeException("Failed to create dataset", e);
    }
}

Install with Tessl CLI

npx tessl i tessl/maven-io-cdap-cdap--cdap-client

docs

application-management.md

artifact-management.md

configuration.md

data-operations.md

dataset-operations.md

index.md

metrics-monitoring.md

program-control.md

schedule-management.md

security-administration.md

service-management.md

tile.json