0
# Dataset Operations
1
2
The DatasetClient provides comprehensive dataset management including creation, configuration, property management, and data operations like truncation. Datasets are persistent storage abstractions in CDAP that provide type-safe access to data.
3
4
## DatasetClient
5
6
```java { .api }
7
public class DatasetClient {
8
// Constructors
9
public DatasetClient(ClientConfig config);
10
public DatasetClient(ClientConfig config, RESTClient restClient);
11
12
// Dataset management methods
13
public List<DatasetSpecificationSummary> list(NamespaceId namespace);
14
public DatasetMeta get(DatasetId instance);
15
public void create(DatasetId instance, DatasetInstanceConfiguration properties);
16
public void create(DatasetId instance, String typeName);
17
public void update(DatasetId instance, Map<String, String> properties);
18
public void updateExisting(DatasetId instance, Map<String, String> properties);
19
public void delete(DatasetId instance);
20
public boolean exists(DatasetId instance);
21
public void truncate(DatasetId instance);
22
public Map<String, String> getProperties(DatasetId instance);
23
}
24
```
25
26
## Dataset Types and Metadata
27
28
```java { .api }
29
public class DatasetSpecificationSummary {
30
public String getName();
31
public String getType();
32
public String getDescription();
33
public Map<String, String> getProperties();
34
}
35
36
public class DatasetMeta {
37
public DatasetSpecification getSpec();
38
public String getType();
39
public long getCreationTime();
40
public String getOwnerPrincipal();
41
public Map<String, String> getProperties();
42
public String getHiveTableName();
43
}
44
45
public class DatasetId {
46
public static DatasetId of(NamespaceId namespace, String dataset);
47
public NamespaceId getNamespace();
48
public String getDataset();
49
}
50
51
public class DatasetInstanceConfiguration {
52
public DatasetInstanceConfiguration(String typeName, Map<String, String> properties);
53
public DatasetInstanceConfiguration(String typeName, Map<String, String> properties, String description);
54
public String getTypeName();
55
public Map<String, String> getProperties();
56
public String getDescription();
57
}
58
```
59
60
## Dataset Management
61
62
### Listing Datasets
63
64
```java
65
// List all datasets in namespace
66
List<DatasetSpecificationSummary> datasets = datasetClient.list(namespace);
67
System.out.println("Found " + datasets.size() + " datasets:");
68
69
for (DatasetSpecificationSummary dataset : datasets) {
70
System.out.println("- " + dataset.getName() + " (type: " + dataset.getType() + ")");
71
System.out.println(" Description: " + dataset.getDescription());
72
System.out.println(" Properties: " + dataset.getProperties());
73
}
74
```
75
76
### Dataset Information
77
78
```java
79
// Get detailed dataset information
80
DatasetId datasetId = DatasetId.of(namespace, "user-profiles");
81
DatasetMeta meta = datasetClient.get(datasetId);
82
83
System.out.println("Dataset: " + datasetId.getDataset());
84
System.out.println("Type: " + meta.getType());
85
System.out.println("Owner: " + meta.getOwnerPrincipal());
86
System.out.println("Created: " + new Date(meta.getCreationTime()));
87
System.out.println("Properties: " + meta.getProperties());
88
System.out.println("Hive table: " + meta.getHiveTableName());
89
90
// Check if dataset exists
91
boolean exists = datasetClient.exists(datasetId);
92
System.out.println("Dataset exists: " + exists);
93
```
94
95
## Dataset Creation
96
97
### Basic Dataset Creation
98
99
```java
100
// Create dataset with type name only
101
DatasetId simpleDataset = DatasetId.of(namespace, "simple-table");
102
datasetClient.create(simpleDataset, "table");
103
104
// Create dataset with configuration
105
Map<String, String> properties = Map.of(
106
"schema", "user_id:STRING,name:STRING,email:STRING,created_at:LONG",
107
"table.rowkey.template", "%s",
108
"table.rowkey.separator", "|"
109
);
110
111
DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(
112
"table",
113
properties,
114
"User profile data table"
115
);
116
117
DatasetId configuredDataset = DatasetId.of(namespace, "user-profiles");
118
datasetClient.create(configuredDataset, config);
119
```
120
121
### Advanced Dataset Creation
122
123
```java
124
// Create partitioned dataset
125
Map<String, String> partitionedProperties = Map.of(
126
"schema", "timestamp:LONG,event_type:STRING,user_id:STRING,data:STRING",
127
"partitioning", "HASH(user_id, 10)",
128
"partition.key", "event_date",
129
"explore.table.name", "events"
130
);
131
132
DatasetInstanceConfiguration partitionedConfig = new DatasetInstanceConfiguration(
133
"partitionedFileSet",
134
partitionedProperties,
135
"Partitioned event data"
136
);
137
138
DatasetId eventsDataset = DatasetId.of(namespace, "events");
139
datasetClient.create(eventsDataset, partitionedConfig);
140
141
// Create time-partitioned dataset
142
Map<String, String> timePartitionedProperties = Map.of(
143
"schema", "user_id:STRING,action:STRING,timestamp:LONG,metadata:STRING",
144
"basePath", "/data/user-actions",
145
"partitioning.time.format", "yyyy-MM-dd-HH",
146
"explore.enabled", "true"
147
);
148
149
DatasetInstanceConfiguration timePartitionedConfig = new DatasetInstanceConfiguration(
150
"timePartitionedFileSet",
151
timePartitionedProperties,
152
"Time-partitioned user actions"
153
);
154
155
DatasetId actionsDataset = DatasetId.of(namespace, "user-actions");
156
datasetClient.create(actionsDataset, timePartitionedConfig);
157
```
158
159
### Dataset with Custom Properties
160
161
```java
162
// Create dataset with comprehensive configuration
163
Map<String, String> advancedProperties = Map.of(
164
// Schema definition
165
"schema", "id:STRING,name:STRING,age:INT,email:STRING,created_at:LONG,updated_at:LONG",
166
167
// Table configuration
168
"table.rowkey.template", "%s",
169
"table.rowkey.separator", "|",
170
"table.name.template", "users_%s",
171
172
// Storage configuration
173
"table.compress.type", "SNAPPY",
174
"table.block.size", "65536",
175
"table.bloom.filter", "ROW",
176
177
// Indexing configuration
178
"explore.enabled", "true",
179
"explore.table.name", "users",
180
"explore.format", "parquet",
181
182
// TTL configuration
183
"table.ttl.seconds", "7776000" // 90 days
184
);
185
186
DatasetInstanceConfiguration advancedConfig = new DatasetInstanceConfiguration(
187
"table",
188
advancedProperties,
189
"User database with advanced configuration"
190
);
191
192
DatasetId advancedDataset = DatasetId.of(namespace, "users");
193
datasetClient.create(advancedDataset, advancedConfig);
194
```
195
196
## Dataset Updates
197
198
### Property Updates
199
200
```java
201
// Update dataset properties
202
Map<String, String> updatedProperties = Map.of(
203
"table.ttl.seconds", "15552000", // Extended to 180 days
204
"table.compress.type", "LZ4", // Changed compression
205
"new.property", "new-value" // Added new property
206
);
207
208
datasetClient.update(datasetId, updatedProperties);
209
System.out.println("Updated dataset properties");
210
211
// Update only existing properties (won't add new ones)
212
Map<String, String> existingUpdates = Map.of(
213
"table.ttl.seconds", "31104000" // 360 days
214
);
215
datasetClient.updateExisting(datasetId, existingUpdates);
216
```
217
218
### Property Management
219
220
```java
221
// Get current properties
222
Map<String, String> currentProperties = datasetClient.getProperties(datasetId);
223
System.out.println("Current properties: " + currentProperties);
224
225
// Merge with new properties
226
Map<String, String> mergedProperties = new HashMap<>(currentProperties);
227
mergedProperties.putAll(Map.of(
228
"updated.by", "admin",
229
"updated.timestamp", String.valueOf(System.currentTimeMillis())
230
));
231
232
datasetClient.update(datasetId, mergedProperties);
233
```
234
235
## Data Operations
236
237
### Dataset Truncation
238
239
```java
240
// Truncate dataset (remove all data but keep structure)
241
try {
242
datasetClient.truncate(datasetId);
243
System.out.println("Dataset truncated successfully");
244
} catch (DatasetNotFoundException e) {
245
System.err.println("Dataset not found: " + datasetId);
246
} catch (UnsupportedOperationException e) {
247
System.err.println("Truncation not supported for this dataset type");
248
}
249
250
// Truncate with confirmation
251
String confirmation = getUserConfirmation("Truncate dataset " + datasetId.getDataset() + "? (yes/no): ");
252
if ("yes".equalsIgnoreCase(confirmation)) {
253
datasetClient.truncate(datasetId);
254
System.out.println("Dataset truncated");
255
} else {
256
System.out.println("Truncation cancelled");
257
}
258
```
259
260
### Dataset Deletion
261
262
```java
263
// Delete dataset
264
try {
265
datasetClient.delete(datasetId);
266
System.out.println("Dataset deleted: " + datasetId.getDataset());
267
} catch (DatasetNotFoundException e) {
268
System.err.println("Dataset not found: " + datasetId);
269
} catch (DatasetInUseException e) {
270
System.err.println("Cannot delete dataset - it's being used: " + e.getMessage());
271
}
272
273
// Safe deletion with checks
274
if (datasetClient.exists(datasetId)) {
275
try {
276
// Optional: Check if dataset is empty before deletion
277
DatasetMeta meta = datasetClient.get(datasetId);
278
System.out.println("Deleting dataset: " + meta.getSpec().getName());
279
280
datasetClient.delete(datasetId);
281
System.out.println("Dataset deleted successfully");
282
283
// Verify deletion
284
if (!datasetClient.exists(datasetId)) {
285
System.out.println("Deletion confirmed");
286
}
287
} catch (Exception e) {
288
System.err.println("Error deleting dataset: " + e.getMessage());
289
}
290
} else {
291
System.out.println("Dataset does not exist: " + datasetId.getDataset());
292
}
293
```
294
295
## Dataset Types and Common Configurations
296
297
### Table Dataset
298
299
```java
300
// Basic table dataset
301
Map<String, String> tableProperties = Map.of(
302
"schema", "key:STRING,value:STRING,timestamp:LONG"
303
);
304
DatasetInstanceConfiguration tableConfig = new DatasetInstanceConfiguration(
305
"table", tableProperties, "Key-value table"
306
);
307
308
// Table with row key template
309
Map<String, String> complexTableProperties = Map.of(
310
"schema", "user_id:STRING,session_id:STRING,event_type:STRING,data:STRING",
311
"table.rowkey.template", "%s:%s", // user_id:session_id
312
"table.rowkey.separator", ":"
313
);
314
```
315
316
### FileSet Dataset
317
318
```java
319
// Basic file set
320
Map<String, String> fileSetProperties = Map.of(
321
"basePath", "/data/files",
322
"explore.enabled", "true"
323
);
324
DatasetInstanceConfiguration fileSetConfig = new DatasetInstanceConfiguration(
325
"fileSet", fileSetProperties, "File storage"
326
);
327
328
// Partitioned file set
329
Map<String, String> partitionedFileSetProperties = Map.of(
330
"basePath", "/data/partitioned",
331
"partitioning", "field:year INT, field:month INT, field:day INT",
332
"explore.enabled", "true",
333
"explore.format", "parquet"
334
);
335
DatasetInstanceConfiguration partitionedFileSetConfig = new DatasetInstanceConfiguration(
336
"partitionedFileSet", partitionedFileSetProperties, "Partitioned data files"
337
);
338
```
339
340
### Time-Partitioned FileSet
341
342
```java
343
// Time-partitioned file set with hourly partitions
344
Map<String, String> timePartitionedProperties = Map.of(
345
"basePath", "/data/time-series",
346
"partitioning.time.format", "yyyy-MM-dd/HH",
347
"explore.enabled", "true",
348
"explore.format", "avro",
349
"schema", "timestamp:LONG,sensor_id:STRING,value:DOUBLE,quality:STRING"
350
);
351
DatasetInstanceConfiguration timePartitionedConfig = new DatasetInstanceConfiguration(
352
"timePartitionedFileSet", timePartitionedProperties, "Time-series sensor data"
353
);
354
```
355
356
## Advanced Operations
357
358
### Bulk Dataset Operations
359
360
```java
361
// Create multiple datasets
362
List<DatasetCreationRequest> datasets = List.of(
363
new DatasetCreationRequest("logs", "table", Map.of("schema", "timestamp:LONG,level:STRING,message:STRING")),
364
new DatasetCreationRequest("metrics", "table", Map.of("schema", "time:LONG,name:STRING,value:DOUBLE")),
365
new DatasetCreationRequest("events", "partitionedFileSet", Map.of("basePath", "/data/events"))
366
);
367
368
for (DatasetCreationRequest request : datasets) {
369
try {
370
DatasetId id = DatasetId.of(namespace, request.name);
371
DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(
372
request.type, request.properties, "Auto-created dataset"
373
);
374
datasetClient.create(id, config);
375
System.out.println("Created dataset: " + request.name);
376
} catch (Exception e) {
377
System.err.println("Failed to create dataset " + request.name + ": " + e.getMessage());
378
}
379
}
380
381
// Helper class for bulk operations
382
private static class DatasetCreationRequest {
383
String name, type;
384
Map<String, String> properties;
385
386
DatasetCreationRequest(String name, String type, Map<String, String> properties) {
387
this.name = name;
388
this.type = type;
389
this.properties = properties;
390
}
391
}
392
```
393
394
### Dataset Validation and Health Checks
395
396
```java
397
// Validate dataset configuration
398
public boolean validateDataset(DatasetId datasetId) {
399
try {
400
if (!datasetClient.exists(datasetId)) {
401
System.err.println("Dataset does not exist: " + datasetId.getDataset());
402
return false;
403
}
404
405
DatasetMeta meta = datasetClient.get(datasetId);
406
Map<String, String> properties = meta.getProperties();
407
408
// Validate schema if present
409
if (properties.containsKey("schema")) {
410
String schema = properties.get("schema");
411
if (schema == null || schema.trim().isEmpty()) {
412
System.err.println("Invalid schema for dataset: " + datasetId.getDataset());
413
return false;
414
}
415
}
416
417
// Validate required properties based on type
418
String type = meta.getType();
419
if ("partitionedFileSet".equals(type)) {
420
if (!properties.containsKey("basePath")) {
421
System.err.println("Missing basePath for partitioned dataset: " + datasetId.getDataset());
422
return false;
423
}
424
}
425
426
System.out.println("Dataset validation passed: " + datasetId.getDataset());
427
return true;
428
429
} catch (Exception e) {
430
System.err.println("Error validating dataset: " + e.getMessage());
431
return false;
432
}
433
}
434
```
435
436
### Dataset Migration
437
438
```java
439
// Migrate dataset configuration
440
public void migrateDataset(DatasetId sourceId, DatasetId targetId, Map<String, String> newProperties) {
441
try {
442
// Get source dataset configuration
443
DatasetMeta sourceMeta = datasetClient.get(sourceId);
444
Map<String, String> sourceProperties = new HashMap<>(sourceMeta.getProperties());
445
446
// Merge with new properties
447
sourceProperties.putAll(newProperties);
448
449
// Create target dataset
450
DatasetInstanceConfiguration targetConfig = new DatasetInstanceConfiguration(
451
sourceMeta.getType(),
452
sourceProperties,
453
"Migrated from " + sourceId.getDataset()
454
);
455
456
datasetClient.create(targetId, targetConfig);
457
System.out.println("Migrated dataset from " + sourceId.getDataset() + " to " + targetId.getDataset());
458
459
// Optionally truncate or delete source
460
// datasetClient.truncate(sourceId);
461
462
} catch (Exception e) {
463
System.err.println("Error migrating dataset: " + e.getMessage());
464
}
465
}
466
```
467
468
## Error Handling
469
470
Dataset operations may throw these exceptions:
471
472
- **DatasetNotFoundException**: Dataset does not exist
473
- **DatasetAlreadyExistsException**: Dataset already exists during creation
474
- **DatasetTypeNotFoundException**: Specified dataset type is not available
475
- **DatasetInUseException**: Cannot delete or modify dataset that's being used
476
- **UnsupportedOperationException**: Operation not supported for dataset type
477
- **BadRequestException**: Invalid dataset configuration or parameters
478
479
```java
480
try {
481
DatasetMeta meta = datasetClient.get(datasetId);
482
System.out.println("Dataset type: " + meta.getType());
483
} catch (DatasetNotFoundException e) {
484
System.err.println("Dataset not found: " + datasetId.getDataset());
485
} catch (UnauthorizedException e) {
486
System.err.println("No permission to access dataset: " + e.getMessage());
487
} catch (IOException e) {
488
System.err.println("Network error: " + e.getMessage());
489
}
490
```
491
492
## Best Practices
493
494
1. **Schema Management**: Define clear, evolvable schemas for your datasets
495
2. **Naming Conventions**: Use consistent naming conventions for datasets
496
3. **Property Management**: Document dataset properties and their purposes
497
4. **Lifecycle Management**: Implement proper dataset lifecycle management
498
5. **Performance**: Configure appropriate compression and storage settings
499
6. **Monitoring**: Regularly check dataset health and usage patterns
500
501
```java
502
// Good: Comprehensive dataset creation with proper configuration
503
public DatasetId createDatasetWithBestPractices(String name, String schema, Map<String, String> customProperties) {
504
DatasetId datasetId = DatasetId.of(namespace, name);
505
506
// Check if dataset already exists
507
if (datasetClient.exists(datasetId)) {
508
System.out.println("Dataset already exists: " + name);
509
return datasetId;
510
}
511
512
// Build properties with defaults and custom overrides
513
Map<String, String> properties = new HashMap<>(Map.of(
514
"schema", schema,
515
"table.compress.type", "SNAPPY",
516
"explore.enabled", "true",
517
"created.by", System.getProperty("user.name"),
518
"created.timestamp", String.valueOf(System.currentTimeMillis())
519
));
520
521
// Add custom properties
522
if (customProperties != null) {
523
properties.putAll(customProperties);
524
}
525
526
try {
527
DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(
528
"table",
529
properties,
530
"Dataset created with best practices: " + name
531
);
532
533
datasetClient.create(datasetId, config);
534
System.out.println("Successfully created dataset: " + name);
535
536
// Validate creation
537
if (datasetClient.exists(datasetId)) {
538
System.out.println("Dataset creation confirmed");
539
}
540
541
return datasetId;
542
543
} catch (Exception e) {
544
System.err.println("Error creating dataset " + name + ": " + e.getMessage());
545
throw new RuntimeException("Failed to create dataset", e);
546
}
547
}
548
```