or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

application-management.mdartifact-management.mdconfiguration.mddata-operations.mddataset-operations.mdindex.mdmetrics-monitoring.mdprogram-control.mdschedule-management.mdsecurity-administration.mdservice-management.md

dataset-operations.mddocs/

0

# Dataset Operations

1

2

The DatasetClient provides comprehensive dataset management including creation, configuration, property management, and data operations like truncation. Datasets are persistent storage abstractions in CDAP that provide type-safe access to data.

3

4

## DatasetClient

5

6

```java { .api }

7

public class DatasetClient {

8

// Constructors

9

public DatasetClient(ClientConfig config);

10

public DatasetClient(ClientConfig config, RESTClient restClient);

11

12

// Dataset management methods

13

public List<DatasetSpecificationSummary> list(NamespaceId namespace);

14

public DatasetMeta get(DatasetId instance);

15

public void create(DatasetId instance, DatasetInstanceConfiguration properties);

16

public void create(DatasetId instance, String typeName);

17

public void update(DatasetId instance, Map<String, String> properties);

18

public void updateExisting(DatasetId instance, Map<String, String> properties);

19

public void delete(DatasetId instance);

20

public boolean exists(DatasetId instance);

21

public void truncate(DatasetId instance);

22

public Map<String, String> getProperties(DatasetId instance);

23

}

24

```

25

26

## Dataset Types and Metadata

27

28

```java { .api }

29

public class DatasetSpecificationSummary {

30

public String getName();

31

public String getType();

32

public String getDescription();

33

public Map<String, String> getProperties();

34

}

35

36

public class DatasetMeta {

37

public DatasetSpecification getSpec();

38

public String getType();

39

public long getCreationTime();

40

public String getOwnerPrincipal();

41

public Map<String, String> getProperties();

42

public String getHiveTableName();

43

}

44

45

public class DatasetId {

46

public static DatasetId of(NamespaceId namespace, String dataset);

47

public NamespaceId getNamespace();

48

public String getDataset();

49

}

50

51

public class DatasetInstanceConfiguration {

52

public DatasetInstanceConfiguration(String typeName, Map<String, String> properties);

53

public DatasetInstanceConfiguration(String typeName, Map<String, String> properties, String description);

54

public String getTypeName();

55

public Map<String, String> getProperties();

56

public String getDescription();

57

}

58

```

59

60

## Dataset Management

61

62

### Listing Datasets

63

64

```java

65

// List all datasets in namespace

66

List<DatasetSpecificationSummary> datasets = datasetClient.list(namespace);

67

System.out.println("Found " + datasets.size() + " datasets:");

68

69

for (DatasetSpecificationSummary dataset : datasets) {

70

System.out.println("- " + dataset.getName() + " (type: " + dataset.getType() + ")");

71

System.out.println(" Description: " + dataset.getDescription());

72

System.out.println(" Properties: " + dataset.getProperties());

73

}

74

```

75

76

### Dataset Information

77

78

```java

79

// Get detailed dataset information

80

DatasetId datasetId = DatasetId.of(namespace, "user-profiles");

81

DatasetMeta meta = datasetClient.get(datasetId);

82

83

System.out.println("Dataset: " + datasetId.getDataset());

84

System.out.println("Type: " + meta.getType());

85

System.out.println("Owner: " + meta.getOwnerPrincipal());

86

System.out.println("Created: " + new Date(meta.getCreationTime()));

87

System.out.println("Properties: " + meta.getProperties());

88

System.out.println("Hive table: " + meta.getHiveTableName());

89

90

// Check if dataset exists

91

boolean exists = datasetClient.exists(datasetId);

92

System.out.println("Dataset exists: " + exists);

93

```

94

95

## Dataset Creation

96

97

### Basic Dataset Creation

98

99

```java

100

// Create dataset with type name only

101

DatasetId simpleDataset = DatasetId.of(namespace, "simple-table");

102

datasetClient.create(simpleDataset, "table");

103

104

// Create dataset with configuration

105

Map<String, String> properties = Map.of(

106

"schema", "user_id:STRING,name:STRING,email:STRING,created_at:LONG",

107

"table.rowkey.template", "%s",

108

"table.rowkey.separator", "|"

109

);

110

111

DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(

112

"table",

113

properties,

114

"User profile data table"

115

);

116

117

DatasetId configuredDataset = DatasetId.of(namespace, "user-profiles");

118

datasetClient.create(configuredDataset, config);

119

```

120

121

### Advanced Dataset Creation

122

123

```java

124

// Create partitioned dataset

125

Map<String, String> partitionedProperties = Map.of(

126

"schema", "timestamp:LONG,event_type:STRING,user_id:STRING,data:STRING",

127

"partitioning", "HASH(user_id, 10)",

128

"partition.key", "event_date",

129

"explore.table.name", "events"

130

);

131

132

DatasetInstanceConfiguration partitionedConfig = new DatasetInstanceConfiguration(

133

"partitionedFileSet",

134

partitionedProperties,

135

"Partitioned event data"

136

);

137

138

DatasetId eventsDataset = DatasetId.of(namespace, "events");

139

datasetClient.create(eventsDataset, partitionedConfig);

140

141

// Create time-partitioned dataset

142

Map<String, String> timePartitionedProperties = Map.of(

143

"schema", "user_id:STRING,action:STRING,timestamp:LONG,metadata:STRING",

144

"basePath", "/data/user-actions",

145

"partitioning.time.format", "yyyy-MM-dd-HH",

146

"explore.enabled", "true"

147

);

148

149

DatasetInstanceConfiguration timePartitionedConfig = new DatasetInstanceConfiguration(

150

"timePartitionedFileSet",

151

timePartitionedProperties,

152

"Time-partitioned user actions"

153

);

154

155

DatasetId actionsDataset = DatasetId.of(namespace, "user-actions");

156

datasetClient.create(actionsDataset, timePartitionedConfig);

157

```

158

159

### Dataset with Custom Properties

160

161

```java

162

// Create dataset with comprehensive configuration

163

Map<String, String> advancedProperties = Map.of(

164

// Schema definition

165

"schema", "id:STRING,name:STRING,age:INT,email:STRING,created_at:LONG,updated_at:LONG",

166

167

// Table configuration

168

"table.rowkey.template", "%s",

169

"table.rowkey.separator", "|",

170

"table.name.template", "users_%s",

171

172

// Storage configuration

173

"table.compress.type", "SNAPPY",

174

"table.block.size", "65536",

175

"table.bloom.filter", "ROW",

176

177

// Indexing configuration

178

"explore.enabled", "true",

179

"explore.table.name", "users",

180

"explore.format", "parquet",

181

182

// TTL configuration

183

"table.ttl.seconds", "7776000" // 90 days

184

);

185

186

DatasetInstanceConfiguration advancedConfig = new DatasetInstanceConfiguration(

187

"table",

188

advancedProperties,

189

"User database with advanced configuration"

190

);

191

192

DatasetId advancedDataset = DatasetId.of(namespace, "users");

193

datasetClient.create(advancedDataset, advancedConfig);

194

```

195

196

## Dataset Updates

197

198

### Property Updates

199

200

```java

201

// Update dataset properties

202

Map<String, String> updatedProperties = Map.of(

203

"table.ttl.seconds", "15552000", // Extended to 180 days

204

"table.compress.type", "LZ4", // Changed compression

205

"new.property", "new-value" // Added new property

206

);

207

208

datasetClient.update(datasetId, updatedProperties);

209

System.out.println("Updated dataset properties");

210

211

// Update only existing properties (won't add new ones)

212

Map<String, String> existingUpdates = Map.of(

213

"table.ttl.seconds", "31104000" // 360 days

214

);

215

datasetClient.updateExisting(datasetId, existingUpdates);

216

```

217

218

### Property Management

219

220

```java

221

// Get current properties

222

Map<String, String> currentProperties = datasetClient.getProperties(datasetId);

223

System.out.println("Current properties: " + currentProperties);

224

225

// Merge with new properties

226

Map<String, String> mergedProperties = new HashMap<>(currentProperties);

227

mergedProperties.putAll(Map.of(

228

"updated.by", "admin",

229

"updated.timestamp", String.valueOf(System.currentTimeMillis())

230

));

231

232

datasetClient.update(datasetId, mergedProperties);

233

```

234

235

## Data Operations

236

237

### Dataset Truncation

238

239

```java

240

// Truncate dataset (remove all data but keep structure)

241

try {

242

datasetClient.truncate(datasetId);

243

System.out.println("Dataset truncated successfully");

244

} catch (DatasetNotFoundException e) {

245

System.err.println("Dataset not found: " + datasetId);

246

} catch (UnsupportedOperationException e) {

247

System.err.println("Truncation not supported for this dataset type");

248

}

249

250

// Truncate with confirmation

251

String confirmation = getUserConfirmation("Truncate dataset " + datasetId.getDataset() + "? (yes/no): ");

252

if ("yes".equalsIgnoreCase(confirmation)) {

253

datasetClient.truncate(datasetId);

254

System.out.println("Dataset truncated");

255

} else {

256

System.out.println("Truncation cancelled");

257

}

258

```

259

260

### Dataset Deletion

261

262

```java

263

// Delete dataset

264

try {

265

datasetClient.delete(datasetId);

266

System.out.println("Dataset deleted: " + datasetId.getDataset());

267

} catch (DatasetNotFoundException e) {

268

System.err.println("Dataset not found: " + datasetId);

269

} catch (DatasetInUseException e) {

270

System.err.println("Cannot delete dataset - it's being used: " + e.getMessage());

271

}

272

273

// Safe deletion with checks

274

if (datasetClient.exists(datasetId)) {

275

try {

276

// Optional: Check if dataset is empty before deletion

277

DatasetMeta meta = datasetClient.get(datasetId);

278

System.out.println("Deleting dataset: " + meta.getSpec().getName());

279

280

datasetClient.delete(datasetId);

281

System.out.println("Dataset deleted successfully");

282

283

// Verify deletion

284

if (!datasetClient.exists(datasetId)) {

285

System.out.println("Deletion confirmed");

286

}

287

} catch (Exception e) {

288

System.err.println("Error deleting dataset: " + e.getMessage());

289

}

290

} else {

291

System.out.println("Dataset does not exist: " + datasetId.getDataset());

292

}

293

```

294

295

## Dataset Types and Common Configurations

296

297

### Table Dataset

298

299

```java

300

// Basic table dataset

301

Map<String, String> tableProperties = Map.of(

302

"schema", "key:STRING,value:STRING,timestamp:LONG"

303

);

304

DatasetInstanceConfiguration tableConfig = new DatasetInstanceConfiguration(

305

"table", tableProperties, "Key-value table"

306

);

307

308

// Table with row key template

309

Map<String, String> complexTableProperties = Map.of(

310

"schema", "user_id:STRING,session_id:STRING,event_type:STRING,data:STRING",

311

"table.rowkey.template", "%s:%s", // user_id:session_id

312

"table.rowkey.separator", ":"

313

);

314

```

315

316

### FileSet Dataset

317

318

```java

319

// Basic file set

320

Map<String, String> fileSetProperties = Map.of(

321

"basePath", "/data/files",

322

"explore.enabled", "true"

323

);

324

DatasetInstanceConfiguration fileSetConfig = new DatasetInstanceConfiguration(

325

"fileSet", fileSetProperties, "File storage"

326

);

327

328

// Partitioned file set

329

Map<String, String> partitionedFileSetProperties = Map.of(

330

"basePath", "/data/partitioned",

331

"partitioning", "field:year INT, field:month INT, field:day INT",

332

"explore.enabled", "true",

333

"explore.format", "parquet"

334

);

335

DatasetInstanceConfiguration partitionedFileSetConfig = new DatasetInstanceConfiguration(

336

"partitionedFileSet", partitionedFileSetProperties, "Partitioned data files"

337

);

338

```

339

340

### Time-Partitioned FileSet

341

342

```java

343

// Time-partitioned file set with hourly partitions

344

Map<String, String> timePartitionedProperties = Map.of(

345

"basePath", "/data/time-series",

346

"partitioning.time.format", "yyyy-MM-dd/HH",

347

"explore.enabled", "true",

348

"explore.format", "avro",

349

"schema", "timestamp:LONG,sensor_id:STRING,value:DOUBLE,quality:STRING"

350

);

351

DatasetInstanceConfiguration timePartitionedConfig = new DatasetInstanceConfiguration(

352

"timePartitionedFileSet", timePartitionedProperties, "Time-series sensor data"

353

);

354

```

355

356

## Advanced Operations

357

358

### Bulk Dataset Operations

359

360

```java

361

// Create multiple datasets

362

List<DatasetCreationRequest> datasets = List.of(

363

new DatasetCreationRequest("logs", "table", Map.of("schema", "timestamp:LONG,level:STRING,message:STRING")),

364

new DatasetCreationRequest("metrics", "table", Map.of("schema", "time:LONG,name:STRING,value:DOUBLE")),

365

new DatasetCreationRequest("events", "partitionedFileSet", Map.of("basePath", "/data/events"))

366

);

367

368

for (DatasetCreationRequest request : datasets) {

369

try {

370

DatasetId id = DatasetId.of(namespace, request.name);

371

DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(

372

request.type, request.properties, "Auto-created dataset"

373

);

374

datasetClient.create(id, config);

375

System.out.println("Created dataset: " + request.name);

376

} catch (Exception e) {

377

System.err.println("Failed to create dataset " + request.name + ": " + e.getMessage());

378

}

379

}

380

381

// Helper class for bulk operations

382

private static class DatasetCreationRequest {

383

String name, type;

384

Map<String, String> properties;

385

386

DatasetCreationRequest(String name, String type, Map<String, String> properties) {

387

this.name = name;

388

this.type = type;

389

this.properties = properties;

390

}

391

}

392

```

393

394

### Dataset Validation and Health Checks

395

396

```java

397

// Validate dataset configuration

398

public boolean validateDataset(DatasetId datasetId) {

399

try {

400

if (!datasetClient.exists(datasetId)) {

401

System.err.println("Dataset does not exist: " + datasetId.getDataset());

402

return false;

403

}

404

405

DatasetMeta meta = datasetClient.get(datasetId);

406

Map<String, String> properties = meta.getProperties();

407

408

// Validate schema if present

409

if (properties.containsKey("schema")) {

410

String schema = properties.get("schema");

411

if (schema == null || schema.trim().isEmpty()) {

412

System.err.println("Invalid schema for dataset: " + datasetId.getDataset());

413

return false;

414

}

415

}

416

417

// Validate required properties based on type

418

String type = meta.getType();

419

if ("partitionedFileSet".equals(type)) {

420

if (!properties.containsKey("basePath")) {

421

System.err.println("Missing basePath for partitioned dataset: " + datasetId.getDataset());

422

return false;

423

}

424

}

425

426

System.out.println("Dataset validation passed: " + datasetId.getDataset());

427

return true;

428

429

} catch (Exception e) {

430

System.err.println("Error validating dataset: " + e.getMessage());

431

return false;

432

}

433

}

434

```

435

436

### Dataset Migration

437

438

```java

439

// Migrate dataset configuration

440

public void migrateDataset(DatasetId sourceId, DatasetId targetId, Map<String, String> newProperties) {

441

try {

442

// Get source dataset configuration

443

DatasetMeta sourceMeta = datasetClient.get(sourceId);

444

Map<String, String> sourceProperties = new HashMap<>(sourceMeta.getProperties());

445

446

// Merge with new properties

447

sourceProperties.putAll(newProperties);

448

449

// Create target dataset

450

DatasetInstanceConfiguration targetConfig = new DatasetInstanceConfiguration(

451

sourceMeta.getType(),

452

sourceProperties,

453

"Migrated from " + sourceId.getDataset()

454

);

455

456

datasetClient.create(targetId, targetConfig);

457

System.out.println("Migrated dataset from " + sourceId.getDataset() + " to " + targetId.getDataset());

458

459

// Optionally truncate or delete source

460

// datasetClient.truncate(sourceId);

461

462

} catch (Exception e) {

463

System.err.println("Error migrating dataset: " + e.getMessage());

464

}

465

}

466

```

467

468

## Error Handling

469

470

Dataset operations may throw these exceptions:

471

472

- **DatasetNotFoundException**: Dataset does not exist

473

- **DatasetAlreadyExistsException**: Dataset already exists during creation

474

- **DatasetTypeNotFoundException**: Specified dataset type is not available

475

- **DatasetInUseException**: Cannot delete or modify dataset that's being used

476

- **UnsupportedOperationException**: Operation not supported for dataset type

477

- **BadRequestException**: Invalid dataset configuration or parameters

478

479

```java

480

try {

481

DatasetMeta meta = datasetClient.get(datasetId);

482

System.out.println("Dataset type: " + meta.getType());

483

} catch (DatasetNotFoundException e) {

484

System.err.println("Dataset not found: " + datasetId.getDataset());

485

} catch (UnauthorizedException e) {

486

System.err.println("No permission to access dataset: " + e.getMessage());

487

} catch (IOException e) {

488

System.err.println("Network error: " + e.getMessage());

489

}

490

```

491

492

## Best Practices

493

494

1. **Schema Management**: Define clear, evolvable schemas for your datasets

495

2. **Naming Conventions**: Use consistent naming conventions for datasets

496

3. **Property Management**: Document dataset properties and their purposes

497

4. **Lifecycle Management**: Implement proper dataset lifecycle management

498

5. **Performance**: Configure appropriate compression and storage settings

499

6. **Monitoring**: Regularly check dataset health and usage patterns

500

501

```java

502

// Good: Comprehensive dataset creation with proper configuration

503

public DatasetId createDatasetWithBestPractices(String name, String schema, Map<String, String> customProperties) {

504

DatasetId datasetId = DatasetId.of(namespace, name);

505

506

// Check if dataset already exists

507

if (datasetClient.exists(datasetId)) {

508

System.out.println("Dataset already exists: " + name);

509

return datasetId;

510

}

511

512

// Build properties with defaults and custom overrides

513

Map<String, String> properties = new HashMap<>(Map.of(

514

"schema", schema,

515

"table.compress.type", "SNAPPY",

516

"explore.enabled", "true",

517

"created.by", System.getProperty("user.name"),

518

"created.timestamp", String.valueOf(System.currentTimeMillis())

519

));

520

521

// Add custom properties

522

if (customProperties != null) {

523

properties.putAll(customProperties);

524

}

525

526

try {

527

DatasetInstanceConfiguration config = new DatasetInstanceConfiguration(

528

"table",

529

properties,

530

"Dataset created with best practices: " + name

531

);

532

533

datasetClient.create(datasetId, config);

534

System.out.println("Successfully created dataset: " + name);

535

536

// Validate creation

537

if (datasetClient.exists(datasetId)) {

538

System.out.println("Dataset creation confirmed");

539

}

540

541

return datasetId;

542

543

} catch (Exception e) {

544

System.err.println("Error creating dataset " + name + ": " + e.getMessage());

545

throw new RuntimeException("Failed to create dataset", e);

546

}

547

}

548

```