or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

audit-compliance.mddataset-management.mdindex.mdmetadata-management.mdnamespace-management.mdstream-processing.mdtransaction-management.mdusage-registry.md

dataset-management.mddocs/

0

# Dataset Management

1

2

Comprehensive dataset lifecycle management including creation, configuration, access, and administration across multiple storage backends with transaction support and lineage tracking. The Dataset Framework provides a unified API for managing datasets regardless of underlying storage technology.

3

4

## Capabilities

5

6

### Dataset Framework Operations

7

8

The primary interface for all dataset management operations, providing comprehensive lifecycle management across different storage backends.

9

10

```java { .api }

11

public interface DatasetFramework {

12

// Module Management

13

void addModule(DatasetModuleId moduleId, DatasetModule module) throws DatasetManagementException;

14

void deleteModule(DatasetModuleId moduleId) throws DatasetManagementException;

15

void deleteAllModules(NamespaceId namespaceId) throws DatasetManagementException;

16

17

// Instance Management

18

void addInstance(String datasetTypeName, DatasetId datasetInstanceId, DatasetProperties props,

19

KerberosPrincipalId ownerPrincipal) throws DatasetManagementException, IOException;

20

void updateInstance(DatasetId datasetInstanceId, DatasetProperties props)

21

throws DatasetManagementException, IOException;

22

void deleteInstance(DatasetId datasetInstanceId) throws DatasetManagementException, IOException;

23

void deleteAllInstances(NamespaceId namespaceId) throws DatasetManagementException, IOException;

24

25

// Instance Queries

26

Collection<DatasetSpecificationSummary> getInstances(NamespaceId namespaceId)

27

throws DatasetManagementException;

28

DatasetSpecification getDatasetSpec(DatasetId datasetInstanceId) throws DatasetManagementException;

29

boolean hasInstance(DatasetId datasetInstanceId) throws DatasetManagementException;

30

boolean hasType(DatasetTypeId datasetTypeId) throws DatasetManagementException;

31

DatasetTypeMeta getTypeInfo(DatasetTypeId datasetTypeId) throws DatasetManagementException;

32

33

// Dataset Access

34

<T extends Dataset> T getDataset(DatasetId datasetInstanceId, Map<String, String> arguments,

35

ClassLoader classLoader, DatasetClassLoaderProvider classLoaderProvider,

36

Iterable<? extends EntityId> owners, AccessType accessType)

37

throws DatasetManagementException, IOException;

38

<T extends DatasetAdmin> T getAdmin(DatasetId datasetInstanceId, ClassLoader classLoader)

39

throws DatasetManagementException, IOException;

40

41

// Operations

42

void truncateInstance(DatasetId datasetInstanceId) throws DatasetManagementException, IOException;

43

void writeLineage(DatasetId datasetInstanceId, AccessType accessType);

44

}

45

```

46

47

### Dataset Framework Implementations

48

49

Different implementations of the DatasetFramework for various deployment scenarios and requirements.

50

51

```java { .api }

52

// In-memory implementation for testing

53

public class InMemoryDatasetFramework implements DatasetFramework {

54

// Fast in-memory dataset operations for unit testing

55

}

56

57

// Delegation wrapper for cross-cutting concerns

58

public class ForwardingDatasetFramework implements DatasetFramework {

59

protected DatasetFramework delegate();

60

// Forwarding implementation allowing decoration of dataset operations

61

}

62

63

// Static configuration-based framework

64

public class StaticDatasetFramework implements DatasetFramework {

65

// Pre-configured dataset framework for static environments

66

}

67

```

68

69

### Dataset Caching

70

71

Dataset instance caching for performance optimization with both single-threaded and multi-threaded implementations.

72

73

```java { .api }

74

// Single-threaded dataset caching

75

public class SingleThreadDatasetCache implements Closeable {

76

public <T extends Dataset> T getDataset(DatasetId datasetId, Map<String, String> arguments,

77

ClassLoader classLoader, Iterable<? extends EntityId> owners,

78

AccessType accessType) throws IOException, DatasetInstantiationException;

79

public void invalidate();

80

}

81

82

// Multi-threaded dataset caching with concurrent access

83

public class MultiThreadDatasetCache implements Closeable {

84

public <T extends Dataset> T getDataset(DatasetId datasetId, Map<String, String> arguments,

85

ClassLoader classLoader, Iterable<? extends EntityId> owners,

86

AccessType accessType, @Nullable ProgramContext programContext)

87

throws IOException, DatasetInstantiationException;

88

public void invalidateAll();

89

}

90

```

91

92

### Dataset Modules and Types

93

94

Dataset modules provide different storage backend implementations with consistent APIs.

95

96

```java { .api }

97

// HBase-based dataset modules

98

public class HBaseTableModule implements DatasetModule {

99

// HBase table dataset implementation

100

}

101

102

public class HBaseMetricsTableModule implements DatasetModule {

103

// Specialized HBase metrics table implementation

104

}

105

106

// LevelDB-based dataset modules

107

public class LevelDBTableModule implements DatasetModule {

108

// LevelDB table dataset implementation for local storage

109

}

110

111

// In-memory dataset modules

112

public class InMemoryTableModule implements DatasetModule {

113

// In-memory table implementation for testing and caching

114

}

115

116

// File-based dataset modules

117

public class FileSetModule implements DatasetModule {

118

// File-based dataset operations

119

}

120

121

public class PartitionedFileSetModule implements DatasetModule {

122

// Partitioned file dataset for large-scale data processing

123

}

124

125

public class TimePartitionedFileSetModule implements DatasetModule {

126

// Time-based partitioned datasets for time-series data

127

}

128

```

129

130

### Dataset Library Components

131

132

Core dataset implementations providing specific data access patterns.

133

134

```java { .api }

135

// File-based datasets

136

public class FileSetDefinition implements DatasetDefinition<FileSet, FileSetAdmin>, Reconfigurable {

137

public FileSet getDataset(DatasetContext datasetContext, DatasetSpecification spec,

138

Map<String, String> arguments, ClassLoader classLoader) throws IOException;

139

}

140

141

public class FileSetAdmin implements DatasetAdmin, Updatable {

142

public void create() throws IOException;

143

public void drop() throws IOException;

144

public void truncate() throws IOException;

145

public void upgrade() throws IOException;

146

}

147

148

public class PartitionedFileSetDataset extends AbstractDataset implements PartitionedFileSet {

149

public void addPartition(PartitionKey key, String path);

150

public void addPartition(PartitionKey key, String path, Map<String, String> metadata);

151

public PartitionDetail getPartition(PartitionKey key);

152

public Set<PartitionDetail> getPartitions(PartitionFilter filter);

153

public void dropPartition(PartitionKey key);

154

}

155

156

public class TimePartitionedFileSetDataset extends PartitionedFileSetDataset

157

implements TimePartitionedFileSet {

158

public void addPartition(long time, String path);

159

public void addPartition(long time, String path, Map<String, String> metadata);

160

public PartitionDetail getPartitionByTime(long time);

161

public Set<PartitionDetail> getPartitionsByTime(long startTime, long endTime);

162

}

163

164

// Key-value tables

165

public class NoTxKeyValueTable extends AbstractDataset implements KeyValueTable {

166

public void write(byte[] key, byte[] value);

167

public byte[] read(byte[] key);

168

public void delete(byte[] key);

169

public CloseableIterator<KeyValue<byte[], byte[]>> scan(byte[] startRow, byte[] stopRow);

170

}

171

172

// Multi-dimensional data cubes

173

public class CubeDataset extends AbstractDataset implements Cube {

174

public void add(CubeFact fact);

175

public void add(Collection<? extends CubeFact> facts);

176

public CubeExploreQuery query(CubeExploreQuery query);

177

public Collection<DimensionValue> findDimensionValues(DimensionValue dimensionValue);

178

public Collection<String> findMeasureNames();

179

}

180

181

public class CubeDatasetDefinition implements DatasetDefinition<CubeDataset, CubeDatasetAdmin> {

182

public CubeDataset getDataset(DatasetContext datasetContext, DatasetSpecification spec,

183

Map<String, String> arguments, ClassLoader classLoader) throws IOException;

184

}

185

186

// Object storage

187

public class ObjectStoreDataset<T> extends AbstractDataset implements ObjectStore<T> {

188

public void write(byte[] key, T object) throws IOException;

189

public T read(byte[] key) throws IOException;

190

public void delete(byte[] key);

191

public CloseableIterator<KeyValue<byte[], T>> scan(byte[] startKey, byte[] stopKey) throws IOException;

192

}

193

194

public class ObjectMappedTableDataset<T> extends AbstractDataset implements ObjectMappedTable<T> {

195

public void write(String key, T object) throws IOException;

196

public T read(String key) throws IOException;

197

public void delete(String key);

198

public CloseableIterator<KeyValue<String, T>> scan(String startKey, String stopKey) throws IOException;

199

}

200

201

// Metrics tables

202

public interface MetricsTable {

203

void put(SortedMap<byte[], SortedMap<byte[], Long>> updates);

204

boolean swap(byte[] row, byte[] column, byte[] oldValue, byte[] newValue);

205

void increment(byte[] row, Map<byte[], Long> increments);

206

void increment(SortedMap<byte[], SortedMap<byte[], Long>> updates);

207

Scanner scan(byte[] startRow, byte[] stopRow);

208

}

209

```

210

211

## Usage Examples

212

213

### Creating and Using a Dataset

214

215

```java

216

// Create dataset framework instance (typically injected)

217

DatasetFramework datasetFramework = // ... obtain from dependency injection

218

219

// Define dataset properties

220

DatasetProperties properties = DatasetProperties.builder()

221

.add("hbase.splits", "10")

222

.add("hbase.compression", "SNAPPY")

223

.build();

224

225

// Create dataset instance

226

DatasetId datasetId = NamespaceId.DEFAULT.dataset("userProfiles");

227

datasetFramework.addInstance("keyValueTable", datasetId, properties, null);

228

229

// Access dataset for operations

230

KeyValueTable dataset = datasetFramework.getDataset(

231

datasetId,

232

Collections.emptyMap(), // runtime arguments

233

null, // classloader

234

null, // classloader provider

235

Collections.emptyList(), // owners

236

AccessType.READ_WRITE

237

);

238

239

// Use the dataset

240

try {

241

dataset.write(Bytes.toBytes("user123"), Bytes.toBytes("profile_data"));

242

byte[] data = dataset.read(Bytes.toBytes("user123"));

243

} finally {

244

dataset.close();

245

}

246

```

247

248

### Working with Partitioned Datasets

249

250

```java

251

// Access partitioned file set

252

PartitionedFileSetDataset partitionedDataset = datasetFramework.getDataset(

253

datasetId, null, null, null, null, AccessType.READ_WRITE);

254

255

// Add partitions with metadata

256

PartitionKey key = PartitionKey.builder()

257

.addStringField("year", "2023")

258

.addStringField("month", "01")

259

.addStringField("day", "15")

260

.build();

261

262

Map<String, String> metadata = Map.of(

263

"format", "parquet",

264

"compression", "snappy",

265

"records", "1000000"

266

);

267

268

partitionedDataset.addPartition(key, "/data/2023/01/15", metadata);

269

270

// Query partitions

271

PartitionFilter filter = PartitionFilter.builder()

272

.addRangeCondition("year", "2023", "2023")

273

.addRangeCondition("month", "01", "03")

274

.build();

275

276

Set<PartitionDetail> partitions = partitionedDataset.getPartitions(filter);

277

for (PartitionDetail partition : partitions) {

278

System.out.println("Partition: " + partition.getRelativePath());

279

System.out.println("Metadata: " + partition.getMetadata());

280

}

281

```

282

283

### Managing Dataset Modules

284

285

```java

286

// Add custom dataset module

287

DatasetModuleId moduleId = NamespaceId.DEFAULT.datasetModule("customModule");

288

DatasetModule customModule = new CustomDatasetModule();

289

datasetFramework.addModule(moduleId, customModule);

290

291

// Create instance of custom dataset type

292

datasetFramework.addInstance("customType", datasetId, properties, null);

293

294

// Check if dataset type exists

295

DatasetTypeId typeId = NamespaceId.DEFAULT.datasetType("customType");

296

boolean exists = datasetFramework.hasType(typeId);

297

298

// Get type information

299

DatasetTypeMeta typeMeta = datasetFramework.getTypeInfo(typeId);

300

System.out.println("Type: " + typeMeta.getName());

301

System.out.println("Modules: " + typeMeta.getModules());

302

```

303

304

## Types

305

306

```java { .api }

307

// Dataset identifiers and specifications

308

public final class DatasetId extends EntityId {

309

public static DatasetId of(String namespace, String dataset);

310

public String getDataset();

311

public NamespaceId getParent();

312

}

313

314

public final class DatasetModuleId extends EntityId {

315

public static DatasetModuleId of(String namespace, String module);

316

public String getModule();

317

}

318

319

public final class DatasetTypeId extends EntityId {

320

public static DatasetTypeId of(String namespace, String type);

321

public String getType();

322

}

323

324

public interface DatasetSpecification {

325

String getName();

326

String getType();

327

DatasetProperties getProperties();

328

Map<String, DatasetSpecification> getSpecifications();

329

}

330

331

public final class DatasetSpecificationSummary {

332

public String getName();

333

public String getType();

334

public String getDescription();

335

}

336

337

// Dataset properties and configuration

338

public final class DatasetProperties {

339

public static Builder builder();

340

public Map<String, String> getProperties();

341

342

public static class Builder {

343

public Builder add(String key, String value);

344

public Builder addAll(Map<String, String> properties);

345

public DatasetProperties build();

346

}

347

}

348

349

// Dataset type metadata

350

public final class DatasetTypeMeta {

351

public String getName();

352

public List<DatasetModuleMeta> getModules();

353

}

354

355

public final class DatasetModuleMeta {

356

public String getName();

357

public String getClassName();

358

public String getJarLocation();

359

public List<String> getTypes();

360

}

361

362

// Dataset interfaces

363

public interface Dataset extends Closeable {

364

// Base dataset interface

365

}

366

367

public interface DatasetAdmin {

368

void create() throws IOException;

369

void drop() throws IOException;

370

void truncate() throws IOException;

371

void upgrade() throws IOException;

372

void close() throws IOException;

373

}

374

375

// Access and security types

376

public enum AccessType {

377

READ, WRITE, ADMIN, READ_WRITE, UNKNOWN

378

}

379

380

public final class KerberosPrincipalId {

381

public static KerberosPrincipalId of(String principal);

382

public String getPrincipal();

383

}

384

385

// Exception types

386

public class DatasetManagementException extends Exception {

387

public DatasetManagementException(String message);

388

public DatasetManagementException(String message, Throwable cause);

389

}

390

391

public class DatasetInstantiationException extends Exception {

392

public DatasetInstantiationException(String message);

393

public DatasetInstantiationException(String message, Throwable cause);

394

}

395

```