or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

data-types.mdimage-processing.mdindex.mdinput-sources.mdrecord-readers.mdtransforms.md

transforms.mddocs/

0

# Data Transforms and Processing

1

2

DataVec provides a comprehensive transformation system for preprocessing and cleaning data before feeding it to machine learning models. The transformation API enables complex data pipelines with column-level operations, mathematical transformations, and data quality improvements.

3

4

## Capabilities

5

6

### Transform Process

7

8

The core transformation workflow manager that orchestrates multiple transformation steps in a pipeline.

9

10

```java { .api }

11

public class TransformProcess {

12

public static Builder builder(Schema initialSchema);

13

public Schema getInitialSchema();

14

public Schema getFinalSchema();

15

public List<DataAction> getActionList();

16

public List<Writable> execute(List<Writable> input);

17

public List<List<Writable>> execute(List<List<Writable>> input);

18

}

19

20

public static class TransformProcess.Builder {

21

public Builder removeColumns(String... columnNames);

22

public Builder removeColumns(int... columnIndices);

23

public Builder renameColumn(String oldName, String newName);

24

public Builder filter(Condition condition);

25

public Builder transform(Transform transform);

26

public Builder convertToString(String columnName);

27

public Builder convertToDouble(String columnName);

28

public Builder convertToInteger(String columnName);

29

public Builder normalize(String columnName, Normalize normalization);

30

public Builder standardize(String columnName);

31

public Builder categoricalToOneHot(String columnName);

32

public Builder categoricalToInteger(String columnName);

33

public Builder stringToTimeTransform(String columnName, String dateTimeFormat, DateTimeZone dateTimeZone);

34

public Builder conditionalReplaceValueTransform(String columnName, Condition condition, Writable newValue);

35

public Builder appendStringColumnTransform(String columnName, String stringToAppend);

36

public Builder replaceStringTransform(String columnName, Map<String, String> mapping);

37

public TransformProcess build();

38

}

39

```

40

41

**Usage Example:**

42

43

```java

44

import org.datavec.api.transform.TransformProcess;

45

import org.datavec.api.transform.schema.Schema;

46

import org.datavec.api.transform.condition.ConditionOp;

47

import org.datavec.api.transform.condition.column.DoubleColumnCondition;

48

49

// Define input schema

50

Schema inputSchema = new Schema.Builder()

51

.addColumnString("name")

52

.addColumnInteger("age")

53

.addColumnDouble("income")

54

.addColumnCategorical("category", Arrays.asList("A", "B", "C"))

55

.build();

56

57

// Build transformation process

58

TransformProcess tp = new TransformProcess.Builder(inputSchema)

59

.removeColumns("name") // Remove name column

60

.filter(new DoubleColumnCondition("income", ConditionOp.GreaterThan, 0.0)) // Filter positive income

61

.normalize("income", Normalize.MinMax) // Min-max normalize income

62

.categoricalToOneHot("category") // One-hot encode category

63

.build();

64

65

// Apply transformations

66

List<Writable> input = Arrays.asList(

67

new Text("John"),

68

new IntWritable(25),

69

new DoubleWritable(50000.0),

70

new Text("A")

71

);

72

73

List<Writable> transformed = tp.execute(input);

74

// Result: [IntWritable(25), DoubleWritable(normalized_income), IntWritable(1), IntWritable(0), IntWritable(0)]

75

```

76

77

### Core Transform Interface

78

79

Base interface for all data transformations.

80

81

```java { .api }

82

public interface Transform {

83

List<Writable> map(List<Writable> writables);

84

String[] outputColumnNames();

85

ColumnType[] outputColumnTypes();

86

String transform(String input);

87

}

88

```

89

90

### Column Operations

91

92

Operations that work on individual columns of data.

93

94

```java { .api }

95

public enum ColumnType {

96

String,

97

Integer,

98

Long,

99

Double,

100

Categorical,

101

Time,

102

Bytes,

103

Boolean,

104

NDArray

105

}

106

107

public abstract class ColumnOp {

108

public abstract ColumnType getColumnType();

109

public abstract String[] columnNames();

110

}

111

112

public class ConvertToString extends ColumnOp {

113

public ConvertToString(String columnName);

114

}

115

116

public class ConvertToDouble extends ColumnOp {

117

public ConvertToDouble(String columnName);

118

}

119

120

public class ConvertToInteger extends ColumnOp {

121

public ConvertToInteger(String columnName);

122

}

123

```

124

125

**Usage Example:**

126

127

```java

128

// Convert string column to double

129

Transform convertTransform = new ConvertToDouble("price_string");

130

131

List<Writable> input = Arrays.asList(

132

new Text("Product A"),

133

new Text("29.99") // String representation of price

134

);

135

136

List<Writable> output = convertTransform.map(input);

137

// Result: [Text("Product A"), DoubleWritable(29.99)]

138

```

139

140

### Mathematical Operations

141

142

Mathematical transformations and calculations on numeric columns.

143

144

```java { .api }

145

public enum MathOp {

146

Add,

147

Subtract,

148

Multiply,

149

Divide,

150

Modulus,

151

ReverseSubtract,

152

ReverseDivide,

153

ScalarMin,

154

ScalarMax,

155

Abs,

156

Ceil,

157

Floor,

158

Round,

159

Sqrt,

160

Square,

161

Log,

162

Log10,

163

Exp,

164

Pow,

165

Sin,

166

Cos,

167

Tan,

168

ASin,

169

ACos,

170

ATan,

171

Sinh,

172

Cosh,

173

Tanh

174

}

175

176

public class MathFunction implements Transform {

177

public MathFunction(String columnName, MathOp operation);

178

public MathFunction(String columnName, MathOp operation, double scalar);

179

}

180

181

public class AddConstantColumnTransform implements Transform {

182

public AddConstantColumnTransform(String columnName, double value);

183

}

184

185

public class MultiplyConstantColumnTransform implements Transform {

186

public MultiplyConstantColumnTransform(String columnName, double value);

187

}

188

```

189

190

**Usage Examples:**

191

192

```java

193

// Square all values in a column

194

Transform squareTransform = new MathFunction("values", MathOp.Square);

195

196

// Add constant to column

197

Transform addConstant = new AddConstantColumnTransform("salary", 5000.0);

198

199

// Multiply by constant

200

Transform multiplyConstant = new MultiplyConstantColumnTransform("price", 1.1); // 10% increase

201

202

List<Writable> input = Arrays.asList(new DoubleWritable(100.0));

203

List<Writable> squared = squareTransform.map(input);

204

// Result: [DoubleWritable(10000.0)]

205

```

206

207

### Reduction Operations

208

209

Operations that reduce multiple rows to summary statistics.

210

211

```java { .api }

212

public enum ReduceOp {

213

Min,

214

Max,

215

Range,

216

Sum,

217

Mean,

218

Prod,

219

Stdev,

220

UncorrectedStdDev,

221

Variance,

222

PopulationVariance,

223

Count,

224

CountUnique

225

}

226

227

public class Reducer {

228

public static Builder builder(ReduceOp op, String column);

229

public IAggregableReduceOp<List<Writable>, List<Writable>> getReduction();

230

}

231

232

public enum StringReduceOp {

233

Merge,

234

Append,

235

Prepend,

236

Replace

237

}

238

239

public class StringReducer {

240

public static Builder builder(StringReduceOp op, String column);

241

public static StringReducer merge(String column, String delimiter);

242

public static StringReducer append(String column, String stringToAppend);

243

}

244

```

245

246

**Usage Examples:**

247

248

```java

249

// Calculate mean of a numeric column

250

IAggregableReduceOp<List<Writable>, List<Writable>> meanReduction =

251

Reducer.builder(ReduceOp.Mean, "values").build().getReduction();

252

253

// Merge string values with delimiter

254

StringReducer merger = StringReducer.merge("names", ",");

255

256

List<List<Writable>> data = Arrays.asList(

257

Arrays.asList(new DoubleWritable(10.0), new Text("Alice")),

258

Arrays.asList(new DoubleWritable(20.0), new Text("Bob")),

259

Arrays.asList(new DoubleWritable(30.0), new Text("Charlie"))

260

);

261

262

// Apply reduction operations

263

List<Writable> meanResult = meanReduction.aggregate(data);

264

// Result: [DoubleWritable(20.0)] - mean of 10, 20, 30

265

```

266

267

### Normalization and Standardization

268

269

Statistical normalization techniques for numeric data.

270

271

```java { .api }

272

public enum Normalize {

273

MinMax,

274

Standardize,

275

Normalize,

276

Log2,

277

Log10

278

}

279

280

public class NormalizeTransform implements Transform {

281

public NormalizeTransform(String columnName, Normalize normalization);

282

public NormalizeTransform(String columnName, Normalize normalization,

283

double minValue, double maxValue);

284

}

285

286

public class StandardizeTransform implements Transform {

287

public StandardizeTransform(String columnName);

288

public StandardizeTransform(String columnName, double mean, double stdev);

289

}

290

```

291

292

**Usage Examples:**

293

294

```java

295

// Min-max normalization to [0, 1]

296

Transform minMaxNorm = new NormalizeTransform("values", Normalize.MinMax);

297

298

// Z-score standardization

299

Transform standardize = new StandardizeTransform("values");

300

301

// Custom min-max range [0, 100]

302

Transform customRange = new NormalizeTransform("values", Normalize.MinMax, 0.0, 100.0);

303

304

List<Writable> input = Arrays.asList(new DoubleWritable(75.0));

305

List<Writable> normalized = minMaxNorm.map(input);

306

// Result depends on previously calculated min/max values from data

307

```

308

309

### Categorical Data Handling

310

311

Transformations for categorical and string data.

312

313

```java { .api }

314

public class CategoricalToIntegerTransform implements Transform {

315

public CategoricalToIntegerTransform(String columnName, List<String> categoryList);

316

}

317

318

public class CategoricalToOneHotTransform implements Transform {

319

public CategoricalToOneHotTransform(String columnName, List<String> categoryList);

320

}

321

322

public class StringToCategoricalTransform implements Transform {

323

public StringToCategoricalTransform(String columnName, List<String> categoryList);

324

}

325

326

public class ReplaceStringTransform implements Transform {

327

public ReplaceStringTransform(String columnName, Map<String, String> mapping);

328

}

329

```

330

331

**Usage Examples:**

332

333

```java

334

// Convert categories to integers

335

List<String> categories = Arrays.asList("small", "medium", "large");

336

Transform catToInt = new CategoricalToIntegerTransform("size", categories);

337

338

// Convert categories to one-hot encoding

339

Transform catToOneHot = new CategoricalToOneHotTransform("size", categories);

340

341

// String replacement mapping

342

Map<String, String> replacements = new HashMap<>();

343

replacements.put("yes", "1");

344

replacements.put("no", "0");

345

Transform stringReplace = new ReplaceStringTransform("response", replacements);

346

347

List<Writable> input = Arrays.asList(new Text("medium"));

348

List<Writable> intResult = catToInt.map(input);

349

// Result: [IntWritable(1)] - "medium" is index 1

350

351

List<Writable> oneHotResult = catToOneHot.map(input);

352

// Result: [IntWritable(0), IntWritable(1), IntWritable(0)] - one-hot for "medium"

353

```

354

355

### Conditional Transformations

356

357

Conditional logic for data transformations.

358

359

```java { .api }

360

public interface Condition {

361

boolean condition(List<Writable> list);

362

boolean condition(Object input);

363

String[] getColumnNames();

364

}

365

366

public enum ConditionOp {

367

Equal,

368

NotEqual,

369

LessThan,

370

LessOrEqual,

371

GreaterThan,

372

GreaterOrEqual

373

}

374

375

public class DoubleColumnCondition implements Condition {

376

public DoubleColumnCondition(String columnName, ConditionOp op, double value);

377

}

378

379

public class StringColumnCondition implements Condition {

380

public StringColumnCondition(String columnName, ConditionOp op, String value);

381

}

382

383

public class ConditionalReplaceValueTransform implements Transform {

384

public ConditionalReplaceValueTransform(String columnName, Condition condition, Writable newValue);

385

}

386

```

387

388

**Usage Examples:**

389

390

```java

391

// Replace negative values with zero

392

Condition negativeCondition = new DoubleColumnCondition("salary", ConditionOp.LessThan, 0.0);

393

Transform replaceNegative = new ConditionalReplaceValueTransform("salary", negativeCondition, new DoubleWritable(0.0));

394

395

// Replace specific string values

396

Condition invalidString = new StringColumnCondition("status", ConditionOp.Equal, "INVALID");

397

Transform replaceInvalid = new ConditionalReplaceValueTransform("status", invalidString, new Text("UNKNOWN"));

398

399

List<Writable> input = Arrays.asList(new DoubleWritable(-1000.0));

400

List<Writable> result = replaceNegative.map(input);

401

// Result: [DoubleWritable(0.0)] - negative value replaced with zero

402

```

403

404

## Integration Patterns

405

406

### With RecordReader

407

408

```java

409

// Apply transformations to record reader output

410

RecordReader reader = new CSVRecordReader();

411

reader.initialize(new FileSplit(new File("data.csv")));

412

413

TransformProcess tp = new TransformProcess.Builder(schema)

414

.normalize("feature1", Normalize.MinMax)

415

.categoricalToOneHot("category")

416

.build();

417

418

List<List<Writable>> transformedData = new ArrayList<>();

419

while (reader.hasNext()) {

420

List<Writable> record = reader.next();

421

List<Writable> transformed = tp.execute(record);

422

transformedData.add(transformed);

423

}

424

```

425

426

### With DataSetIterator

427

428

```java

429

// Transform data before creating DataSet

430

TransformProcessRecordReader transformReader = new TransformProcessRecordReader(baseReader, transformProcess);

431

432

DataSetIterator iterator = new RecordReaderDataSetIterator(

433

transformReader,

434

batchSize,

435

labelIndex,

436

numClasses

437

);

438

```

439

440

### Batch Processing

441

442

```java

443

// Process data in batches with transformations

444

List<List<Writable>> batch = new ArrayList<>();

445

// ... populate batch

446

447

// Apply transformation to entire batch

448

List<List<Writable>> transformedBatch = transformProcess.execute(batch);

449

450

// Process transformed batch

451

for (List<Writable> record : transformedBatch) {

452

// Handle transformed record

453

}

454

```

455

456

## Types

457

458

### Core Interfaces

459

460

```java { .api }

461

public interface Transform {

462

List<Writable> map(List<Writable> writables);

463

String[] outputColumnNames();

464

ColumnType[] outputColumnTypes();

465

String transform(String input);

466

}

467

468

public interface Condition {

469

boolean condition(List<Writable> list);

470

boolean condition(Object input);

471

String[] getColumnNames();

472

}

473

```

474

475

### Transform Process Classes

476

477

```java { .api }

478

public class TransformProcess;

479

public class TransformProcess.Builder;

480

public class TransformProcessRecordReader implements RecordReader;

481

```

482

483

### Column Operations

484

485

```java { .api }

486

public enum ColumnType;

487

public abstract class ColumnOp;

488

public class ConvertToString extends ColumnOp;

489

public class ConvertToDouble extends ColumnOp;

490

public class ConvertToInteger extends ColumnOp;

491

```

492

493

### Mathematical Operations

494

495

```java { .api }

496

public enum MathOp;

497

public class MathFunction implements Transform;

498

public class AddConstantColumnTransform implements Transform;

499

public class MultiplyConstantColumnTransform implements Transform;

500

```

501

502

### Reduction Operations

503

504

```java { .api }

505

public enum ReduceOp;

506

public enum StringReduceOp;

507

public class Reducer;

508

public class StringReducer;

509

```

510

511

### Normalization

512

513

```java { .api }

514

public enum Normalize;

515

public class NormalizeTransform implements Transform;

516

public class StandardizeTransform implements Transform;

517

```

518

519

### Categorical Transforms

520

521

```java { .api }

522

public class CategoricalToIntegerTransform implements Transform;

523

public class CategoricalToOneHotTransform implements Transform;

524

public class StringToCategoricalTransform implements Transform;

525

public class ReplaceStringTransform implements Transform;

526

```

527

528

### Conditional Operations

529

530

```java { .api }

531

public enum ConditionOp;

532

public class DoubleColumnCondition implements Condition;

533

public class StringColumnCondition implements Condition;

534

public class ConditionalReplaceValueTransform implements Transform;

535

```