or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

catalog.mdcolumns-functions.mddata-io.mddataset-dataframe.mdindex.mdsession-management.mdstreaming.mdtypes-encoders.mdudfs.md

columns-functions.mddocs/

0

# Column Operations and Functions

1

2

Column expressions and built-in SQL functions for data transformation. Provides both operator overloading and function-based APIs for creating complex expressions, with support for all SQL operations and advanced analytics functions.

3

4

## Capabilities

5

6

### Column Class

7

8

Represents a column in a DataFrame that can be composed into complex expressions.

9

10

```scala { .api }

11

/**

12

* A column in a DataFrame that can be composed into complex expressions

13

*/

14

class Column {

15

/** Equality comparison (SQL-safe null handling) */

16

def ===(other: Any): Column

17

def =!=(other: Any): Column

18

19

/** Numeric comparison */

20

def >(other: Any): Column

21

def >=(other: Any): Column

22

def <(other: Any): Column

23

def <=(other: Any): Column

24

25

/** Arithmetic operations */

26

def +(other: Any): Column

27

def -(other: Any): Column

28

def *(other: Any): Column

29

def /(other: Any): Column

30

def %(other: Any): Column

31

32

/** Logical operations */

33

def &&(other: Column): Column

34

def ||(other: Column): Column

35

def unary_!(): Column

36

37

/** Null handling */

38

def isNull: Column

39

def isNotNull: Column

40

def isNaN: Column

41

42

/** String operations */

43

def startsWith(other: Column): Column

44

def startsWith(literal: String): Column

45

def endsWith(other: Column): Column

46

def endsWith(literal: String): Column

47

def contains(other: Column): Column

48

def contains(literal: String): Column

49

def like(literal: String): Column

50

def rlike(literal: String): Column

51

52

/** Type operations */

53

def cast(to: DataType): Column

54

def cast(to: String): Column

55

56

/** Alias operations */

57

def as(alias: String): Column

58

def as(alias: Symbol): Column

59

def alias(alias: String): Column

60

61

/** Sorting */

62

def asc: Column

63

def asc_nulls_first: Column

64

def asc_nulls_last: Column

65

def desc: Column

66

def desc_nulls_first: Column

67

def desc_nulls_last: Column

68

69

/** Window operations */

70

def over(window: WindowSpec): Column

71

72

/** Collection operations */

73

def getItem(key: Any): Column

74

def getField(fieldName: String): Column

75

76

/** Boolean operations for arrays */

77

def isin(list: Any*): Column

78

}

79

```

80

81

### TypedColumn

82

83

Column with type information for input and output types.

84

85

```scala { .api }

86

/**

87

* Column with type information

88

* @tparam T Input type

89

* @tparam U Output type

90

*/

91

class TypedColumn[T, U] extends Column {

92

/** Apply to Dataset */

93

def apply(dataset: Dataset[T]): Dataset[U]

94

}

95

```

96

97

### Core Functions

98

99

Essential functions for column creation and manipulation.

100

101

```scala { .api }

102

object functions {

103

/** Create column from column name */

104

def col(colName: String): Column

105

def column(colName: String): Column

106

107

/** Create literal column */

108

def lit(literal: Any): Column

109

def typedLit[T : TypeTag](literal: T): Column

110

111

/** Conditional expressions */

112

def when(condition: Column, value: Any): Column

113

def coalesce(e: Column*): Column

114

def greatest(e: Column*): Column

115

def least(e: Column*): Column

116

def isnull(e: Column): Column

117

def isnan(e: Column): Column

118

def nanvl(col1: Column, col2: Column): Column

119

}

120

```

121

122

**Usage Examples:**

123

124

```scala

125

import org.apache.spark.sql.functions._

126

127

val df = spark.table("employees")

128

129

// Column operations

130

val filtered = df.filter(col("age") > 25 && col("department") === "Engineering")

131

val calculated = df.withColumn("bonus", col("salary") * 0.1)

132

val sorted = df.orderBy(col("salary").desc, col("name").asc)

133

134

// Conditional logic

135

val categorized = df.withColumn("category",

136

when(col("age") < 30, "Young")

137

.when(col("age") < 50, "Middle")

138

.otherwise("Senior")

139

)

140

141

// Null handling

142

val cleaned = df.filter(col("email").isNotNull && col("phone").isNotNull)

143

val filled = df.withColumn("phone", coalesce(col("phone"), lit("N/A")))

144

```

145

146

### Aggregate Functions

147

148

Statistical and aggregate functions for data analysis.

149

150

```scala { .api }

151

object functions {

152

/** Count functions */

153

def count(e: Column): Column

154

def countDistinct(expr: Column, exprs: Column*): Column

155

def approx_count_distinct(e: Column): Column

156

def approx_count_distinct(e: Column, rsd: Double): Column

157

158

/** Basic aggregates */

159

def sum(e: Column): Column

160

def sumDistinct(e: Column): Column

161

def avg(e: Column): Column

162

def mean(e: Column): Column

163

def min(e: Column): Column

164

def max(e: Column): Column

165

def first(e: Column): Column

166

def first(e: Column, ignoreNulls: Boolean): Column

167

def last(e: Column): Column

168

def last(e: Column, ignoreNulls: Boolean): Column

169

170

/** Statistical functions */

171

def stddev(e: Column): Column

172

def stddev_pop(e: Column): Column

173

def stddev_samp(e: Column): Column

174

def variance(e: Column): Column

175

def var_pop(e: Column): Column

176

def var_samp(e: Column): Column

177

def skewness(e: Column): Column

178

def kurtosis(e: Column): Column

179

def corr(column1: Column, column2: Column): Column

180

def covar_pop(column1: Column, column2: Column): Column

181

def covar_samp(column1: Column, column2: Column): Column

182

183

/** Collection aggregates */

184

def collect_list(e: Column): Column

185

def collect_set(e: Column): Column

186

}

187

```

188

189

### Mathematical Functions

190

191

Mathematical operations and functions.

192

193

```scala { .api }

194

object functions {

195

/** Basic math */

196

def abs(e: Column): Column

197

def negate(e: Column): Column

198

def sqrt(e: Column): Column

199

def cbrt(e: Column): Column

200

201

/** Power and logarithmic */

202

def pow(l: Column, r: Column): Column

203

def pow(l: Column, r: Double): Column

204

def exp(e: Column): Column

205

def expm1(e: Column): Column

206

def log(e: Column): Column

207

def log(base: Double, e: Column): Column

208

def log10(e: Column): Column

209

def log1p(e: Column): Column

210

def log2(e: Column): Column

211

212

/** Trigonometric */

213

def sin(e: Column): Column

214

def cos(e: Column): Column

215

def tan(e: Column): Column

216

def asin(e: Column): Column

217

def acos(e: Column): Column

218

def atan(e: Column): Column

219

def atan2(y: Column, x: Column): Column

220

def sinh(e: Column): Column

221

def cosh(e: Column): Column

222

def tanh(e: Column): Column

223

224

/** Rounding */

225

def round(e: Column): Column

226

def round(e: Column, scale: Int): Column

227

def bround(e: Column): Column

228

def bround(e: Column, scale: Int): Column

229

def ceil(e: Column): Column

230

def floor(e: Column): Column

231

232

/** Random */

233

def rand(): Column

234

def rand(seed: Long): Column

235

def randn(): Column

236

def randn(seed: Long): Column

237

}

238

```

239

240

### String Functions

241

242

String manipulation and processing functions.

243

244

```scala { .api }

245

object functions {

246

/** String operations */

247

def length(e: Column): Column

248

def char_length(e: Column): Column

249

def bit_length(e: Column): Column

250

def octet_length(e: Column): Column

251

252

/** Case conversion */

253

def upper(e: Column): Column

254

def lower(e: Column): Column

255

def initcap(e: Column): Column

256

257

/** Trimming */

258

def trim(e: Column): Column

259

def ltrim(e: Column): Column

260

def rtrim(e: Column): Column

261

262

/** Padding */

263

def lpad(str: Column, len: Int, pad: String): Column

264

def rpad(str: Column, len: Int, pad: String): Column

265

266

/** Concatenation */

267

def concat(exprs: Column*): Column

268

def concat_ws(sep: String, exprs: Column*): Column

269

270

/** Substring operations */

271

def substring(str: Column, pos: Int, len: Int): Column

272

def substring_index(str: Column, delim: String, count: Int): Column

273

def left(str: Column, len: Int): Column

274

def right(str: Column, len: Int): Column

275

276

/** Pattern matching */

277

def regexp_extract(e: Column, exp: String, groupIdx: Int): Column

278

def regexp_replace(e: Column, pattern: String, replacement: String): Column

279

def split(str: Column, pattern: String): Column

280

def split(str: Column, pattern: String, limit: Int): Column

281

282

/** String search */

283

def instr(str: Column, substring: String): Column

284

def locate(substr: String, str: Column): Column

285

def locate(substr: String, str: Column, pos: Int): Column

286

287

/** Encoding */

288

def ascii(e: Column): Column

289

def base64(e: Column): Column

290

def unbase64(e: Column): Column

291

def encode(value: Column, charset: String): Column

292

def decode(value: Column, charset: String): Column

293

294

/** Format */

295

def format_number(x: Column, d: Int): Column

296

def format_string(format: String, arguments: Column*): Column

297

}

298

```

299

300

### Date and Time Functions

301

302

Date, time, and timestamp manipulation functions.

303

304

```scala { .api }

305

object functions {

306

/** Current date/time */

307

def current_date(): Column

308

def current_timestamp(): Column

309

def now(): Column

310

def unix_timestamp(): Column

311

def unix_timestamp(s: Column): Column

312

def unix_timestamp(s: Column, p: String): Column

313

314

/** Date/time conversion */

315

def to_date(e: Column): Column

316

def to_date(e: Column, fmt: String): Column

317

def to_timestamp(s: Column): Column

318

def to_timestamp(s: Column, fmt: String): Column

319

def from_unixtime(ut: Column): Column

320

def from_unixtime(ut: Column, f: String): Column

321

322

/** Date arithmetic */

323

def date_add(start: Column, days: Int): Column

324

def date_sub(start: Column, days: Int): Column

325

def datediff(end: Column, start: Column): Column

326

def months_between(end: Column, start: Column): Column

327

def months_between(end: Column, start: Column, roundOff: Boolean): Column

328

def add_months(start: Column, months: Int): Column

329

def next_day(date: Column, dayOfWeek: String): Column

330

def last_day(e: Column): Column

331

332

/** Date/time extraction */

333

def year(e: Column): Column

334

def quarter(e: Column): Column

335

def month(e: Column): Column

336

def dayofmonth(e: Column): Column

337

def dayofweek(e: Column): Column

338

def dayofyear(e: Column): Column

339

def hour(e: Column): Column

340

def minute(e: Column): Column

341

def second(e: Column): Column

342

def weekofyear(e: Column): Column

343

344

/** Date/time formatting */

345

def date_format(dateExpr: Column, format: String): Column

346

def from_utc_timestamp(ts: Column, tz: String): Column

347

def to_utc_timestamp(ts: Column, tz: String): Column

348

349

/** Time window */

350

def window(timeColumn: Column, windowDuration: String): Column

351

def window(timeColumn: Column, windowDuration: String, slideDuration: String): Column

352

def window(timeColumn: Column, windowDuration: String, slideDuration: String, startTime: String): Column

353

}

354

```

355

356

### Array and Map Functions

357

358

Functions for working with complex data types.

359

360

```scala { .api }

361

object functions {

362

/** Array creation */

363

def array(cols: Column*): Column

364

def array(colName: String, colNames: String*): Column

365

def split(str: Column, pattern: String): Column

366

367

/** Array operations */

368

def array_contains(column: Column, value: Any): Column

369

def array_distinct(e: Column): Column

370

def array_except(col1: Column, col2: Column): Column

371

def array_intersect(col1: Column, col2: Column): Column

372

def array_join(column: Column, delimiter: String): Column

373

def array_join(column: Column, delimiter: String, nullReplacement: String): Column

374

def array_max(e: Column): Column

375

def array_min(e: Column): Column

376

def array_position(column: Column, value: Any): Column

377

def array_remove(column: Column, element: Any): Column

378

def array_repeat(e: Column, count: Int): Column

379

def array_sort(e: Column): Column

380

def array_union(col1: Column, col2: Column): Column

381

def arrays_overlap(a1: Column, a2: Column): Column

382

def arrays_zip(e: Column*): Column

383

def concat(exprs: Column*): Column

384

def element_at(column: Column, extraction: Any): Column

385

def explode(e: Column): Column

386

def explode_outer(e: Column): Column

387

def flatten(e: Column): Column

388

def posexplode(e: Column): Column

389

def posexplode_outer(e: Column): Column

390

def reverse(e: Column): Column

391

def sequence(start: Column, stop: Column): Column

392

def sequence(start: Column, stop: Column, step: Column): Column

393

def shuffle(e: Column): Column

394

def size(e: Column): Column

395

def slice(x: Column, start: Int, length: Int): Column

396

def sort_array(e: Column): Column

397

def sort_array(e: Column, asc: Boolean): Column

398

399

/** Map creation */

400

def map(cols: Column*): Column

401

def map_from_arrays(keys: Column, values: Column): Column

402

def map_from_entries(e: Column): Column

403

404

/** Map operations */

405

def map_concat(cols: Column*): Column

406

def map_entries(e: Column): Column

407

def map_filter(expr: Column, f: (Column, Column) => Column): Column

408

def map_keys(e: Column): Column

409

def map_values(e: Column): Column

410

def map_zip_with(left: Column, right: Column, f: (Column, Column, Column) => Column): Column

411

412

/** Struct operations */

413

def struct(cols: Column*): Column

414

def struct(colName: String, colNames: String*): Column

415

}

416

```

417

418

### Window Functions

419

420

Analytics functions for window-based calculations.

421

422

```scala { .api }

423

object functions {

424

/** Ranking functions */

425

def row_number(): Column

426

def rank(): Column

427

def dense_rank(): Column

428

def percent_rank(): Column

429

def ntile(n: Int): Column

430

def cume_dist(): Column

431

432

/** Analytic functions */

433

def lag(e: Column, offset: Int): Column

434

def lag(e: Column, offset: Int, defaultValue: Any): Column

435

def lead(e: Column, offset: Int): Column

436

def lead(e: Column, offset: Int, defaultValue: Any): Column

437

def first_value(e: Column): Column

438

def first_value(e: Column, ignoreNulls: Boolean): Column

439

def last_value(e: Column): Column

440

def last_value(e: Column, ignoreNulls: Boolean): Column

441

def nth_value(e: Column, n: Int): Column

442

def nth_value(e: Column, n: Int, ignoreNulls: Boolean): Column

443

}

444

445

/**

446

* Window specification for window functions

447

*/

448

class WindowSpec {

449

def partitionBy(colNames: String*): WindowSpec

450

def partitionBy(cols: Column*): WindowSpec

451

def orderBy(colNames: String*): WindowSpec

452

def orderBy(cols: Column*): WindowSpec

453

def rowsBetween(start: Long, end: Long): WindowSpec

454

def rangeBetween(start: Long, end: Long): WindowSpec

455

}

456

457

object Window {

458

/** Create unbounded window */

459

def partitionBy(colNames: String*): WindowSpec

460

def partitionBy(cols: Column*): WindowSpec

461

def orderBy(colNames: String*): WindowSpec

462

def orderBy(cols: Column*): WindowSpec

463

464

/** Window frame boundaries */

465

val unboundedPreceding: Long

466

val unboundedFollowing: Long

467

val currentRow: Long

468

}

469

```

470

471

**Usage Examples:**

472

473

```scala

474

import org.apache.spark.sql.functions._

475

import org.apache.spark.sql.expressions.Window

476

477

val sales = spark.table("sales")

478

479

// Window functions

480

val windowSpec = Window.partitionBy("region").orderBy(col("date"))

481

482

val salesWithRank = sales.withColumn("rank",

483

row_number().over(windowSpec)

484

)

485

486

val salesWithLag = sales.withColumn("prev_amount",

487

lag(col("amount"), 1).over(windowSpec)

488

)

489

490

// Running totals

491

val runningTotal = sales.withColumn("running_total",

492

sum(col("amount")).over(windowSpec.rowsBetween(Window.unboundedPreceding, Window.currentRow))

493

)

494

495

// Moving averages

496

val movingAvg = sales.withColumn("moving_avg_3",

497

avg(col("amount")).over(windowSpec.rowsBetween(-2, 0))

498

)

499

```