or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

ai-ml.mdcatalog.mddata-io.mddataframe-operations.mdexpressions.mdindex.mdsession.mdsql.mdudf.md

dataframe-operations.mddocs/

0

# DataFrame Operations

1

2

Core DataFrame functionality for distributed data processing. DataFrames are the primary data structure in Daft, providing lazy evaluation, distributed processing, and rich transformation capabilities.

3

4

## Capabilities

5

6

### DataFrame Creation

7

8

Create DataFrames from various Python data structures and external sources.

9

10

```python { .api }

11

def from_pydict(data: Dict[str, List[Any]]) -> DataFrame:

12

"""

13

Create DataFrame from Python dictionary.

14

15

Parameters:

16

- data: Dictionary with column names as keys and lists of values

17

18

Returns:

19

DataFrame: New DataFrame instance

20

"""

21

22

def from_pylist(data: List[Dict[str, Any]]) -> DataFrame:

23

"""

24

Create DataFrame from list of dictionaries.

25

26

Parameters:

27

- data: List of dictionaries representing rows

28

29

Returns:

30

DataFrame: New DataFrame instance

31

"""

32

33

def from_pandas(df: "pandas.DataFrame") -> DataFrame:

34

"""

35

Create DataFrame from pandas DataFrame.

36

37

Parameters:

38

- df: pandas DataFrame to convert

39

40

Returns:

41

DataFrame: New DataFrame instance

42

"""

43

44

def from_arrow(table: "pyarrow.Table") -> DataFrame:

45

"""

46

Create DataFrame from Apache Arrow table.

47

48

Parameters:

49

- table: pyarrow Table to convert

50

51

Returns:

52

DataFrame: New DataFrame instance

53

"""

54

55

def from_ray_dataset(ds: "ray.data.Dataset") -> DataFrame:

56

"""

57

Create DataFrame from Ray dataset.

58

59

Parameters:

60

- ds: Ray dataset to convert

61

62

Returns:

63

DataFrame: New DataFrame instance

64

"""

65

66

def from_dask_dataframe(ddf: "dask.DataFrame") -> DataFrame:

67

"""

68

Create DataFrame from Dask DataFrame.

69

70

Parameters:

71

- ddf: Dask DataFrame to convert

72

73

Returns:

74

DataFrame: New DataFrame instance

75

"""

76

```

77

78

### Selection and Projection

79

80

Select, rename, and transform columns in DataFrames.

81

82

```python { .api }

83

class DataFrame:

84

def select(*columns: ColumnInputType, **projections: Expression) -> DataFrame:

85

"""

86

Select columns and create new projections.

87

88

Parameters:

89

- columns: Column names or expressions to select

90

- projections: Named expressions for new columns

91

92

Returns:

93

DataFrame: New DataFrame with selected columns

94

"""

95

96

def exclude(*names: str) -> DataFrame:

97

"""

98

Exclude columns by name.

99

100

Parameters:

101

- names: Column names to exclude

102

103

Returns:

104

DataFrame: New DataFrame without excluded columns

105

"""

106

107

def with_column_renamed(existing: str, new: str) -> DataFrame:

108

"""

109

Rename a single column.

110

111

Parameters:

112

- existing: Current column name

113

- new: New column name

114

115

Returns:

116

DataFrame: New DataFrame with renamed column

117

"""

118

119

def with_columns_renamed(cols_map: Dict[str, str]) -> DataFrame:

120

"""

121

Rename multiple columns.

122

123

Parameters:

124

- cols_map: Dictionary mapping old names to new names

125

126

Returns:

127

DataFrame: New DataFrame with renamed columns

128

"""

129

```

130

131

### Filtering and Slicing

132

133

Filter rows based on conditions and slice DataFrames.

134

135

```python { .api }

136

class DataFrame:

137

def filter(predicate: Union[Expression, str]) -> DataFrame:

138

"""

139

Filter rows by condition.

140

141

Parameters:

142

- predicate: Boolean expression or SQL WHERE clause

143

144

Returns:

145

DataFrame: New DataFrame with filtered rows

146

"""

147

148

def where(predicate: Union[Expression, str]) -> DataFrame:

149

"""

150

Alias for filter().

151

152

Parameters:

153

- predicate: Boolean expression or SQL WHERE clause

154

155

Returns:

156

DataFrame: New DataFrame with filtered rows

157

"""

158

159

def limit(num: int) -> DataFrame:

160

"""

161

Limit to first N rows.

162

163

Parameters:

164

- num: Maximum number of rows to return

165

166

Returns:

167

DataFrame: New DataFrame with limited rows

168

"""

169

170

def offset(num: int) -> DataFrame:

171

"""

172

Skip first N rows.

173

174

Parameters:

175

- num: Number of rows to skip

176

177

Returns:

178

DataFrame: New DataFrame starting from offset

179

"""

180

```

181

182

### Data Cleaning

183

184

Remove duplicates, null values, and NaN values.

185

186

```python { .api }

187

class DataFrame:

188

def drop_duplicates(*subset: ColumnInputType) -> DataFrame:

189

"""

190

Remove duplicate rows.

191

192

Parameters:

193

- subset: Column names to consider for duplicates (all columns if empty)

194

195

Returns:

196

DataFrame: New DataFrame without duplicates

197

"""

198

199

def distinct(*on: ColumnInputType) -> DataFrame:

200

"""

201

Get distinct rows.

202

203

Parameters:

204

- on: Column names to consider for distinctness (all columns if empty)

205

206

Returns:

207

DataFrame: New DataFrame with distinct rows

208

"""

209

210

def drop_null(*cols: ColumnInputType) -> DataFrame:

211

"""

212

Drop rows with null values.

213

214

Parameters:

215

- cols: Column names to check for nulls (all columns if empty)

216

217

Returns:

218

DataFrame: New DataFrame without null rows

219

"""

220

221

def drop_nan(*cols: ColumnInputType) -> DataFrame:

222

"""

223

Drop rows with NaN values.

224

225

Parameters:

226

- cols: Column names to check for NaN (all columns if empty)

227

228

Returns:

229

DataFrame: New DataFrame without NaN rows

230

"""

231

```

232

233

### Grouping and Aggregation

234

235

Group data and perform aggregation operations.

236

237

```python { .api }

238

class DataFrame:

239

def groupby(*group_by: ManyColumnsInputType) -> GroupedDataFrame:

240

"""

241

Group DataFrame by columns.

242

243

Parameters:

244

- group_by: Column names or expressions to group by

245

246

Returns:

247

GroupedDataFrame: Grouped DataFrame for aggregation

248

"""

249

250

def sum(*cols: ColumnInputType) -> DataFrame:

251

"""

252

Sum numeric columns.

253

254

Parameters:

255

- cols: Column names to sum (all numeric columns if empty)

256

257

Returns:

258

DataFrame: DataFrame with sum aggregation

259

"""

260

261

def mean(*cols: ColumnInputType) -> DataFrame:

262

"""

263

Calculate mean of numeric columns.

264

265

Parameters:

266

- cols: Column names to average (all numeric columns if empty)

267

268

Returns:

269

DataFrame: DataFrame with mean aggregation

270

"""

271

272

def count(*cols: ColumnInputType) -> DataFrame:

273

"""

274

Count non-null values.

275

276

Parameters:

277

- cols: Column names to count (all columns if empty)

278

279

Returns:

280

DataFrame: DataFrame with count aggregation

281

"""

282

283

def agg(*to_agg: Union[Expression, Iterable[Expression]]) -> DataFrame:

284

"""

285

General aggregation with expressions.

286

287

Parameters:

288

- to_agg: Aggregation expressions

289

290

Returns:

291

DataFrame: DataFrame with custom aggregations

292

"""

293

294

class GroupedDataFrame:

295

def sum(*cols: ColumnInputType) -> DataFrame:

296

"""Sum within groups."""

297

298

def mean(*cols: ColumnInputType) -> DataFrame:

299

"""Mean within groups."""

300

301

def count(*cols: ColumnInputType) -> DataFrame:

302

"""Count within groups."""

303

304

def agg(*to_agg: Union[Expression, Iterable[Expression]]) -> DataFrame:

305

"""Custom aggregation within groups."""

306

```

307

308

### Set Operations

309

310

Combine DataFrames using set operations.

311

312

```python { .api }

313

class DataFrame:

314

def union(other: DataFrame) -> DataFrame:

315

"""

316

Union with another DataFrame (removes duplicates).

317

318

Parameters:

319

- other: DataFrame to union with

320

321

Returns:

322

DataFrame: Combined DataFrame without duplicates

323

"""

324

325

def union_all(other: DataFrame) -> DataFrame:

326

"""

327

Union all rows with another DataFrame (keeps duplicates).

328

329

Parameters:

330

- other: DataFrame to union with

331

332

Returns:

333

DataFrame: Combined DataFrame with all rows

334

"""

335

336

def intersect(other: DataFrame) -> DataFrame:

337

"""

338

Intersection with another DataFrame.

339

340

Parameters:

341

- other: DataFrame to intersect with

342

343

Returns:

344

DataFrame: DataFrame with common rows

345

"""

346

347

def except_distinct(other: DataFrame) -> DataFrame:

348

"""

349

Rows in this DataFrame but not in other (distinct).

350

351

Parameters:

352

- other: DataFrame to subtract

353

354

Returns:

355

DataFrame: DataFrame with difference

356

"""

357

```

358

359

### Transformations

360

361

Apply complex transformations and manipulations.

362

363

```python { .api }

364

class DataFrame:

365

def explode(*columns: ColumnInputType) -> DataFrame:

366

"""

367

Explode array/list columns into separate rows.

368

369

Parameters:

370

- columns: Array/list column names to explode

371

372

Returns:

373

DataFrame: DataFrame with exploded columns

374

"""

375

376

def transform(func: Callable[..., DataFrame], *args: Any, **kwargs: Any) -> DataFrame:

377

"""

378

Apply transformation function to DataFrame.

379

380

Parameters:

381

- func: Function that takes DataFrame and returns DataFrame

382

- args: Positional arguments to pass to function

383

- kwargs: Keyword arguments to pass to function

384

385

Returns:

386

DataFrame: Transformed DataFrame

387

"""

388

```

389

390

### Execution and Materialization

391

392

Execute lazy operations and materialize results.

393

394

```python { .api }

395

class DataFrame:

396

def collect(num_preview_rows: Optional[int] = 8) -> DataFrame:

397

"""

398

Execute lazy operations and collect results.

399

400

Parameters:

401

- num_preview_rows: Number of rows to preview (for display)

402

403

Returns:

404

DataFrame: Materialized DataFrame

405

"""

406

407

def show(n: int = 8) -> None:

408

"""

409

Display first N rows of DataFrame.

410

411

Parameters:

412

- n: Number of rows to display

413

"""

414

415

def count_rows() -> int:

416

"""

417

Count total number of rows (materializes data).

418

419

Returns:

420

int: Total row count

421

"""

422

```

423

424

### Partitioning

425

426

Control data distribution and partitioning.

427

428

```python { .api }

429

class DataFrame:

430

def repartition(num: Optional[int], *partition_by: ColumnInputType) -> DataFrame:

431

"""

432

Repartition DataFrame.

433

434

Parameters:

435

- num: Target number of partitions

436

- partition_by: Columns to partition by

437

438

Returns:

439

DataFrame: Repartitioned DataFrame

440

"""

441

442

def into_partitions(num: int) -> DataFrame:

443

"""

444

Distribute into specified number of partitions.

445

446

Parameters:

447

- num: Number of partitions

448

449

Returns:

450

DataFrame: DataFrame with specified partitions

451

"""

452

```

453

454

### Data Export

455

456

Convert DataFrames to other formats.

457

458

```python { .api }

459

class DataFrame:

460

def to_pandas(coerce_temporal_nanoseconds: bool = False) -> "pandas.DataFrame":

461

"""

462

Convert to pandas DataFrame.

463

464

Parameters:

465

- coerce_temporal_nanoseconds: Handle nanosecond precision

466

467

Returns:

468

pandas.DataFrame: Converted DataFrame

469

"""

470

471

def to_arrow() -> "pyarrow.Table":

472

"""

473

Convert to Apache Arrow table.

474

475

Returns:

476

pyarrow.Table: Arrow representation

477

"""

478

479

def to_pydict() -> Dict[str, List[Any]]:

480

"""

481

Convert to Python dictionary.

482

483

Returns:

484

Dict: Dictionary with column names as keys

485

"""

486

487

def to_pylist() -> List[Dict[str, Any]]:

488

"""

489

Convert to list of dictionaries.

490

491

Returns:

492

List: List of row dictionaries

493

"""

494

```

495

496

## Usage Examples

497

498

### Basic DataFrame Operations

499

```python

500

import daft

501

from daft import col

502

503

# Create DataFrame

504

df = daft.from_pydict({

505

"name": ["Alice", "Bob", "Charlie", "Diana"],

506

"age": [25, 30, 35, 25],

507

"salary": [50000, 75000, 85000, 60000],

508

"department": ["Engineering", "Sales", "Engineering", "Marketing"]

509

})

510

511

# Filter and select

512

result = (df

513

.filter(col("age") >= 30)

514

.select("name", "department", (col("salary") * 1.1).alias("new_salary"))

515

.collect()

516

)

517

518

# Group and aggregate

519

dept_stats = (df

520

.groupby("department")

521

.agg(

522

col("salary").mean().alias("avg_salary"),

523

col("age").max().alias("max_age"),

524

col("name").count().alias("employee_count")

525

)

526

.collect()

527

)

528

```

529

530

### Data Cleaning Pipeline

531

```python

532

# Remove duplicates and null values, then transform

533

cleaned_df = (df

534

.drop_duplicates("name", "age")

535

.drop_null("salary")

536

.with_column_renamed("department", "dept")

537

.filter(col("salary") > 0)

538

.collect()

539

)

540

```

541

542

## Types

543

544

```python { .api }

545

ColumnInputType = Union[str, Expression]

546

ManyColumnsInputType = Union[ColumnInputType, Iterable[ColumnInputType]]

547

```