0
# DataFrame Operations
1
2
Core DataFrame functionality for distributed data processing. DataFrames are the primary data structure in Daft, providing lazy evaluation, distributed processing, and rich transformation capabilities.
3
4
## Capabilities
5
6
### DataFrame Creation
7
8
Create DataFrames from various Python data structures and external sources.
9
10
```python { .api }
11
def from_pydict(data: Dict[str, List[Any]]) -> DataFrame:
12
"""
13
Create DataFrame from Python dictionary.
14
15
Parameters:
16
- data: Dictionary with column names as keys and lists of values
17
18
Returns:
19
DataFrame: New DataFrame instance
20
"""
21
22
def from_pylist(data: List[Dict[str, Any]]) -> DataFrame:
23
"""
24
Create DataFrame from list of dictionaries.
25
26
Parameters:
27
- data: List of dictionaries representing rows
28
29
Returns:
30
DataFrame: New DataFrame instance
31
"""
32
33
def from_pandas(df: "pandas.DataFrame") -> DataFrame:
34
"""
35
Create DataFrame from pandas DataFrame.
36
37
Parameters:
38
- df: pandas DataFrame to convert
39
40
Returns:
41
DataFrame: New DataFrame instance
42
"""
43
44
def from_arrow(table: "pyarrow.Table") -> DataFrame:
45
"""
46
Create DataFrame from Apache Arrow table.
47
48
Parameters:
49
- table: pyarrow Table to convert
50
51
Returns:
52
DataFrame: New DataFrame instance
53
"""
54
55
def from_ray_dataset(ds: "ray.data.Dataset") -> DataFrame:
56
"""
57
Create DataFrame from Ray dataset.
58
59
Parameters:
60
- ds: Ray dataset to convert
61
62
Returns:
63
DataFrame: New DataFrame instance
64
"""
65
66
def from_dask_dataframe(ddf: "dask.DataFrame") -> DataFrame:
67
"""
68
Create DataFrame from Dask DataFrame.
69
70
Parameters:
71
- ddf: Dask DataFrame to convert
72
73
Returns:
74
DataFrame: New DataFrame instance
75
"""
76
```
77
78
### Selection and Projection
79
80
Select, rename, and transform columns in DataFrames.
81
82
```python { .api }
83
class DataFrame:
84
def select(*columns: ColumnInputType, **projections: Expression) -> DataFrame:
85
"""
86
Select columns and create new projections.
87
88
Parameters:
89
- columns: Column names or expressions to select
90
- projections: Named expressions for new columns
91
92
Returns:
93
DataFrame: New DataFrame with selected columns
94
"""
95
96
def exclude(*names: str) -> DataFrame:
97
"""
98
Exclude columns by name.
99
100
Parameters:
101
- names: Column names to exclude
102
103
Returns:
104
DataFrame: New DataFrame without excluded columns
105
"""
106
107
def with_column_renamed(existing: str, new: str) -> DataFrame:
108
"""
109
Rename a single column.
110
111
Parameters:
112
- existing: Current column name
113
- new: New column name
114
115
Returns:
116
DataFrame: New DataFrame with renamed column
117
"""
118
119
def with_columns_renamed(cols_map: Dict[str, str]) -> DataFrame:
120
"""
121
Rename multiple columns.
122
123
Parameters:
124
- cols_map: Dictionary mapping old names to new names
125
126
Returns:
127
DataFrame: New DataFrame with renamed columns
128
"""
129
```
130
131
### Filtering and Slicing
132
133
Filter rows based on conditions and slice DataFrames.
134
135
```python { .api }
136
class DataFrame:
137
def filter(predicate: Union[Expression, str]) -> DataFrame:
138
"""
139
Filter rows by condition.
140
141
Parameters:
142
- predicate: Boolean expression or SQL WHERE clause
143
144
Returns:
145
DataFrame: New DataFrame with filtered rows
146
"""
147
148
def where(predicate: Union[Expression, str]) -> DataFrame:
149
"""
150
Alias for filter().
151
152
Parameters:
153
- predicate: Boolean expression or SQL WHERE clause
154
155
Returns:
156
DataFrame: New DataFrame with filtered rows
157
"""
158
159
def limit(num: int) -> DataFrame:
160
"""
161
Limit to first N rows.
162
163
Parameters:
164
- num: Maximum number of rows to return
165
166
Returns:
167
DataFrame: New DataFrame with limited rows
168
"""
169
170
def offset(num: int) -> DataFrame:
171
"""
172
Skip first N rows.
173
174
Parameters:
175
- num: Number of rows to skip
176
177
Returns:
178
DataFrame: New DataFrame starting from offset
179
"""
180
```
181
182
### Data Cleaning
183
184
Remove duplicates, null values, and NaN values.
185
186
```python { .api }
187
class DataFrame:
188
def drop_duplicates(*subset: ColumnInputType) -> DataFrame:
189
"""
190
Remove duplicate rows.
191
192
Parameters:
193
- subset: Column names to consider for duplicates (all columns if empty)
194
195
Returns:
196
DataFrame: New DataFrame without duplicates
197
"""
198
199
def distinct(*on: ColumnInputType) -> DataFrame:
200
"""
201
Get distinct rows.
202
203
Parameters:
204
- on: Column names to consider for distinctness (all columns if empty)
205
206
Returns:
207
DataFrame: New DataFrame with distinct rows
208
"""
209
210
def drop_null(*cols: ColumnInputType) -> DataFrame:
211
"""
212
Drop rows with null values.
213
214
Parameters:
215
- cols: Column names to check for nulls (all columns if empty)
216
217
Returns:
218
DataFrame: New DataFrame without null rows
219
"""
220
221
def drop_nan(*cols: ColumnInputType) -> DataFrame:
222
"""
223
Drop rows with NaN values.
224
225
Parameters:
226
- cols: Column names to check for NaN (all columns if empty)
227
228
Returns:
229
DataFrame: New DataFrame without NaN rows
230
"""
231
```
232
233
### Grouping and Aggregation
234
235
Group data and perform aggregation operations.
236
237
```python { .api }
238
class DataFrame:
239
def groupby(*group_by: ManyColumnsInputType) -> GroupedDataFrame:
240
"""
241
Group DataFrame by columns.
242
243
Parameters:
244
- group_by: Column names or expressions to group by
245
246
Returns:
247
GroupedDataFrame: Grouped DataFrame for aggregation
248
"""
249
250
def sum(*cols: ColumnInputType) -> DataFrame:
251
"""
252
Sum numeric columns.
253
254
Parameters:
255
- cols: Column names to sum (all numeric columns if empty)
256
257
Returns:
258
DataFrame: DataFrame with sum aggregation
259
"""
260
261
def mean(*cols: ColumnInputType) -> DataFrame:
262
"""
263
Calculate mean of numeric columns.
264
265
Parameters:
266
- cols: Column names to average (all numeric columns if empty)
267
268
Returns:
269
DataFrame: DataFrame with mean aggregation
270
"""
271
272
def count(*cols: ColumnInputType) -> DataFrame:
273
"""
274
Count non-null values.
275
276
Parameters:
277
- cols: Column names to count (all columns if empty)
278
279
Returns:
280
DataFrame: DataFrame with count aggregation
281
"""
282
283
def agg(*to_agg: Union[Expression, Iterable[Expression]]) -> DataFrame:
284
"""
285
General aggregation with expressions.
286
287
Parameters:
288
- to_agg: Aggregation expressions
289
290
Returns:
291
DataFrame: DataFrame with custom aggregations
292
"""
293
294
class GroupedDataFrame:
295
def sum(*cols: ColumnInputType) -> DataFrame:
296
"""Sum within groups."""
297
298
def mean(*cols: ColumnInputType) -> DataFrame:
299
"""Mean within groups."""
300
301
def count(*cols: ColumnInputType) -> DataFrame:
302
"""Count within groups."""
303
304
def agg(*to_agg: Union[Expression, Iterable[Expression]]) -> DataFrame:
305
"""Custom aggregation within groups."""
306
```
307
308
### Set Operations
309
310
Combine DataFrames using set operations.
311
312
```python { .api }
313
class DataFrame:
314
def union(other: DataFrame) -> DataFrame:
315
"""
316
Union with another DataFrame (removes duplicates).
317
318
Parameters:
319
- other: DataFrame to union with
320
321
Returns:
322
DataFrame: Combined DataFrame without duplicates
323
"""
324
325
def union_all(other: DataFrame) -> DataFrame:
326
"""
327
Union all rows with another DataFrame (keeps duplicates).
328
329
Parameters:
330
- other: DataFrame to union with
331
332
Returns:
333
DataFrame: Combined DataFrame with all rows
334
"""
335
336
def intersect(other: DataFrame) -> DataFrame:
337
"""
338
Intersection with another DataFrame.
339
340
Parameters:
341
- other: DataFrame to intersect with
342
343
Returns:
344
DataFrame: DataFrame with common rows
345
"""
346
347
def except_distinct(other: DataFrame) -> DataFrame:
348
"""
349
Rows in this DataFrame but not in other (distinct).
350
351
Parameters:
352
- other: DataFrame to subtract
353
354
Returns:
355
DataFrame: DataFrame with difference
356
"""
357
```
358
359
### Transformations
360
361
Apply complex transformations and manipulations.
362
363
```python { .api }
364
class DataFrame:
365
def explode(*columns: ColumnInputType) -> DataFrame:
366
"""
367
Explode array/list columns into separate rows.
368
369
Parameters:
370
- columns: Array/list column names to explode
371
372
Returns:
373
DataFrame: DataFrame with exploded columns
374
"""
375
376
def transform(func: Callable[..., DataFrame], *args: Any, **kwargs: Any) -> DataFrame:
377
"""
378
Apply transformation function to DataFrame.
379
380
Parameters:
381
- func: Function that takes DataFrame and returns DataFrame
382
- args: Positional arguments to pass to function
383
- kwargs: Keyword arguments to pass to function
384
385
Returns:
386
DataFrame: Transformed DataFrame
387
"""
388
```
389
390
### Execution and Materialization
391
392
Execute lazy operations and materialize results.
393
394
```python { .api }
395
class DataFrame:
396
def collect(num_preview_rows: Optional[int] = 8) -> DataFrame:
397
"""
398
Execute lazy operations and collect results.
399
400
Parameters:
401
- num_preview_rows: Number of rows to preview (for display)
402
403
Returns:
404
DataFrame: Materialized DataFrame
405
"""
406
407
def show(n: int = 8) -> None:
408
"""
409
Display first N rows of DataFrame.
410
411
Parameters:
412
- n: Number of rows to display
413
"""
414
415
def count_rows() -> int:
416
"""
417
Count total number of rows (materializes data).
418
419
Returns:
420
int: Total row count
421
"""
422
```
423
424
### Partitioning
425
426
Control data distribution and partitioning.
427
428
```python { .api }
429
class DataFrame:
430
def repartition(num: Optional[int], *partition_by: ColumnInputType) -> DataFrame:
431
"""
432
Repartition DataFrame.
433
434
Parameters:
435
- num: Target number of partitions
436
- partition_by: Columns to partition by
437
438
Returns:
439
DataFrame: Repartitioned DataFrame
440
"""
441
442
def into_partitions(num: int) -> DataFrame:
443
"""
444
Distribute into specified number of partitions.
445
446
Parameters:
447
- num: Number of partitions
448
449
Returns:
450
DataFrame: DataFrame with specified partitions
451
"""
452
```
453
454
### Data Export
455
456
Convert DataFrames to other formats.
457
458
```python { .api }
459
class DataFrame:
460
def to_pandas(coerce_temporal_nanoseconds: bool = False) -> "pandas.DataFrame":
461
"""
462
Convert to pandas DataFrame.
463
464
Parameters:
465
- coerce_temporal_nanoseconds: Handle nanosecond precision
466
467
Returns:
468
pandas.DataFrame: Converted DataFrame
469
"""
470
471
def to_arrow() -> "pyarrow.Table":
472
"""
473
Convert to Apache Arrow table.
474
475
Returns:
476
pyarrow.Table: Arrow representation
477
"""
478
479
def to_pydict() -> Dict[str, List[Any]]:
480
"""
481
Convert to Python dictionary.
482
483
Returns:
484
Dict: Dictionary with column names as keys
485
"""
486
487
def to_pylist() -> List[Dict[str, Any]]:
488
"""
489
Convert to list of dictionaries.
490
491
Returns:
492
List: List of row dictionaries
493
"""
494
```
495
496
## Usage Examples
497
498
### Basic DataFrame Operations
499
```python
500
import daft
501
from daft import col
502
503
# Create DataFrame
504
df = daft.from_pydict({
505
"name": ["Alice", "Bob", "Charlie", "Diana"],
506
"age": [25, 30, 35, 25],
507
"salary": [50000, 75000, 85000, 60000],
508
"department": ["Engineering", "Sales", "Engineering", "Marketing"]
509
})
510
511
# Filter and select
512
result = (df
513
.filter(col("age") >= 30)
514
.select("name", "department", (col("salary") * 1.1).alias("new_salary"))
515
.collect()
516
)
517
518
# Group and aggregate
519
dept_stats = (df
520
.groupby("department")
521
.agg(
522
col("salary").mean().alias("avg_salary"),
523
col("age").max().alias("max_age"),
524
col("name").count().alias("employee_count")
525
)
526
.collect()
527
)
528
```
529
530
### Data Cleaning Pipeline
531
```python
532
# Remove duplicates and null values, then transform
533
cleaned_df = (df
534
.drop_duplicates("name", "age")
535
.drop_null("salary")
536
.with_column_renamed("department", "dept")
537
.filter(col("salary") > 0)
538
.collect()
539
)
540
```
541
542
## Types
543
544
```python { .api }
545
ColumnInputType = Union[str, Expression]
546
ManyColumnsInputType = Union[ColumnInputType, Iterable[ColumnInputType]]
547
```