Tessl Tile for pypi/daft@0.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

ai-ml.md catalog.md data-io.md dataframe-operations.md expressions.md index.md session.md sql.md udf.md

dataframe-operations.mddocs/

0
# DataFrame Operations
1

2
Core DataFrame functionality for distributed data processing. DataFrames are the primary data structure in Daft, providing lazy evaluation, distributed processing, and rich transformation capabilities.
3

4
## Capabilities
5

6
### DataFrame Creation
7

8
Create DataFrames from various Python data structures and external sources.
9

10
```python { .api }
11
def from_pydict(data: Dict[str, List[Any]]) -> DataFrame:
12
    """
13
    Create DataFrame from Python dictionary.
14
    
15
    Parameters:
16
    - data: Dictionary with column names as keys and lists of values
17
    
18
    Returns:
19
    DataFrame: New DataFrame instance
20
    """
21

22
def from_pylist(data: List[Dict[str, Any]]) -> DataFrame:
23
    """
24
    Create DataFrame from list of dictionaries.
25
    
26
    Parameters:
27
    - data: List of dictionaries representing rows
28
    
29
    Returns:
30
    DataFrame: New DataFrame instance
31
    """
32

33
def from_pandas(df: "pandas.DataFrame") -> DataFrame:
34
    """
35
    Create DataFrame from pandas DataFrame.
36
    
37
    Parameters:
38
    - df: pandas DataFrame to convert
39
    
40
    Returns:
41
    DataFrame: New DataFrame instance
42
    """
43

44
def from_arrow(table: "pyarrow.Table") -> DataFrame:
45
    """
46
    Create DataFrame from Apache Arrow table.
47
    
48
    Parameters:
49
    - table: pyarrow Table to convert
50
    
51
    Returns:
52
    DataFrame: New DataFrame instance
53
    """
54

55
def from_ray_dataset(ds: "ray.data.Dataset") -> DataFrame:
56
    """
57
    Create DataFrame from Ray dataset.
58
    
59
    Parameters:
60
    - ds: Ray dataset to convert
61
    
62
    Returns:
63
    DataFrame: New DataFrame instance
64
    """
65

66
def from_dask_dataframe(ddf: "dask.DataFrame") -> DataFrame:
67
    """
68
    Create DataFrame from Dask DataFrame.
69
    
70
    Parameters:
71
    - ddf: Dask DataFrame to convert
72
    
73
    Returns:
74
    DataFrame: New DataFrame instance
75
    """
76
```
77

78
### Selection and Projection
79

80
Select, rename, and transform columns in DataFrames.
81

82
```python { .api }
83
class DataFrame:
84
    def select(*columns: ColumnInputType, **projections: Expression) -> DataFrame:
85
        """
86
        Select columns and create new projections.
87
        
88
        Parameters:
89
        - columns: Column names or expressions to select
90
        - projections: Named expressions for new columns
91
        
92
        Returns:
93
        DataFrame: New DataFrame with selected columns
94
        """
95
    
96
    def exclude(*names: str) -> DataFrame:
97
        """
98
        Exclude columns by name.
99
        
100
        Parameters:
101
        - names: Column names to exclude
102
        
103
        Returns:
104
        DataFrame: New DataFrame without excluded columns
105
        """
106
    
107
    def with_column_renamed(existing: str, new: str) -> DataFrame:
108
        """
109
        Rename a single column.
110
        
111
        Parameters:
112
        - existing: Current column name
113
        - new: New column name
114
        
115
        Returns:
116
        DataFrame: New DataFrame with renamed column
117
        """
118
    
119
    def with_columns_renamed(cols_map: Dict[str, str]) -> DataFrame:
120
        """
121
        Rename multiple columns.
122
        
123
        Parameters:
124
        - cols_map: Dictionary mapping old names to new names
125
        
126
        Returns:
127
        DataFrame: New DataFrame with renamed columns
128
        """
129
```
130

131
### Filtering and Slicing
132

133
Filter rows based on conditions and slice DataFrames.
134

135
```python { .api }
136
class DataFrame:
137
    def filter(predicate: Union[Expression, str]) -> DataFrame:
138
        """
139
        Filter rows by condition.
140
        
141
        Parameters:
142
        - predicate: Boolean expression or SQL WHERE clause
143
        
144
        Returns:
145
        DataFrame: New DataFrame with filtered rows
146
        """
147
    
148
    def where(predicate: Union[Expression, str]) -> DataFrame:
149
        """
150
        Alias for filter().
151
        
152
        Parameters:
153
        - predicate: Boolean expression or SQL WHERE clause
154
        
155
        Returns:
156
        DataFrame: New DataFrame with filtered rows
157
        """
158
    
159
    def limit(num: int) -> DataFrame:
160
        """
161
        Limit to first N rows.
162
        
163
        Parameters:
164
        - num: Maximum number of rows to return
165
        
166
        Returns:
167
        DataFrame: New DataFrame with limited rows
168
        """
169
    
170
    def offset(num: int) -> DataFrame:
171
        """
172
        Skip first N rows.
173
        
174
        Parameters:
175
        - num: Number of rows to skip
176
        
177
        Returns:
178
        DataFrame: New DataFrame starting from offset
179
        """
180
```
181

182
### Data Cleaning
183

184
Remove duplicates, null values, and NaN values.
185

186
```python { .api }
187
class DataFrame:
188
    def drop_duplicates(*subset: ColumnInputType) -> DataFrame:
189
        """
190
        Remove duplicate rows.
191
        
192
        Parameters:
193
        - subset: Column names to consider for duplicates (all columns if empty)
194
        
195
        Returns:
196
        DataFrame: New DataFrame without duplicates
197
        """
198
    
199
    def distinct(*on: ColumnInputType) -> DataFrame:
200
        """
201
        Get distinct rows.
202
        
203
        Parameters:
204
        - on: Column names to consider for distinctness (all columns if empty)
205
        
206
        Returns:
207
        DataFrame: New DataFrame with distinct rows
208
        """
209
    
210
    def drop_null(*cols: ColumnInputType) -> DataFrame:
211
        """
212
        Drop rows with null values.
213
        
214
        Parameters:
215
        - cols: Column names to check for nulls (all columns if empty)
216
        
217
        Returns:
218
        DataFrame: New DataFrame without null rows
219
        """
220
    
221
    def drop_nan(*cols: ColumnInputType) -> DataFrame:
222
        """
223
        Drop rows with NaN values.
224
        
225
        Parameters:
226
        - cols: Column names to check for NaN (all columns if empty)
227
        
228
        Returns:
229
        DataFrame: New DataFrame without NaN rows
230
        """
231
```
232

233
### Grouping and Aggregation
234

235
Group data and perform aggregation operations.
236

237
```python { .api }
238
class DataFrame:
239
    def groupby(*group_by: ManyColumnsInputType) -> GroupedDataFrame:
240
        """
241
        Group DataFrame by columns.
242
        
243
        Parameters:
244
        - group_by: Column names or expressions to group by
245
        
246
        Returns:
247
        GroupedDataFrame: Grouped DataFrame for aggregation
248
        """
249
    
250
    def sum(*cols: ColumnInputType) -> DataFrame:
251
        """
252
        Sum numeric columns.
253
        
254
        Parameters:
255
        - cols: Column names to sum (all numeric columns if empty)
256
        
257
        Returns:
258
        DataFrame: DataFrame with sum aggregation
259
        """
260
    
261
    def mean(*cols: ColumnInputType) -> DataFrame:
262
        """
263
        Calculate mean of numeric columns.
264
        
265
        Parameters:
266
        - cols: Column names to average (all numeric columns if empty)
267
        
268
        Returns:
269
        DataFrame: DataFrame with mean aggregation
270
        """
271
    
272
    def count(*cols: ColumnInputType) -> DataFrame:
273
        """
274
        Count non-null values.
275
        
276
        Parameters:
277
        - cols: Column names to count (all columns if empty)
278
        
279
        Returns:
280
        DataFrame: DataFrame with count aggregation
281
        """
282
    
283
    def agg(*to_agg: Union[Expression, Iterable[Expression]]) -> DataFrame:
284
        """
285
        General aggregation with expressions.
286
        
287
        Parameters:
288
        - to_agg: Aggregation expressions
289
        
290
        Returns:
291
        DataFrame: DataFrame with custom aggregations
292
        """
293

294
class GroupedDataFrame:
295
    def sum(*cols: ColumnInputType) -> DataFrame:
296
        """Sum within groups."""
297
    
298
    def mean(*cols: ColumnInputType) -> DataFrame:
299
        """Mean within groups."""
300
    
301
    def count(*cols: ColumnInputType) -> DataFrame:
302
        """Count within groups."""
303
    
304
    def agg(*to_agg: Union[Expression, Iterable[Expression]]) -> DataFrame:
305
        """Custom aggregation within groups."""
306
```
307

308
### Set Operations
309

310
Combine DataFrames using set operations.
311

312
```python { .api }
313
class DataFrame:
314
    def union(other: DataFrame) -> DataFrame:
315
        """
316
        Union with another DataFrame (removes duplicates).
317
        
318
        Parameters:
319
        - other: DataFrame to union with
320
        
321
        Returns:
322
        DataFrame: Combined DataFrame without duplicates
323
        """
324
    
325
    def union_all(other: DataFrame) -> DataFrame:
326
        """
327
        Union all rows with another DataFrame (keeps duplicates).
328
        
329
        Parameters:
330
        - other: DataFrame to union with
331
        
332
        Returns:
333
        DataFrame: Combined DataFrame with all rows
334
        """
335
    
336
    def intersect(other: DataFrame) -> DataFrame:
337
        """
338
        Intersection with another DataFrame.
339
        
340
        Parameters:
341
        - other: DataFrame to intersect with
342
        
343
        Returns:
344
        DataFrame: DataFrame with common rows
345
        """
346
    
347
    def except_distinct(other: DataFrame) -> DataFrame:
348
        """
349
        Rows in this DataFrame but not in other (distinct).
350
        
351
        Parameters:
352
        - other: DataFrame to subtract
353
        
354
        Returns:
355
        DataFrame: DataFrame with difference
356
        """
357
```
358

359
### Transformations
360

361
Apply complex transformations and manipulations.
362

363
```python { .api }
364
class DataFrame:
365
    def explode(*columns: ColumnInputType) -> DataFrame:
366
        """
367
        Explode array/list columns into separate rows.
368
        
369
        Parameters:
370
        - columns: Array/list column names to explode
371
        
372
        Returns:
373
        DataFrame: DataFrame with exploded columns
374
        """
375
    
376
    def transform(func: Callable[..., DataFrame], *args: Any, **kwargs: Any) -> DataFrame:
377
        """
378
        Apply transformation function to DataFrame.
379
        
380
        Parameters:
381
        - func: Function that takes DataFrame and returns DataFrame
382
        - args: Positional arguments to pass to function
383
        - kwargs: Keyword arguments to pass to function
384
        
385
        Returns:
386
        DataFrame: Transformed DataFrame
387
        """
388
```
389

390
### Execution and Materialization
391

392
Execute lazy operations and materialize results.
393

394
```python { .api }
395
class DataFrame:
396
    def collect(num_preview_rows: Optional[int] = 8) -> DataFrame:
397
        """
398
        Execute lazy operations and collect results.
399
        
400
        Parameters:
401
        - num_preview_rows: Number of rows to preview (for display)
402
        
403
        Returns:
404
        DataFrame: Materialized DataFrame
405
        """
406
    
407
    def show(n: int = 8) -> None:
408
        """
409
        Display first N rows of DataFrame.
410
        
411
        Parameters:
412
        - n: Number of rows to display
413
        """
414
    
415
    def count_rows() -> int:
416
        """
417
        Count total number of rows (materializes data).
418
        
419
        Returns:
420
        int: Total row count
421
        """
422
```
423

424
### Partitioning
425

426
Control data distribution and partitioning.
427

428
```python { .api }
429
class DataFrame:
430
    def repartition(num: Optional[int], *partition_by: ColumnInputType) -> DataFrame:
431
        """
432
        Repartition DataFrame.
433
        
434
        Parameters:
435
        - num: Target number of partitions
436
        - partition_by: Columns to partition by
437
        
438
        Returns:
439
        DataFrame: Repartitioned DataFrame
440
        """
441
    
442
    def into_partitions(num: int) -> DataFrame:
443
        """
444
        Distribute into specified number of partitions.
445
        
446
        Parameters:
447
        - num: Number of partitions
448
        
449
        Returns:
450
        DataFrame: DataFrame with specified partitions
451
        """
452
```
453

454
### Data Export
455

456
Convert DataFrames to other formats.
457

458
```python { .api }
459
class DataFrame:
460
    def to_pandas(coerce_temporal_nanoseconds: bool = False) -> "pandas.DataFrame":
461
        """
462
        Convert to pandas DataFrame.
463
        
464
        Parameters:
465
        - coerce_temporal_nanoseconds: Handle nanosecond precision
466
        
467
        Returns:
468
        pandas.DataFrame: Converted DataFrame
469
        """
470
    
471
    def to_arrow() -> "pyarrow.Table":
472
        """
473
        Convert to Apache Arrow table.
474
        
475
        Returns:
476
        pyarrow.Table: Arrow representation
477
        """
478
    
479
    def to_pydict() -> Dict[str, List[Any]]:
480
        """
481
        Convert to Python dictionary.
482
        
483
        Returns:
484
        Dict: Dictionary with column names as keys
485
        """
486
    
487
    def to_pylist() -> List[Dict[str, Any]]:
488
        """
489
        Convert to list of dictionaries.
490
        
491
        Returns:
492
        List: List of row dictionaries
493
        """
494
```
495

496
## Usage Examples
497

498
### Basic DataFrame Operations
499
```python
500
import daft
501
from daft import col
502

503
# Create DataFrame
504
df = daft.from_pydict({
505
    "name": ["Alice", "Bob", "Charlie", "Diana"],
506
    "age": [25, 30, 35, 25],
507
    "salary": [50000, 75000, 85000, 60000],
508
    "department": ["Engineering", "Sales", "Engineering", "Marketing"]
509
})
510

511
# Filter and select
512
result = (df
513
    .filter(col("age") >= 30)
514
    .select("name", "department", (col("salary") * 1.1).alias("new_salary"))
515
    .collect()
516
)
517

518
# Group and aggregate
519
dept_stats = (df
520
    .groupby("department")
521
    .agg(
522
        col("salary").mean().alias("avg_salary"),
523
        col("age").max().alias("max_age"),
524
        col("name").count().alias("employee_count")
525
    )
526
    .collect()
527
)
528
```
529

530
### Data Cleaning Pipeline
531
```python
532
# Remove duplicates and null values, then transform
533
cleaned_df = (df
534
    .drop_duplicates("name", "age")
535
    .drop_null("salary")
536
    .with_column_renamed("department", "dept")
537
    .filter(col("salary") > 0)
538
    .collect()
539
)
540
```
541

542
## Types
543

544
```python { .api }
545
ColumnInputType = Union[str, Expression]
546
ManyColumnsInputType = Union[ColumnInputType, Iterable[ColumnInputType]]
547
```

Version

Tile

Files

dataframe-operations.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

dataframe-operations.mddocs/