Tessl Tile for pypi/dask@2025.7.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

arrays.md bags.md configuration.md core-functions.md dataframes.md delayed.md diagnostics.md index.md

dataframes.mddocs/

0
# DataFrames
1

2
Pandas-compatible distributed DataFrames for larger-than-memory datasets. Dask DataFrames partition data across multiple pandas DataFrames, enabling familiar pandas operations on datasets that don't fit in memory.
3

4
## Capabilities
5

6
### DataFrame Creation
7

8
Create Dask DataFrames from various sources including files, pandas objects, and other collections.
9

10
```python { .api }
11
def from_pandas(df, npartitions=None, chunksize=None, sort=True, name=None):
12
    """
13
    Create Dask DataFrame from pandas DataFrame.
14
    
15
    Parameters:
16
    - df: pandas DataFrame or Series
17
    - npartitions: Number of partitions to create
18
    - chunksize: Approximate size of each partition
19
    - sort: Whether to sort by index
20
    - name: Custom name for collection
21
    
22
    Returns:
23
    dask.dataframe.DataFrame or Series: Dask collection
24
    """
25

26
def from_array(x, columns=None, index=None, meta=None):
27
    """
28
    Create DataFrame from dask array.
29
    
30
    Parameters:
31
    - x: Dask array (must be 2D)
32
    - columns: Column names
33
    - index: Index for DataFrame
34
    - meta: Metadata DataFrame
35
    
36
    Returns:
37
    dask.dataframe.DataFrame: Dask DataFrame
38
    """
39

40
def from_dict(data, npartitions, orient='columns', dtype=None, columns=None):
41
    """
42
    Create DataFrame from dictionary of sequences.
43
    
44
    Parameters:
45
    - data: Dictionary of array-like values
46
    - npartitions: Number of partitions
47
    - orient: Data orientation ('columns' or 'index')
48
    - dtype: Data type
49
    - columns: Column names
50
    
51
    Returns:
52
    dask.dataframe.DataFrame: Dask DataFrame
53
    """
54

55
def from_delayed(dfs, meta=None, divisions=None, prefix='from-delayed', 
56
                verify_meta=True):
57
    """
58
    Create DataFrame from delayed objects.
59
    
60
    Parameters:
61
    - dfs: List of delayed pandas DataFrame objects
62
    - meta: Metadata DataFrame for type inference
63
    - divisions: Index divisions between partitions
64
    - prefix: Name prefix for task keys
65
    - verify_meta: Check metadata consistency
66
    
67
    Returns:
68
    dask.dataframe.DataFrame: Dask DataFrame
69
    """
70
```
71

72
### File I/O Operations
73

74
Read and write DataFrames from various file formats.
75

76
```python { .api }
77
def read_csv(urlpath, blocksize=None, collection=True, **kwargs):
78
    """
79
    Read CSV files into Dask DataFrame.
80
    
81
    Parameters:
82
    - urlpath: File path or pattern (supports wildcards)
83
    - blocksize: Size of each partition in bytes
84
    - collection: Return DataFrame (True) or delayed objects (False)
85
    - **kwargs: Additional pandas.read_csv arguments
86
    
87
    Returns:
88
    dask.dataframe.DataFrame: Dask DataFrame
89
    """
90

91
def read_parquet(path, columns=None, filters=None, categories=None,
92
                index=None, storage_options=None, **kwargs):
93
    """
94
    Read Parquet files into Dask DataFrame.
95
    
96
    Parameters:
97
    - path: File path or directory
98
    - columns: Columns to read
99
    - filters: Row filters to apply
100
    - categories: Columns to treat as categorical
101
    - index: Column to use as index
102
    - storage_options: Storage backend options
103
    - **kwargs: Additional arguments
104
    
105
    Returns:
106
    dask.dataframe.DataFrame: Dask DataFrame
107
    """
108

109
def read_json(urlpath, orient='records', lines=None, **kwargs):
110
    """Read JSON files into Dask DataFrame."""
111

112
def read_hdf(pattern, key, start=None, stop=None, **kwargs):
113
    """Read HDF5 files into Dask DataFrame."""
114

115
def read_sql_table(table, uri, index_col, divisions=None, 
116
                  npartitions=None, **kwargs):
117
    """Read SQL table into Dask DataFrame."""
118

119
def read_sql_query(sql, uri, index_col, divisions=None,
120
                  npartitions=None, **kwargs):
121
    """Read SQL query results into Dask DataFrame."""
122

123
def to_csv(df, filename, **kwargs):
124
    """Write DataFrame to CSV files."""
125

126
def to_parquet(df, path, **kwargs):  
127
    """Write DataFrame to Parquet format."""
128

129
def to_json(df, filename, **kwargs):
130
    """Write DataFrame to JSON files."""
131

132
def to_hdf(df, path, key, **kwargs):
133
    """Write DataFrame to HDF5 format."""
134

135
def to_sql(df, name, uri, **kwargs):
136
    """Write DataFrame to SQL database."""
137
```
138

139
### Core DataFrame Class
140

141
Main DataFrame class with pandas-compatible interface.
142

143
```python { .api }
144
class DataFrame:
145
    """
146
    Distributed pandas-like DataFrame.
147
    
148
    Properties:
149
    - columns: Index - Column names
150
    - dtypes: Series - Data types of columns  
151
    - index: Index - Row index
152
    - shape: tuple - Approximate shape (nrows, ncols)
153
    - ndim: int - Number of dimensions (always 2)
154
    - npartitions: int - Number of partitions
155
    - divisions: tuple - Index divisions between partitions
156
    """
157
    
158
    def compute(self, scheduler=None, **kwargs):
159
        """
160
        Compute DataFrame and return pandas result.
161
        
162
        Returns:
163
        pandas.DataFrame: Computed DataFrame
164
        """
165
    
166
    def persist(self, scheduler=None, **kwargs):
167
        """
168
        Persist DataFrame in memory for reuse.
169
        
170
        Returns:
171
        dask.dataframe.DataFrame: Persisted DataFrame
172
        """
173
    
174
    def head(self, n=5, npartitions=1, compute=True):
175
        """
176
        Return first n rows.
177
        
178
        Parameters:
179
        - n: Number of rows to return
180
        - npartitions: Number of partitions to search
181
        - compute: Whether to compute result
182
        
183
        Returns:
184
        pandas.DataFrame or dask.dataframe.DataFrame: First n rows
185
        """
186
    
187
    def tail(self, n=5, compute=True):
188
        """Return last n rows."""
189
    
190
    def __getitem__(self, key):
191
        """Column selection and fancy indexing."""
192
    
193
    def __setitem__(self, key, value):
194
        """Column assignment."""
195
    
196
    def loc(self):
197
        """Label-based indexing."""
198
    
199
    def iloc(self):
200
        """Integer position-based indexing."""
201
```
202

203
### Series Class
204

205
Distributed Series for single-column operations.
206

207
```python { .api }
208
class Series:
209
    """
210
    Distributed pandas-like Series.
211
    
212
    Properties:
213
    - dtype: numpy.dtype - Data type
214
    - index: Index - Row index
215
    - name: str - Series name
216
    - shape: tuple - Approximate shape (nrows,)
217
    - ndim: int - Number of dimensions (always 1)
218
    - npartitions: int - Number of partitions
219
    """
220
    
221
    def compute(self, scheduler=None, **kwargs):
222
        """
223
        Compute Series and return pandas result.
224
        
225
        Returns:
226
        pandas.Series: Computed Series
227
        """
228
    
229
    def persist(self, scheduler=None, **kwargs):
230
        """
231
        Persist Series in memory.
232
        
233
        Returns:
234
        dask.dataframe.Series: Persisted Series
235
        """
236
    
237
    def head(self, n=5, npartitions=1, compute=True):
238
        """Return first n values."""
239
    
240
    def tail(self, n=5, compute=True):
241
        """Return last n values."""
242
    
243
    def value_counts(self, normalize=False, sort=True, ascending=False,
244
                    split_every=None):
245
        """Count unique values."""
246
    
247
    def unique(self, split_every=None):
248
        """Return unique values."""
249
```
250

251
### DataFrame Operations
252

253
Data manipulation and transformation functions.
254

255
```python { .api }
256
def concat(dfs, axis=0, join='outer', ignore_index=False,
257
          interleave_partitions=None):
258
    """
259
    Concatenate DataFrames along axis.
260
    
261
    Parameters:
262
    - dfs: List of DataFrames to concatenate
263
    - axis: Axis to concatenate along (0=rows, 1=columns)
264
    - join: How to handle non-matching columns
265
    - ignore_index: Reset index in result
266
    - interleave_partitions: Interleave partitions for axis=0
267
    
268
    Returns:
269
    dask.dataframe.DataFrame: Concatenated DataFrame
270
    """
271

272
def merge(left, right, how='inner', on=None, left_on=None, right_on=None,
273
         left_index=False, right_index=False, suffixes=('_x', '_y'),
274
         npartitions=None, shuffle=None):
275
    """
276
    Merge DataFrames with database-style joins.
277
    
278
    Parameters:
279
    - left, right: DataFrames to merge
280
    - how: Type of join ('inner', 'outer', 'left', 'right')
281
    - on: Column names to join on
282
    - left_on, right_on: Columns to join on for each DataFrame
283
    - left_index, right_index: Use index as join key
284
    - suffixes: Suffixes for overlapping column names
285
    - npartitions: Number of output partitions
286
    - shuffle: Shuffling method
287
    
288
    Returns:
289
    dask.dataframe.DataFrame: Merged DataFrame
290
    """
291

292
def merge_asof(left, right, on=None, left_on=None, right_on=None,
293
              left_index=False, right_index=False, by=None, 
294
              left_by=None, right_by=None, suffixes=('_x', '_y'),
295
              tolerance=None, allow_exact_matches=True, direction='backward'):
296
    """Perform asof merge (temporal/ordered merge)."""
297

298
def pivot_table(df, index=None, columns=None, values=None, aggfunc='mean'):
299
    """Create pivot table."""
300

301
def melt(df, id_vars=None, value_vars=None, var_name=None, value_name='value'):
302
    """Unpivot DataFrame from wide to long format."""
303

304
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, 
305
               columns=None, sparse=False, drop_first=False, dtype=None):
306
    """Convert categorical variables to dummy/indicator variables."""
307
```
308

309
### Groupby Operations
310

311
Group-based operations and aggregations.
312

313
```python { .api }
314
class DataFrameGroupBy:
315
    """DataFrame groupby operations."""
316
    
317
    def aggregate(self, func, **kwargs):
318
        """Apply aggregation functions."""
319
    
320
    def apply(self, func, **kwargs):
321
        """Apply function to each group."""
322
    
323
    def size(self):
324
        """Size of each group."""
325
    
326
    def count(self):
327
        """Count non-null values in each group."""
328
    
329
    def mean(self, **kwargs):
330
        """Mean of each group."""
331
    
332
    def sum(self, **kwargs):
333
        """Sum of each group."""
334
    
335
    def min(self, **kwargs):
336
        """Minimum of each group."""
337
    
338
    def max(self, **kwargs):
339
        """Maximum of each group."""
340
    
341
    def std(self, **kwargs):
342
        """Standard deviation of each group."""
343
    
344
    def var(self, **kwargs):
345
        """Variance of each group."""
346

347
class SeriesGroupBy:
348
    """Series groupby operations."""
349
    
350
    def aggregate(self, func, **kwargs):
351
        """Apply aggregation functions."""
352
    
353
    def apply(self, func, **kwargs):
354
        """Apply function to each group."""
355
    
356
    # Same methods as DataFrameGroupBy
357
```
358

359
### Data Processing
360

361
Data cleaning, transformation, and processing functions.
362

363
```python { .api }
364
def map_partitions(func, *args, meta=None, **kwargs):
365
    """
366
    Apply function to each partition.
367
    
368
    Parameters:
369
    - func: Function to apply to each partition
370
    - *args: DataFrames and other arguments
371
    - meta: Metadata for result inference
372
    - **kwargs: Additional arguments to func
373
    
374
    Returns:
375
    dask.dataframe result: Result of applying func
376
    """
377

378
def repartition(df, divisions=None, npartitions=None, partition_size=None,
379
               freq=None, force=False):
380
    """
381
    Change DataFrame partitioning.
382
    
383
    Parameters:
384
    - df: DataFrame to repartition
385
    - divisions: New index divisions
386
    - npartitions: Target number of partitions
387
    - partition_size: Target partition size
388
    - freq: Frequency for time-based partitioning
389
    - force: Force repartitioning even if expensive
390
    
391
    Returns:
392
    dask.dataframe.DataFrame: Repartitioned DataFrame
393
    """
394

395
def to_datetime(arg, **kwargs):
396
    """Convert argument to datetime."""
397

398
def to_numeric(arg, errors='raise', **kwargs):
399
    """Convert argument to numeric type."""
400

401
def to_timedelta(arg, unit=None, **kwargs):
402
    """Convert argument to timedelta."""
403

404
def isna(df):
405
    """Detect missing values."""
406
```
407

408
### Index Operations
409

410
Index and division management for DataFrames.
411

412
```python { .api }
413
class Index:
414
    """Distributed index for DataFrames and Series."""
415
    
416
    def compute(self, scheduler=None, **kwargs):
417
        """Compute index values."""
418
    
419
    def persist(self, scheduler=None, **kwargs):
420
        """Persist index in memory."""
421

422
def set_index(df, other, divisions=None, sorted=None, npartitions=None,
423
             shuffle=None, compute=None):
424
    """Set DataFrame index."""
425

426
def reset_index(df, drop=False):
427
    """Reset DataFrame index."""
428
```
429

430
### Aggregation Specifications
431

432
Custom aggregation definitions for groupby operations.
433

434
```python { .api }
435
class Aggregation:
436
    """
437
    Specification for custom aggregations.
438
    
439
    Parameters:
440
    - name: str - Name of aggregation
441
    - chunk: callable - Function for chunk-level aggregation
442
    - agg: callable - Function for combining chunks
443
    - finalize: callable - Function for final result
444
    """
445
    
446
    def __init__(self, name, chunk, agg, finalize=None):
447
        """Initialize aggregation specification."""
448
```
449

450
## Usage Examples
451

452
### Basic DataFrame Operations
453

454
```python
455
import dask.dataframe as dd
456
import pandas as pd
457

458
# Read large CSV file
459
df = dd.read_csv('large_dataset.csv')
460

461
# Basic operations
462
filtered = df[df.value > 100]
463
grouped = df.groupby('category').value.mean()
464
sorted_df = df.sort_values('timestamp')
465

466
# Compute results
467
result = grouped.compute()
468
```
469

470
### Multi-file Processing
471

472
```python
473
import dask.dataframe as dd
474

475
# Read multiple files with pattern
476
df = dd.read_csv('data/year=*/month=*/day=*.csv')
477

478
# Process with familiar pandas operations
479
daily_stats = (df.groupby(['date', 'category'])
480
                .agg({'value': ['mean', 'sum', 'count']})
481
                .compute())
482
```
483

484
### Custom Aggregations
485

486
```python
487
import dask.dataframe as dd
488
from dask.dataframe import Aggregation
489
import pandas as pd
490

491
# Define custom aggregation
492
def chunk_median(x):
493
    return x.quantile([0.25, 0.5, 0.75])
494

495
def combine_quantiles(x):
496
    return pd.concat(x).groupby(level=0).median()
497

498
median_agg = Aggregation(
499
    name='median',
500
    chunk=chunk_median,
501
    agg=combine_quantiles
502
)
503

504
# Use custom aggregation
505
df = dd.read_csv('data.csv')
506
result = df.groupby('category').value.agg(median_agg).compute()
507
```
508

509
### Memory-Efficient Processing
510

511
```python
512
import dask.dataframe as dd
513

514
# Read and process without loading full dataset
515
df = dd.read_parquet('large_dataset.parquet')
516

517
# Chain operations efficiently
518
result = (df.query('value > 0')
519
           .groupby('category')
520
           .value.sum()
521
           .sort_values(ascending=False)
522
           .head(10)
523
           .compute())
524

525
# Persist intermediate results for reuse
526
df_clean = df.dropna().persist()
527
stat1 = df_clean.value.mean().compute()
528
stat2 = df_clean.value.std().compute()
529
```
530

531
### Time Series Operations
532

533
```python
534
import dask.dataframe as dd
535

536
# Read time series data
537
df = dd.read_csv('timeseries.csv', parse_dates=['timestamp'])
538
df = df.set_index('timestamp').repartition(freq='1D')
539

540
# Time-based operations
541
daily_avg = df.resample('D').mean()
542
rolling_mean = df.value.rolling('7D').mean()
543

544
# Compute results
545
results = dd.compute(daily_avg, rolling_mean)
546
```

Version

Tile

Files

dataframes.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

dataframes.mddocs/