Tessl Tile for pypi/polars-u64-idx@1.33.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

config-utilities.md core-data-structures.md data-types.md expressions.md functions.md index.md io-operations.md selectors.md sql-interface.md

core-data-structures.mddocs/

0
# Core Data Structures
1

2
Primary data structures for working with tabular data in Polars, including eager DataFrame/Series for immediate operations and LazyFrame for optimized query execution with the 64-bit index variant supporting datasets exceeding 4.2 billion rows.
3

4
## Capabilities
5

6
### DataFrame
7

8
Two-dimensional labeled data structure with columns of potentially different types. The primary data structure for eager evaluation where operations are executed immediately.
9

10
```python { .api }
11
class DataFrame:
12
    def __init__(
13
        self, 
14
        data=None, 
15
        schema=None, 
16
        schema_overrides=None, 
17
        orient=None, 
18
        infer_schema_length=N_INFER_DEFAULT, 
19
        nan_to_null=False
20
    ):
21
        """
22
        Create a DataFrame from various data sources.
23
        
24
        Parameters:
25
        - data: Data source (dict, list, numpy array, pandas DataFrame, etc.)
26
        - schema: Column names and types
27
        - schema_overrides: Override inferred types for specific columns  
28
        - orient: Data orientation ("row" or "col")
29
        - infer_schema_length: Number of rows to scan for type inference
30
        - nan_to_null: Convert NaN values to null
31
        """
32
    
33
    @property
34
    def shape(self) -> tuple[int, int]:
35
        """Get the shape (rows, columns) of the DataFrame."""
36
    
37
    @property
38
    def height(self) -> int:
39
        """Get the number of rows."""
40
    
41
    @property
42
    def width(self) -> int:
43
        """Get the number of columns."""
44
    
45
    @property
46
    def columns(self) -> list[str]:
47
        """Get column names."""
48
    
49
    @property
50
    def dtypes(self) -> list[DataType]:
51
        """Get data types of all columns."""
52
    
53
    @property
54
    def schema(self) -> Schema:
55
        """Get the schema (column names and types)."""
56
    
57
    def select(self, *exprs, **named_exprs) -> DataFrame:
58
        """
59
        Select columns using expressions.
60
        
61
        Parameters:
62
        - exprs: Column expressions to select
63
        - named_exprs: Named expressions for new columns
64
        
65
        Returns:
66
        DataFrame with selected columns
67
        """
68
    
69
    def filter(self, *predicates, **constraints) -> DataFrame:
70
        """
71
        Filter rows based on predicates.
72
        
73
        Parameters:
74
        - predicates: Boolean expressions for filtering
75
        - constraints: Named constraints
76
        
77
        Returns:
78
        Filtered DataFrame
79
        """
80
    
81
    def with_columns(self, *exprs, **named_exprs) -> DataFrame:
82
        """
83
        Add or modify columns.
84
        
85
        Parameters:
86
        - exprs: Column expressions to add/modify
87
        - named_exprs: Named expressions for new columns
88
        
89
        Returns:
90
        DataFrame with added/modified columns
91
        """
92
    
93
    def drop(self, *columns, strict=True) -> DataFrame:
94
        """
95
        Drop columns from DataFrame.
96
        
97
        Parameters:
98
        - columns: Column names to drop
99
        - strict: Whether to raise error if column doesn't exist
100
        
101
        Returns:
102
        DataFrame without dropped columns
103
        """
104
    
105
    def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> DataFrame:
106
        """
107
        Rename columns.
108
        
109
        Parameters:
110
        - mapping: Dictionary mapping old to new names, or function
111
        
112
        Returns:
113
        DataFrame with renamed columns
114
        """
115
    
116
    def sort(
117
        self, 
118
        by, 
119
        *, 
120
        descending=False, 
121
        nulls_last=False, 
122
        multithreaded=True
123
    ) -> DataFrame:
124
        """
125
        Sort DataFrame by columns.
126
        
127
        Parameters:
128
        - by: Column(s) to sort by
129
        - descending: Sort in descending order
130
        - nulls_last: Place nulls at end
131
        - multithreaded: Use multiple threads
132
        
133
        Returns:
134
        Sorted DataFrame
135
        """
136
    
137
    def group_by(self, *by, maintain_order=False, **named_by) -> GroupBy:
138
        """
139
        Group DataFrame for aggregation.
140
        
141
        Parameters:
142
        - by: Columns to group by
143
        - maintain_order: Maintain order of groups
144
        - named_by: Named grouping expressions
145
        
146
        Returns:
147
        GroupBy object for aggregation
148
        """
149
    
150
    def join(
151
        self, 
152
        other, 
153
        on=None, 
154
        how="inner", 
155
        *, 
156
        left_on=None, 
157
        right_on=None, 
158
        suffix="_right", 
159
        validate="m:m", 
160
        join_nulls=False, 
161
        coalesce=None
162
    ) -> DataFrame:
163
        """
164
        Join with another DataFrame.
165
        
166
        Parameters:
167
        - other: DataFrame to join with
168
        - on: Column(s) to join on
169
        - how: Join type ("inner", "left", "outer", "cross", "anti", "semi")
170
        - left_on: Left DataFrame join columns
171
        - right_on: Right DataFrame join columns  
172
        - suffix: Suffix for duplicate column names
173
        - validate: Join validation ("m:m", "1:m", "m:1", "1:1")
174
        - join_nulls: Join on null values
175
        - coalesce: Coalesce join columns
176
        
177
        Returns:
178
        Joined DataFrame
179
        """
180
    
181
    def concat(self, other, *, how="vertical", ignore_index=False) -> DataFrame:
182
        """
183
        Concatenate with other DataFrame(s).
184
        
185
        Parameters:
186
        - other: DataFrame(s) to concatenate
187
        - how: Concatenation method ("vertical", "horizontal", "diagonal")
188
        - ignore_index: Reset index after concatenation
189
        
190
        Returns:
191
        Concatenated DataFrame
192
        """
193
    
194
    def to_pandas(self, **kwargs) -> pd.DataFrame:
195
        """Convert to pandas DataFrame."""
196
    
197
    def to_numpy(self, structured=False, order="c") -> np.ndarray:
198
        """Convert to NumPy array."""
199
    
200
    def to_arrow(self, *, compat_level=None) -> pa.Table:
201
        """Convert to PyArrow Table."""
202
    
203
    def to_dict(self, *, as_series=True) -> dict[str, Series | list[Any]]:
204
        """Convert to dictionary."""
205
    
206
    def write_csv(self, file=None, **kwargs) -> str | None:
207
        """Write to CSV file."""
208
    
209
    def write_json(self, file=None, **kwargs) -> str | None:
210
        """Write to JSON file."""
211
    
212
    def write_parquet(self, file, **kwargs) -> None:
213
        """Write to Parquet file."""
214
    
215
    def write_ipc(self, file, **kwargs) -> None:
216
        """Write to IPC/Arrow file."""
217
    
218
    def lazy(self) -> LazyFrame:
219
        """Convert to LazyFrame for optimized operations."""
220
    
221
    def head(self, n=5) -> DataFrame:
222
        """Get first n rows."""
223
    
224
    def tail(self, n=5) -> DataFrame:
225
        """Get last n rows."""
226
    
227
    def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> DataFrame:
228
        """Sample rows from DataFrame."""
229
    
230
    def null_count(self) -> DataFrame:
231
        """Count null values per column."""
232
    
233
    def is_empty(self) -> bool:
234
        """Check if DataFrame is empty."""
235
    
236
    def clone(self) -> DataFrame:
237
        """Create a copy of the DataFrame."""
238
```
239

240
### Series
241

242
One-dimensional labeled array with homogeneous data type. Similar to a column in a DataFrame but can exist independently.
243

244
```python { .api }
245
class Series:
246
    def __init__(
247
        self, 
248
        name=None, 
249
        values=None, 
250
        dtype=None, 
251
        strict=True, 
252
        nan_to_null=False, 
253
        dtype_if_empty=Null
254
    ):
255
        """
256
        Create a Series.
257
        
258
        Parameters:
259
        - name: Series name
260
        - values: Data values
261
        - dtype: Data type
262
        - strict: Strict type checking
263
        - nan_to_null: Convert NaN to null
264
        - dtype_if_empty: Type when empty
265
        """
266
    
267
    @property
268
    def name(self) -> str:
269
        """Get Series name."""
270
    
271
    @property
272
    def dtype(self) -> DataType:
273
        """Get data type."""
274
    
275
    @property
276
    def shape(self) -> tuple[int]:
277
        """Get shape (length,)."""
278
    
279
    def len(self) -> int:
280
        """Get length."""
281
    
282
    def sum(self) -> Any:
283
        """Sum all values."""
284
    
285
    def mean(self) -> float | None:
286
        """Calculate mean."""
287
    
288
    def max(self) -> Any:
289
        """Get maximum value."""
290
    
291
    def min(self) -> Any:
292
        """Get minimum value."""
293
    
294
    def sort(self, *, descending=False, nulls_last=False) -> Series:
295
        """Sort Series values."""
296
    
297
    def filter(self, predicate) -> Series:
298
        """Filter values based on predicate."""
299
    
300
    def to_list(self) -> list[Any]:
301
        """Convert to Python list."""
302
    
303
    def to_numpy(self) -> np.ndarray:
304
        """Convert to NumPy array."""
305
    
306
    def to_pandas(self) -> pd.Series:
307
        """Convert to pandas Series."""
308
    
309
    def to_frame(self, name=None) -> DataFrame:
310
        """Convert to single-column DataFrame."""
311
```
312

313
### LazyFrame  
314

315
Lazy evaluation version of DataFrame that builds a query plan without executing until `.collect()` is called. Enables query optimization and efficient processing of large datasets.
316

317
```python { .api }
318
class LazyFrame:
319
    def select(self, *exprs, **named_exprs) -> LazyFrame:
320
        """Select columns (lazy operation)."""
321
    
322
    def filter(self, *predicates, **constraints) -> LazyFrame:
323
        """Filter rows (lazy operation)."""
324
    
325
    def with_columns(self, *exprs, **named_exprs) -> LazyFrame:
326
        """Add/modify columns (lazy operation)."""
327
    
328
    def drop(self, *columns, strict=True) -> LazyFrame:
329
        """Drop columns (lazy operation)."""
330
    
331
    def sort(self, by, *, descending=False, nulls_last=False, multithreaded=True) -> LazyFrame:
332
        """Sort by columns (lazy operation)."""
333
    
334
    def group_by(self, *by, maintain_order=False, **named_by) -> LazyGroupBy:
335
        """Group for aggregation (lazy operation)."""
336
    
337
    def join(
338
        self, 
339
        other, 
340
        on=None, 
341
        how="inner", 
342
        *, 
343
        left_on=None, 
344
        right_on=None, 
345
        suffix="_right",
346
        validate="m:m", 
347
        join_nulls=False, 
348
        coalesce=None
349
    ) -> LazyFrame:
350
        """Join with another LazyFrame (lazy operation)."""
351
    
352
    def collect(
353
        self, 
354
        *, 
355
        type_coercion=True, 
356
        predicate_pushdown=True, 
357
        projection_pushdown=True, 
358
        simplify_expression=True, 
359
        slice_pushdown=True, 
360
        comm_subplan_elim=True, 
361
        comm_subexpr_elim=True, 
362
        cluster_with_columns=True, 
363
        no_optimization=False, 
364
        streaming=False, 
365
        background=False, 
366
        _eager=False
367
    ) -> DataFrame:
368
        """
369
        Execute the lazy query and return DataFrame.
370
        
371
        Parameters:
372
        - type_coercion: Apply automatic type coercion
373
        - predicate_pushdown: Push filters down to scan level  
374
        - projection_pushdown: Push column selection down
375
        - simplify_expression: Simplify expressions
376
        - slice_pushdown: Push limits/offsets down
377
        - comm_subplan_elim: Eliminate common subplans
378
        - comm_subexpr_elim: Eliminate common subexpressions
379
        - cluster_with_columns: Cluster with_columns operations
380
        - no_optimization: Disable all optimizations
381
        - streaming: Execute in streaming mode
382
        - background: Execute in background thread
383
        
384
        Returns:
385
        Executed DataFrame
386
        """
387
    
388
    def explain(self, *, optimized=True, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, streaming=False) -> str:
389
        """Get query execution plan."""
390
    
391
    def schema(self) -> Schema:
392
        """Get the expected schema."""
393
    
394
    def dtypes(self) -> list[DataType]:
395
        """Get expected column data types."""
396
    
397
    def columns(self) -> list[str]:
398
        """Get expected column names."""
399
    
400
    def head(self, n=5) -> LazyFrame:
401
        """Get first n rows (lazy operation)."""
402
    
403
    def tail(self, n=5) -> LazyFrame:
404
        """Get last n rows (lazy operation)."""
405
    
406
    def limit(self, n) -> LazyFrame:
407
        """Limit number of rows (lazy operation)."""
408
    
409
    def offset(self, n) -> LazyFrame:
410
        """Skip first n rows (lazy operation)."""
411
    
412
    def slice(self, offset, length=None) -> LazyFrame:
413
        """Slice rows (lazy operation)."""
414
```
415

416
### GroupBy Operations
417

418
GroupBy objects returned from `group_by()` operations on DataFrame and LazyFrame for aggregation operations.
419

420
```python { .api }
421
class GroupBy:
422
    def agg(self, *aggs, **named_aggs) -> DataFrame:
423
        """
424
        Aggregate grouped data.
425
        
426
        Parameters:
427
        - aggs: Aggregation expressions
428
        - named_aggs: Named aggregation expressions
429
        
430
        Returns:
431
        DataFrame with aggregated results
432
        """
433
    
434
    def sum(self) -> DataFrame:
435
        """Sum each group."""
436
    
437
    def mean(self) -> DataFrame:
438
        """Mean of each group."""
439
    
440
    def max(self) -> DataFrame:
441
        """Maximum of each group."""
442
    
443
    def min(self) -> DataFrame:
444
        """Minimum of each group."""
445
    
446
    def count(self) -> DataFrame:
447
        """Count rows in each group."""
448
    
449
    def first(self) -> DataFrame:
450
        """First value in each group."""
451
    
452
    def last(self) -> DataFrame:
453
        """Last value in each group."""
454

455
class LazyGroupBy:
456
    def agg(self, *aggs, **named_aggs) -> LazyFrame:
457
        """Aggregate grouped data (lazy operation)."""
458
    
459
    def sum(self) -> LazyFrame:
460
        """Sum each group (lazy operation)."""
461
    
462
    def mean(self) -> LazyFrame:
463
        """Mean of each group (lazy operation)."""
464
    
465
    def max(self) -> LazyFrame:
466
        """Maximum of each group (lazy operation)."""
467
    
468
    def min(self) -> LazyFrame:
469
        """Minimum of each group (lazy operation)."""
470
    
471
    def count(self) -> LazyFrame:
472
        """Count rows in each group (lazy operation)."""
473
```
474

475
## Usage Examples
476

477
### Creating DataFrames
478

479
```python
480
import polars as pl
481

482
# From dictionary
483
df = pl.DataFrame({
484
    "name": ["Alice", "Bob", "Charlie"],
485
    "age": [25, 30, 35],
486
    "salary": [50000, 60000, 70000]
487
})
488

489
# From list of dictionaries
490
data = [
491
    {"name": "Alice", "age": 25, "salary": 50000},
492
    {"name": "Bob", "age": 30, "salary": 60000},
493
    {"name": "Charlie", "age": 35, "salary": 70000}
494
]
495
df = pl.DataFrame(data)
496

497
# From NumPy array
498
import numpy as np
499
arr = np.array([[1, 2, 3], [4, 5, 6]])
500
df = pl.DataFrame(arr, schema=["a", "b", "c"])
501
```
502

503
### DataFrame Operations
504

505
```python
506
# Basic operations
507
result = (df
508
    .filter(pl.col("age") > 28)
509
    .select([
510
        pl.col("name"),
511
        pl.col("age"),
512
        (pl.col("salary") / 1000).alias("salary_k")
513
    ])
514
    .sort("age", descending=True)
515
)
516

517
# Grouping and aggregation
518
summary = (df
519
    .group_by("department")
520
    .agg([
521
        pl.col("salary").mean().alias("avg_salary"),
522
        pl.col("name").count().alias("employee_count"),
523
        pl.col("age").max().alias("max_age")
524
    ])
525
)
526
```
527

528
### Lazy Operations
529

530
```python
531
# Build query plan without execution
532
lazy_query = (pl
533
    .scan_csv("large_dataset.csv")
534
    .filter(pl.col("amount") > 1000)
535
    .group_by("category")
536
    .agg([
537
        pl.col("amount").sum().alias("total"),
538
        pl.col("id").count().alias("count")
539
    ])
540
    .sort("total", descending=True)
541
)
542

543
# Execute optimized query
544
result = lazy_query.collect()
545

546
# Check execution plan
547
print(lazy_query.explain())
548
```
549

550
### Working with Large Datasets (64-bit Index)
551

552
```python
553
# The u64-idx variant handles datasets > 4.2B rows
554
very_large_df = pl.scan_parquet("huge_dataset.parquet")
555

556
# Operations work the same but support more rows
557
result = (very_large_df
558
    .filter(pl.col("timestamp") > "2023-01-01")
559
    .group_by("user_id")
560
    .agg([
561
        pl.col("value").sum(),
562
        pl.col("event").count()
563
    ])
564
    .collect(streaming=True)  # Use streaming for memory efficiency
565
)
566
```

Version

Tile

Files

core-data-structures.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

core-data-structures.mddocs/