0
# Core Data Structures
1
2
The fundamental data structures that form the foundation of Polars: DataFrame for eager evaluation, LazyFrame for lazy evaluation with query optimization, Series for one-dimensional data, and Expr for building complex column operations and transformations.
3
4
## Capabilities
5
6
### DataFrame
7
8
Primary data structure for eager evaluation providing immediate computation with comprehensive data manipulation methods including filtering, selection, aggregation, joining, and reshaping operations.
9
10
```python { .api }
11
class DataFrame:
12
def __init__(
13
self,
14
data=None,
15
schema=None,
16
*,
17
schema_overrides=None,
18
strict=True,
19
orient=None,
20
infer_schema_length=None,
21
nan_to_null=False
22
):
23
"""
24
Create a DataFrame from various data sources.
25
26
Parameters:
27
- data: Data source (dict, list, arrow table, pandas df, etc.)
28
- schema: Column names and types
29
- schema_overrides: Override specific column types
30
- strict: Strict schema validation
31
- orient: Data orientation ('row' or 'col')
32
- infer_schema_length: Rows to scan for type inference
33
- nan_to_null: Convert NaN to null values
34
"""
35
36
# Selection and Projection
37
def select(self, *exprs, **named_exprs) -> DataFrame: ...
38
def with_columns(self, *exprs, **named_exprs) -> DataFrame: ...
39
def drop(self, *columns) -> DataFrame: ...
40
def rename(self, mapping) -> DataFrame: ...
41
42
# Filtering and Sorting
43
def filter(self, *predicates) -> DataFrame: ...
44
def sort(self, by, *, descending=False, nulls_last=False) -> DataFrame: ...
45
def unique(self, subset=None, *, keep="any", maintain_order=False) -> DataFrame: ...
46
def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> DataFrame: ...
47
48
# Aggregation and Grouping
49
def group_by(self, *by, maintain_order=False) -> GroupBy: ...
50
def sum(self) -> DataFrame: ...
51
def mean(self) -> DataFrame: ...
52
def max(self) -> DataFrame: ...
53
def min(self) -> DataFrame: ...
54
def std(self, ddof=1) -> DataFrame: ...
55
def var(self, ddof=1) -> DataFrame: ...
56
57
# Reshaping and Transformation
58
def pivot(self, *, on, index=None, values=None, aggregate_function="first", sort_columns=False) -> DataFrame: ...
59
def unpivot(self, *, on=None, index=None, variable_name=None, value_name=None) -> DataFrame: ...
60
def transpose(self, *, include_header=False, header_name="column", column_names=None) -> DataFrame: ...
61
def explode(self, columns, *, schema_overrides=None) -> DataFrame: ...
62
63
# Joining Operations
64
def join(self, other, *, on=None, how="inner", left_on=None, right_on=None, suffix="_right", validate=None, join_nulls=False) -> DataFrame: ...
65
def join_asof(self, other, *, left_on=None, right_on=None, on=None, by_left=None, by_right=None, by=None, strategy="backward") -> DataFrame: ...
66
67
# Window Operations
68
def with_row_index(self, name="row_nr", offset=0) -> DataFrame: ...
69
def rolling(self, index_column, *, period, offset=None, closed="right", by=None, check_sorted=True) -> RollingGroupBy: ...
70
71
# I/O Operations
72
def write_csv(self, file=None, **kwargs) -> str | None: ...
73
def write_parquet(self, file, **kwargs) -> None: ...
74
def write_json(self, file=None, **kwargs) -> str | None: ...
75
def write_excel(self, workbook=None, worksheet=None, **kwargs): ...
76
def write_database(self, table_name, connection, **kwargs) -> int: ...
77
78
# Conversion Methods
79
def to_pandas(self, **kwargs): ...
80
def to_numpy(self, structured=False, **kwargs): ...
81
def to_arrow(self) -> pa.Table: ...
82
def to_dict(self, as_series=True) -> dict: ...
83
def to_dicts(self) -> list[dict]: ...
84
85
# Utility Methods
86
def head(self, n=5) -> DataFrame: ...
87
def tail(self, n=5) -> DataFrame: ...
88
def slice(self, offset, length=None) -> DataFrame: ...
89
def glimpse(self, *, max_items_per_column=10, max_colname_length=50, return_as_string=False) -> str | None: ...
90
def describe(self, *, percentiles=None, interpolation="nearest") -> DataFrame: ...
91
def is_empty(self) -> bool: ...
92
def lazy(self) -> LazyFrame: ...
93
94
# Properties
95
@property
96
def columns(self) -> list[str]: ...
97
@property
98
def dtypes(self) -> list[DataType]: ...
99
@property
100
def schema(self) -> Schema: ...
101
@property
102
def shape(self) -> tuple[int, int]: ...
103
@property
104
def height(self) -> int: ...
105
@property
106
def width(self) -> int: ...
107
@property
108
def flags(self) -> dict[str, dict[str, bool]]: ...
109
```
110
111
### LazyFrame
112
113
Lazy evaluation data structure that builds a computation graph for query optimization, predicate pushdown, and efficient memory usage with automatic query planning.
114
115
```python { .api }
116
class LazyFrame:
117
# Selection and Projection
118
def select(self, *exprs, **named_exprs) -> LazyFrame: ...
119
def with_columns(self, *exprs, **named_exprs) -> LazyFrame: ...
120
def drop(self, *columns) -> LazyFrame: ...
121
def rename(self, mapping) -> LazyFrame: ...
122
123
# Filtering and Sorting
124
def filter(self, *predicates) -> LazyFrame: ...
125
def sort(self, by, *, descending=False, nulls_last=False, multithreaded=True, maintain_order=False) -> LazyFrame: ...
126
def unique(self, subset=None, *, keep="any", maintain_order=False) -> LazyFrame: ...
127
def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> LazyFrame: ...
128
129
# Aggregation and Grouping
130
def group_by(self, *by, maintain_order=False) -> LazyGroupBy: ...
131
def sum(self) -> LazyFrame: ...
132
def mean(self) -> LazyFrame: ...
133
def max(self) -> LazyFrame: ...
134
def min(self) -> LazyFrame: ...
135
def std(self, ddof=1) -> LazyFrame: ...
136
def var(self, ddof=1) -> LazyFrame: ...
137
138
# Reshaping and Transformation
139
def pivot(self, *, on, index=None, values=None, aggregate_function="first", sort_columns=False) -> LazyFrame: ...
140
def unpivot(self, *, on=None, index=None, variable_name=None, value_name=None) -> LazyFrame: ...
141
def explode(self, columns, *, schema_overrides=None) -> LazyFrame: ...
142
143
# Joining Operations
144
def join(self, other, *, on=None, how="inner", left_on=None, right_on=None, suffix="_right", validate=None, join_nulls=False) -> LazyFrame: ...
145
def join_asof(self, other, *, left_on=None, right_on=None, on=None, by_left=None, by_right=None, by=None, strategy="backward") -> LazyFrame: ...
146
147
# Window Operations
148
def with_row_index(self, name="row_nr", offset=0) -> LazyFrame: ...
149
def rolling(self, index_column, *, period, offset=None, closed="right", by=None) -> RollingGroupBy: ...
150
151
# Execution and Optimization
152
def collect(self, *, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, streaming=False, background=False, _eager=True) -> DataFrame: ...
153
def explain(self, *, optimized=True, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, format="plain") -> str: ...
154
def show_graph(self, *, optimized=True, show=True, output_path=None, raw_output=False, figsize=(16, 12), type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True) -> str | None: ...
155
156
# Utility Methods
157
def head(self, n=5) -> LazyFrame: ...
158
def tail(self, n=5) -> LazyFrame: ...
159
def slice(self, offset, length=None) -> LazyFrame: ...
160
def first(self) -> LazyFrame: ...
161
def last(self) -> LazyFrame: ...
162
def cache(self) -> LazyFrame: ...
163
164
# Properties
165
@property
166
def columns(self) -> list[str]: ...
167
@property
168
def dtypes(self) -> list[DataType]: ...
169
@property
170
def schema(self) -> Schema: ...
171
@property
172
def width(self) -> int: ...
173
```
174
175
### Series
176
177
One-dimensional data structure with vectorized operations, supporting element-wise transformations, aggregations, and integration with DataFrame operations.
178
179
```python { .api }
180
class Series:
181
def __init__(self, name=None, values=None, dtype=None, strict=True, nan_to_null=False):
182
"""
183
Create a Series from values.
184
185
Parameters:
186
- name: Series name
187
- values: Data values (list, array, etc.)
188
- dtype: Data type
189
- strict: Strict type checking
190
- nan_to_null: Convert NaN to null
191
"""
192
193
# Element Access and Slicing
194
def __getitem__(self, item): ...
195
def get(self, index, *, default=None): ...
196
def slice(self, offset, length=None) -> Series: ...
197
def head(self, n=5) -> Series: ...
198
def tail(self, n=5) -> Series: ...
199
def take(self, indices) -> Series: ...
200
def gather(self, indices) -> Series: ...
201
202
# Filtering and Selection
203
def filter(self, predicate) -> Series: ...
204
def unique(self, *, maintain_order=False) -> Series: ...
205
def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> Series: ...
206
def sort(self, *, descending=False, nulls_last=False) -> Series: ...
207
208
# Transformations
209
def map_elements(self, function, return_dtype=None, *, skip_nulls=True) -> Series: ...
210
def cast(self, dtype, *, strict=True) -> Series: ...
211
def alias(self, name) -> Series: ...
212
def rename(self, name) -> Series: ...
213
214
# Aggregations
215
def sum(self) -> int | float: ...
216
def mean(self) -> float | None: ...
217
def median(self) -> float | None: ...
218
def max(self) -> Any: ...
219
def min(self) -> Any: ...
220
def std(self, ddof=1) -> float | None: ...
221
def var(self, ddof=1) -> float | None: ...
222
def count(self) -> int: ...
223
def len(self) -> int: ...
224
225
# String Operations (when dtype is String)
226
@property
227
def str(self) -> StringNameSpace: ...
228
229
# Datetime Operations (when dtype is temporal)
230
@property
231
def dt(self) -> DateTimeNameSpace: ...
232
233
# List Operations (when dtype is List)
234
@property
235
def list(self) -> ListNameSpace: ...
236
237
# Array Operations (when dtype is Array)
238
@property
239
def arr(self) -> ArrayNameSpace: ...
240
241
# Struct Operations (when dtype is Struct)
242
@property
243
def struct(self) -> StructNameSpace: ...
244
245
# Categorical Operations (when dtype is Categorical)
246
@property
247
def cat(self) -> CategoricalNameSpace: ...
248
249
# Binary Operations (when dtype is Binary)
250
@property
251
def bin(self) -> BinaryNameSpace: ...
252
253
# Conversion Methods
254
def to_list(self) -> list: ...
255
def to_numpy(self, *, zero_copy_only=False, writable=False) -> np.ndarray: ...
256
def to_arrow(self) -> pa.Array: ...
257
def to_pandas(self, **kwargs): ...
258
def to_frame(self, name=None) -> DataFrame: ...
259
260
# Utility Methods
261
def is_null(self) -> Series: ...
262
def is_not_null(self) -> Series: ...
263
def is_finite(self) -> Series: ...
264
def is_infinite(self) -> Series: ...
265
def is_nan(self) -> Series: ...
266
def is_not_nan(self) -> Series: ...
267
def is_empty(self) -> bool: ...
268
def describe(self, *, percentiles=None, interpolation="nearest") -> DataFrame: ...
269
270
# Properties
271
@property
272
def name(self) -> str: ...
273
@property
274
def dtype(self) -> DataType: ...
275
@property
276
def shape(self) -> tuple[int]: ...
277
@property
278
def flags(self) -> dict[str, bool]: ...
279
```
280
281
### Expr
282
283
Expression builder for column operations, transformations, and aggregations that can be used across DataFrame, LazyFrame, and various contexts for building complex data processing pipelines.
284
285
```python { .api }
286
class Expr:
287
# Aliasing and Naming
288
def alias(self, name: str) -> Expr: ...
289
def name(self) -> ExprNameNameSpace: ...
290
291
# Filtering and Selection
292
def filter(self, predicate) -> Expr: ...
293
def sort(self, *, descending=False, nulls_last=False) -> Expr: ...
294
def sort_by(self, by, *, descending=False, nulls_last=False) -> Expr: ...
295
def unique(self, *, maintain_order=False) -> Expr: ...
296
def slice(self, offset, length=None) -> Expr: ...
297
def head(self, n=5) -> Expr: ...
298
def tail(self, n=5) -> Expr: ...
299
def first(self) -> Expr: ...
300
def last(self) -> Expr: ...
301
def take(self, indices) -> Expr: ...
302
def gather(self, indices) -> Expr: ...
303
304
# Aggregations
305
def sum(self) -> Expr: ...
306
def mean(self) -> Expr: ...
307
def median(self) -> Expr: ...
308
def max(self) -> Expr: ...
309
def min(self) -> Expr: ...
310
def std(self, ddof=1) -> Expr: ...
311
def var(self, ddof=1) -> Expr: ...
312
def count(self) -> Expr: ...
313
def len(self) -> Expr: ...
314
def n_unique(self) -> Expr: ...
315
def null_count(self) -> Expr: ...
316
def quantile(self, quantile, interpolation="nearest") -> Expr: ...
317
318
# Window Functions
319
def over(self, partition_by=None, *, order_by=None, mapping_strategy="group_to_rows") -> Expr: ...
320
def rank(self, method="average", *, descending=False, seed=None) -> Expr: ...
321
def cum_sum(self, *, reverse=False) -> Expr: ...
322
def cum_count(self, *, reverse=False) -> Expr: ...
323
def cum_max(self, *, reverse=False) -> Expr: ...
324
def cum_min(self, *, reverse=False) -> Expr: ...
325
326
# Mathematical Operations
327
def abs(self) -> Expr: ...
328
def sqrt(self) -> Expr: ...
329
def log(self, base=None) -> Expr: ...
330
def log10(self) -> Expr: ...
331
def exp(self) -> Expr: ...
332
def pow(self, exponent) -> Expr: ...
333
def round(self, decimals=0) -> Expr: ...
334
def floor(self) -> Expr: ...
335
def ceil(self) -> Expr: ...
336
337
# Type Operations
338
def cast(self, dtype, *, strict=True) -> Expr: ...
339
def is_null(self) -> Expr: ...
340
def is_not_null(self) -> Expr: ...
341
def is_finite(self) -> Expr: ...
342
def is_infinite(self) -> Expr: ...
343
def is_nan(self) -> Expr: ...
344
def is_not_nan(self) -> Expr: ...
345
def is_duplicated(self) -> Expr: ...
346
def is_unique(self) -> Expr: ...
347
def is_first_distinct(self) -> Expr: ...
348
def is_last_distinct(self) -> Expr: ...
349
350
# Conditional Operations
351
def is_between(self, lower_bound, upper_bound, closed="both") -> Expr: ...
352
def is_in(self, other) -> Expr: ...
353
def when(self, condition) -> When: ...
354
355
# String Operations (when expression evaluates to String)
356
@property
357
def str(self) -> ExprStringNameSpace: ...
358
359
# Datetime Operations (when expression evaluates to temporal type)
360
@property
361
def dt(self) -> ExprDateTimeNameSpace: ...
362
363
# List Operations (when expression evaluates to List)
364
@property
365
def list(self) -> ExprListNameSpace: ...
366
367
# Array Operations (when expression evaluates to Array)
368
@property
369
def arr(self) -> ExprArrayNameSpace: ...
370
371
# Struct Operations (when expression evaluates to Struct)
372
@property
373
def struct(self) -> ExprStructNameSpace: ...
374
375
# Categorical Operations (when expression evaluates to Categorical)
376
@property
377
def cat(self) -> ExprCategoricalNameSpace: ...
378
379
# Binary Operations (when expression evaluates to Binary)
380
@property
381
def bin(self) -> ExprBinaryNameSpace: ...
382
383
# Meta Operations
384
@property
385
def meta(self) -> ExprMetaNameSpace: ...
386
```
387
388
## Usage Examples
389
390
### Basic DataFrame Operations
391
392
```python
393
import polars as pl
394
395
# Create DataFrame
396
df = pl.DataFrame({
397
"product": ["A", "B", "C", "A", "B"],
398
"sales": [100, 200, 150, 80, 250],
399
"region": ["North", "South", "North", "South", "North"]
400
})
401
402
# Chain operations
403
result = (
404
df
405
.filter(pl.col("sales") > 100)
406
.with_columns(
407
pl.col("sales").mul(1.1).alias("sales_with_tax"),
408
pl.col("product").str.to_lowercase().alias("product_lower")
409
)
410
.group_by("region")
411
.agg([
412
pl.col("sales").sum().alias("total_sales"),
413
pl.col("product").count().alias("product_count")
414
])
415
)
416
```
417
418
### Lazy Evaluation with Query Optimization
419
420
```python
421
# Build lazy computation
422
lazy_query = (
423
pl.scan_csv("large_dataset.csv")
424
.filter(pl.col("amount") > 1000)
425
.with_columns(
426
pl.col("date").str.to_date().alias("parsed_date"),
427
pl.col("category").str.to_uppercase()
428
)
429
.group_by(["category", pl.col("parsed_date").dt.month()])
430
.agg([
431
pl.col("amount").sum().alias("monthly_total"),
432
pl.col("transaction_id").count().alias("transaction_count")
433
])
434
.sort("monthly_total", descending=True)
435
)
436
437
# Execute optimized query
438
result = lazy_query.collect()
439
440
# View query plan
441
print(lazy_query.explain(optimized=True))
442
```
443
444
### Advanced Expressions
445
446
```python
447
# Complex expression building
448
complex_expr = (
449
pl.when(pl.col("score") >= 90)
450
.then(pl.lit("A"))
451
.when(pl.col("score") >= 80)
452
.then(pl.lit("B"))
453
.when(pl.col("score") >= 70)
454
.then(pl.lit("C"))
455
.otherwise(pl.lit("F"))
456
.alias("grade")
457
)
458
459
df = df.with_columns(complex_expr)
460
461
# Window functions
462
df = df.with_columns([
463
pl.col("sales").rank().over("region").alias("sales_rank"),
464
pl.col("sales").cum_sum().over("region").alias("running_total")
465
])
466
```