0
# Core Data Structures
1
2
Primary data structures for working with tabular data in Polars, including eager DataFrame/Series for immediate operations and LazyFrame for optimized query execution with the 64-bit index variant supporting datasets exceeding 4.2 billion rows.
3
4
## Capabilities
5
6
### DataFrame
7
8
Two-dimensional labeled data structure with columns of potentially different types. The primary data structure for eager evaluation where operations are executed immediately.
9
10
```python { .api }
11
class DataFrame:
12
def __init__(
13
self,
14
data=None,
15
schema=None,
16
schema_overrides=None,
17
orient=None,
18
infer_schema_length=N_INFER_DEFAULT,
19
nan_to_null=False
20
):
21
"""
22
Create a DataFrame from various data sources.
23
24
Parameters:
25
- data: Data source (dict, list, numpy array, pandas DataFrame, etc.)
26
- schema: Column names and types
27
- schema_overrides: Override inferred types for specific columns
28
- orient: Data orientation ("row" or "col")
29
- infer_schema_length: Number of rows to scan for type inference
30
- nan_to_null: Convert NaN values to null
31
"""
32
33
@property
34
def shape(self) -> tuple[int, int]:
35
"""Get the shape (rows, columns) of the DataFrame."""
36
37
@property
38
def height(self) -> int:
39
"""Get the number of rows."""
40
41
@property
42
def width(self) -> int:
43
"""Get the number of columns."""
44
45
@property
46
def columns(self) -> list[str]:
47
"""Get column names."""
48
49
@property
50
def dtypes(self) -> list[DataType]:
51
"""Get data types of all columns."""
52
53
@property
54
def schema(self) -> Schema:
55
"""Get the schema (column names and types)."""
56
57
def select(self, *exprs, **named_exprs) -> DataFrame:
58
"""
59
Select columns using expressions.
60
61
Parameters:
62
- exprs: Column expressions to select
63
- named_exprs: Named expressions for new columns
64
65
Returns:
66
DataFrame with selected columns
67
"""
68
69
def filter(self, *predicates, **constraints) -> DataFrame:
70
"""
71
Filter rows based on predicates.
72
73
Parameters:
74
- predicates: Boolean expressions for filtering
75
- constraints: Named constraints
76
77
Returns:
78
Filtered DataFrame
79
"""
80
81
def with_columns(self, *exprs, **named_exprs) -> DataFrame:
82
"""
83
Add or modify columns.
84
85
Parameters:
86
- exprs: Column expressions to add/modify
87
- named_exprs: Named expressions for new columns
88
89
Returns:
90
DataFrame with added/modified columns
91
"""
92
93
def drop(self, *columns, strict=True) -> DataFrame:
94
"""
95
Drop columns from DataFrame.
96
97
Parameters:
98
- columns: Column names to drop
99
- strict: Whether to raise error if column doesn't exist
100
101
Returns:
102
DataFrame without dropped columns
103
"""
104
105
def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> DataFrame:
106
"""
107
Rename columns.
108
109
Parameters:
110
- mapping: Dictionary mapping old to new names, or function
111
112
Returns:
113
DataFrame with renamed columns
114
"""
115
116
def sort(
117
self,
118
by,
119
*,
120
descending=False,
121
nulls_last=False,
122
multithreaded=True
123
) -> DataFrame:
124
"""
125
Sort DataFrame by columns.
126
127
Parameters:
128
- by: Column(s) to sort by
129
- descending: Sort in descending order
130
- nulls_last: Place nulls at end
131
- multithreaded: Use multiple threads
132
133
Returns:
134
Sorted DataFrame
135
"""
136
137
def group_by(self, *by, maintain_order=False, **named_by) -> GroupBy:
138
"""
139
Group DataFrame for aggregation.
140
141
Parameters:
142
- by: Columns to group by
143
- maintain_order: Maintain order of groups
144
- named_by: Named grouping expressions
145
146
Returns:
147
GroupBy object for aggregation
148
"""
149
150
def join(
151
self,
152
other,
153
on=None,
154
how="inner",
155
*,
156
left_on=None,
157
right_on=None,
158
suffix="_right",
159
validate="m:m",
160
join_nulls=False,
161
coalesce=None
162
) -> DataFrame:
163
"""
164
Join with another DataFrame.
165
166
Parameters:
167
- other: DataFrame to join with
168
- on: Column(s) to join on
169
- how: Join type ("inner", "left", "outer", "cross", "anti", "semi")
170
- left_on: Left DataFrame join columns
171
- right_on: Right DataFrame join columns
172
- suffix: Suffix for duplicate column names
173
- validate: Join validation ("m:m", "1:m", "m:1", "1:1")
174
- join_nulls: Join on null values
175
- coalesce: Coalesce join columns
176
177
Returns:
178
Joined DataFrame
179
"""
180
181
def concat(self, other, *, how="vertical", ignore_index=False) -> DataFrame:
182
"""
183
Concatenate with other DataFrame(s).
184
185
Parameters:
186
- other: DataFrame(s) to concatenate
187
- how: Concatenation method ("vertical", "horizontal", "diagonal")
188
- ignore_index: Reset index after concatenation
189
190
Returns:
191
Concatenated DataFrame
192
"""
193
194
def to_pandas(self, **kwargs) -> pd.DataFrame:
195
"""Convert to pandas DataFrame."""
196
197
def to_numpy(self, structured=False, order="c") -> np.ndarray:
198
"""Convert to NumPy array."""
199
200
def to_arrow(self, *, compat_level=None) -> pa.Table:
201
"""Convert to PyArrow Table."""
202
203
def to_dict(self, *, as_series=True) -> dict[str, Series | list[Any]]:
204
"""Convert to dictionary."""
205
206
def write_csv(self, file=None, **kwargs) -> str | None:
207
"""Write to CSV file."""
208
209
def write_json(self, file=None, **kwargs) -> str | None:
210
"""Write to JSON file."""
211
212
def write_parquet(self, file, **kwargs) -> None:
213
"""Write to Parquet file."""
214
215
def write_ipc(self, file, **kwargs) -> None:
216
"""Write to IPC/Arrow file."""
217
218
def lazy(self) -> LazyFrame:
219
"""Convert to LazyFrame for optimized operations."""
220
221
def head(self, n=5) -> DataFrame:
222
"""Get first n rows."""
223
224
def tail(self, n=5) -> DataFrame:
225
"""Get last n rows."""
226
227
def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> DataFrame:
228
"""Sample rows from DataFrame."""
229
230
def null_count(self) -> DataFrame:
231
"""Count null values per column."""
232
233
def is_empty(self) -> bool:
234
"""Check if DataFrame is empty."""
235
236
def clone(self) -> DataFrame:
237
"""Create a copy of the DataFrame."""
238
```
239
240
### Series
241
242
One-dimensional labeled array with homogeneous data type. Similar to a column in a DataFrame but can exist independently.
243
244
```python { .api }
245
class Series:
246
def __init__(
247
self,
248
name=None,
249
values=None,
250
dtype=None,
251
strict=True,
252
nan_to_null=False,
253
dtype_if_empty=Null
254
):
255
"""
256
Create a Series.
257
258
Parameters:
259
- name: Series name
260
- values: Data values
261
- dtype: Data type
262
- strict: Strict type checking
263
- nan_to_null: Convert NaN to null
264
- dtype_if_empty: Type when empty
265
"""
266
267
@property
268
def name(self) -> str:
269
"""Get Series name."""
270
271
@property
272
def dtype(self) -> DataType:
273
"""Get data type."""
274
275
@property
276
def shape(self) -> tuple[int]:
277
"""Get shape (length,)."""
278
279
def len(self) -> int:
280
"""Get length."""
281
282
def sum(self) -> Any:
283
"""Sum all values."""
284
285
def mean(self) -> float | None:
286
"""Calculate mean."""
287
288
def max(self) -> Any:
289
"""Get maximum value."""
290
291
def min(self) -> Any:
292
"""Get minimum value."""
293
294
def sort(self, *, descending=False, nulls_last=False) -> Series:
295
"""Sort Series values."""
296
297
def filter(self, predicate) -> Series:
298
"""Filter values based on predicate."""
299
300
def to_list(self) -> list[Any]:
301
"""Convert to Python list."""
302
303
def to_numpy(self) -> np.ndarray:
304
"""Convert to NumPy array."""
305
306
def to_pandas(self) -> pd.Series:
307
"""Convert to pandas Series."""
308
309
def to_frame(self, name=None) -> DataFrame:
310
"""Convert to single-column DataFrame."""
311
```
312
313
### LazyFrame
314
315
Lazy evaluation version of DataFrame that builds a query plan without executing until `.collect()` is called. Enables query optimization and efficient processing of large datasets.
316
317
```python { .api }
318
class LazyFrame:
319
def select(self, *exprs, **named_exprs) -> LazyFrame:
320
"""Select columns (lazy operation)."""
321
322
def filter(self, *predicates, **constraints) -> LazyFrame:
323
"""Filter rows (lazy operation)."""
324
325
def with_columns(self, *exprs, **named_exprs) -> LazyFrame:
326
"""Add/modify columns (lazy operation)."""
327
328
def drop(self, *columns, strict=True) -> LazyFrame:
329
"""Drop columns (lazy operation)."""
330
331
def sort(self, by, *, descending=False, nulls_last=False, multithreaded=True) -> LazyFrame:
332
"""Sort by columns (lazy operation)."""
333
334
def group_by(self, *by, maintain_order=False, **named_by) -> LazyGroupBy:
335
"""Group for aggregation (lazy operation)."""
336
337
def join(
338
self,
339
other,
340
on=None,
341
how="inner",
342
*,
343
left_on=None,
344
right_on=None,
345
suffix="_right",
346
validate="m:m",
347
join_nulls=False,
348
coalesce=None
349
) -> LazyFrame:
350
"""Join with another LazyFrame (lazy operation)."""
351
352
def collect(
353
self,
354
*,
355
type_coercion=True,
356
predicate_pushdown=True,
357
projection_pushdown=True,
358
simplify_expression=True,
359
slice_pushdown=True,
360
comm_subplan_elim=True,
361
comm_subexpr_elim=True,
362
cluster_with_columns=True,
363
no_optimization=False,
364
streaming=False,
365
background=False,
366
_eager=False
367
) -> DataFrame:
368
"""
369
Execute the lazy query and return DataFrame.
370
371
Parameters:
372
- type_coercion: Apply automatic type coercion
373
- predicate_pushdown: Push filters down to scan level
374
- projection_pushdown: Push column selection down
375
- simplify_expression: Simplify expressions
376
- slice_pushdown: Push limits/offsets down
377
- comm_subplan_elim: Eliminate common subplans
378
- comm_subexpr_elim: Eliminate common subexpressions
379
- cluster_with_columns: Cluster with_columns operations
380
- no_optimization: Disable all optimizations
381
- streaming: Execute in streaming mode
382
- background: Execute in background thread
383
384
Returns:
385
Executed DataFrame
386
"""
387
388
def explain(self, *, optimized=True, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, streaming=False) -> str:
389
"""Get query execution plan."""
390
391
def schema(self) -> Schema:
392
"""Get the expected schema."""
393
394
def dtypes(self) -> list[DataType]:
395
"""Get expected column data types."""
396
397
def columns(self) -> list[str]:
398
"""Get expected column names."""
399
400
def head(self, n=5) -> LazyFrame:
401
"""Get first n rows (lazy operation)."""
402
403
def tail(self, n=5) -> LazyFrame:
404
"""Get last n rows (lazy operation)."""
405
406
def limit(self, n) -> LazyFrame:
407
"""Limit number of rows (lazy operation)."""
408
409
def offset(self, n) -> LazyFrame:
410
"""Skip first n rows (lazy operation)."""
411
412
def slice(self, offset, length=None) -> LazyFrame:
413
"""Slice rows (lazy operation)."""
414
```
415
416
### GroupBy Operations
417
418
GroupBy objects returned from `group_by()` operations on DataFrame and LazyFrame for aggregation operations.
419
420
```python { .api }
421
class GroupBy:
422
def agg(self, *aggs, **named_aggs) -> DataFrame:
423
"""
424
Aggregate grouped data.
425
426
Parameters:
427
- aggs: Aggregation expressions
428
- named_aggs: Named aggregation expressions
429
430
Returns:
431
DataFrame with aggregated results
432
"""
433
434
def sum(self) -> DataFrame:
435
"""Sum each group."""
436
437
def mean(self) -> DataFrame:
438
"""Mean of each group."""
439
440
def max(self) -> DataFrame:
441
"""Maximum of each group."""
442
443
def min(self) -> DataFrame:
444
"""Minimum of each group."""
445
446
def count(self) -> DataFrame:
447
"""Count rows in each group."""
448
449
def first(self) -> DataFrame:
450
"""First value in each group."""
451
452
def last(self) -> DataFrame:
453
"""Last value in each group."""
454
455
class LazyGroupBy:
456
def agg(self, *aggs, **named_aggs) -> LazyFrame:
457
"""Aggregate grouped data (lazy operation)."""
458
459
def sum(self) -> LazyFrame:
460
"""Sum each group (lazy operation)."""
461
462
def mean(self) -> LazyFrame:
463
"""Mean of each group (lazy operation)."""
464
465
def max(self) -> LazyFrame:
466
"""Maximum of each group (lazy operation)."""
467
468
def min(self) -> LazyFrame:
469
"""Minimum of each group (lazy operation)."""
470
471
def count(self) -> LazyFrame:
472
"""Count rows in each group (lazy operation)."""
473
```
474
475
## Usage Examples
476
477
### Creating DataFrames
478
479
```python
480
import polars as pl
481
482
# From dictionary
483
df = pl.DataFrame({
484
"name": ["Alice", "Bob", "Charlie"],
485
"age": [25, 30, 35],
486
"salary": [50000, 60000, 70000]
487
})
488
489
# From list of dictionaries
490
data = [
491
{"name": "Alice", "age": 25, "salary": 50000},
492
{"name": "Bob", "age": 30, "salary": 60000},
493
{"name": "Charlie", "age": 35, "salary": 70000}
494
]
495
df = pl.DataFrame(data)
496
497
# From NumPy array
498
import numpy as np
499
arr = np.array([[1, 2, 3], [4, 5, 6]])
500
df = pl.DataFrame(arr, schema=["a", "b", "c"])
501
```
502
503
### DataFrame Operations
504
505
```python
506
# Basic operations
507
result = (df
508
.filter(pl.col("age") > 28)
509
.select([
510
pl.col("name"),
511
pl.col("age"),
512
(pl.col("salary") / 1000).alias("salary_k")
513
])
514
.sort("age", descending=True)
515
)
516
517
# Grouping and aggregation
518
summary = (df
519
.group_by("department")
520
.agg([
521
pl.col("salary").mean().alias("avg_salary"),
522
pl.col("name").count().alias("employee_count"),
523
pl.col("age").max().alias("max_age")
524
])
525
)
526
```
527
528
### Lazy Operations
529
530
```python
531
# Build query plan without execution
532
lazy_query = (pl
533
.scan_csv("large_dataset.csv")
534
.filter(pl.col("amount") > 1000)
535
.group_by("category")
536
.agg([
537
pl.col("amount").sum().alias("total"),
538
pl.col("id").count().alias("count")
539
])
540
.sort("total", descending=True)
541
)
542
543
# Execute optimized query
544
result = lazy_query.collect()
545
546
# Check execution plan
547
print(lazy_query.explain())
548
```
549
550
### Working with Large Datasets (64-bit Index)
551
552
```python
553
# The u64-idx variant handles datasets > 4.2B rows
554
very_large_df = pl.scan_parquet("huge_dataset.parquet")
555
556
# Operations work the same but support more rows
557
result = (very_large_df
558
.filter(pl.col("timestamp") > "2023-01-01")
559
.group_by("user_id")
560
.agg([
561
pl.col("value").sum(),
562
pl.col("event").count()
563
])
564
.collect(streaming=True) # Use streaming for memory efficiency
565
)
566
```