0
# Functions
1
2
Rich collection of functions for data manipulation including aggregation, lazy operations, range generation, mathematical operations, and utility functions. These functions provide the building blocks for complex data transformations and computations.
3
4
## Capabilities
5
6
### Column Selection and Manipulation
7
8
Core functions for selecting and manipulating columns in DataFrames and LazyFrames.
9
10
```python { .api }
11
def col(name: str | list[str]) -> Expr:
12
"""
13
Select column(s) by name.
14
15
Parameters:
16
- name: Column name(s) to select
17
18
Returns:
19
- Expr: Column selection expression
20
"""
21
22
def lit(value: Any) -> Expr:
23
"""
24
Create literal value expression.
25
26
Parameters:
27
- value: Literal value (int, float, str, bool, etc.)
28
29
Returns:
30
- Expr: Literal expression
31
"""
32
33
def when(predicate: IntoExpr) -> ExprWhenThen:
34
"""
35
Start conditional expression chain.
36
37
Parameters:
38
- predicate: Boolean condition
39
40
Returns:
41
- ExprWhenThen: Conditional expression builder
42
"""
43
44
def exclude(*columns: str | list[str]) -> Expr:
45
"""
46
Exclude specified columns from selection.
47
48
Parameters:
49
- columns: Column names to exclude
50
51
Returns:
52
- Expr: Column exclusion expression
53
"""
54
55
def select(*exprs: IntoExpr) -> Expr:
56
"""
57
Select expressions for DataFrame operations.
58
59
Parameters:
60
- exprs: Expressions to select
61
62
Returns:
63
- Expr: Selection expression
64
"""
65
```
66
67
### Aggregation Functions
68
69
Functions for computing aggregations across rows or columns.
70
71
```python { .api }
72
def sum(*args: IntoExpr) -> Expr:
73
"""
74
Sum values.
75
76
Parameters:
77
- args: Expressions to sum
78
79
Returns:
80
- Expr: Sum expression
81
"""
82
83
def mean(*args: IntoExpr) -> Expr:
84
"""
85
Compute mean of values.
86
87
Parameters:
88
- args: Expressions to average
89
90
Returns:
91
- Expr: Mean expression
92
"""
93
94
def max(*args: IntoExpr) -> Expr:
95
"""
96
Find maximum values.
97
98
Parameters:
99
- args: Expressions to find max of
100
101
Returns:
102
- Expr: Maximum expression
103
"""
104
105
def min(*args: IntoExpr) -> Expr:
106
"""
107
Find minimum values.
108
109
Parameters:
110
- args: Expressions to find min of
111
112
Returns:
113
- Expr: Minimum expression
114
"""
115
116
def count(*args: IntoExpr) -> Expr:
117
"""
118
Count values.
119
120
Parameters:
121
- args: Expressions to count
122
123
Returns:
124
- Expr: Count expression
125
"""
126
127
def median(*args: IntoExpr) -> Expr:
128
"""
129
Compute median of values.
130
131
Parameters:
132
- args: Expressions to find median of
133
134
Returns:
135
- Expr: Median expression
136
"""
137
138
def std(*args: IntoExpr, ddof: int = 1) -> Expr:
139
"""
140
Compute standard deviation.
141
142
Parameters:
143
- args: Expressions to compute std of
144
- ddof: Delta degrees of freedom
145
146
Returns:
147
- Expr: Standard deviation expression
148
"""
149
150
def var(*args: IntoExpr, ddof: int = 1) -> Expr:
151
"""
152
Compute variance.
153
154
Parameters:
155
- args: Expressions to compute variance of
156
- ddof: Delta degrees of freedom
157
158
Returns:
159
- Expr: Variance expression
160
"""
161
162
def quantile(*args: IntoExpr, quantile: float, interpolation: str = "nearest") -> Expr:
163
"""
164
Compute quantile.
165
166
Parameters:
167
- args: Expressions to compute quantile of
168
- quantile: Quantile value (0.0 to 1.0)
169
- interpolation: Interpolation method
170
171
Returns:
172
- Expr: Quantile expression
173
"""
174
```
175
176
### Horizontal Aggregations
177
178
Functions for computing aggregations across columns horizontally.
179
180
```python { .api }
181
def sum_horizontal(*exprs: IntoExpr) -> Expr:
182
"""
183
Sum values horizontally across columns.
184
185
Parameters:
186
- exprs: Column expressions to sum
187
188
Returns:
189
- Expr: Horizontal sum expression
190
"""
191
192
def mean_horizontal(*exprs: IntoExpr) -> Expr:
193
"""
194
Compute mean horizontally across columns.
195
196
Parameters:
197
- exprs: Column expressions to average
198
199
Returns:
200
- Expr: Horizontal mean expression
201
"""
202
203
def max_horizontal(*exprs: IntoExpr) -> Expr:
204
"""
205
Find maximum horizontally across columns.
206
207
Parameters:
208
- exprs: Column expressions to find max of
209
210
Returns:
211
- Expr: Horizontal maximum expression
212
"""
213
214
def min_horizontal(*exprs: IntoExpr) -> Expr:
215
"""
216
Find minimum horizontally across columns.
217
218
Parameters:
219
- exprs: Column expressions to find min of
220
221
Returns:
222
- Expr: Horizontal minimum expression
223
"""
224
225
def all_horizontal(*exprs: IntoExpr) -> Expr:
226
"""
227
Logical AND horizontally across columns.
228
229
Parameters:
230
- exprs: Boolean column expressions
231
232
Returns:
233
- Expr: Horizontal all expression
234
"""
235
236
def any_horizontal(*exprs: IntoExpr) -> Expr:
237
"""
238
Logical OR horizontally across columns.
239
240
Parameters:
241
- exprs: Boolean column expressions
242
243
Returns:
244
- Expr: Horizontal any expression
245
"""
246
```
247
248
### Boolean Logic Functions
249
250
Functions for boolean operations across columns and rows.
251
252
```python { .api }
253
def all(*args: IntoExpr) -> Expr:
254
"""
255
Check if all values are true.
256
257
Parameters:
258
- args: Boolean expressions
259
260
Returns:
261
- Expr: All expression
262
"""
263
264
def any(*args: IntoExpr) -> Expr:
265
"""
266
Check if any values are true.
267
268
Parameters:
269
- args: Boolean expressions
270
271
Returns:
272
- Expr: Any expression
273
"""
274
```
275
276
### Cumulative Functions
277
278
Functions for cumulative operations and reductions.
279
280
```python { .api }
281
def cum_sum(*args: IntoExpr, reverse: bool = False) -> Expr:
282
"""
283
Compute cumulative sum.
284
285
Parameters:
286
- args: Expressions to compute cumulative sum of
287
- reverse: Compute in reverse order
288
289
Returns:
290
- Expr: Cumulative sum expression
291
"""
292
293
def cum_sum_horizontal(*exprs: IntoExpr) -> Expr:
294
"""
295
Compute cumulative sum horizontally across columns.
296
297
Parameters:
298
- exprs: Column expressions
299
300
Returns:
301
- Expr: Horizontal cumulative sum expression
302
"""
303
304
def cum_count(*args: IntoExpr, reverse: bool = False) -> Expr:
305
"""
306
Compute cumulative count.
307
308
Parameters:
309
- args: Expressions to count
310
- reverse: Compute in reverse order
311
312
Returns:
313
- Expr: Cumulative count expression
314
"""
315
316
def cum_fold(
317
acc: IntoExpr,
318
function: Callable[[Expr, Expr], Expr],
319
*exprs: IntoExpr,
320
include_init: bool = False
321
) -> Expr:
322
"""
323
Cumulatively fold expressions with a function.
324
325
Parameters:
326
- acc: Initial accumulator value
327
- function: Folding function
328
- exprs: Expressions to fold
329
- include_init: Include initial value in result
330
331
Returns:
332
- Expr: Cumulative fold expression
333
"""
334
335
def cum_reduce(expression: Expr) -> Expr:
336
"""
337
Cumulatively reduce expression.
338
339
Parameters:
340
- expression: Expression to reduce
341
342
Returns:
343
- Expr: Cumulative reduce expression
344
"""
345
```
346
347
### Range Functions
348
349
Functions for generating ranges and sequences of values.
350
351
```python { .api }
352
def arange(
353
start: int | IntoExpr,
354
end: int | IntoExpr,
355
step: int = 1,
356
*,
357
eager: bool = False
358
) -> Expr | Series:
359
"""
360
Create range of integers.
361
362
Parameters:
363
- start: Start value (inclusive)
364
- end: End value (exclusive)
365
- step: Step size
366
- eager: Return Series instead of Expr
367
368
Returns:
369
- Expr | Series: Integer range
370
"""
371
372
def int_range(
373
start: int | IntoExpr,
374
end: int | IntoExpr | None = None,
375
step: int = 1,
376
*,
377
eager: bool = False
378
) -> Expr | Series:
379
"""
380
Create range of integers.
381
382
Parameters:
383
- start: Start value or end if end is None
384
- end: End value (exclusive)
385
- step: Step size
386
- eager: Return Series instead of Expr
387
388
Returns:
389
- Expr | Series: Integer range
390
"""
391
392
def int_ranges(
393
start: int | IntoExpr,
394
end: int | IntoExpr,
395
step: int = 1,
396
*,
397
eager: bool = False
398
) -> Expr | Series:
399
"""
400
Create multiple integer ranges.
401
402
Parameters:
403
- start: Start values
404
- end: End values
405
- step: Step size
406
- eager: Return Series instead of Expr
407
408
Returns:
409
- Expr | Series: List of integer ranges
410
"""
411
412
def date_range(
413
start: date | datetime | IntoExpr,
414
end: date | datetime | IntoExpr,
415
interval: str | timedelta = "1d",
416
*,
417
closed: str = "both",
418
eager: bool = False
419
) -> Expr | Series:
420
"""
421
Create range of dates.
422
423
Parameters:
424
- start: Start date
425
- end: End date
426
- interval: Date interval (e.g., '1d', '1w', '1mo')
427
- closed: Include endpoints ('both', 'left', 'right', 'none')
428
- eager: Return Series instead of Expr
429
430
Returns:
431
- Expr | Series: Date range
432
"""
433
434
def date_ranges(
435
start: date | datetime | IntoExpr,
436
end: date | datetime | IntoExpr,
437
interval: str | timedelta = "1d",
438
*,
439
closed: str = "both",
440
eager: bool = False
441
) -> Expr | Series:
442
"""
443
Create multiple date ranges.
444
445
Returns:
446
- Expr | Series: List of date ranges
447
"""
448
449
def datetime_range(
450
start: datetime | IntoExpr,
451
end: datetime | IntoExpr,
452
interval: str | timedelta = "1d",
453
*,
454
closed: str = "both",
455
time_unit: str | None = None,
456
time_zone: str | None = None,
457
eager: bool = False
458
) -> Expr | Series:
459
"""
460
Create range of datetimes.
461
462
Parameters:
463
- start: Start datetime
464
- end: End datetime
465
- interval: Datetime interval
466
- closed: Include endpoints
467
- time_unit: Time precision ('ns', 'us', 'ms')
468
- time_zone: Timezone
469
- eager: Return Series instead of Expr
470
471
Returns:
472
- Expr | Series: Datetime range
473
"""
474
475
def datetime_ranges(
476
start: datetime | IntoExpr,
477
end: datetime | IntoExpr,
478
interval: str | timedelta = "1d",
479
*,
480
closed: str = "both",
481
time_unit: str | None = None,
482
time_zone: str | None = None,
483
eager: bool = False
484
) -> Expr | Series:
485
"""
486
Create multiple datetime ranges.
487
488
Returns:
489
- Expr | Series: List of datetime ranges
490
"""
491
492
def time_range(
493
start: time | IntoExpr | None = None,
494
end: time | IntoExpr | None = None,
495
interval: str | timedelta = "1h",
496
*,
497
closed: str = "both",
498
eager: bool = False
499
) -> Expr | Series:
500
"""
501
Create range of times.
502
503
Parameters:
504
- start: Start time
505
- end: End time
506
- interval: Time interval
507
- closed: Include endpoints
508
- eager: Return Series instead of Expr
509
510
Returns:
511
- Expr | Series: Time range
512
"""
513
514
def time_ranges(
515
start: time | IntoExpr,
516
end: time | IntoExpr,
517
interval: str | timedelta = "1h",
518
*,
519
closed: str = "both",
520
eager: bool = False
521
) -> Expr | Series:
522
"""
523
Create multiple time ranges.
524
525
Returns:
526
- Expr | Series: List of time ranges
527
"""
528
```
529
530
### Linear Space Functions
531
532
Functions for generating linearly spaced values.
533
534
```python { .api }
535
def linear_space(
536
start: int | float | IntoExpr,
537
end: int | float | IntoExpr,
538
num: int,
539
*,
540
endpoint: bool = True,
541
dtype: type = Float64,
542
eager: bool = False
543
) -> Expr | Series:
544
"""
545
Create linearly spaced values.
546
547
Parameters:
548
- start: Start value
549
- end: End value
550
- num: Number of values
551
- endpoint: Include endpoint
552
- dtype: Data type of result
553
- eager: Return Series instead of Expr
554
555
Returns:
556
- Expr | Series: Linearly spaced values
557
"""
558
559
def linear_spaces(
560
start: int | float | IntoExpr,
561
end: int | float | IntoExpr,
562
num: int,
563
*,
564
endpoint: bool = True,
565
dtype: type = Float64,
566
eager: bool = False
567
) -> Expr | Series:
568
"""
569
Create multiple linear spaces.
570
571
Returns:
572
- Expr | Series: List of linearly spaced values
573
"""
574
```
575
576
### Data Type Constructor Functions
577
578
Functions for creating typed literal values and structures.
579
580
```python { .api }
581
def date(year: int, month: int, day: int) -> date:
582
"""
583
Create date value.
584
585
Parameters:
586
- year: Year
587
- month: Month (1-12)
588
- day: Day of month
589
590
Returns:
591
- date: Date object
592
"""
593
594
def datetime(
595
year: int,
596
month: int,
597
day: int,
598
hour: int = 0,
599
minute: int = 0,
600
second: int = 0,
601
microsecond: int = 0,
602
*,
603
time_unit: str = "us",
604
time_zone: str | None = None
605
) -> datetime:
606
"""
607
Create datetime value.
608
609
Parameters:
610
- year: Year
611
- month: Month
612
- day: Day
613
- hour: Hour
614
- minute: Minute
615
- second: Second
616
- microsecond: Microsecond
617
- time_unit: Time precision
618
- time_zone: Timezone
619
620
Returns:
621
- datetime: Datetime object
622
"""
623
624
def time(
625
hour: int = 0,
626
minute: int = 0,
627
second: int = 0,
628
microsecond: int = 0
629
) -> time:
630
"""
631
Create time value.
632
633
Parameters:
634
- hour: Hour (0-23)
635
- minute: Minute (0-59)
636
- second: Second (0-59)
637
- microsecond: Microsecond
638
639
Returns:
640
- time: Time object
641
"""
642
643
def duration(
644
*,
645
weeks: int | IntoExpr | None = None,
646
days: int | IntoExpr | None = None,
647
hours: int | IntoExpr | None = None,
648
minutes: int | IntoExpr | None = None,
649
seconds: int | IntoExpr | None = None,
650
milliseconds: int | IntoExpr | None = None,
651
microseconds: int | IntoExpr | None = None,
652
nanoseconds: int | IntoExpr | None = None,
653
time_unit: str = "us"
654
) -> Expr:
655
"""
656
Create duration expression.
657
658
Parameters:
659
- weeks: Number of weeks
660
- days: Number of days
661
- hours: Number of hours
662
- minutes: Number of minutes
663
- seconds: Number of seconds
664
- milliseconds: Number of milliseconds
665
- microseconds: Number of microseconds
666
- nanoseconds: Number of nanoseconds
667
- time_unit: Time precision
668
669
Returns:
670
- Expr: Duration expression
671
"""
672
673
def struct(*exprs: IntoExpr, schema: list[str] | None = None, **named_exprs: IntoExpr) -> Expr:
674
"""
675
Create struct expression from fields.
676
677
Parameters:
678
- exprs: Field expressions
679
- schema: Field names
680
- named_exprs: Named field expressions
681
682
Returns:
683
- Expr: Struct expression
684
"""
685
686
def struct_with_fields(fields: Sequence[Expr]) -> Expr:
687
"""
688
Create struct expression with explicit fields.
689
690
Parameters:
691
- fields: Field expressions
692
693
Returns:
694
- Expr: Struct expression
695
"""
696
```
697
698
### String and Concatenation Functions
699
700
Functions for string operations and concatenation.
701
702
```python { .api }
703
def concat_str(*exprs: IntoExpr, separator: str = "", ignore_nulls: bool = False) -> Expr:
704
"""
705
Concatenate string expressions.
706
707
Parameters:
708
- exprs: String expressions to concatenate
709
- separator: Separator between strings
710
- ignore_nulls: Skip null values
711
712
Returns:
713
- Expr: Concatenated string expression
714
"""
715
716
def concat_list(*exprs: IntoExpr) -> Expr:
717
"""
718
Concatenate expressions into list.
719
720
Parameters:
721
- exprs: Expressions to concatenate
722
723
Returns:
724
- Expr: List expression
725
"""
726
727
def concat_arr(*exprs: IntoExpr) -> Expr:
728
"""
729
Concatenate expressions into array.
730
731
Parameters:
732
- exprs: Expressions to concatenate
733
734
Returns:
735
- Expr: Array expression
736
"""
737
738
def format(format_str: str, *args: IntoExpr) -> Expr:
739
"""
740
Format string with expressions.
741
742
Parameters:
743
- format_str: Format string with {} placeholders
744
- args: Expressions to format
745
746
Returns:
747
- Expr: Formatted string expression
748
"""
749
750
def escape_regex(value: str) -> str:
751
"""
752
Escape regex special characters in string.
753
754
Parameters:
755
- value: String to escape
756
757
Returns:
758
- str: Escaped string
759
"""
760
```
761
762
### Mathematical Functions
763
764
Functions for mathematical operations.
765
766
```python { .api }
767
def arctan2(y: str | Expr, x: str | Expr) -> Expr:
768
"""
769
Compute element-wise arc tangent of y/x in radians.
770
771
Parameters:
772
- y: Y coordinates
773
- x: X coordinates
774
775
Returns:
776
- Expr: Arc tangent expression
777
"""
778
779
def arctan2d(y: str | Expr, x: str | Expr) -> Expr:
780
"""
781
Compute element-wise arc tangent of y/x in degrees.
782
783
Parameters:
784
- y: Y coordinates
785
- x: X coordinates
786
787
Returns:
788
- Expr: Arc tangent expression in degrees
789
"""
790
```
791
792
### Statistical Functions
793
794
Functions for correlation and covariance.
795
796
```python { .api }
797
def corr(a: IntoExpr, b: IntoExpr, *, method: str = "pearson", ddof: int = 1) -> Expr:
798
"""
799
Compute correlation between two expressions.
800
801
Parameters:
802
- a: First expression
803
- b: Second expression
804
- method: Correlation method ('pearson', 'spearman')
805
- ddof: Delta degrees of freedom
806
807
Returns:
808
- Expr: Correlation expression
809
"""
810
811
def cov(a: IntoExpr, b: IntoExpr, *, ddof: int = 1) -> Expr:
812
"""
813
Compute covariance between two expressions.
814
815
Parameters:
816
- a: First expression
817
- b: Second expression
818
- ddof: Delta degrees of freedom
819
820
Returns:
821
- Expr: Covariance expression
822
"""
823
824
def rolling_corr(
825
a: IntoExpr,
826
b: IntoExpr,
827
window_size: int,
828
*,
829
ddof: int = 1
830
) -> Expr:
831
"""
832
Compute rolling correlation.
833
834
Parameters:
835
- a: First expression
836
- b: Second expression
837
- window_size: Rolling window size
838
- ddof: Delta degrees of freedom
839
840
Returns:
841
- Expr: Rolling correlation expression
842
"""
843
844
def rolling_cov(
845
a: IntoExpr,
846
b: IntoExpr,
847
window_size: int,
848
*,
849
ddof: int = 1
850
) -> Expr:
851
"""
852
Compute rolling covariance.
853
854
Parameters:
855
- a: First expression
856
- b: Second expression
857
- window_size: Rolling window size
858
- ddof: Delta degrees of freedom
859
860
Returns:
861
- Expr: Rolling covariance expression
862
"""
863
```
864
865
### Utility Functions
866
867
Miscellaneous utility functions for data manipulation.
868
869
```python { .api }
870
def coalesce(*exprs: IntoExpr) -> Expr:
871
"""
872
Return first non-null value from expressions.
873
874
Parameters:
875
- exprs: Expressions to check
876
877
Returns:
878
- Expr: Coalesced expression
879
"""
880
881
def from_epoch(column: IntoExpr, time_unit: str = "s") -> Expr:
882
"""
883
Convert epoch timestamp to datetime.
884
885
Parameters:
886
- column: Epoch timestamp expression
887
- time_unit: Time unit of input ('s', 'ms', 'us', 'ns')
888
889
Returns:
890
- Expr: Datetime expression
891
"""
892
893
def approx_n_unique(column: IntoExpr) -> Expr:
894
"""
895
Approximate number of unique values.
896
897
Parameters:
898
- column: Column expression
899
900
Returns:
901
- Expr: Approximate unique count expression
902
"""
903
904
def n_unique(column: IntoExpr) -> Expr:
905
"""
906
Count unique values.
907
908
Parameters:
909
- column: Column expression
910
911
Returns:
912
- Expr: Unique count expression
913
"""
914
915
def dtype_of(column: IntoExpr) -> Expr:
916
"""
917
Get data type of expression.
918
919
Parameters:
920
- column: Expression to check
921
922
Returns:
923
- Expr: Data type expression
924
"""
925
926
def self_dtype() -> Expr:
927
"""
928
Get data type of current column context.
929
930
Returns:
931
- Expr: Self data type expression
932
"""
933
```
934
935
### Array Creation Functions
936
937
Functions for creating arrays with specific patterns.
938
939
```python { .api }
940
def ones(shape: int | tuple[int, ...], *, dtype: type = Float64, eager: bool = False) -> Expr | Series:
941
"""
942
Create array filled with ones.
943
944
Parameters:
945
- shape: Array shape
946
- dtype: Data type
947
- eager: Return Series instead of Expr
948
949
Returns:
950
- Expr | Series: Array of ones
951
"""
952
953
def zeros(shape: int | tuple[int, ...], *, dtype: type = Float64, eager: bool = False) -> Expr | Series:
954
"""
955
Create array filled with zeros.
956
957
Parameters:
958
- shape: Array shape
959
- dtype: Data type
960
- eager: Return Series instead of Expr
961
962
Returns:
963
- Expr | Series: Array of zeros
964
"""
965
966
def repeat(
967
value: IntoExpr,
968
n: int | IntoExpr,
969
*,
970
eager: bool = False
971
) -> Expr | Series:
972
"""
973
Repeat value n times.
974
975
Parameters:
976
- value: Value to repeat
977
- n: Number of repetitions
978
- eager: Return Series instead of Expr
979
980
Returns:
981
- Expr | Series: Repeated values
982
"""
983
```
984
985
### Collection Functions
986
987
Functions for working with multiple DataFrames and LazyFrames.
988
989
```python { .api }
990
def collect_all(
991
lazy_frames: list[LazyFrame],
992
*,
993
type_coercion: bool = True,
994
predicate_pushdown: bool = True,
995
projection_pushdown: bool = True,
996
simplify_expression: bool = True,
997
slice_pushdown: bool = True,
998
comm_subplan_elim: bool = True,
999
comm_subexpr_elim: bool = True,
1000
streaming: bool = False
1001
) -> list[DataFrame]:
1002
"""
1003
Collect multiple LazyFrames with shared optimization.
1004
1005
Parameters:
1006
- lazy_frames: List of LazyFrames to collect
1007
- type_coercion: Enable type coercion optimization
1008
- predicate_pushdown: Enable predicate pushdown
1009
- projection_pushdown: Enable projection pushdown
1010
- simplify_expression: Enable expression simplification
1011
- slice_pushdown: Enable slice pushdown
1012
- comm_subplan_elim: Enable common subplan elimination
1013
- comm_subexpr_elim: Enable common subexpression elimination
1014
- streaming: Enable streaming execution
1015
1016
Returns:
1017
- list[DataFrame]: Collected DataFrames
1018
"""
1019
1020
def collect_all_async(
1021
lazy_frames: list[LazyFrame],
1022
*,
1023
gevent: bool = False,
1024
**kwargs
1025
) -> Awaitable[list[DataFrame]]:
1026
"""
1027
Collect multiple LazyFrames asynchronously.
1028
1029
Parameters:
1030
- lazy_frames: List of LazyFrames to collect
1031
- gevent: Use gevent for async execution
1032
- **kwargs: Same optimization parameters as collect_all
1033
1034
Returns:
1035
- Awaitable[list[DataFrame]]: Async collected DataFrames
1036
"""
1037
1038
def concat(
1039
items: Iterable[DataFrame | LazyFrame | Series],
1040
*,
1041
rechunk: bool = False,
1042
how: str = "vertical",
1043
parallel: bool = True
1044
) -> DataFrame | LazyFrame | Series:
1045
"""
1046
Concatenate DataFrames, LazyFrames, or Series.
1047
1048
Parameters:
1049
- items: Items to concatenate
1050
- rechunk: Rechunk result for better memory layout
1051
- how: Concatenation method ('vertical', 'horizontal', 'diagonal')
1052
- parallel: Use parallel concatenation
1053
1054
Returns:
1055
- DataFrame | LazyFrame | Series: Concatenated result
1056
"""
1057
```
1058
1059
### Other Utility Functions
1060
1061
Additional utility functions for various operations.
1062
1063
```python { .api }
1064
def first(*args: IntoExpr) -> Expr:
1065
"""Get first value."""
1066
1067
def last(*args: IntoExpr) -> Expr:
1068
"""Get last value."""
1069
1070
def nth(column: IntoExpr, n: int | IntoExpr) -> Expr:
1071
"""Get nth value."""
1072
1073
def head(*args: IntoExpr, n: int = 10) -> Expr:
1074
"""Get first n values."""
1075
1076
def tail(*args: IntoExpr, n: int = 10) -> Expr:
1077
"""Get last n values."""
1078
1079
def groups() -> Expr:
1080
"""Get group indices."""
1081
1082
def implode(column: IntoExpr) -> Expr:
1083
"""Collect values into list."""
1084
1085
def len() -> Expr:
1086
"""Get length."""
1087
1088
def element() -> Expr:
1089
"""Get single element from length-1 Series."""
1090
1091
def arg_sort_by(*by: IntoExpr, descending: bool = False) -> Expr:
1092
"""Get indices that would sort by expressions."""
1093
1094
def arg_where(condition: IntoExpr) -> Expr:
1095
"""Get indices where condition is true."""
1096
1097
def business_day_count(
1098
start: IntoExpr,
1099
end: IntoExpr,
1100
*,
1101
week_mask: list[bool] = [True, True, True, True, True, False, False],
1102
holidays: list[date] | None = None
1103
) -> Expr:
1104
"""Count business days between dates."""
1105
1106
def set_random_seed(seed: int) -> None:
1107
"""Set random seed for reproducible operations."""
1108
1109
def field(name: str) -> Expr:
1110
"""Select struct field."""
1111
1112
def fold(
1113
acc: IntoExpr,
1114
function: Callable[[Expr, Expr], Expr],
1115
*exprs: IntoExpr
1116
) -> Expr:
1117
"""Fold expressions with function."""
1118
1119
def reduce(function: Callable[[Expr, Expr], Expr], *exprs: IntoExpr) -> Expr:
1120
"""Reduce expressions with function."""
1121
1122
def map_batches(
1123
function: Callable[[DataFrame], DataFrame],
1124
*exprs: IntoExpr,
1125
returns_scalar: bool = False,
1126
agg_list: bool = False
1127
) -> Expr:
1128
"""Apply function to DataFrame batches."""
1129
1130
def map_groups(
1131
function: Callable[[DataFrame], DataFrame],
1132
*exprs: IntoExpr,
1133
returns_scalar: bool = False
1134
) -> Expr:
1135
"""Apply function to grouped DataFrames."""
1136
1137
def align_frames(
1138
*frames: DataFrame | LazyFrame,
1139
on: str | Expr | list[str | Expr],
1140
select: str | Expr | list[str | Expr] | None = None,
1141
reverse: bool | list[bool] = False
1142
) -> list[DataFrame | LazyFrame]:
1143
"""Align frames on common values."""
1144
1145
def row_index() -> Expr:
1146
"""Add row index column."""
1147
1148
def explain_all(*lazy_frames: LazyFrame, **kwargs) -> None:
1149
"""Print query plans for multiple LazyFrames."""
1150
1151
def sql_expr(sql: str) -> Expr:
1152
"""Create expression from SQL fragment."""
1153
```
1154
1155
## Usage Examples
1156
1157
### Basic Function Usage
1158
1159
```python
1160
import polars as pl
1161
1162
df = pl.DataFrame({
1163
"a": [1, 2, 3, 4],
1164
"b": [10, 20, 30, 40],
1165
"c": [100, 200, 300, 400]
1166
})
1167
1168
# Column selection and manipulation
1169
result = df.select([
1170
pl.col("a"),
1171
pl.lit(42).alias("literal"),
1172
pl.when(pl.col("a") > 2).then(pl.col("b")).otherwise(0).alias("conditional")
1173
])
1174
1175
# Aggregations
1176
agg_result = df.select([
1177
pl.sum("a").alias("sum_a"),
1178
pl.mean("b").alias("mean_b"),
1179
pl.max("c").alias("max_c"),
1180
pl.count().alias("count")
1181
])
1182
```
1183
1184
### Horizontal Operations
1185
1186
```python
1187
# Horizontal aggregations
1188
result = df.with_columns([
1189
pl.sum_horizontal("a", "b", "c").alias("row_sum"),
1190
pl.max_horizontal("a", "b", "c").alias("row_max"),
1191
pl.mean_horizontal("a", "b", "c").alias("row_mean")
1192
])
1193
```
1194
1195
### Range Functions
1196
1197
```python
1198
# Create ranges
1199
ranges_df = pl.DataFrame({
1200
"int_range": pl.arange(0, 10, eager=True),
1201
"date_range": pl.date_range(
1202
pl.date(2023, 1, 1),
1203
pl.date(2023, 1, 10),
1204
"1d",
1205
eager=True
1206
)
1207
})
1208
1209
# Linear space
1210
linear_vals = pl.linear_space(0, 100, 11, eager=True)
1211
```
1212
1213
### String Operations
1214
1215
```python
1216
text_df = pl.DataFrame({
1217
"first": ["hello", "world"],
1218
"second": ["polars", "rocks"]
1219
})
1220
1221
result = text_df.select([
1222
pl.concat_str("first", "second", separator=" ").alias("combined"),
1223
pl.format("{} is {}", pl.col("first"), pl.col("second")).alias("formatted")
1224
])
1225
```
1226
1227
### Mathematical Functions
1228
1229
```python
1230
coords_df = pl.DataFrame({
1231
"x": [1.0, 2.0, 3.0],
1232
"y": [1.0, 2.0, 3.0]
1233
})
1234
1235
result = coords_df.with_columns([
1236
pl.arctan2("y", "x").alias("angle_rad"),
1237
pl.arctan2d("y", "x").alias("angle_deg")
1238
])
1239
```
1240
1241
### Statistical Functions
1242
1243
```python
1244
stats_df = pl.DataFrame({
1245
"x": [1, 2, 3, 4, 5],
1246
"y": [2, 4, 6, 8, 10]
1247
})
1248
1249
result = stats_df.select([
1250
pl.corr("x", "y").alias("correlation"),
1251
pl.cov("x", "y").alias("covariance")
1252
])
1253
```
1254
1255
### Collection Operations
1256
1257
```python
1258
# Multiple LazyFrames
1259
lazy1 = pl.scan_csv("file1.csv")
1260
lazy2 = pl.scan_csv("file2.csv")
1261
lazy3 = pl.scan_csv("file3.csv")
1262
1263
# Collect all with shared optimization
1264
results = pl.collect_all([lazy1, lazy2, lazy3])
1265
1266
# Concatenate DataFrames
1267
combined = pl.concat([results[0], results[1], results[2]], how="vertical")
1268
```
1269
1270
### Advanced Function Usage
1271
1272
```python
1273
# Cumulative operations
1274
cumulative_df = df.select([
1275
pl.col("a"),
1276
pl.cum_sum("a").alias("cumsum_a"),
1277
pl.cum_count("a").alias("cumcount_a")
1278
])
1279
1280
# Complex folding
1281
folded = df.select([
1282
pl.fold(
1283
acc=pl.lit(0),
1284
function=lambda acc, x: acc + x,
1285
exprs=["a", "b", "c"]
1286
).alias("total")
1287
])
1288
1289
# Utility functions
1290
utility_result = df.select([
1291
pl.coalesce("a", pl.lit(0)).alias("coalesced"),
1292
pl.n_unique("a").alias("unique_count"),
1293
pl.dtype_of("a").alias("data_type")
1294
])
1295
```