0
# Configuration and Utilities
1
2
Configuration options, selectors for column operations, string caching for categorical data, meta information utilities, and testing utilities for DataFrame comparisons. These components provide essential support for customizing Polars behavior and working efficiently with data.
3
4
## Capabilities
5
6
### Configuration
7
8
Customize Polars display options, performance settings, and behavior through the Config class.
9
10
```python { .api }
11
class Config:
12
def __init__(self):
13
"""Global configuration manager for Polars."""
14
15
# Table Display Configuration
16
def set_tbl_cols(self, n: int) -> Config:
17
"""
18
Set maximum number of columns to display.
19
20
Parameters:
21
- n: Maximum columns (-1 for unlimited)
22
23
Returns:
24
- Config: Self for method chaining
25
"""
26
27
def set_tbl_rows(self, n: int) -> Config:
28
"""
29
Set maximum number of rows to display.
30
31
Parameters:
32
- n: Maximum rows (-1 for unlimited)
33
34
Returns:
35
- Config: Self for method chaining
36
"""
37
38
def set_tbl_width_chars(self, width: int) -> Config:
39
"""
40
Set maximum table width in characters.
41
42
Parameters:
43
- width: Maximum width in characters
44
45
Returns:
46
- Config: Self for method chaining
47
"""
48
49
def set_tbl_column_data_type_inline(self, active: bool = True) -> Config:
50
"""
51
Show column data types inline with headers.
52
53
Parameters:
54
- active: Enable inline data types
55
56
Returns:
57
- Config: Self for method chaining
58
"""
59
60
def set_tbl_dataframe_shape_below(self, active: bool = True) -> Config:
61
"""
62
Display DataFrame shape below the table.
63
64
Parameters:
65
- active: Show shape below table
66
67
Returns:
68
- Config: Self for method chaining
69
"""
70
71
def set_tbl_formatting(
72
self,
73
format: str = "UTF8_FULL_CONDENSED",
74
rounded_corners: bool = False
75
) -> Config:
76
"""
77
Set table formatting style.
78
79
Parameters:
80
- format: Table format style
81
- rounded_corners: Use rounded table corners
82
83
Returns:
84
- Config: Self for method chaining
85
"""
86
87
def set_tbl_hide_column_data_types(self, active: bool = True) -> Config:
88
"""
89
Hide column data types from display.
90
91
Parameters:
92
- active: Hide data types
93
94
Returns:
95
- Config: Self for method chaining
96
"""
97
98
def set_tbl_hide_column_names(self, active: bool = True) -> Config:
99
"""
100
Hide column names from display.
101
102
Parameters:
103
- active: Hide column names
104
105
Returns:
106
- Config: Self for method chaining
107
"""
108
109
def set_tbl_hide_dtype_separator(self, active: bool = True) -> Config:
110
"""
111
Hide separator between column names and types.
112
113
Parameters:
114
- active: Hide dtype separator
115
116
Returns:
117
- Config: Self for method chaining
118
"""
119
120
# Performance and Behavior Configuration
121
def set_verbose(self, active: bool = True) -> Config:
122
"""
123
Enable verbose output for debugging.
124
125
Parameters:
126
- active: Enable verbose mode
127
128
Returns:
129
- Config: Self for method chaining
130
"""
131
132
def set_streaming_chunk_size(self, size: int) -> Config:
133
"""
134
Set chunk size for streaming operations.
135
136
Parameters:
137
- size: Chunk size in rows
138
139
Returns:
140
- Config: Self for method chaining
141
"""
142
143
def set_auto_structify(self, active: bool = True) -> Config:
144
"""
145
Automatically convert eligible data to struct format.
146
147
Parameters:
148
- active: Enable auto structification
149
150
Returns:
151
- Config: Self for method chaining
152
"""
153
154
# Context Manager Support
155
def __enter__(self) -> Config:
156
"""Enter configuration context."""
157
158
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
159
"""Exit configuration context, restoring previous settings."""
160
161
# Function Decorator Support
162
def __call__(self, func: Callable) -> Callable:
163
"""Use as function decorator to apply config temporarily."""
164
```
165
166
### String Cache
167
168
Optimize memory usage and performance for categorical-like string data through string interning.
169
170
```python { .api }
171
class StringCache:
172
def __init__(self):
173
"""Context manager for string cache operations."""
174
175
def __enter__(self) -> StringCache:
176
"""Enable string cache."""
177
178
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
179
"""Disable string cache and clean up."""
180
181
def enable_string_cache() -> None:
182
"""
183
Enable global string cache for categorical operations.
184
Strings are interned for memory efficiency and faster comparisons.
185
"""
186
187
def disable_string_cache() -> None:
188
"""
189
Disable global string cache.
190
Clean up interned strings and return to normal string handling.
191
"""
192
193
def using_string_cache() -> bool:
194
"""
195
Check if string cache is currently enabled.
196
197
Returns:
198
- bool: True if string cache is active
199
"""
200
```
201
202
### Meta Information
203
204
Access build information, version details, and system configuration.
205
206
```python { .api }
207
def build_info() -> dict[str, str]:
208
"""
209
Get Polars build information.
210
211
Returns:
212
- dict[str, str]: Build details including version, features, target
213
"""
214
215
def show_versions() -> None:
216
"""
217
Display version information for Polars and key dependencies.
218
Prints version details to stdout for debugging and support.
219
"""
220
221
def thread_pool_size() -> int:
222
"""
223
Get current thread pool size for parallel operations.
224
225
Returns:
226
- int: Number of threads in the pool
227
"""
228
229
def threadpool_size() -> int:
230
"""
231
Alias for thread_pool_size().
232
233
Returns:
234
- int: Number of threads in the pool
235
"""
236
237
def get_index_type() -> type:
238
"""
239
Get the data type used for DataFrame indices.
240
241
Returns:
242
- type: Index data type (typically UInt32 or UInt64)
243
"""
244
```
245
246
### Selectors System
247
248
Powerful column selection system for flexible DataFrame operations.
249
250
```python { .api }
251
class Selector:
252
"""Base class for column selectors."""
253
254
# Type-based Selectors
255
def by_dtype(*dtypes: type) -> Selector:
256
"""Select columns by data type."""
257
258
def numeric() -> Selector:
259
"""Select numeric columns (int, float, decimal)."""
260
261
def integer() -> Selector:
262
"""Select integer columns."""
263
264
def signed_integer() -> Selector:
265
"""Select signed integer columns."""
266
267
def unsigned_integer() -> Selector:
268
"""Select unsigned integer columns."""
269
270
def float() -> Selector:
271
"""Select floating-point columns."""
272
273
def string() -> Selector:
274
"""Select string/text columns."""
275
276
def boolean() -> Selector:
277
"""Select boolean columns."""
278
279
def temporal() -> Selector:
280
"""Select temporal columns (date, datetime, time, duration)."""
281
282
def date() -> Selector:
283
"""Select date columns."""
284
285
def datetime(time_unit: str | None = None, time_zone: str | None = None) -> Selector:
286
"""Select datetime columns with optional unit/timezone filtering."""
287
288
def time() -> Selector:
289
"""Select time columns."""
290
291
def duration(time_unit: str | None = None) -> Selector:
292
"""Select duration columns with optional unit filtering."""
293
294
def categorical() -> Selector:
295
"""Select categorical columns."""
296
297
def enum() -> Selector:
298
"""Select enum columns."""
299
300
def binary() -> Selector:
301
"""Select binary data columns."""
302
303
def decimal() -> Selector:
304
"""Select decimal columns."""
305
306
# Complex Type Selectors
307
def list() -> Selector:
308
"""Select list columns."""
309
310
def array() -> Selector:
311
"""Select array columns."""
312
313
def struct() -> Selector:
314
"""Select struct columns."""
315
316
def nested() -> Selector:
317
"""Select nested columns (list, array, struct)."""
318
319
# Position-based Selectors
320
def first() -> Selector:
321
"""Select first column."""
322
323
def last() -> Selector:
324
"""Select last column."""
325
326
def by_index(*indices: int) -> Selector:
327
"""Select columns by index positions."""
328
329
# Name-based Selectors
330
def by_name(*names: str | list[str]) -> Selector:
331
"""Select columns by exact names."""
332
333
def matches(pattern: str, *, flags: int = 0) -> Selector:
334
"""Select columns matching regex pattern."""
335
336
def contains(substring: str) -> Selector:
337
"""Select columns containing substring."""
338
339
def starts_with(prefix: str) -> Selector:
340
"""Select columns starting with prefix."""
341
342
def ends_with(suffix: str) -> Selector:
343
"""Select columns ending with suffix."""
344
345
# Character Class Selectors
346
def alpha() -> Selector:
347
"""Select columns with alphabetic names."""
348
349
def alphanumeric() -> Selector:
350
"""Select columns with alphanumeric names."""
351
352
def digit() -> Selector:
353
"""Select columns with numeric names."""
354
355
# Utility Selectors
356
def all() -> Selector:
357
"""Select all columns."""
358
359
def exclude(*selectors: Selector | str) -> Selector:
360
"""Exclude specified selectors or column names."""
361
362
# Selector Operations
363
def expand_selector(
364
frame: DataFrame | LazyFrame,
365
*selectors: Selector | str
366
) -> list[str]:
367
"""
368
Expand selectors to column names for given frame.
369
370
Parameters:
371
- frame: DataFrame or LazyFrame to expand selectors against
372
- selectors: Selectors to expand
373
374
Returns:
375
- list[str]: Column names matching selectors
376
"""
377
378
def is_selector(obj: Any) -> bool:
379
"""
380
Check if object is a selector.
381
382
Parameters:
383
- obj: Object to check
384
385
Returns:
386
- bool: True if object is a selector
387
"""
388
```
389
390
### Testing Utilities
391
392
Assertion functions for comparing DataFrames and Series in tests.
393
394
```python { .api }
395
def assert_frame_equal(
396
left: DataFrame | LazyFrame,
397
right: DataFrame | LazyFrame,
398
*,
399
check_dtype: bool = True,
400
check_exact: bool = False,
401
rtol: float = 1e-5,
402
atol: float = 1e-8,
403
categorical_as_str: bool = False,
404
check_column_order: bool = True,
405
check_row_order: bool = True
406
) -> None:
407
"""
408
Assert that two DataFrames are equal.
409
410
Parameters:
411
- left: First DataFrame
412
- right: Second DataFrame
413
- check_dtype: Check column data types
414
- check_exact: Check exact floating-point equality
415
- rtol: Relative tolerance for floating-point comparison
416
- atol: Absolute tolerance for floating-point comparison
417
- categorical_as_str: Compare categoricals as strings
418
- check_column_order: Check column order
419
- check_row_order: Check row order
420
421
Raises:
422
- AssertionError: If DataFrames are not equal
423
"""
424
425
def assert_frame_not_equal(
426
left: DataFrame | LazyFrame,
427
right: DataFrame | LazyFrame,
428
**kwargs
429
) -> None:
430
"""
431
Assert that two DataFrames are not equal.
432
433
Parameters:
434
- left: First DataFrame
435
- right: Second DataFrame
436
- **kwargs: Same parameters as assert_frame_equal
437
438
Raises:
439
- AssertionError: If DataFrames are equal
440
"""
441
442
def assert_series_equal(
443
left: Series,
444
right: Series,
445
*,
446
check_dtype: bool = True,
447
check_exact: bool = False,
448
rtol: float = 1e-5,
449
atol: float = 1e-8,
450
categorical_as_str: bool = False,
451
check_names: bool = True
452
) -> None:
453
"""
454
Assert that two Series are equal.
455
456
Parameters:
457
- left: First Series
458
- right: Second Series
459
- check_dtype: Check data types
460
- check_exact: Check exact floating-point equality
461
- rtol: Relative tolerance for floating-point comparison
462
- atol: Absolute tolerance for floating-point comparison
463
- categorical_as_str: Compare categoricals as strings
464
- check_names: Check Series names
465
466
Raises:
467
- AssertionError: If Series are not equal
468
"""
469
470
def assert_series_not_equal(
471
left: Series,
472
right: Series,
473
**kwargs
474
) -> None:
475
"""
476
Assert that two Series are not equal.
477
478
Parameters:
479
- left: First Series
480
- right: Second Series
481
- **kwargs: Same parameters as assert_series_equal
482
483
Raises:
484
- AssertionError: If Series are equal
485
"""
486
```
487
488
## Usage Examples
489
490
### Configuration Usage
491
492
```python
493
import polars as pl
494
495
# Global configuration changes
496
pl.Config.set_tbl_rows(10)
497
pl.Config.set_tbl_cols(8)
498
pl.Config.set_verbose(True)
499
500
# Context manager for temporary config
501
with pl.Config() as cfg:
502
cfg.set_tbl_rows(20)
503
cfg.set_tbl_cols(12)
504
# Configuration active only within this block
505
print(large_df) # Uses temporary settings
506
507
# Function decorator for config
508
@pl.Config(set_tbl_rows=5, set_verbose=False)
509
def analyze_data(df):
510
return df.describe()
511
512
# Streaming configuration
513
pl.Config.set_streaming_chunk_size(50000)
514
```
515
516
### String Cache Usage
517
518
```python
519
# Context manager approach
520
with pl.StringCache():
521
# String operations are optimized within this block
522
df1 = pl.DataFrame({"category": ["A", "B", "A", "C", "B"]})
523
df2 = pl.DataFrame({"category": ["A", "B", "C"]})
524
525
# Joins and categorical operations are faster
526
result = df1.join(df2, on="category")
527
528
# Global enable/disable
529
pl.enable_string_cache()
530
531
# Check if enabled
532
if pl.using_string_cache():
533
print("String cache is active")
534
535
# Categorical operations benefit from string cache
536
df_cat = df.with_columns(pl.col("category").cast(pl.Categorical))
537
538
pl.disable_string_cache()
539
```
540
541
### Meta Information
542
543
```python
544
# Get build information
545
build_info = pl.build_info()
546
print(f"Polars version: {build_info['version']}")
547
print(f"Build features: {build_info['features']}")
548
549
# Show all version information
550
pl.show_versions()
551
552
# Thread pool information
553
thread_count = pl.thread_pool_size()
554
print(f"Using {thread_count} threads")
555
556
# Index type information
557
index_type = pl.get_index_type()
558
print(f"Index type: {index_type}")
559
```
560
561
### Selectors Usage
562
563
```python
564
import polars.selectors as cs
565
566
df = pl.DataFrame({
567
"id": [1, 2, 3],
568
"name": ["Alice", "Bob", "Charlie"],
569
"age": [25, 30, 35],
570
"salary": [50000.0, 60000.0, 70000.0],
571
"active": [True, False, True],
572
"start_date": [pl.date(2020, 1, 1), pl.date(2019, 5, 15), pl.date(2021, 3, 10)]
573
})
574
575
# Type-based selection
576
numeric_cols = df.select(cs.numeric())
577
string_cols = df.select(cs.string())
578
temporal_cols = df.select(cs.temporal())
579
580
# Name-based selection
581
name_pattern_cols = df.select(cs.matches(r".*a.*")) # Contains 'a'
582
prefix_cols = df.select(cs.starts_with("s")) # Starts with 's'
583
584
# Combined selectors
585
analysis_cols = df.select(cs.numeric() | cs.temporal())
586
non_id_cols = df.select(cs.all() & ~cs.by_name("id"))
587
588
# Complex selector operations
589
selected_cols = df.select(
590
cs.numeric() & ~cs.by_name("id"), # Numeric except id
591
cs.string(), # All strings
592
cs.exclude(cs.boolean()) # Everything except boolean
593
)
594
595
# Expand selectors to column names
596
expanded = cs.expand_selector(df, cs.numeric(), cs.string())
597
print(f"Selected columns: {expanded}")
598
```
599
600
### Testing Utilities
601
602
```python
603
import polars.testing as plt
604
605
# Create test DataFrames
606
df1 = pl.DataFrame({
607
"a": [1, 2, 3],
608
"b": [4.0, 5.0, 6.0],
609
"c": ["x", "y", "z"]
610
})
611
612
df2 = pl.DataFrame({
613
"a": [1, 2, 3],
614
"b": [4.0, 5.0, 6.0],
615
"c": ["x", "y", "z"]
616
})
617
618
# Assert DataFrames are equal
619
plt.assert_frame_equal(df1, df2)
620
621
# Assert with tolerance for floating-point
622
df3 = pl.DataFrame({
623
"a": [1, 2, 3],
624
"b": [4.0001, 5.0001, 6.0001],
625
"c": ["x", "y", "z"]
626
})
627
628
plt.assert_frame_equal(df1, df3, rtol=1e-3)
629
630
# Assert Series equality
631
s1 = pl.Series("values", [1, 2, 3])
632
s2 = pl.Series("values", [1, 2, 3])
633
plt.assert_series_equal(s1, s2)
634
635
# Assert inequality
636
df_different = pl.DataFrame({"a": [1, 2, 4]}) # Different values
637
plt.assert_frame_not_equal(df1, df_different)
638
639
# Testing in unit tests
640
def test_data_processing():
641
input_df = pl.DataFrame({"x": [1, 2, 3]})
642
expected_df = pl.DataFrame({"x": [2, 4, 6]})
643
644
result_df = input_df.select(pl.col("x") * 2)
645
646
plt.assert_frame_equal(result_df, expected_df)
647
```
648
649
### Advanced Configuration Patterns
650
651
```python
652
# Chained configuration
653
config_result = (
654
pl.Config()
655
.set_tbl_rows(15)
656
.set_tbl_cols(10)
657
.set_verbose(True)
658
.set_streaming_chunk_size(25000)
659
)
660
661
# Configuration for different environments
662
def setup_dev_config():
663
return (
664
pl.Config()
665
.set_verbose(True)
666
.set_tbl_rows(-1) # Show all rows
667
.set_tbl_cols(-1) # Show all columns
668
)
669
670
def setup_prod_config():
671
return (
672
pl.Config()
673
.set_verbose(False)
674
.set_tbl_rows(10)
675
.set_streaming_chunk_size(100000)
676
)
677
678
# Environment-specific setup
679
if os.getenv("ENV") == "development":
680
setup_dev_config()
681
else:
682
setup_prod_config()
683
```
684
685
### String Cache Performance Benefits
686
687
```python
688
# Performance comparison example
689
import time
690
691
# Without string cache
692
start_time = time.time()
693
for _ in range(1000):
694
df = pl.DataFrame({"cat": ["A", "B", "C"] * 1000})
695
result = df.filter(pl.col("cat") == "A")
696
no_cache_time = time.time() - start_time
697
698
# With string cache
699
pl.enable_string_cache()
700
start_time = time.time()
701
for _ in range(1000):
702
df = pl.DataFrame({"cat": ["A", "B", "C"] * 1000})
703
result = df.filter(pl.col("cat") == "A")
704
cache_time = time.time() - start_time
705
pl.disable_string_cache()
706
707
print(f"Without cache: {no_cache_time:.3f}s")
708
print(f"With cache: {cache_time:.3f}s")
709
print(f"Speedup: {no_cache_time/cache_time:.2f}x")
710
```
711
712
### CompatLevel
713
714
Data structure compatibility level configuration for controlling format compatibility when working with external systems and data interchange.
715
716
```python { .api }
717
class CompatLevel:
718
"""
719
Data structure compatibility level for interchange protocols.
720
721
Used to control compatibility when converting to/from external formats
722
like Arrow, ensuring data structures are compatible with different
723
system requirements.
724
"""
725
726
@staticmethod
727
def newest() -> CompatLevel:
728
"""
729
Get the highest supported compatibility level.
730
731
Warning: Highest compatibility level is considered unstable
732
and may change without notice.
733
"""
734
735
@staticmethod
736
def oldest() -> CompatLevel:
737
"""Get the most compatible level for maximum compatibility."""
738
```