0
# I/O Operations
1
2
Extensive support for reading and writing data in various formats including CSV, Parquet, JSON, Arrow IPC, databases, Excel, and cloud storage with streaming capabilities for efficient processing of large datasets.
3
4
## Capabilities
5
6
### CSV Operations
7
8
Read and write CSV files with extensive customization options for delimiters, encoding, and data types.
9
10
```python { .api }
11
def read_csv(
12
source: str | Path | list[str] | list[Path] | BinaryIO,
13
*,
14
has_header: bool = True,
15
columns: list[int] | list[str] | None = None,
16
new_columns: list[str] | None = None,
17
dtypes: dict[str, type] | Sequence[type] | None = None,
18
separator: str = ",",
19
comment_prefix: str | None = None,
20
quote_char: str | None = '"',
21
skip_rows: int = 0,
22
skip_rows_after_header: int = 0,
23
row_index_name: str | None = None,
24
row_index_offset: int = 0,
25
sample_size: int = 1024,
26
eol_char: str = "\n",
27
raise_if_empty: bool = True,
28
truncate_ragged_lines: bool = False,
29
rechunk: bool = False,
30
schema_overrides: dict[str, type] | None = None,
31
null_values: str | list[str] | dict[str, str] | None = None,
32
missing_utf8_is_empty_string: bool = False,
33
max_rows: int | None = None,
34
encoding: str = "utf8",
35
try_parse_dates: bool = False,
36
n_threads: int | None = None,
37
infer_schema_length: int | None = 100,
38
batch_size: int = 8192,
39
n_rows: int | None = None,
40
low_memory: bool = False,
41
rechunk_end: bool = True,
42
skip_blank_lines: bool = True,
43
ignore_errors: bool = False
44
) -> DataFrame:
45
"""
46
Read CSV file(s) into DataFrame.
47
48
Parameters:
49
- source: File path(s) or file-like object
50
- has_header: Whether first row contains headers
51
- columns: Columns to select by index or name
52
- dtypes: Data types for columns
53
- separator: Field separator character
54
- quote_char: Quote character for fields
55
- null_values: Values to interpret as null
56
- encoding: Text encoding
57
- n_threads: Number of threads for parallel processing
58
59
Returns:
60
- DataFrame: Parsed CSV data
61
"""
62
63
def read_csv_batched(
64
source: str | Path | BinaryIO,
65
*,
66
batch_size: int = 50000,
67
**kwargs
68
) -> BatchedCsvReader:
69
"""
70
Read CSV file in batches for memory-efficient processing.
71
72
Parameters:
73
- source: File path or file-like object
74
- batch_size: Number of rows per batch
75
- **kwargs: Same parameters as read_csv
76
77
Returns:
78
- BatchedCsvReader: Iterator yielding DataFrame batches
79
"""
80
81
def scan_csv(
82
source: str | Path | list[str] | list[Path],
83
*,
84
has_header: bool = True,
85
separator: str = ",",
86
comment_prefix: str | None = None,
87
quote_char: str | None = '"',
88
skip_rows: int = 0,
89
dtypes: dict[str, type] | None = None,
90
null_values: str | list[str] | dict[str, str] | None = None,
91
missing_utf8_is_empty_string: bool = False,
92
cache: bool = True,
93
with_column_names: Callable[[list[str]], list[str]] | None = None,
94
infer_schema_length: int | None = 100,
95
n_rows: int | None = None,
96
encoding: str = "utf8",
97
low_memory: bool = False,
98
rechunk: bool = False,
99
skip_rows_after_header: int = 0,
100
row_index_name: str | None = None,
101
row_index_offset: int = 0,
102
try_parse_dates: bool = False,
103
eol_char: str = "\n",
104
raise_if_empty: bool = True,
105
truncate_ragged_lines: bool = False,
106
schema: dict[str, type] | None = None,
107
ignore_errors: bool = False
108
) -> LazyFrame:
109
"""
110
Scan CSV file(s) for lazy processing.
111
112
Returns:
113
- LazyFrame: Lazy representation of CSV data
114
"""
115
```
116
117
### Parquet Operations
118
119
Read and write Apache Parquet files with compression and metadata options.
120
121
```python { .api }
122
def read_parquet(
123
source: str | Path | list[str] | list[Path] | BinaryIO,
124
*,
125
columns: list[int] | list[str] | None = None,
126
n_rows: int | None = None,
127
row_index_name: str | None = None,
128
row_index_offset: int = 0,
129
parallel: str = "auto",
130
use_statistics: bool = True,
131
hive_partitioning: bool | None = None,
132
hive_schema: dict[str, type] | None = None,
133
try_parse_hive_dates: bool = True,
134
glob: bool = True,
135
schema: dict[str, type] | None = None,
136
rechunk: bool = False,
137
low_memory: bool = False,
138
storage_options: dict[str, Any] | None = None,
139
credential_provider: CredentialProvider | None = None,
140
retries: int = 2,
141
use_pyarrow: bool = False,
142
pyarrow_options: dict[str, Any] | None = None,
143
memory_map: bool = True
144
) -> DataFrame:
145
"""
146
Read Parquet file(s) into DataFrame.
147
148
Parameters:
149
- source: File path(s) or file-like object
150
- columns: Columns to select
151
- parallel: Parallel reading mode ('auto', 'columns', 'row_groups', 'none')
152
- use_statistics: Use Parquet statistics for optimization
153
- hive_partitioning: Enable Hive-style partitioning
154
- storage_options: Cloud storage configuration
155
- credential_provider: Cloud credentials
156
157
Returns:
158
- DataFrame: Parquet data
159
"""
160
161
def scan_parquet(
162
source: str | Path | list[str] | list[Path],
163
*,
164
n_rows: int | None = None,
165
row_index_name: str | None = None,
166
row_index_offset: int = 0,
167
parallel: str = "auto",
168
use_statistics: bool = True,
169
hive_partitioning: bool | None = None,
170
hive_schema: dict[str, type] | None = None,
171
try_parse_hive_dates: bool = True,
172
glob: bool = True,
173
schema: dict[str, type] | None = None,
174
cache: bool = True,
175
cloud_options: dict[str, Any] | None = None,
176
credential_provider: CredentialProvider | None = None,
177
retries: int = 2
178
) -> LazyFrame:
179
"""
180
Scan Parquet file(s) for lazy processing.
181
182
Returns:
183
- LazyFrame: Lazy representation of Parquet data
184
"""
185
186
def read_parquet_schema(source: str | Path | BinaryIO) -> dict[str, type]:
187
"""
188
Read schema from Parquet file without loading data.
189
190
Parameters:
191
- source: File path or file-like object
192
193
Returns:
194
- dict[str, type]: Column names and types
195
"""
196
197
def read_parquet_metadata(source: str | Path | BinaryIO) -> dict[str, Any]:
198
"""
199
Read metadata from Parquet file.
200
201
Parameters:
202
- source: File path or file-like object
203
204
Returns:
205
- dict[str, Any]: Parquet metadata
206
"""
207
```
208
209
### JSON Operations
210
211
Read and write JSON and newline-delimited JSON (NDJSON) files.
212
213
```python { .api }
214
def read_json(
215
source: str | Path | IOBase | bytes,
216
*,
217
schema: dict[str, type] | None = None,
218
schema_overrides: dict[str, type] | None = None,
219
infer_schema_length: int | None = 100
220
) -> DataFrame:
221
"""
222
Read JSON file into DataFrame.
223
224
Parameters:
225
- source: JSON file path or content
226
- schema: Expected schema
227
- schema_overrides: Override inferred types
228
- infer_schema_length: Rows to scan for schema inference
229
230
Returns:
231
- DataFrame: JSON data
232
"""
233
234
def read_ndjson(
235
source: str | Path | IOBase | bytes,
236
*,
237
schema: dict[str, type] | None = None,
238
schema_overrides: dict[str, type] | None = None,
239
ignore_errors: bool = False
240
) -> DataFrame:
241
"""
242
Read newline-delimited JSON file into DataFrame.
243
244
Parameters:
245
- source: NDJSON file path or content
246
- schema: Expected schema
247
- ignore_errors: Skip malformed JSON lines
248
249
Returns:
250
- DataFrame: NDJSON data
251
"""
252
253
def scan_ndjson(
254
source: str | Path | list[str] | list[Path],
255
*,
256
schema: dict[str, type] | None = None,
257
ignore_errors: bool = False,
258
batch_size: int | None = None,
259
n_rows: int | None = None,
260
low_memory: bool = False,
261
rechunk: bool = False,
262
row_index_name: str | None = None,
263
row_index_offset: int = 0,
264
infer_schema_length: int | None = 100
265
) -> LazyFrame:
266
"""
267
Scan NDJSON file(s) for lazy processing.
268
269
Returns:
270
- LazyFrame: Lazy representation of NDJSON data
271
"""
272
```
273
274
### Arrow IPC Operations
275
276
Read and write Apache Arrow IPC format for efficient columnar data exchange.
277
278
```python { .api }
279
def read_ipc(
280
source: str | Path | BinaryIO,
281
*,
282
columns: list[int] | list[str] | None = None,
283
n_rows: int | None = None,
284
row_index_name: str | None = None,
285
row_index_offset: int = 0,
286
rechunk: bool = False,
287
memory_map: bool = True,
288
storage_options: dict[str, Any] | None = None,
289
credential_provider: CredentialProvider | None = None,
290
retries: int = 2
291
) -> DataFrame:
292
"""
293
Read Arrow IPC file into DataFrame.
294
295
Parameters:
296
- source: IPC file path or file-like object
297
- columns: Columns to select
298
- memory_map: Use memory mapping for better performance
299
- storage_options: Cloud storage configuration
300
301
Returns:
302
- DataFrame: IPC data
303
"""
304
305
def read_ipc_stream(
306
source: str | Path | BinaryIO,
307
*,
308
columns: list[int] | list[str] | None = None,
309
n_rows: int | None = None,
310
row_index_name: str | None = None,
311
row_index_offset: int = 0,
312
rechunk: bool = False,
313
storage_options: dict[str, Any] | None = None,
314
credential_provider: CredentialProvider | None = None,
315
retries: int = 2
316
) -> DataFrame:
317
"""
318
Read Arrow IPC stream into DataFrame.
319
320
Returns:
321
- DataFrame: IPC stream data
322
"""
323
324
def scan_ipc(
325
source: str | Path | list[str] | list[Path],
326
*,
327
n_rows: int | None = None,
328
cache: bool = True,
329
rechunk: bool = False,
330
row_index_name: str | None = None,
331
row_index_offset: int = 0,
332
storage_options: dict[str, Any] | None = None,
333
credential_provider: CredentialProvider | None = None,
334
retries: int = 2,
335
memory_map: bool = True
336
) -> LazyFrame:
337
"""
338
Scan IPC file(s) for lazy processing.
339
340
Returns:
341
- LazyFrame: Lazy representation of IPC data
342
"""
343
344
def read_ipc_schema(source: str | Path | BinaryIO) -> dict[str, type]:
345
"""
346
Read schema from IPC file without loading data.
347
348
Returns:
349
- dict[str, type]: Column names and types
350
"""
351
```
352
353
### Database Operations
354
355
Connect to and query various databases with full SQL support.
356
357
```python { .api }
358
def read_database(
359
query: str | RawExpr,
360
connection: str | ConnectionProtocol,
361
*,
362
partition_on: str | None = None,
363
partition_range: tuple[int, int] | None = None,
364
partition_num: int | None = None,
365
protocol: str | None = None,
366
engine: str | None = None,
367
schema_overrides: dict[str, type] | None = None,
368
execute_options: dict[str, Any] | None = None,
369
iter_batches: bool = False,
370
batch_size: int | None = None
371
) -> DataFrame:
372
"""
373
Execute database query and return DataFrame.
374
375
Parameters:
376
- query: SQL query string
377
- connection: Database connection string or object
378
- partition_on: Column for parallel partitioning
379
- protocol: Database protocol ('adbc', 'connectorx')
380
- engine: Database engine
381
- schema_overrides: Override inferred column types
382
383
Returns:
384
- DataFrame: Query results
385
"""
386
387
def read_database_uri(
388
query: str | RawExpr,
389
uri: str,
390
*,
391
partition_on: str | None = None,
392
partition_range: tuple[int, int] | None = None,
393
partition_num: int | None = None,
394
protocol: str | None = None,
395
engine: str | None = None,
396
schema_overrides: dict[str, type] | None = None,
397
execute_options: dict[str, Any] | None = None
398
) -> DataFrame:
399
"""
400
Execute database query using URI connection string.
401
402
Parameters:
403
- query: SQL query string
404
- uri: Database URI connection string
405
406
Returns:
407
- DataFrame: Query results
408
"""
409
```
410
411
### Spreadsheet Operations
412
413
Read Excel and OpenDocument spreadsheet files.
414
415
```python { .api }
416
def read_excel(
417
source: str | Path | BinaryIO,
418
*,
419
sheet_id: int | None = None,
420
sheet_name: str | None = None,
421
engine: str | None = None,
422
engine_options: dict[str, Any] | None = None,
423
read_options: dict[str, Any] | None = None,
424
schema_overrides: dict[str, type] | None = None,
425
infer_schema_length: int | None = None,
426
raise_if_empty: bool = True
427
) -> DataFrame:
428
"""
429
Read Excel file into DataFrame.
430
431
Parameters:
432
- source: Excel file path or file-like object
433
- sheet_id: Sheet index to read
434
- sheet_name: Sheet name to read
435
- engine: Excel engine ('calamine', 'openpyxl', 'xlsx2csv')
436
- schema_overrides: Override inferred column types
437
438
Returns:
439
- DataFrame: Excel data
440
"""
441
442
def read_ods(
443
source: str | Path | BinaryIO,
444
*,
445
sheet_id: int | None = None,
446
sheet_name: str | None = None,
447
schema_overrides: dict[str, type] | None = None,
448
infer_schema_length: int | None = None,
449
raise_if_empty: bool = True
450
) -> DataFrame:
451
"""
452
Read OpenDocument Spreadsheet file into DataFrame.
453
454
Parameters:
455
- source: ODS file path or file-like object
456
- sheet_id: Sheet index to read
457
- sheet_name: Sheet name to read
458
459
Returns:
460
- DataFrame: ODS data
461
"""
462
```
463
464
### Other Formats
465
466
Support for additional data formats.
467
468
```python { .api }
469
def read_avro(
470
source: str | Path | BinaryIO,
471
*,
472
columns: list[int] | list[str] | None = None,
473
n_rows: int | None = None
474
) -> DataFrame:
475
"""
476
Read Apache Avro file into DataFrame.
477
478
Parameters:
479
- source: Avro file path or file-like object
480
- columns: Columns to select
481
- n_rows: Number of rows to read
482
483
Returns:
484
- DataFrame: Avro data
485
"""
486
487
def read_clipboard(*, separator: str = "\t", **kwargs) -> DataFrame:
488
"""
489
Read data from system clipboard.
490
491
Parameters:
492
- separator: Field separator
493
- **kwargs: Additional CSV parsing options
494
495
Returns:
496
- DataFrame: Clipboard data
497
"""
498
499
def read_delta(
500
source: str | Path,
501
*,
502
version: int | str | datetime | None = None,
503
columns: list[str] | None = None,
504
storage_options: dict[str, str] | None = None,
505
delta_table_options: dict[str, Any] | None = None,
506
pyarrow_options: dict[str, Any] | None = None
507
) -> DataFrame:
508
"""
509
Read Delta Lake table into DataFrame.
510
511
Parameters:
512
- source: Delta table path
513
- version: Table version to read
514
- columns: Columns to select
515
- storage_options: Cloud storage configuration
516
517
Returns:
518
- DataFrame: Delta table data
519
"""
520
521
def scan_delta(
522
source: str | Path,
523
*,
524
version: int | str | datetime | None = None,
525
storage_options: dict[str, str] | None = None,
526
delta_table_options: dict[str, Any] | None = None,
527
pyarrow_options: dict[str, Any] | None = None
528
) -> LazyFrame:
529
"""
530
Scan Delta Lake table for lazy processing.
531
532
Returns:
533
- LazyFrame: Lazy representation of Delta table
534
"""
535
```
536
537
### Cloud Storage Support
538
539
Integration with cloud storage providers and object stores.
540
541
```python { .api }
542
# Cloud credential providers
543
class CredentialProvider:
544
"""Base class for cloud credential providers"""
545
546
class CredentialProviderAWS:
547
def __init__(
548
self,
549
*,
550
access_key_id: str | None = None,
551
secret_access_key: str | None = None,
552
session_token: str | None = None,
553
region: str | None = None,
554
profile: str | None = None
555
):
556
"""
557
AWS credential provider.
558
559
Parameters:
560
- access_key_id: AWS access key
561
- secret_access_key: AWS secret key
562
- session_token: AWS session token
563
- region: AWS region
564
- profile: AWS CLI profile name
565
"""
566
567
class CredentialProviderAzure:
568
def __init__(
569
self,
570
*,
571
account_name: str | None = None,
572
account_key: str | None = None,
573
sas_token: str | None = None,
574
tenant_id: str | None = None,
575
client_id: str | None = None,
576
client_secret: str | None = None
577
):
578
"""
579
Azure credential provider.
580
581
Parameters:
582
- account_name: Storage account name
583
- account_key: Storage account key
584
- sas_token: Shared access signature token
585
"""
586
587
class CredentialProviderGCP:
588
def __init__(
589
self,
590
*,
591
service_account_path: str | None = None,
592
service_account_key: str | None = None,
593
project_id: str | None = None
594
):
595
"""
596
Google Cloud Platform credential provider.
597
598
Parameters:
599
- service_account_path: Path to service account JSON file
600
- service_account_key: Service account key JSON string
601
- project_id: GCP project ID
602
"""
603
604
class CredentialProviderFunction:
605
def __init__(self, func: Callable[[], dict[str, str]]):
606
"""
607
Function-based credential provider.
608
609
Parameters:
610
- func: Function returning credential dictionary
611
"""
612
613
# Cloud scanning
614
def scan_iceberg(
615
source: str,
616
*,
617
mode: str = "convert",
618
pyarrow_options: dict[str, Any] | None = None
619
) -> LazyFrame:
620
"""
621
Scan Apache Iceberg table for lazy processing.
622
623
Parameters:
624
- source: Iceberg table path or catalog reference
625
- mode: Scanning mode ('convert' or 'arrow')
626
627
Returns:
628
- LazyFrame: Lazy representation of Iceberg table
629
"""
630
631
def scan_pyarrow_dataset(
632
source: str | Path,
633
*,
634
schema: dict[str, type] | None = None,
635
allow_pyarrow_filter: bool = True,
636
cache: bool = True
637
) -> LazyFrame:
638
"""
639
Scan PyArrow dataset for lazy processing.
640
641
Parameters:
642
- source: Dataset path
643
- schema: Expected schema
644
- allow_pyarrow_filter: Enable PyArrow predicate pushdown
645
646
Returns:
647
- LazyFrame: Lazy representation of PyArrow dataset
648
"""
649
```
650
651
### Scan Configuration
652
653
Advanced configuration options for scanning operations.
654
655
```python { .api }
656
class ScanCastOptions:
657
def __init__(
658
self,
659
*,
660
cast_time_unit: str | None = None,
661
cast_string_strict: bool = True
662
):
663
"""
664
Options for type casting during scanning.
665
666
Parameters:
667
- cast_time_unit: Time unit for temporal casts
668
- cast_string_strict: Strict string casting
669
"""
670
671
# Partitioning classes
672
class PartitionByKey:
673
def __init__(self, by: str | list[str]):
674
"""Partition by column values."""
675
676
class PartitionMaxSize:
677
def __init__(self, max_size: int):
678
"""Partition by maximum size."""
679
680
class PartitionParted:
681
def __init__(self, n_partitions: int):
682
"""Partition into fixed number of parts."""
683
684
# Context classes for advanced partitioning
685
class BasePartitionContext:
686
"""Base partition context"""
687
688
class KeyedPartitionContext(BasePartitionContext):
689
def __init__(self, key: Any): ...
690
691
class KeyedPartition:
692
def __init__(self, key: Any, partition: DataFrame): ...
693
```
694
695
### Deferred I/O
696
697
Utilities for deferred I/O operations.
698
699
```python { .api }
700
def defer() -> Expr:
701
"""
702
Create deferred I/O expression for use in scan operations.
703
704
Returns:
705
- Expr: Deferred expression
706
"""
707
```
708
709
## Usage Examples
710
711
### CSV Operations
712
713
```python
714
import polars as pl
715
716
# Basic CSV reading
717
df = pl.read_csv("data.csv")
718
719
# CSV with custom options
720
df = pl.read_csv(
721
"data.csv",
722
separator=";",
723
has_header=True,
724
dtypes={"id": pl.Int32, "date": pl.Date},
725
null_values=["", "NULL", "N/A"]
726
)
727
728
# Lazy CSV scanning for large files
729
lazy_df = pl.scan_csv("large_file.csv").filter(pl.col("date") >= "2023-01-01")
730
result = lazy_df.collect()
731
732
# Batched reading for memory efficiency
733
reader = pl.read_csv_batched("huge_file.csv", batch_size=10000)
734
for batch in reader:
735
process_batch(batch)
736
```
737
738
### Parquet Operations
739
740
```python
741
# Read Parquet file
742
df = pl.read_parquet("data.parquet")
743
744
# Parquet with column selection
745
df = pl.read_parquet("data.parquet", columns=["id", "name", "value"])
746
747
# Lazy Parquet scanning with predicate pushdown
748
lazy_df = (
749
pl.scan_parquet("partitioned/*.parquet")
750
.filter(pl.col("year") == 2023)
751
.select(["id", "amount"])
752
)
753
result = lazy_df.collect()
754
755
# Read Parquet metadata
756
schema = pl.read_parquet_schema("data.parquet")
757
metadata = pl.read_parquet_metadata("data.parquet")
758
```
759
760
### Database Operations
761
762
```python
763
# Read from database
764
df = pl.read_database(
765
"SELECT * FROM customers WHERE active = true",
766
"postgresql://user:pass@localhost/db"
767
)
768
769
# Partitioned database reading
770
df = pl.read_database(
771
"SELECT * FROM large_table",
772
connection,
773
partition_on="id",
774
partition_num=4
775
)
776
777
# Using different protocols
778
df = pl.read_database(
779
"SELECT * FROM table",
780
connection,
781
protocol="adbc" # or "connectorx"
782
)
783
```
784
785
### Cloud Storage
786
787
```python
788
# AWS S3
789
aws_creds = pl.CredentialProviderAWS(
790
access_key_id="ACCESS_KEY",
791
secret_access_key="SECRET_KEY",
792
region="us-east-1"
793
)
794
795
df = pl.read_parquet(
796
"s3://bucket/data.parquet",
797
credential_provider=aws_creds
798
)
799
800
# Azure Blob Storage
801
azure_creds = pl.CredentialProviderAzure(
802
account_name="account",
803
account_key="key"
804
)
805
806
df = pl.read_csv(
807
"az://container/data.csv",
808
credential_provider=azure_creds
809
)
810
811
# Google Cloud Storage
812
gcp_creds = pl.CredentialProviderGCP(
813
service_account_path="service-account.json"
814
)
815
816
df = pl.scan_parquet(
817
"gs://bucket/partitioned/*.parquet",
818
credential_provider=gcp_creds
819
)
820
```
821
822
### Excel and Spreadsheets
823
824
```python
825
# Read Excel file
826
df = pl.read_excel("data.xlsx", sheet_name="Sheet1")
827
828
# Excel with specific engine
829
df = pl.read_excel(
830
"data.xlsx",
831
engine="openpyxl",
832
schema_overrides={"date": pl.Date}
833
)
834
835
# OpenDocument Spreadsheet
836
df = pl.read_ods("data.ods", sheet_id=0)
837
```
838
839
### JSON Operations
840
841
```python
842
# Read JSON
843
df = pl.read_json("data.json")
844
845
# Read NDJSON (newline-delimited JSON)
846
df = pl.read_ndjson("logs.jsonl")
847
848
# Lazy NDJSON scanning
849
lazy_df = pl.scan_ndjson("large_logs.jsonl").filter(
850
pl.col("timestamp") >= "2023-01-01"
851
)
852
```
853
854
### Delta Lake
855
856
```python
857
# Read Delta table
858
df = pl.read_delta("path/to/delta/table")
859
860
# Read specific version
861
df = pl.read_delta("delta/table", version=5)
862
863
# Lazy scanning with time travel
864
lazy_df = pl.scan_delta("delta/table", version="2023-01-01T00:00:00Z")
865
```
866
867
### Advanced Scanning
868
869
```python
870
# Scan with custom options
871
cast_options = pl.ScanCastOptions(
872
cast_time_unit="us",
873
cast_string_strict=False
874
)
875
876
lazy_df = pl.scan_csv(
877
"data.csv",
878
cast_options=cast_options
879
)
880
881
# Iceberg table scanning
882
lazy_df = pl.scan_iceberg("catalog.database.table")
883
884
# PyArrow dataset scanning
885
lazy_df = pl.scan_pyarrow_dataset("partitioned/dataset/")
886
```