0
# Input/Output Operations
1
2
Comprehensive I/O support for 15+ file formats including CSV, Parquet, JSON, Excel, databases, and cloud storage with both eager reading and lazy scanning capabilities for optimal performance and memory usage.
3
4
## Capabilities
5
6
### CSV Operations
7
8
Read and scan CSV files with extensive customization options for delimiters, headers, data types, and parsing behavior.
9
10
```python { .api }
11
def read_csv(
12
source,
13
*,
14
has_header=True,
15
columns=None,
16
new_columns=None,
17
dtypes=None,
18
separator=",",
19
comment_prefix=None,
20
quote_char='"',
21
skip_rows=0,
22
skip_rows_after_header=0,
23
row_index_name=None,
24
row_index_offset=0,
25
sample_size=1024,
26
eol_char="\n",
27
raise_if_empty=True,
28
truncate_ragged_lines=False,
29
decimal_comma=False,
30
glob=True,
31
rechunk=False,
32
low_memory=False,
33
use_pyarrow=False,
34
storage_options=None,
35
credential_provider=None,
36
retries=2,
37
file_cache_ttl=None,
38
include_file_paths=None
39
) -> DataFrame:
40
"""
41
Read CSV file(s) into DataFrame.
42
43
Parameters:
44
- source: File path, URL, or file-like object
45
- has_header: Whether CSV has header row
46
- columns: Column subset to read
47
- new_columns: New column names
48
- dtypes: Column data types
49
- separator: Field delimiter
50
- comment_prefix: Comment line prefix
51
- quote_char: Quote character
52
- skip_rows: Rows to skip at start
53
- skip_rows_after_header: Rows to skip after header
54
- row_index_name: Add row index column
55
- row_index_offset: Row index starting value
56
- sample_size: Rows to sample for type inference
57
- eol_char: End-of-line character
58
- raise_if_empty: Raise error if empty
59
- truncate_ragged_lines: Handle ragged lines
60
- decimal_comma: Use comma as decimal separator
61
- glob: Enable glob patterns
62
- rechunk: Rechunk to contiguous memory
63
- low_memory: Use low memory mode
64
- use_pyarrow: Use PyArrow parser
65
- storage_options: Cloud storage options
66
- credential_provider: Cloud credentials
67
- retries: Number of retries
68
- file_cache_ttl: File cache TTL
69
- include_file_paths: Include file path column
70
71
Returns:
72
DataFrame with CSV data
73
"""
74
75
def scan_csv(
76
source,
77
*,
78
has_header=True,
79
separator=",",
80
comment_prefix=None,
81
quote_char='"',
82
skip_rows=0,
83
dtypes=None,
84
null_values=None,
85
missing_utf8_is_empty_string=False,
86
ignore_errors=False,
87
cache=True,
88
with_columns=None,
89
include_file_paths=None,
90
n_rows=None,
91
encoding="utf8",
92
low_memory=False,
93
rechunk=False,
94
skip_rows_after_header=0,
95
row_index_name=None,
96
row_index_offset=0,
97
sample_size=1024,
98
eol_char="\n",
99
raise_if_empty=True,
100
truncate_ragged_lines=False,
101
decimal_comma=False,
102
glob=True,
103
storage_options=None,
104
credential_provider=None,
105
retries=2,
106
file_cache_ttl=None
107
) -> LazyFrame:
108
"""
109
Scan CSV file(s) lazily into LazyFrame.
110
111
Parameters: Similar to read_csv
112
113
Returns:
114
LazyFrame for deferred CSV reading
115
"""
116
```
117
118
### Parquet Operations
119
120
Read and scan Parquet files with column selection, predicate pushdown, and parallel processing.
121
122
```python { .api }
123
def read_parquet(
124
source,
125
*,
126
columns=None,
127
n_rows=None,
128
parallel="auto",
129
row_index_name=None,
130
row_index_offset=0,
131
low_memory=False,
132
use_pyarrow=False,
133
storage_options=None,
134
credential_provider=None,
135
retries=2,
136
rechunk=False,
137
hive_partitioning=None,
138
hive_schema=None,
139
try_parse_hive_dates=True,
140
include_file_paths=None,
141
allow_missing_columns=False
142
) -> DataFrame:
143
"""
144
Read Parquet file(s) into DataFrame.
145
146
Parameters:
147
- source: File path, URL, or file-like object
148
- columns: Column subset to read
149
- n_rows: Number of rows to read
150
- parallel: Parallel reading mode
151
- row_index_name: Add row index column
152
- row_index_offset: Row index starting value
153
- low_memory: Use low memory mode
154
- use_pyarrow: Use PyArrow reader
155
- storage_options: Cloud storage options
156
- credential_provider: Cloud credentials
157
- retries: Number of retries
158
- rechunk: Rechunk to contiguous memory
159
- hive_partitioning: Enable Hive partitioning
160
- hive_schema: Hive partition schema
161
- try_parse_hive_dates: Parse Hive date partitions
162
- include_file_paths: Include file path column
163
- allow_missing_columns: Allow missing columns
164
165
Returns:
166
DataFrame with Parquet data
167
"""
168
169
def scan_parquet(
170
source,
171
*,
172
n_rows=None,
173
row_index_name=None,
174
row_index_offset=0,
175
parallel="auto",
176
glob=True,
177
rechunk=False,
178
low_memory=False,
179
cache=True,
180
storage_options=None,
181
credential_provider=None,
182
retries=2,
183
hive_partitioning=None,
184
hive_schema=None,
185
try_parse_hive_dates=True,
186
include_file_paths=None,
187
allow_missing_columns=False
188
) -> LazyFrame:
189
"""
190
Scan Parquet file(s) lazily into LazyFrame.
191
192
Parameters: Similar to read_parquet
193
194
Returns:
195
LazyFrame for deferred Parquet reading
196
"""
197
198
def read_parquet_schema(source) -> Schema:
199
"""
200
Read schema from Parquet file without loading data.
201
202
Parameters:
203
- source: Parquet file path or URL
204
205
Returns:
206
Schema of Parquet file
207
"""
208
209
def read_parquet_metadata(source):
210
"""
211
Read metadata from Parquet file.
212
213
Parameters:
214
- source: Parquet file path or URL
215
216
Returns:
217
Parquet file metadata
218
"""
219
```
220
221
### JSON Operations
222
223
Read JSON and newline-delimited JSON files with flexible schema inference and nested data handling.
224
225
```python { .api }
226
def read_json(
227
source,
228
*,
229
schema=None,
230
schema_overrides=None,
231
infer_schema_length=None,
232
batch_size=None,
233
n_rows=None,
234
row_index_name=None,
235
row_index_offset=0,
236
storage_options=None,
237
credential_provider=None,
238
retries=2
239
) -> DataFrame:
240
"""
241
Read JSON file into DataFrame.
242
243
Parameters:
244
- source: File path, URL, or file-like object
245
- schema: Column schema
246
- schema_overrides: Override specific column types
247
- infer_schema_length: Rows to scan for schema inference
248
- batch_size: Processing batch size
249
- n_rows: Number of rows to read
250
- row_index_name: Add row index column
251
- row_index_offset: Row index starting value
252
- storage_options: Cloud storage options
253
- credential_provider: Cloud credentials
254
- retries: Number of retries
255
256
Returns:
257
DataFrame with JSON data
258
"""
259
260
def read_ndjson(
261
source,
262
*,
263
schema=None,
264
schema_overrides=None,
265
infer_schema_length=None,
266
batch_size=None,
267
n_rows=None,
268
row_index_name=None,
269
row_index_offset=0,
270
ignore_errors=False,
271
storage_options=None,
272
credential_provider=None,
273
retries=2
274
) -> DataFrame:
275
"""
276
Read newline-delimited JSON file into DataFrame.
277
278
Parameters: Similar to read_json with additional:
279
- ignore_errors: Continue on parsing errors
280
281
Returns:
282
DataFrame with NDJSON data
283
"""
284
285
def scan_ndjson(
286
source,
287
*,
288
infer_schema_length=100,
289
batch_size=1024,
290
n_rows=None,
291
low_memory=False,
292
rechunk=False,
293
row_index_name=None,
294
row_index_offset=0,
295
ignore_errors=False,
296
schema=None,
297
schema_overrides=None,
298
include_file_paths=None,
299
retries=2
300
) -> LazyFrame:
301
"""
302
Scan NDJSON file(s) lazily into LazyFrame.
303
304
Parameters: Similar to read_ndjson
305
306
Returns:
307
LazyFrame for deferred NDJSON reading
308
"""
309
```
310
311
### Excel and ODS Operations
312
313
Read Excel and OpenDocument spreadsheet files with sheet selection and range specification.
314
315
```python { .api }
316
def read_excel(
317
source,
318
*,
319
sheet_id=None,
320
sheet_name=None,
321
engine=None,
322
engine_options=None,
323
read_options=None,
324
schema_overrides=None,
325
infer_schema_length=1000,
326
raise_if_empty=True
327
) -> DataFrame:
328
"""
329
Read Excel file into DataFrame.
330
331
Parameters:
332
- source: Excel file path or file-like object
333
- sheet_id: Sheet index (0-based)
334
- sheet_name: Sheet name
335
- engine: Excel engine ("xlsx2csv", "openpyxl", "python")
336
- engine_options: Engine-specific options
337
- read_options: Additional read options
338
- schema_overrides: Override column types
339
- infer_schema_length: Rows for schema inference
340
- raise_if_empty: Raise error if empty
341
342
Returns:
343
DataFrame with Excel data
344
"""
345
346
def read_ods(
347
source,
348
*,
349
sheet_id=None,
350
sheet_name=None,
351
schema_overrides=None,
352
infer_schema_length=1000,
353
raise_if_empty=True
354
) -> DataFrame:
355
"""
356
Read OpenDocument Spreadsheet into DataFrame.
357
358
Parameters:
359
- source: ODS file path or file-like object
360
- sheet_id: Sheet index (0-based)
361
- sheet_name: Sheet name
362
- schema_overrides: Override column types
363
- infer_schema_length: Rows for schema inference
364
- raise_if_empty: Raise error if empty
365
366
Returns:
367
DataFrame with ODS data
368
"""
369
```
370
371
### Arrow IPC Operations
372
373
Read and scan Arrow IPC files and streams for efficient data exchange.
374
375
```python { .api }
376
def read_ipc(
377
source,
378
*,
379
columns=None,
380
n_rows=None,
381
row_index_name=None,
382
row_index_offset=0,
383
rechunk=False,
384
memory_map=True,
385
storage_options=None,
386
credential_provider=None,
387
retries=2,
388
include_file_paths=None
389
) -> DataFrame:
390
"""
391
Read Arrow IPC file into DataFrame.
392
393
Parameters:
394
- source: IPC file path, URL, or file-like object
395
- columns: Column subset to read
396
- n_rows: Number of rows to read
397
- row_index_name: Add row index column
398
- row_index_offset: Row index starting value
399
- rechunk: Rechunk to contiguous memory
400
- memory_map: Use memory mapping
401
- storage_options: Cloud storage options
402
- credential_provider: Cloud credentials
403
- retries: Number of retries
404
- include_file_paths: Include file path column
405
406
Returns:
407
DataFrame with IPC data
408
"""
409
410
def read_ipc_stream(
411
source,
412
*,
413
columns=None,
414
n_rows=None,
415
row_index_name=None,
416
row_index_offset=0,
417
rechunk=False,
418
storage_options=None,
419
credential_provider=None,
420
retries=2,
421
include_file_paths=None
422
) -> DataFrame:
423
"""
424
Read Arrow IPC stream into DataFrame.
425
426
Parameters: Similar to read_ipc
427
428
Returns:
429
DataFrame with IPC stream data
430
"""
431
432
def scan_ipc(
433
source,
434
*,
435
n_rows=None,
436
cache=True,
437
rechunk=False,
438
row_index_name=None,
439
row_index_offset=0,
440
storage_options=None,
441
credential_provider=None,
442
retries=2,
443
include_file_paths=None,
444
memory_map=True
445
) -> LazyFrame:
446
"""
447
Scan Arrow IPC file(s) lazily into LazyFrame.
448
449
Parameters: Similar to read_ipc
450
451
Returns:
452
LazyFrame for deferred IPC reading
453
"""
454
455
def read_ipc_schema(source) -> Schema:
456
"""
457
Read schema from Arrow IPC file without loading data.
458
459
Parameters:
460
- source: IPC file path or URL
461
462
Returns:
463
Schema of IPC file
464
"""
465
```
466
467
### Database Operations
468
469
Read data from databases using SQL queries with connection management and credential handling.
470
471
```python { .api }
472
def read_database(
473
query,
474
connection,
475
*,
476
partition_on=None,
477
partition_range=None,
478
partition_num=None,
479
protocol=None,
480
engine=None,
481
schema_overrides=None,
482
execute_options=None
483
) -> DataFrame:
484
"""
485
Read from database using SQL query.
486
487
Parameters:
488
- query: SQL query string
489
- connection: Database connection or connection string
490
- partition_on: Column for partitioning
491
- partition_range: Range for partitioning
492
- partition_num: Number of partitions
493
- protocol: Database protocol
494
- engine: Database engine
495
- schema_overrides: Override column types
496
- execute_options: Execution options
497
498
Returns:
499
DataFrame with query results
500
"""
501
502
def read_database_uri(
503
query,
504
uri,
505
*,
506
partition_on=None,
507
partition_range=None,
508
partition_num=None,
509
protocol=None,
510
engine=None,
511
schema_overrides=None,
512
execute_options=None
513
) -> DataFrame:
514
"""
515
Read from database using connection URI.
516
517
Parameters:
518
- query: SQL query string
519
- uri: Database connection URI
520
- partition_on: Column for partitioning
521
- partition_range: Range for partitioning
522
- partition_num: Number of partitions
523
- protocol: Database protocol
524
- engine: Database engine
525
- schema_overrides: Override column types
526
- execute_options: Execution options
527
528
Returns:
529
DataFrame with query results
530
"""
531
```
532
533
### Cloud Storage and Credentials
534
535
Credential providers for accessing cloud storage services with authentication.
536
537
```python { .api }
538
class CredentialProvider:
539
"""Base class for credential providers."""
540
541
class CredentialProviderAWS(CredentialProvider):
542
def __init__(
543
self,
544
*,
545
access_key_id=None,
546
secret_access_key=None,
547
session_token=None,
548
region_name=None,
549
profile_name=None,
550
assume_role_arn=None,
551
assume_role_session_name=None,
552
assume_role_external_id=None
553
):
554
"""
555
AWS credential provider.
556
557
Parameters:
558
- access_key_id: AWS access key ID
559
- secret_access_key: AWS secret access key
560
- session_token: AWS session token
561
- region_name: AWS region
562
- profile_name: AWS profile name
563
- assume_role_arn: Role ARN to assume
564
- assume_role_session_name: Assume role session name
565
- assume_role_external_id: External ID for assume role
566
"""
567
568
class CredentialProviderGCP(CredentialProvider):
569
def __init__(
570
self,
571
*,
572
service_account_path=None,
573
service_account_key=None,
574
project_id=None
575
):
576
"""
577
Google Cloud credential provider.
578
579
Parameters:
580
- service_account_path: Path to service account key file
581
- service_account_key: Service account key JSON
582
- project_id: GCP project ID
583
"""
584
585
class CredentialProviderAzure(CredentialProvider):
586
def __init__(
587
self,
588
*,
589
account_name=None,
590
account_key=None,
591
sas_token=None,
592
tenant_id=None,
593
client_id=None,
594
client_secret=None
595
):
596
"""
597
Azure credential provider.
598
599
Parameters:
600
- account_name: Storage account name
601
- account_key: Storage account key
602
- sas_token: SAS token
603
- tenant_id: Azure tenant ID
604
- client_id: Azure client ID
605
- client_secret: Azure client secret
606
"""
607
608
class CredentialProviderFunction(CredentialProvider):
609
def __init__(self, function):
610
"""
611
Function-based credential provider.
612
613
Parameters:
614
- function: Function returning credentials
615
"""
616
617
# Type alias for function return
618
CredentialProviderFunctionReturn = dict[str, str]
619
```
620
621
### Specialized Formats
622
623
Support for additional formats including Avro, Delta Lake, and Iceberg.
624
625
```python { .api }
626
def read_avro(
627
source,
628
*,
629
columns=None,
630
n_rows=None,
631
storage_options=None,
632
credential_provider=None,
633
retries=2
634
) -> DataFrame:
635
"""
636
Read Avro file into DataFrame.
637
638
Parameters:
639
- source: Avro file path or URL
640
- columns: Column subset to read
641
- n_rows: Number of rows to read
642
- storage_options: Cloud storage options
643
- credential_provider: Cloud credentials
644
- retries: Number of retries
645
646
Returns:
647
DataFrame with Avro data
648
"""
649
650
def read_delta(
651
source,
652
*,
653
version=None,
654
columns=None,
655
storage_options=None,
656
credential_provider=None,
657
delta_table_options=None
658
) -> DataFrame:
659
"""
660
Read Delta Lake table into DataFrame.
661
662
Parameters:
663
- source: Delta table path or URL
664
- version: Table version to read
665
- columns: Column subset to read
666
- storage_options: Cloud storage options
667
- credential_provider: Cloud credentials
668
- delta_table_options: Delta-specific options
669
670
Returns:
671
DataFrame with Delta table data
672
"""
673
674
def scan_delta(
675
source,
676
*,
677
version=None,
678
storage_options=None,
679
credential_provider=None,
680
delta_table_options=None
681
) -> LazyFrame:
682
"""
683
Scan Delta Lake table lazily into LazyFrame.
684
685
Parameters: Similar to read_delta
686
687
Returns:
688
LazyFrame for deferred Delta reading
689
"""
690
691
def scan_iceberg(
692
source,
693
*,
694
storage_options=None,
695
credential_provider=None
696
) -> LazyFrame:
697
"""
698
Scan Apache Iceberg table lazily into LazyFrame.
699
700
Parameters:
701
- source: Iceberg table path or URL
702
- storage_options: Cloud storage options
703
- credential_provider: Cloud credentials
704
705
Returns:
706
LazyFrame for deferred Iceberg reading
707
"""
708
709
def scan_pyarrow_dataset(
710
source,
711
*,
712
allow_pyarrow_filter=True,
713
pyarrow_options=None
714
) -> LazyFrame:
715
"""
716
Scan PyArrow dataset lazily into LazyFrame.
717
718
Parameters:
719
- source: PyArrow dataset
720
- allow_pyarrow_filter: Enable PyArrow filtering
721
- pyarrow_options: PyArrow-specific options
722
723
Returns:
724
LazyFrame for deferred dataset reading
725
"""
726
```
727
728
### Utility Operations
729
730
Additional I/O utilities including clipboard support and deferred computation.
731
732
```python { .api }
733
def read_clipboard(**kwargs) -> DataFrame:
734
"""
735
Read data from system clipboard.
736
737
Parameters:
738
- kwargs: Additional options (passed to pandas)
739
740
Returns:
741
DataFrame with clipboard data
742
"""
743
744
def defer() -> Expr:
745
"""
746
Create deferred computation placeholder.
747
748
Returns:
749
Deferred expression
750
"""
751
```
752
753
## Usage Examples
754
755
### Basic File Operations
756
757
```python
758
import polars as pl
759
760
# Read CSV with custom options
761
df = pl.read_csv(
762
"data.csv",
763
has_header=True,
764
separator=",",
765
dtypes={"id": pl.Int32, "name": pl.String},
766
null_values=["", "NULL"]
767
)
768
769
# Read Parquet with column selection
770
df = pl.read_parquet(
771
"data.parquet",
772
columns=["id", "name", "value"],
773
n_rows=1000
774
)
775
776
# Read JSON with schema override
777
df = pl.read_json(
778
"data.json",
779
schema_overrides={"timestamp": pl.Datetime}
780
)
781
```
782
783
### Lazy I/O with Query Optimization
784
785
```python
786
# Lazy CSV scanning with predicate pushdown
787
lazy_df = (
788
pl.scan_csv("large_data.csv")
789
.filter(pl.col("amount") > 1000)
790
.select(["customer_id", "amount", "date"])
791
.group_by("customer_id")
792
.agg([
793
pl.col("amount").sum().alias("total_amount"),
794
pl.col("date").max().alias("last_date")
795
])
796
)
797
798
# Execute optimized query
799
result = lazy_df.collect()
800
801
# Lazy Parquet scanning with column selection
802
result = (
803
pl.scan_parquet("*.parquet")
804
.select(["id", "value"])
805
.filter(pl.col("value") > 0)
806
.collect()
807
)
808
```
809
810
### Cloud Storage Access
811
812
```python
813
# AWS S3 access with credentials
814
aws_creds = pl.CredentialProviderAWS(
815
access_key_id="YOUR_KEY",
816
secret_access_key="YOUR_SECRET",
817
region_name="us-east-1"
818
)
819
820
df = pl.read_parquet(
821
"s3://my-bucket/data.parquet",
822
credential_provider=aws_creds
823
)
824
825
# Google Cloud Storage
826
gcp_creds = pl.CredentialProviderGCP(
827
service_account_path="path/to/service-account.json"
828
)
829
830
df = pl.read_csv(
831
"gs://my-bucket/data.csv",
832
credential_provider=gcp_creds
833
)
834
```
835
836
### Database Operations
837
838
```python
839
# Read from database
840
df = pl.read_database(
841
"SELECT * FROM users WHERE age > 18",
842
connection="postgresql://user:pass@localhost/db"
843
)
844
845
# Partitioned database reading
846
df = pl.read_database(
847
"SELECT * FROM large_table WHERE date >= ? AND date < ?",
848
"postgresql://user:pass@localhost/db",
849
partition_on="id",
850
partition_range=(1, 1000000),
851
partition_num=10
852
)
853
```
854
855
### Multiple File Formats
856
857
```python
858
# Read Excel with specific sheet
859
df = pl.read_excel(
860
"report.xlsx",
861
sheet_name="Summary",
862
schema_overrides={"date": pl.Date}
863
)
864
865
# Read Avro file
866
df = pl.read_avro("data.avro")
867
868
# Read Delta Lake table
869
df = pl.read_delta(
870
"path/to/delta/table",
871
version=5 # Read specific version
872
)
873
874
# Scan multiple Parquet files with glob
875
lazy_df = pl.scan_parquet("data/year=*/month=*/*.parquet")
876
result = lazy_df.collect()
877
```
878
879
### Advanced I/O Patterns
880
881
```python
882
# Streaming large files
883
for batch in pl.read_csv_batched("very_large.csv", batch_size=10000):
884
# Process each batch
885
processed = batch.with_columns([
886
pl.col("amount").mul(1.1).alias("amount_with_tax")
887
])
888
# Write or accumulate results
889
890
# Reading with error handling
891
df = pl.read_ndjson(
892
"messy_data.jsonl",
893
ignore_errors=True, # Skip malformed lines
894
infer_schema_length=1000
895
)
896
897
# Include file paths in multi-file reading
898
df = pl.read_csv(
899
"data/*.csv",
900
include_file_paths="source_file"
901
)
902
```