0
# I/O Operations
1
2
Comprehensive I/O capabilities supporting 10+ file formats with both eager reading and lazy scanning for performance optimization. Polars provides efficient data ingestion and export across various formats with advanced features like predicate pushdown and schema inference.
3
4
## Capabilities
5
6
### CSV Operations
7
8
Reading and scanning CSV files with extensive configuration options.
9
10
```python { .api }
11
def read_csv(
12
source: str | Path | IO[str] | IO[bytes] | bytes,
13
*,
14
has_header: bool = True,
15
columns: list[int] | list[str] | None = None,
16
new_columns: list[str] | None = None,
17
dtypes: dict[int | str, DataType] | Sequence[DataType] | None = None,
18
separator: str = ",",
19
comment_prefix: str | None = None,
20
quote_char: str | None = '"',
21
skip_rows: int = 0,
22
skip_rows_after_header: int = 0,
23
row_index_name: str | None = None,
24
row_index_offset: int = 0,
25
sample_size: int = 1024,
26
eol_char: str = "\n",
27
null_values: str | Sequence[str] | dict[str, str] | None = None,
28
missing_utf8_is_empty_string: bool = False,
29
ignore_errors: bool = False,
30
try_parse_dates: bool = False,
31
n_threads: int | None = None,
32
infer_schema_length: int | None = N_INFER_DEFAULT,
33
batch_size: int | None = None,
34
n_rows: int | None = None,
35
encoding: CsvEncoding = "utf8",
36
low_memory: bool = False,
37
rechunk: bool = False,
38
skip_blank_lines: bool = True,
39
raise_if_empty: bool = True,
40
truncate_ragged_lines: bool = False,
41
decimal_comma: bool = False,
42
glob: bool = True
43
) -> DataFrame:
44
"""
45
Read CSV file into DataFrame.
46
47
Parameters:
48
- source: File path, URL, or file-like object
49
- has_header: First row contains column names
50
- columns: Columns to select by index or name
51
- new_columns: Override column names
52
- dtypes: Column data types
53
- separator: Field delimiter
54
- comment_prefix: Comment line prefix to skip
55
- quote_char: Quote character for strings
56
- skip_rows: Number of rows to skip at start
57
- skip_rows_after_header: Rows to skip after header
58
- row_index_name: Add row index column with this name
59
- row_index_offset: Start value for row index
60
- sample_size: Rows to sample for type inference
61
- eol_char: End-of-line character
62
- null_values: Values to interpret as null
63
- missing_utf8_is_empty_string: Treat invalid UTF-8 as empty
64
- ignore_errors: Continue on parse errors
65
- try_parse_dates: Attempt date parsing
66
- n_threads: Number of threads for parsing
67
- infer_schema_length: Rows to scan for schema inference
68
- batch_size: Batch size for processing
69
- n_rows: Maximum rows to read
70
- encoding: Text encoding
71
- low_memory: Use less memory (slower)
72
- rechunk: Rechunk to single chunk
73
- skip_blank_lines: Skip empty lines
74
- raise_if_empty: Raise error if no data
75
- truncate_ragged_lines: Handle inconsistent columns
76
- decimal_comma: Use comma as decimal separator
77
- glob: Use glob patterns for multiple files
78
79
Returns:
80
DataFrame with CSV data
81
"""
82
83
def scan_csv(
84
source: str | Path | list[str] | list[Path],
85
**kwargs
86
) -> LazyFrame:
87
"""
88
Lazy scan CSV file(s) for optimized processing.
89
90
Parameters:
91
Similar to read_csv but returns LazyFrame for deferred execution
92
93
Returns:
94
LazyFrame for lazy evaluation
95
"""
96
```
97
98
### Parquet Operations
99
100
High-performance columnar format operations with advanced features.
101
102
```python { .api }
103
def read_parquet(
104
source: str | Path | IO[bytes] | bytes,
105
*,
106
columns: list[int] | list[str] | None = None,
107
n_rows: int | None = None,
108
row_index_name: str | None = None,
109
row_index_offset: int = 0,
110
parallel: ParallelStrategy = "auto",
111
use_statistics: bool = True,
112
hive_partitioning: bool | None = None,
113
glob: bool = True,
114
rechunk: bool = False,
115
low_memory: bool = False,
116
storage_options: dict[str, Any] | None = None,
117
credential_provider: CredentialProvider | None = None,
118
retries: int = 2,
119
file_cache_ttl: int | None = None
120
) -> DataFrame:
121
"""
122
Read Parquet file into DataFrame.
123
124
Parameters:
125
- source: File path, URL, or bytes
126
- columns: Columns to select
127
- n_rows: Maximum rows to read
128
- row_index_name: Add row index column
129
- row_index_offset: Row index start value
130
- parallel: Parallelization strategy
131
- use_statistics: Use Parquet statistics for optimization
132
- hive_partitioning: Enable Hive-style partitioning
133
- glob: Use glob patterns
134
- rechunk: Rechunk to single chunk
135
- low_memory: Use less memory
136
- storage_options: Cloud storage options
137
- credential_provider: Cloud credentials
138
- retries: Number of retry attempts
139
- file_cache_ttl: File cache time-to-live
140
141
Returns:
142
DataFrame with Parquet data
143
"""
144
145
def scan_parquet(
146
source: str | Path | list[str] | list[Path],
147
**kwargs
148
) -> LazyFrame:
149
"""Lazy scan Parquet file(s)."""
150
151
def read_parquet_metadata(source: str | Path | IO[bytes] | bytes) -> dict[str, Any]:
152
"""Read Parquet file metadata."""
153
154
def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> Schema:
155
"""Read Parquet file schema."""
156
```
157
158
### JSON Operations
159
160
JSON and newline-delimited JSON file operations.
161
162
```python { .api }
163
def read_json(
164
source: str | Path | IO[str] | IO[bytes] | bytes,
165
*,
166
schema: dict[str, DataType] | None = None,
167
schema_overrides: dict[str, DataType] | None = None,
168
infer_schema_length: int | None = N_INFER_DEFAULT
169
) -> DataFrame:
170
"""
171
Read JSON file into DataFrame.
172
173
Parameters:
174
- source: JSON file path or data
175
- schema: Expected schema
176
- schema_overrides: Override inferred types
177
- infer_schema_length: Rows for schema inference
178
179
Returns:
180
DataFrame with JSON data
181
"""
182
183
def read_ndjson(
184
source: str | Path | IO[str] | IO[bytes] | bytes,
185
*,
186
schema: dict[str, DataType] | None = None,
187
schema_overrides: dict[str, DataType] | None = None,
188
batch_size: int | None = None,
189
n_rows: int | None = None,
190
low_memory: bool = False,
191
rechunk: bool = False,
192
row_index_name: str | None = None,
193
row_index_offset: int = 0,
194
ignore_errors: bool = False
195
) -> DataFrame:
196
"""
197
Read newline-delimited JSON file.
198
199
Parameters:
200
- source: NDJSON file path or data
201
- schema: Expected schema
202
- schema_overrides: Override inferred types
203
- batch_size: Processing batch size
204
- n_rows: Maximum rows to read
205
- low_memory: Use less memory
206
- rechunk: Rechunk to single chunk
207
- row_index_name: Add row index column
208
- row_index_offset: Row index start value
209
- ignore_errors: Continue on parse errors
210
211
Returns:
212
DataFrame with NDJSON data
213
"""
214
215
def scan_ndjson(
216
source: str | Path | list[str] | list[Path],
217
**kwargs
218
) -> LazyFrame:
219
"""Lazy scan NDJSON file(s)."""
220
```
221
222
### Database Operations
223
224
Reading data from various databases using connection strings or objects.
225
226
```python { .api }
227
def read_database(
228
query: str,
229
connection: str | ConnectionOrCursor,
230
*,
231
partition_on: str | None = None,
232
partition_range: tuple[int, int] | None = None,
233
partition_num: int | None = None,
234
protocol: str | None = None,
235
engine: DbReadEngine | None = None,
236
schema_overrides: dict[str, DataType] | None = None,
237
execute_options: dict[str, Any] | None = None
238
) -> DataFrame:
239
"""
240
Read database query results into DataFrame.
241
242
Parameters:
243
- query: SQL query string
244
- connection: Database connection string or object
245
- partition_on: Column for partitioned reading
246
- partition_range: Range for partitioned reading
247
- partition_num: Number of partitions
248
- protocol: Database protocol
249
- engine: Database engine to use
250
- schema_overrides: Override inferred types
251
- execute_options: Additional execution options
252
253
Returns:
254
DataFrame with query results
255
"""
256
257
def read_database_uri(
258
query: str,
259
uri: str,
260
*,
261
partition_on: str | None = None,
262
partition_range: tuple[int, int] | None = None,
263
partition_num: int | None = None,
264
protocol: str | None = None,
265
engine: DbReadEngine | None = None,
266
schema_overrides: dict[str, DataType] | None = None
267
) -> DataFrame:
268
"""
269
Read from database using URI connection string.
270
271
Parameters:
272
- query: SQL query string
273
- uri: Database URI
274
- Other parameters: Same as read_database
275
276
Returns:
277
DataFrame with query results
278
"""
279
```
280
281
### IPC/Arrow Operations
282
283
Apache Arrow IPC format operations for efficient cross-language data exchange.
284
285
```python { .api }
286
def read_ipc(
287
source: str | Path | IO[bytes] | bytes,
288
*,
289
columns: list[int] | list[str] | None = None,
290
n_rows: int | None = None,
291
row_index_name: str | None = None,
292
row_index_offset: int = 0,
293
rechunk: bool = False,
294
memory_map: bool = True,
295
storage_options: dict[str, Any] | None = None,
296
credential_provider: CredentialProvider | None = None
297
) -> DataFrame:
298
"""
299
Read IPC/Arrow file into DataFrame.
300
301
Parameters:
302
- source: IPC file path or bytes
303
- columns: Columns to select
304
- n_rows: Maximum rows to read
305
- row_index_name: Add row index column
306
- row_index_offset: Row index start value
307
- rechunk: Rechunk to single chunk
308
- memory_map: Use memory mapping
309
- storage_options: Cloud storage options
310
- credential_provider: Cloud credentials
311
312
Returns:
313
DataFrame with IPC data
314
"""
315
316
def read_ipc_stream(
317
source: str | Path | IO[bytes] | bytes,
318
**kwargs
319
) -> DataFrame:
320
"""Read IPC stream format."""
321
322
def scan_ipc(
323
source: str | Path | list[str] | list[Path],
324
**kwargs
325
) -> LazyFrame:
326
"""Lazy scan IPC file(s)."""
327
328
def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> Schema:
329
"""Read IPC file schema."""
330
```
331
332
### Excel Operations
333
334
Reading Excel and OpenDocument spreadsheet files.
335
336
```python { .api }
337
def read_excel(
338
source: str | Path | IO[bytes] | bytes,
339
*,
340
sheet_id: int | Sequence[int] | None = None,
341
sheet_name: str | list[str] | None = None,
342
engine: ExcelSpreadsheetEngine | None = None,
343
engine_options: dict[str, Any] | None = None,
344
read_options: dict[str, Any] | None = None,
345
schema_overrides: dict[str, DataType] | None = None,
346
infer_schema_length: int | None = N_INFER_DEFAULT,
347
raise_if_empty: bool = True
348
) -> DataFrame | dict[str, DataFrame]:
349
"""
350
Read Excel file into DataFrame.
351
352
Parameters:
353
- source: Excel file path or bytes
354
- sheet_id: Sheet index(es) to read
355
- sheet_name: Sheet name(s) to read
356
- engine: Excel engine to use
357
- engine_options: Engine-specific options
358
- read_options: Reading options
359
- schema_overrides: Override inferred types
360
- infer_schema_length: Rows for schema inference
361
- raise_if_empty: Raise error if no data
362
363
Returns:
364
DataFrame or dict of DataFrames (if multiple sheets)
365
"""
366
367
def read_ods(
368
source: str | Path | IO[bytes] | bytes,
369
**kwargs
370
) -> DataFrame | dict[str, DataFrame]:
371
"""Read OpenDocument Spreadsheet file."""
372
```
373
374
### Cloud and Advanced I/O
375
376
Cloud storage integration and advanced I/O features.
377
378
```python { .api }
379
def read_avro(
380
source: str | Path | IO[bytes] | bytes,
381
*,
382
columns: list[int] | list[str] | None = None,
383
n_rows: int | None = None
384
) -> DataFrame:
385
"""Read Apache Avro file."""
386
387
def read_clipboard(**kwargs) -> DataFrame:
388
"""Read data from system clipboard."""
389
390
def scan_iceberg(
391
source: str,
392
**kwargs
393
) -> LazyFrame:
394
"""Lazy scan Apache Iceberg table."""
395
396
def scan_delta(
397
source: str,
398
*,
399
version: int | str | None = None,
400
storage_options: dict[str, str] | None = None,
401
delta_table_options: dict[str, Any] | None = None
402
) -> LazyFrame:
403
"""
404
Lazy scan Delta Lake table.
405
406
Parameters:
407
- source: Delta table path
408
- version: Table version to read
409
- storage_options: Cloud storage options
410
- delta_table_options: Delta table options
411
412
Returns:
413
LazyFrame for Delta table
414
"""
415
416
def read_delta(
417
source: str,
418
**kwargs
419
) -> DataFrame:
420
"""Read Delta Lake table."""
421
422
def scan_pyarrow_dataset(
423
source: str | Path,
424
**kwargs
425
) -> LazyFrame:
426
"""Lazy scan PyArrow dataset."""
427
```
428
429
### Partitioning and Scan Options
430
431
Advanced partitioning strategies and scan configuration.
432
433
```python { .api }
434
class ScanCastOptions:
435
"""Options for casting during scan operations."""
436
def __init__(
437
self,
438
*,
439
enabled: bool = True,
440
dtypes: dict[str, DataType] | None = None,
441
strict: bool = True
442
):
443
"""
444
Configure scan casting.
445
446
Parameters:
447
- enabled: Enable automatic casting
448
- dtypes: Target data types
449
- strict: Strict casting mode
450
"""
451
452
class BasePartitionContext:
453
"""Base class for partition contexts."""
454
455
class KeyedPartitionContext(BasePartitionContext):
456
"""Partition context with key-based partitioning."""
457
458
class KeyedPartition:
459
"""Partition information for keyed partitioning."""
460
def __init__(self, key: Any, df: DataFrame):
461
"""
462
Create keyed partition.
463
464
Parameters:
465
- key: Partition key
466
- df: Partition DataFrame
467
"""
468
469
class PartitionByKey:
470
"""Partition strategy based on column values."""
471
def __init__(self, by: str | list[str]):
472
"""
473
Partition by column key(s).
474
475
Parameters:
476
- by: Column name(s) for partitioning
477
"""
478
479
class PartitionMaxSize:
480
"""Partition strategy based on maximum size."""
481
def __init__(self, max_size: int):
482
"""
483
Partition by maximum size.
484
485
Parameters:
486
- max_size: Maximum partition size
487
"""
488
489
class PartitionParted:
490
"""Information about partitioned data."""
491
```
492
493
### Cloud Credential Providers
494
495
Authentication for cloud storage access.
496
497
```python { .api }
498
class CredentialProvider:
499
"""Base credential provider."""
500
501
class CredentialProviderAWS(CredentialProvider):
502
"""AWS credential provider."""
503
def __init__(
504
self,
505
*,
506
access_key_id: str | None = None,
507
secret_access_key: str | None = None,
508
session_token: str | None = None,
509
region: str | None = None,
510
profile: str | None = None
511
):
512
"""
513
AWS credentials.
514
515
Parameters:
516
- access_key_id: AWS access key
517
- secret_access_key: AWS secret key
518
- session_token: AWS session token
519
- region: AWS region
520
- profile: AWS profile name
521
"""
522
523
class CredentialProviderAzure(CredentialProvider):
524
"""Azure credential provider."""
525
526
class CredentialProviderGCP(CredentialProvider):
527
"""Google Cloud credential provider."""
528
529
class CredentialProviderFunction(CredentialProvider):
530
"""Function-based credential provider."""
531
def __init__(self, func: Callable[[], CredentialProviderFunctionReturn]):
532
"""
533
Function-based credentials.
534
535
Parameters:
536
- func: Function returning credentials
537
"""
538
539
class CredentialProviderFunctionReturn:
540
"""Return type for credential function."""
541
```
542
543
## Usage Examples
544
545
### Basic File Reading
546
547
```python
548
import polars as pl
549
550
# Read CSV with automatic type inference
551
df = pl.read_csv("data.csv")
552
553
# Read with specific options
554
df = pl.read_csv(
555
"data.csv",
556
separator=";",
557
null_values=["", "NULL", "N/A"],
558
try_parse_dates=True,
559
infer_schema_length=1000
560
)
561
562
# Read specific columns
563
df = pl.read_csv("data.csv", columns=["name", "age", "salary"])
564
```
565
566
### Lazy Scanning for Large Files
567
568
```python
569
# Lazy scan for memory efficiency
570
lazy_df = (pl
571
.scan_csv("large_file.csv")
572
.filter(pl.col("amount") > 1000)
573
.select(["customer_id", "amount", "date"])
574
.group_by("customer_id")
575
.agg([
576
pl.col("amount").sum(),
577
pl.col("date").max()
578
])
579
)
580
581
# Execute when ready
582
result = lazy_df.collect()
583
```
584
585
### Working with Multiple Files
586
587
```python
588
# Read multiple CSV files at once
589
df = pl.read_csv("data_*.csv", glob=True)
590
591
# Scan multiple Parquet files
592
lazy_df = pl.scan_parquet(["file1.parquet", "file2.parquet", "file3.parquet"])
593
```
594
595
### Database Integration
596
597
```python
598
# Read from database
599
df = pl.read_database(
600
"SELECT * FROM customers WHERE age > 25",
601
"postgresql://user:pass@localhost:5432/db"
602
)
603
604
# Partitioned database reading for large tables
605
df = pl.read_database(
606
"SELECT * FROM large_table",
607
"postgresql://user:pass@localhost:5432/db",
608
partition_on="id",
609
partition_num=4
610
)
611
```
612
613
### Cloud Storage Access
614
615
```python
616
# Read from S3 with credentials
617
df = pl.read_parquet(
618
"s3://bucket/data.parquet",
619
credential_provider=pl.CredentialProviderAWS(
620
access_key_id="key",
621
secret_access_key="secret",
622
region="us-east-1"
623
)
624
)
625
626
# Read from Azure Blob Storage
627
df = pl.read_csv(
628
"az://container/data.csv",
629
credential_provider=pl.CredentialProviderAzure()
630
)
631
```
632
633
### Advanced Excel Reading
634
635
```python
636
# Read specific Excel sheet
637
df = pl.read_excel("report.xlsx", sheet_name="Summary")
638
639
# Read multiple sheets
640
sheets = pl.read_excel("report.xlsx", sheet_id=[0, 1, 2])
641
summary_df = sheets["Summary"]
642
details_df = sheets["Details"]
643
644
# Excel with custom options
645
df = pl.read_excel(
646
"data.xlsx",
647
engine="openpyxl",
648
read_options={
649
"has_header": True,
650
"skip_rows": 2
651
},
652
schema_overrides={
653
"date": pl.Date,
654
"amount": pl.Decimal(10, 2)
655
}
656
)
657
```
658
659
### Data Export
660
661
```python
662
# DataFrame write methods
663
df.write_csv("output.csv")
664
df.write_parquet("output.parquet")
665
df.write_json("output.json")
666
df.write_ipc("output.arrow")
667
668
# LazyFrame collect and write
669
lazy_df.collect().write_parquet("result.parquet")
670
671
# Write with options
672
df.write_csv(
673
"output.csv",
674
separator="|",
675
quote_char="'",
676
null_value="NULL"
677
)
678
```
679
680
### Schema Management
681
682
```python
683
# Define schema for consistent reading
684
schema = pl.Schema({
685
"id": pl.Int32,
686
"name": pl.String,
687
"amount": pl.Decimal(10, 2),
688
"timestamp": pl.Datetime("us", "UTC")
689
})
690
691
df = pl.read_csv("data.csv", schema=schema)
692
693
# Override specific column types
694
df = pl.read_csv(
695
"data.csv",
696
schema_overrides={
697
"customer_id": pl.String, # Keep as string
698
"amount": pl.Decimal(12, 4) # Higher precision
699
}
700
)
701
```