Tessl Tile for pypi/polars-lts-cpu@1.33.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

configuration.md core-classes.md data-types.md expressions.md functions.md index.md io-operations.md sql-functionality.md

io-operations.mddocs/

0
# I/O Operations
1

2
Extensive support for reading and writing data in various formats including CSV, Parquet, JSON, Arrow IPC, databases, Excel, and cloud storage with streaming capabilities for efficient processing of large datasets.
3

4
## Capabilities
5

6
### CSV Operations
7

8
Read and write CSV files with extensive customization options for delimiters, encoding, and data types.
9

10
```python { .api }
11
def read_csv(
12
    source: str | Path | list[str] | list[Path] | BinaryIO,
13
    *,
14
    has_header: bool = True,
15
    columns: list[int] | list[str] | None = None,
16
    new_columns: list[str] | None = None,
17
    dtypes: dict[str, type] | Sequence[type] | None = None,
18
    separator: str = ",",
19
    comment_prefix: str | None = None,
20
    quote_char: str | None = '"',
21
    skip_rows: int = 0,
22
    skip_rows_after_header: int = 0,
23
    row_index_name: str | None = None,
24
    row_index_offset: int = 0,
25
    sample_size: int = 1024,
26
    eol_char: str = "\n",
27
    raise_if_empty: bool = True,
28
    truncate_ragged_lines: bool = False,
29
    rechunk: bool = False,
30
    schema_overrides: dict[str, type] | None = None,
31
    null_values: str | list[str] | dict[str, str] | None = None,
32
    missing_utf8_is_empty_string: bool = False,
33
    max_rows: int | None = None,
34
    encoding: str = "utf8",
35
    try_parse_dates: bool = False,
36
    n_threads: int | None = None,
37
    infer_schema_length: int | None = 100,
38
    batch_size: int = 8192,
39
    n_rows: int | None = None,
40
    low_memory: bool = False,
41
    rechunk_end: bool = True,
42
    skip_blank_lines: bool = True,
43
    ignore_errors: bool = False
44
) -> DataFrame:
45
    """
46
    Read CSV file(s) into DataFrame.
47
    
48
    Parameters:
49
    - source: File path(s) or file-like object
50
    - has_header: Whether first row contains headers
51
    - columns: Columns to select by index or name
52
    - dtypes: Data types for columns
53
    - separator: Field separator character
54
    - quote_char: Quote character for fields
55
    - null_values: Values to interpret as null
56
    - encoding: Text encoding
57
    - n_threads: Number of threads for parallel processing
58
    
59
    Returns:
60
    - DataFrame: Parsed CSV data
61
    """
62

63
def read_csv_batched(
64
    source: str | Path | BinaryIO,
65
    *,
66
    batch_size: int = 50000,
67
    **kwargs
68
) -> BatchedCsvReader:
69
    """
70
    Read CSV file in batches for memory-efficient processing.
71
    
72
    Parameters:
73
    - source: File path or file-like object  
74
    - batch_size: Number of rows per batch
75
    - **kwargs: Same parameters as read_csv
76
    
77
    Returns:
78
    - BatchedCsvReader: Iterator yielding DataFrame batches
79
    """
80

81
def scan_csv(
82
    source: str | Path | list[str] | list[Path],
83
    *,
84
    has_header: bool = True,
85
    separator: str = ",",
86
    comment_prefix: str | None = None,
87
    quote_char: str | None = '"',
88
    skip_rows: int = 0,
89
    dtypes: dict[str, type] | None = None,
90
    null_values: str | list[str] | dict[str, str] | None = None,
91
    missing_utf8_is_empty_string: bool = False,
92
    cache: bool = True,
93
    with_column_names: Callable[[list[str]], list[str]] | None = None,
94
    infer_schema_length: int | None = 100,
95
    n_rows: int | None = None,
96
    encoding: str = "utf8",
97
    low_memory: bool = False,
98
    rechunk: bool = False,
99
    skip_rows_after_header: int = 0,
100
    row_index_name: str | None = None,
101
    row_index_offset: int = 0,
102
    try_parse_dates: bool = False,
103
    eol_char: str = "\n",
104
    raise_if_empty: bool = True,
105
    truncate_ragged_lines: bool = False,
106
    schema: dict[str, type] | None = None,
107
    ignore_errors: bool = False
108
) -> LazyFrame:
109
    """
110
    Scan CSV file(s) for lazy processing.
111
    
112
    Returns:
113
    - LazyFrame: Lazy representation of CSV data
114
    """
115
```
116

117
### Parquet Operations
118

119
Read and write Apache Parquet files with compression and metadata options.
120

121
```python { .api }
122
def read_parquet(
123
    source: str | Path | list[str] | list[Path] | BinaryIO,
124
    *,
125
    columns: list[int] | list[str] | None = None,
126
    n_rows: int | None = None,
127
    row_index_name: str | None = None,
128
    row_index_offset: int = 0,
129
    parallel: str = "auto",
130
    use_statistics: bool = True,
131
    hive_partitioning: bool | None = None,
132
    hive_schema: dict[str, type] | None = None,
133
    try_parse_hive_dates: bool = True,
134
    glob: bool = True,
135
    schema: dict[str, type] | None = None,
136
    rechunk: bool = False,
137
    low_memory: bool = False,
138
    storage_options: dict[str, Any] | None = None,
139
    credential_provider: CredentialProvider | None = None,
140
    retries: int = 2,
141
    use_pyarrow: bool = False,
142
    pyarrow_options: dict[str, Any] | None = None,
143
    memory_map: bool = True
144
) -> DataFrame:
145
    """
146
    Read Parquet file(s) into DataFrame.
147
    
148
    Parameters:
149
    - source: File path(s) or file-like object
150
    - columns: Columns to select
151
    - parallel: Parallel reading mode ('auto', 'columns', 'row_groups', 'none')
152
    - use_statistics: Use Parquet statistics for optimization
153
    - hive_partitioning: Enable Hive-style partitioning
154
    - storage_options: Cloud storage configuration
155
    - credential_provider: Cloud credentials
156
    
157
    Returns:
158
    - DataFrame: Parquet data
159
    """
160

161
def scan_parquet(
162
    source: str | Path | list[str] | list[Path],
163
    *,
164
    n_rows: int | None = None,
165
    row_index_name: str | None = None,
166
    row_index_offset: int = 0,
167
    parallel: str = "auto",
168
    use_statistics: bool = True,
169
    hive_partitioning: bool | None = None,
170
    hive_schema: dict[str, type] | None = None,
171
    try_parse_hive_dates: bool = True,
172
    glob: bool = True,
173
    schema: dict[str, type] | None = None,
174
    cache: bool = True,
175
    cloud_options: dict[str, Any] | None = None,
176
    credential_provider: CredentialProvider | None = None,
177
    retries: int = 2
178
) -> LazyFrame:
179
    """
180
    Scan Parquet file(s) for lazy processing.
181
    
182
    Returns:
183
    - LazyFrame: Lazy representation of Parquet data
184
    """
185

186
def read_parquet_schema(source: str | Path | BinaryIO) -> dict[str, type]:
187
    """
188
    Read schema from Parquet file without loading data.
189
    
190
    Parameters:
191
    - source: File path or file-like object
192
    
193
    Returns:
194
    - dict[str, type]: Column names and types
195
    """
196

197
def read_parquet_metadata(source: str | Path | BinaryIO) -> dict[str, Any]:
198
    """
199
    Read metadata from Parquet file.
200
    
201
    Parameters:
202
    - source: File path or file-like object
203
    
204
    Returns:
205
    - dict[str, Any]: Parquet metadata
206
    """
207
```
208

209
### JSON Operations
210

211
Read and write JSON and newline-delimited JSON (NDJSON) files.
212

213
```python { .api }
214
def read_json(
215
    source: str | Path | IOBase | bytes,
216
    *,
217
    schema: dict[str, type] | None = None,
218
    schema_overrides: dict[str, type] | None = None,
219
    infer_schema_length: int | None = 100
220
) -> DataFrame:
221
    """
222
    Read JSON file into DataFrame.
223
    
224
    Parameters:
225
    - source: JSON file path or content
226
    - schema: Expected schema
227
    - schema_overrides: Override inferred types
228
    - infer_schema_length: Rows to scan for schema inference
229
    
230
    Returns:
231
    - DataFrame: JSON data
232
    """
233

234
def read_ndjson(
235
    source: str | Path | IOBase | bytes,
236
    *,
237
    schema: dict[str, type] | None = None,
238
    schema_overrides: dict[str, type] | None = None,
239
    ignore_errors: bool = False
240
) -> DataFrame:
241
    """
242
    Read newline-delimited JSON file into DataFrame.
243
    
244
    Parameters:
245
    - source: NDJSON file path or content
246
    - schema: Expected schema
247
    - ignore_errors: Skip malformed JSON lines
248
    
249
    Returns:
250
    - DataFrame: NDJSON data
251
    """
252

253
def scan_ndjson(
254
    source: str | Path | list[str] | list[Path],
255
    *,
256
    schema: dict[str, type] | None = None,
257
    ignore_errors: bool = False,
258
    batch_size: int | None = None,
259
    n_rows: int | None = None,
260
    low_memory: bool = False,
261
    rechunk: bool = False,
262
    row_index_name: str | None = None,
263
    row_index_offset: int = 0,
264
    infer_schema_length: int | None = 100
265
) -> LazyFrame:
266
    """
267
    Scan NDJSON file(s) for lazy processing.
268
    
269
    Returns:
270
    - LazyFrame: Lazy representation of NDJSON data
271
    """
272
```
273

274
### Arrow IPC Operations
275

276
Read and write Apache Arrow IPC format for efficient columnar data exchange.
277

278
```python { .api }
279
def read_ipc(
280
    source: str | Path | BinaryIO,
281
    *,
282
    columns: list[int] | list[str] | None = None,
283
    n_rows: int | None = None,
284
    row_index_name: str | None = None,
285
    row_index_offset: int = 0,
286
    rechunk: bool = False,
287
    memory_map: bool = True,
288
    storage_options: dict[str, Any] | None = None,
289
    credential_provider: CredentialProvider | None = None,
290
    retries: int = 2
291
) -> DataFrame:
292
    """
293
    Read Arrow IPC file into DataFrame.
294
    
295
    Parameters:
296
    - source: IPC file path or file-like object
297
    - columns: Columns to select
298
    - memory_map: Use memory mapping for better performance
299
    - storage_options: Cloud storage configuration
300
    
301
    Returns:
302
    - DataFrame: IPC data
303
    """
304

305
def read_ipc_stream(
306
    source: str | Path | BinaryIO,
307
    *,
308
    columns: list[int] | list[str] | None = None,
309
    n_rows: int | None = None,
310
    row_index_name: str | None = None,
311
    row_index_offset: int = 0,
312
    rechunk: bool = False,
313
    storage_options: dict[str, Any] | None = None,
314
    credential_provider: CredentialProvider | None = None,
315
    retries: int = 2
316
) -> DataFrame:
317
    """
318
    Read Arrow IPC stream into DataFrame.
319
    
320
    Returns:
321
    - DataFrame: IPC stream data
322
    """
323

324
def scan_ipc(
325
    source: str | Path | list[str] | list[Path],
326
    *,
327
    n_rows: int | None = None,
328
    cache: bool = True,
329
    rechunk: bool = False,
330
    row_index_name: str | None = None,
331
    row_index_offset: int = 0,
332
    storage_options: dict[str, Any] | None = None,
333
    credential_provider: CredentialProvider | None = None,
334
    retries: int = 2,
335
    memory_map: bool = True
336
) -> LazyFrame:
337
    """
338
    Scan IPC file(s) for lazy processing.
339
    
340
    Returns:
341
    - LazyFrame: Lazy representation of IPC data
342
    """
343

344
def read_ipc_schema(source: str | Path | BinaryIO) -> dict[str, type]:
345
    """
346
    Read schema from IPC file without loading data.
347
    
348
    Returns:
349
    - dict[str, type]: Column names and types
350
    """
351
```
352

353
### Database Operations
354

355
Connect to and query various databases with full SQL support.
356

357
```python { .api }
358
def read_database(
359
    query: str | RawExpr,
360
    connection: str | ConnectionProtocol,
361
    *,
362
    partition_on: str | None = None,
363
    partition_range: tuple[int, int] | None = None,
364
    partition_num: int | None = None,
365
    protocol: str | None = None,
366
    engine: str | None = None,
367
    schema_overrides: dict[str, type] | None = None,
368
    execute_options: dict[str, Any] | None = None,
369
    iter_batches: bool = False,
370
    batch_size: int | None = None
371
) -> DataFrame:
372
    """
373
    Execute database query and return DataFrame.
374
    
375
    Parameters:
376
    - query: SQL query string
377
    - connection: Database connection string or object
378
    - partition_on: Column for parallel partitioning
379
    - protocol: Database protocol ('adbc', 'connectorx')
380
    - engine: Database engine
381
    - schema_overrides: Override inferred column types
382
    
383
    Returns:
384
    - DataFrame: Query results
385
    """
386

387
def read_database_uri(
388
    query: str | RawExpr,
389
    uri: str,
390
    *,
391
    partition_on: str | None = None,
392
    partition_range: tuple[int, int] | None = None,
393
    partition_num: int | None = None,
394
    protocol: str | None = None,
395
    engine: str | None = None,
396
    schema_overrides: dict[str, type] | None = None,
397
    execute_options: dict[str, Any] | None = None
398
) -> DataFrame:
399
    """
400
    Execute database query using URI connection string.
401
    
402
    Parameters:
403
    - query: SQL query string
404
    - uri: Database URI connection string
405
    
406
    Returns:
407
    - DataFrame: Query results
408
    """
409
```
410

411
### Spreadsheet Operations
412

413
Read Excel and OpenDocument spreadsheet files.
414

415
```python { .api }
416
def read_excel(
417
    source: str | Path | BinaryIO,
418
    *,
419
    sheet_id: int | None = None,
420
    sheet_name: str | None = None,
421
    engine: str | None = None,
422
    engine_options: dict[str, Any] | None = None,
423
    read_options: dict[str, Any] | None = None,
424
    schema_overrides: dict[str, type] | None = None,
425
    infer_schema_length: int | None = None,
426
    raise_if_empty: bool = True
427
) -> DataFrame:
428
    """
429
    Read Excel file into DataFrame.
430
    
431
    Parameters:
432
    - source: Excel file path or file-like object
433
    - sheet_id: Sheet index to read
434
    - sheet_name: Sheet name to read
435
    - engine: Excel engine ('calamine', 'openpyxl', 'xlsx2csv')
436
    - schema_overrides: Override inferred column types
437
    
438
    Returns:
439
    - DataFrame: Excel data
440
    """
441

442
def read_ods(
443
    source: str | Path | BinaryIO,
444
    *,
445
    sheet_id: int | None = None,
446
    sheet_name: str | None = None,
447
    schema_overrides: dict[str, type] | None = None,
448
    infer_schema_length: int | None = None,
449
    raise_if_empty: bool = True
450
) -> DataFrame:
451
    """
452
    Read OpenDocument Spreadsheet file into DataFrame.
453
    
454
    Parameters:
455
    - source: ODS file path or file-like object
456
    - sheet_id: Sheet index to read
457
    - sheet_name: Sheet name to read
458
    
459
    Returns:
460
    - DataFrame: ODS data
461
    """
462
```
463

464
### Other Formats
465

466
Support for additional data formats.
467

468
```python { .api }
469
def read_avro(
470
    source: str | Path | BinaryIO,
471
    *,
472
    columns: list[int] | list[str] | None = None,
473
    n_rows: int | None = None
474
) -> DataFrame:
475
    """
476
    Read Apache Avro file into DataFrame.
477
    
478
    Parameters:
479
    - source: Avro file path or file-like object
480
    - columns: Columns to select
481
    - n_rows: Number of rows to read
482
    
483
    Returns:
484
    - DataFrame: Avro data
485
    """
486

487
def read_clipboard(*, separator: str = "\t", **kwargs) -> DataFrame:
488
    """
489
    Read data from system clipboard.
490
    
491
    Parameters:
492
    - separator: Field separator
493
    - **kwargs: Additional CSV parsing options
494
    
495
    Returns:
496
    - DataFrame: Clipboard data
497
    """
498

499
def read_delta(
500
    source: str | Path,
501
    *,
502
    version: int | str | datetime | None = None,
503
    columns: list[str] | None = None,
504
    storage_options: dict[str, str] | None = None,
505
    delta_table_options: dict[str, Any] | None = None,
506
    pyarrow_options: dict[str, Any] | None = None
507
) -> DataFrame:
508
    """
509
    Read Delta Lake table into DataFrame.
510
    
511
    Parameters:
512
    - source: Delta table path
513
    - version: Table version to read
514
    - columns: Columns to select
515
    - storage_options: Cloud storage configuration
516
    
517
    Returns:
518
    - DataFrame: Delta table data
519
    """
520

521
def scan_delta(
522
    source: str | Path,
523
    *,
524
    version: int | str | datetime | None = None,
525
    storage_options: dict[str, str] | None = None,
526
    delta_table_options: dict[str, Any] | None = None,
527
    pyarrow_options: dict[str, Any] | None = None
528
) -> LazyFrame:
529
    """
530
    Scan Delta Lake table for lazy processing.
531
    
532
    Returns:
533
    - LazyFrame: Lazy representation of Delta table
534
    """
535
```
536

537
### Cloud Storage Support
538

539
Integration with cloud storage providers and object stores.
540

541
```python { .api }
542
# Cloud credential providers
543
class CredentialProvider:
544
    """Base class for cloud credential providers"""
545

546
class CredentialProviderAWS:
547
    def __init__(
548
        self,
549
        *,
550
        access_key_id: str | None = None,
551
        secret_access_key: str | None = None,
552
        session_token: str | None = None,
553
        region: str | None = None,
554
        profile: str | None = None
555
    ):
556
        """
557
        AWS credential provider.
558
        
559
        Parameters:
560
        - access_key_id: AWS access key
561
        - secret_access_key: AWS secret key
562
        - session_token: AWS session token
563
        - region: AWS region
564
        - profile: AWS CLI profile name
565
        """
566

567
class CredentialProviderAzure:
568
    def __init__(
569
        self,
570
        *,
571
        account_name: str | None = None,
572
        account_key: str | None = None,
573
        sas_token: str | None = None,
574
        tenant_id: str | None = None,
575
        client_id: str | None = None,
576
        client_secret: str | None = None
577
    ):
578
        """
579
        Azure credential provider.
580
        
581
        Parameters:
582
        - account_name: Storage account name
583
        - account_key: Storage account key
584
        - sas_token: Shared access signature token
585
        """
586

587
class CredentialProviderGCP:
588
    def __init__(
589
        self,
590
        *,
591
        service_account_path: str | None = None,
592
        service_account_key: str | None = None,
593
        project_id: str | None = None
594
    ):
595
        """
596
        Google Cloud Platform credential provider.
597
        
598
        Parameters:
599
        - service_account_path: Path to service account JSON file
600
        - service_account_key: Service account key JSON string
601
        - project_id: GCP project ID
602
        """
603

604
class CredentialProviderFunction:
605
    def __init__(self, func: Callable[[], dict[str, str]]):
606
        """
607
        Function-based credential provider.
608
        
609
        Parameters:
610
        - func: Function returning credential dictionary
611
        """
612

613
# Cloud scanning
614
def scan_iceberg(
615
    source: str,
616
    *,
617
    mode: str = "convert",
618
    pyarrow_options: dict[str, Any] | None = None
619
) -> LazyFrame:
620
    """
621
    Scan Apache Iceberg table for lazy processing.
622
    
623
    Parameters:
624
    - source: Iceberg table path or catalog reference
625
    - mode: Scanning mode ('convert' or 'arrow')
626
    
627
    Returns:
628
    - LazyFrame: Lazy representation of Iceberg table
629
    """
630

631
def scan_pyarrow_dataset(
632
    source: str | Path,
633
    *,
634
    schema: dict[str, type] | None = None,
635
    allow_pyarrow_filter: bool = True,
636
    cache: bool = True
637
) -> LazyFrame:
638
    """
639
    Scan PyArrow dataset for lazy processing.
640
    
641
    Parameters:
642
    - source: Dataset path
643
    - schema: Expected schema
644
    - allow_pyarrow_filter: Enable PyArrow predicate pushdown
645
    
646
    Returns:
647
    - LazyFrame: Lazy representation of PyArrow dataset
648
    """
649
```
650

651
### Scan Configuration
652

653
Advanced configuration options for scanning operations.
654

655
```python { .api }
656
class ScanCastOptions:
657
    def __init__(
658
        self,
659
        *,
660
        cast_time_unit: str | None = None,
661
        cast_string_strict: bool = True
662
    ):
663
        """
664
        Options for type casting during scanning.
665
        
666
        Parameters:
667
        - cast_time_unit: Time unit for temporal casts
668
        - cast_string_strict: Strict string casting
669
        """
670

671
# Partitioning classes
672
class PartitionByKey:
673
    def __init__(self, by: str | list[str]):
674
        """Partition by column values."""
675

676
class PartitionMaxSize:
677
    def __init__(self, max_size: int):
678
        """Partition by maximum size."""
679

680
class PartitionParted:
681
    def __init__(self, n_partitions: int):
682
        """Partition into fixed number of parts."""
683

684
# Context classes for advanced partitioning
685
class BasePartitionContext:
686
    """Base partition context"""
687

688
class KeyedPartitionContext(BasePartitionContext):
689
    def __init__(self, key: Any): ...
690

691
class KeyedPartition:
692
    def __init__(self, key: Any, partition: DataFrame): ...
693
```
694

695
### Deferred I/O
696

697
Utilities for deferred I/O operations.
698

699
```python { .api }
700
def defer() -> Expr:
701
    """
702
    Create deferred I/O expression for use in scan operations.
703
    
704
    Returns:
705
    - Expr: Deferred expression
706
    """
707
```
708

709
## Usage Examples
710

711
### CSV Operations
712

713
```python
714
import polars as pl
715

716
# Basic CSV reading
717
df = pl.read_csv("data.csv")
718

719
# CSV with custom options
720
df = pl.read_csv(
721
    "data.csv",
722
    separator=";",
723
    has_header=True,
724
    dtypes={"id": pl.Int32, "date": pl.Date},
725
    null_values=["", "NULL", "N/A"]
726
)
727

728
# Lazy CSV scanning for large files
729
lazy_df = pl.scan_csv("large_file.csv").filter(pl.col("date") >= "2023-01-01")
730
result = lazy_df.collect()
731

732
# Batched reading for memory efficiency  
733
reader = pl.read_csv_batched("huge_file.csv", batch_size=10000)
734
for batch in reader:
735
    process_batch(batch)
736
```
737

738
### Parquet Operations
739

740
```python
741
# Read Parquet file
742
df = pl.read_parquet("data.parquet")
743

744
# Parquet with column selection
745
df = pl.read_parquet("data.parquet", columns=["id", "name", "value"])
746

747
# Lazy Parquet scanning with predicate pushdown
748
lazy_df = (
749
    pl.scan_parquet("partitioned/*.parquet")
750
    .filter(pl.col("year") == 2023)
751
    .select(["id", "amount"])
752
)
753
result = lazy_df.collect()
754

755
# Read Parquet metadata
756
schema = pl.read_parquet_schema("data.parquet")
757
metadata = pl.read_parquet_metadata("data.parquet")
758
```
759

760
### Database Operations
761

762
```python
763
# Read from database
764
df = pl.read_database(
765
    "SELECT * FROM customers WHERE active = true",
766
    "postgresql://user:pass@localhost/db"
767
)
768

769
# Partitioned database reading
770
df = pl.read_database(
771
    "SELECT * FROM large_table",
772
    connection,
773
    partition_on="id",
774
    partition_num=4
775
)
776

777
# Using different protocols
778
df = pl.read_database(
779
    "SELECT * FROM table",
780
    connection,
781
    protocol="adbc"  # or "connectorx"
782
)
783
```
784

785
### Cloud Storage
786

787
```python
788
# AWS S3
789
aws_creds = pl.CredentialProviderAWS(
790
    access_key_id="ACCESS_KEY",
791
    secret_access_key="SECRET_KEY",
792
    region="us-east-1"
793
)
794

795
df = pl.read_parquet(
796
    "s3://bucket/data.parquet",
797
    credential_provider=aws_creds
798
)
799

800
# Azure Blob Storage
801
azure_creds = pl.CredentialProviderAzure(
802
    account_name="account",
803
    account_key="key"
804
)
805

806
df = pl.read_csv(
807
    "az://container/data.csv",
808
    credential_provider=azure_creds
809
)
810

811
# Google Cloud Storage
812
gcp_creds = pl.CredentialProviderGCP(
813
    service_account_path="service-account.json"
814
)
815

816
df = pl.scan_parquet(
817
    "gs://bucket/partitioned/*.parquet",
818
    credential_provider=gcp_creds
819
)
820
```
821

822
### Excel and Spreadsheets
823

824
```python
825
# Read Excel file
826
df = pl.read_excel("data.xlsx", sheet_name="Sheet1")
827

828
# Excel with specific engine
829
df = pl.read_excel(
830
    "data.xlsx",
831
    engine="openpyxl",
832
    schema_overrides={"date": pl.Date}
833
)
834

835
# OpenDocument Spreadsheet
836
df = pl.read_ods("data.ods", sheet_id=0)
837
```
838

839
### JSON Operations
840

841
```python
842
# Read JSON
843
df = pl.read_json("data.json")
844

845
# Read NDJSON (newline-delimited JSON)
846
df = pl.read_ndjson("logs.jsonl")
847

848
# Lazy NDJSON scanning
849
lazy_df = pl.scan_ndjson("large_logs.jsonl").filter(
850
    pl.col("timestamp") >= "2023-01-01"
851
)
852
```
853

854
### Delta Lake
855

856
```python
857
# Read Delta table
858
df = pl.read_delta("path/to/delta/table")
859

860
# Read specific version
861
df = pl.read_delta("delta/table", version=5)
862

863
# Lazy scanning with time travel
864
lazy_df = pl.scan_delta("delta/table", version="2023-01-01T00:00:00Z")
865
```
866

867
### Advanced Scanning
868

869
```python
870
# Scan with custom options
871
cast_options = pl.ScanCastOptions(
872
    cast_time_unit="us",
873
    cast_string_strict=False
874
)
875

876
lazy_df = pl.scan_csv(
877
    "data.csv",
878
    cast_options=cast_options
879
)
880

881
# Iceberg table scanning
882
lazy_df = pl.scan_iceberg("catalog.database.table")
883

884
# PyArrow dataset scanning
885
lazy_df = pl.scan_pyarrow_dataset("partitioned/dataset/")
886
```

Version

Tile

Files

io-operations.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

io-operations.mddocs/