Tessl Tile for pypi/polars-u64-idx@1.33.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

config-utilities.md core-data-structures.md data-types.md expressions.md functions.md index.md io-operations.md selectors.md sql-interface.md

io-operations.mddocs/

0
# I/O Operations
1

2
Comprehensive I/O capabilities supporting 10+ file formats with both eager reading and lazy scanning for performance optimization. Polars provides efficient data ingestion and export across various formats with advanced features like predicate pushdown and schema inference.
3

4
## Capabilities
5

6
### CSV Operations
7

8
Reading and scanning CSV files with extensive configuration options.
9

10
```python { .api }
11
def read_csv(
12
    source: str | Path | IO[str] | IO[bytes] | bytes,
13
    *,
14
    has_header: bool = True,
15
    columns: list[int] | list[str] | None = None,
16
    new_columns: list[str] | None = None,
17
    dtypes: dict[int | str, DataType] | Sequence[DataType] | None = None,
18
    separator: str = ",",
19
    comment_prefix: str | None = None,
20
    quote_char: str | None = '"',
21
    skip_rows: int = 0,
22
    skip_rows_after_header: int = 0,
23
    row_index_name: str | None = None,
24
    row_index_offset: int = 0,
25
    sample_size: int = 1024,
26
    eol_char: str = "\n",
27
    null_values: str | Sequence[str] | dict[str, str] | None = None,
28
    missing_utf8_is_empty_string: bool = False,
29
    ignore_errors: bool = False,
30
    try_parse_dates: bool = False,
31
    n_threads: int | None = None,
32
    infer_schema_length: int | None = N_INFER_DEFAULT,
33
    batch_size: int | None = None,
34
    n_rows: int | None = None,
35
    encoding: CsvEncoding = "utf8",
36
    low_memory: bool = False,
37
    rechunk: bool = False,
38
    skip_blank_lines: bool = True,
39
    raise_if_empty: bool = True,
40
    truncate_ragged_lines: bool = False,
41
    decimal_comma: bool = False,
42
    glob: bool = True
43
) -> DataFrame:
44
    """
45
    Read CSV file into DataFrame.
46
    
47
    Parameters:
48
    - source: File path, URL, or file-like object
49
    - has_header: First row contains column names
50
    - columns: Columns to select by index or name
51
    - new_columns: Override column names
52
    - dtypes: Column data types
53
    - separator: Field delimiter
54
    - comment_prefix: Comment line prefix to skip
55
    - quote_char: Quote character for strings
56
    - skip_rows: Number of rows to skip at start
57
    - skip_rows_after_header: Rows to skip after header
58
    - row_index_name: Add row index column with this name
59
    - row_index_offset: Start value for row index
60
    - sample_size: Rows to sample for type inference
61
    - eol_char: End-of-line character
62
    - null_values: Values to interpret as null
63
    - missing_utf8_is_empty_string: Treat invalid UTF-8 as empty
64
    - ignore_errors: Continue on parse errors
65
    - try_parse_dates: Attempt date parsing
66
    - n_threads: Number of threads for parsing
67
    - infer_schema_length: Rows to scan for schema inference
68
    - batch_size: Batch size for processing
69
    - n_rows: Maximum rows to read
70
    - encoding: Text encoding
71
    - low_memory: Use less memory (slower)
72
    - rechunk: Rechunk to single chunk
73
    - skip_blank_lines: Skip empty lines
74
    - raise_if_empty: Raise error if no data
75
    - truncate_ragged_lines: Handle inconsistent columns
76
    - decimal_comma: Use comma as decimal separator
77
    - glob: Use glob patterns for multiple files
78
    
79
    Returns:
80
    DataFrame with CSV data
81
    """
82

83
def scan_csv(
84
    source: str | Path | list[str] | list[Path],
85
    **kwargs
86
) -> LazyFrame:
87
    """
88
    Lazy scan CSV file(s) for optimized processing.
89
    
90
    Parameters:
91
    Similar to read_csv but returns LazyFrame for deferred execution
92
    
93
    Returns:
94
    LazyFrame for lazy evaluation
95
    """
96
```
97

98
### Parquet Operations
99

100
High-performance columnar format operations with advanced features.
101

102
```python { .api }
103
def read_parquet(
104
    source: str | Path | IO[bytes] | bytes,
105
    *,
106
    columns: list[int] | list[str] | None = None,
107
    n_rows: int | None = None,
108
    row_index_name: str | None = None,
109
    row_index_offset: int = 0,
110
    parallel: ParallelStrategy = "auto",
111
    use_statistics: bool = True,
112
    hive_partitioning: bool | None = None,
113
    glob: bool = True,
114
    rechunk: bool = False,
115
    low_memory: bool = False,
116
    storage_options: dict[str, Any] | None = None,
117
    credential_provider: CredentialProvider | None = None,
118
    retries: int = 2,
119
    file_cache_ttl: int | None = None
120
) -> DataFrame:
121
    """
122
    Read Parquet file into DataFrame.
123
    
124
    Parameters:
125
    - source: File path, URL, or bytes
126
    - columns: Columns to select
127
    - n_rows: Maximum rows to read
128
    - row_index_name: Add row index column
129
    - row_index_offset: Row index start value
130
    - parallel: Parallelization strategy
131
    - use_statistics: Use Parquet statistics for optimization
132
    - hive_partitioning: Enable Hive-style partitioning
133
    - glob: Use glob patterns
134
    - rechunk: Rechunk to single chunk
135
    - low_memory: Use less memory
136
    - storage_options: Cloud storage options
137
    - credential_provider: Cloud credentials
138
    - retries: Number of retry attempts
139
    - file_cache_ttl: File cache time-to-live
140
    
141
    Returns:
142
    DataFrame with Parquet data
143
    """
144

145
def scan_parquet(
146
    source: str | Path | list[str] | list[Path],
147
    **kwargs
148
) -> LazyFrame:
149
    """Lazy scan Parquet file(s)."""
150

151
def read_parquet_metadata(source: str | Path | IO[bytes] | bytes) -> dict[str, Any]:
152
    """Read Parquet file metadata."""
153

154
def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> Schema:
155
    """Read Parquet file schema."""
156
```
157

158
### JSON Operations
159

160
JSON and newline-delimited JSON file operations.
161

162
```python { .api }
163
def read_json(
164
    source: str | Path | IO[str] | IO[bytes] | bytes,
165
    *,
166
    schema: dict[str, DataType] | None = None,
167
    schema_overrides: dict[str, DataType] | None = None,
168
    infer_schema_length: int | None = N_INFER_DEFAULT
169
) -> DataFrame:
170
    """
171
    Read JSON file into DataFrame.
172
    
173
    Parameters:
174
    - source: JSON file path or data
175
    - schema: Expected schema
176
    - schema_overrides: Override inferred types
177
    - infer_schema_length: Rows for schema inference
178
    
179
    Returns:
180
    DataFrame with JSON data
181
    """
182

183
def read_ndjson(
184
    source: str | Path | IO[str] | IO[bytes] | bytes,
185
    *,
186
    schema: dict[str, DataType] | None = None,
187
    schema_overrides: dict[str, DataType] | None = None,
188
    batch_size: int | None = None,
189
    n_rows: int | None = None,
190
    low_memory: bool = False,
191
    rechunk: bool = False,
192
    row_index_name: str | None = None,
193
    row_index_offset: int = 0,
194
    ignore_errors: bool = False
195
) -> DataFrame:
196
    """
197
    Read newline-delimited JSON file.
198
    
199
    Parameters:
200
    - source: NDJSON file path or data
201
    - schema: Expected schema
202
    - schema_overrides: Override inferred types
203
    - batch_size: Processing batch size
204
    - n_rows: Maximum rows to read
205
    - low_memory: Use less memory
206
    - rechunk: Rechunk to single chunk
207
    - row_index_name: Add row index column
208
    - row_index_offset: Row index start value
209
    - ignore_errors: Continue on parse errors
210
    
211
    Returns:
212
    DataFrame with NDJSON data
213
    """
214

215
def scan_ndjson(
216
    source: str | Path | list[str] | list[Path],
217
    **kwargs
218
) -> LazyFrame:
219
    """Lazy scan NDJSON file(s)."""
220
```
221

222
### Database Operations
223

224
Reading data from various databases using connection strings or objects.
225

226
```python { .api }
227
def read_database(
228
    query: str,
229
    connection: str | ConnectionOrCursor,
230
    *,
231
    partition_on: str | None = None,
232
    partition_range: tuple[int, int] | None = None,
233
    partition_num: int | None = None,
234
    protocol: str | None = None,
235
    engine: DbReadEngine | None = None,
236
    schema_overrides: dict[str, DataType] | None = None,
237
    execute_options: dict[str, Any] | None = None
238
) -> DataFrame:
239
    """
240
    Read database query results into DataFrame.
241
    
242
    Parameters:
243
    - query: SQL query string
244
    - connection: Database connection string or object
245
    - partition_on: Column for partitioned reading
246
    - partition_range: Range for partitioned reading
247
    - partition_num: Number of partitions
248
    - protocol: Database protocol
249
    - engine: Database engine to use
250
    - schema_overrides: Override inferred types
251
    - execute_options: Additional execution options
252
    
253
    Returns:
254
    DataFrame with query results
255
    """
256

257
def read_database_uri(
258
    query: str,
259
    uri: str,
260
    *,
261
    partition_on: str | None = None,
262
    partition_range: tuple[int, int] | None = None,
263
    partition_num: int | None = None,
264
    protocol: str | None = None,
265
    engine: DbReadEngine | None = None,
266
    schema_overrides: dict[str, DataType] | None = None
267
) -> DataFrame:
268
    """
269
    Read from database using URI connection string.
270
    
271
    Parameters:
272
    - query: SQL query string
273
    - uri: Database URI
274
    - Other parameters: Same as read_database
275
    
276
    Returns:
277
    DataFrame with query results
278
    """
279
```
280

281
### IPC/Arrow Operations
282

283
Apache Arrow IPC format operations for efficient cross-language data exchange.
284

285
```python { .api }
286
def read_ipc(
287
    source: str | Path | IO[bytes] | bytes,
288
    *,
289
    columns: list[int] | list[str] | None = None,
290
    n_rows: int | None = None,
291
    row_index_name: str | None = None,
292
    row_index_offset: int = 0,
293
    rechunk: bool = False,
294
    memory_map: bool = True,
295
    storage_options: dict[str, Any] | None = None,
296
    credential_provider: CredentialProvider | None = None
297
) -> DataFrame:
298
    """
299
    Read IPC/Arrow file into DataFrame.
300
    
301
    Parameters:
302
    - source: IPC file path or bytes
303
    - columns: Columns to select
304
    - n_rows: Maximum rows to read
305
    - row_index_name: Add row index column
306
    - row_index_offset: Row index start value
307
    - rechunk: Rechunk to single chunk
308
    - memory_map: Use memory mapping
309
    - storage_options: Cloud storage options
310
    - credential_provider: Cloud credentials
311
    
312
    Returns:
313
    DataFrame with IPC data
314
    """
315

316
def read_ipc_stream(
317
    source: str | Path | IO[bytes] | bytes,
318
    **kwargs
319
) -> DataFrame:
320
    """Read IPC stream format."""
321

322
def scan_ipc(
323
    source: str | Path | list[str] | list[Path],
324
    **kwargs
325
) -> LazyFrame:
326
    """Lazy scan IPC file(s)."""
327

328
def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> Schema:
329
    """Read IPC file schema."""
330
```
331

332
### Excel Operations
333

334
Reading Excel and OpenDocument spreadsheet files.
335

336
```python { .api }
337
def read_excel(
338
    source: str | Path | IO[bytes] | bytes,
339
    *,
340
    sheet_id: int | Sequence[int] | None = None,
341
    sheet_name: str | list[str] | None = None,
342
    engine: ExcelSpreadsheetEngine | None = None,
343
    engine_options: dict[str, Any] | None = None,
344
    read_options: dict[str, Any] | None = None,
345
    schema_overrides: dict[str, DataType] | None = None,
346
    infer_schema_length: int | None = N_INFER_DEFAULT,
347
    raise_if_empty: bool = True
348
) -> DataFrame | dict[str, DataFrame]:
349
    """
350
    Read Excel file into DataFrame.
351
    
352
    Parameters:
353
    - source: Excel file path or bytes
354
    - sheet_id: Sheet index(es) to read
355
    - sheet_name: Sheet name(s) to read
356
    - engine: Excel engine to use
357
    - engine_options: Engine-specific options
358
    - read_options: Reading options
359
    - schema_overrides: Override inferred types
360
    - infer_schema_length: Rows for schema inference
361
    - raise_if_empty: Raise error if no data
362
    
363
    Returns:
364
    DataFrame or dict of DataFrames (if multiple sheets)
365
    """
366

367
def read_ods(
368
    source: str | Path | IO[bytes] | bytes,
369
    **kwargs
370
) -> DataFrame | dict[str, DataFrame]:
371
    """Read OpenDocument Spreadsheet file."""
372
```
373

374
### Cloud and Advanced I/O
375

376
Cloud storage integration and advanced I/O features.
377

378
```python { .api }
379
def read_avro(
380
    source: str | Path | IO[bytes] | bytes,
381
    *,
382
    columns: list[int] | list[str] | None = None,
383
    n_rows: int | None = None
384
) -> DataFrame:
385
    """Read Apache Avro file."""
386

387
def read_clipboard(**kwargs) -> DataFrame:
388
    """Read data from system clipboard."""
389

390
def scan_iceberg(
391
    source: str,
392
    **kwargs
393
) -> LazyFrame:
394
    """Lazy scan Apache Iceberg table."""
395

396
def scan_delta(
397
    source: str,
398
    *,
399
    version: int | str | None = None,
400
    storage_options: dict[str, str] | None = None,
401
    delta_table_options: dict[str, Any] | None = None
402
) -> LazyFrame:
403
    """
404
    Lazy scan Delta Lake table.
405
    
406
    Parameters:
407
    - source: Delta table path
408
    - version: Table version to read
409
    - storage_options: Cloud storage options
410
    - delta_table_options: Delta table options
411
    
412
    Returns:
413
    LazyFrame for Delta table
414
    """
415

416
def read_delta(
417
    source: str,
418
    **kwargs
419
) -> DataFrame:
420
    """Read Delta Lake table."""
421

422
def scan_pyarrow_dataset(
423
    source: str | Path,
424
    **kwargs
425
) -> LazyFrame:
426
    """Lazy scan PyArrow dataset."""
427
```
428

429
### Partitioning and Scan Options
430

431
Advanced partitioning strategies and scan configuration.
432

433
```python { .api }
434
class ScanCastOptions:
435
    """Options for casting during scan operations."""
436
    def __init__(
437
        self,
438
        *,
439
        enabled: bool = True,
440
        dtypes: dict[str, DataType] | None = None,
441
        strict: bool = True
442
    ):
443
        """
444
        Configure scan casting.
445
        
446
        Parameters:
447
        - enabled: Enable automatic casting
448
        - dtypes: Target data types
449
        - strict: Strict casting mode
450
        """
451

452
class BasePartitionContext:
453
    """Base class for partition contexts."""
454

455
class KeyedPartitionContext(BasePartitionContext):
456
    """Partition context with key-based partitioning."""
457

458
class KeyedPartition:
459
    """Partition information for keyed partitioning."""
460
    def __init__(self, key: Any, df: DataFrame):
461
        """
462
        Create keyed partition.
463
        
464
        Parameters:
465
        - key: Partition key
466
        - df: Partition DataFrame
467
        """
468

469
class PartitionByKey:
470
    """Partition strategy based on column values."""
471
    def __init__(self, by: str | list[str]):
472
        """
473
        Partition by column key(s).
474
        
475
        Parameters:
476
        - by: Column name(s) for partitioning
477
        """
478

479
class PartitionMaxSize:
480
    """Partition strategy based on maximum size."""
481
    def __init__(self, max_size: int):
482
        """
483
        Partition by maximum size.
484
        
485
        Parameters:
486
        - max_size: Maximum partition size
487
        """
488

489
class PartitionParted:
490
    """Information about partitioned data."""
491
```
492

493
### Cloud Credential Providers
494

495
Authentication for cloud storage access.
496

497
```python { .api }
498
class CredentialProvider:
499
    """Base credential provider."""
500

501
class CredentialProviderAWS(CredentialProvider):
502
    """AWS credential provider."""
503
    def __init__(
504
        self,
505
        *,
506
        access_key_id: str | None = None,
507
        secret_access_key: str | None = None,
508
        session_token: str | None = None,
509
        region: str | None = None,
510
        profile: str | None = None
511
    ):
512
        """
513
        AWS credentials.
514
        
515
        Parameters:
516
        - access_key_id: AWS access key
517
        - secret_access_key: AWS secret key
518
        - session_token: AWS session token
519
        - region: AWS region
520
        - profile: AWS profile name
521
        """
522

523
class CredentialProviderAzure(CredentialProvider):
524
    """Azure credential provider."""
525

526
class CredentialProviderGCP(CredentialProvider):
527
    """Google Cloud credential provider."""
528

529
class CredentialProviderFunction(CredentialProvider):
530
    """Function-based credential provider."""
531
    def __init__(self, func: Callable[[], CredentialProviderFunctionReturn]):
532
        """
533
        Function-based credentials.
534
        
535
        Parameters:
536
        - func: Function returning credentials
537
        """
538

539
class CredentialProviderFunctionReturn:
540
    """Return type for credential function."""
541
```
542

543
## Usage Examples
544

545
### Basic File Reading
546

547
```python
548
import polars as pl
549

550
# Read CSV with automatic type inference
551
df = pl.read_csv("data.csv")
552

553
# Read with specific options
554
df = pl.read_csv(
555
    "data.csv",
556
    separator=";",
557
    null_values=["", "NULL", "N/A"],
558
    try_parse_dates=True,
559
    infer_schema_length=1000
560
)
561

562
# Read specific columns
563
df = pl.read_csv("data.csv", columns=["name", "age", "salary"])
564
```
565

566
### Lazy Scanning for Large Files
567

568
```python
569
# Lazy scan for memory efficiency
570
lazy_df = (pl
571
    .scan_csv("large_file.csv")
572
    .filter(pl.col("amount") > 1000)
573
    .select(["customer_id", "amount", "date"])
574
    .group_by("customer_id")
575
    .agg([
576
        pl.col("amount").sum(),
577
        pl.col("date").max()
578
    ])
579
)
580

581
# Execute when ready
582
result = lazy_df.collect()
583
```
584

585
### Working with Multiple Files
586

587
```python
588
# Read multiple CSV files at once
589
df = pl.read_csv("data_*.csv", glob=True)
590

591
# Scan multiple Parquet files
592
lazy_df = pl.scan_parquet(["file1.parquet", "file2.parquet", "file3.parquet"])
593
```
594

595
### Database Integration
596

597
```python
598
# Read from database
599
df = pl.read_database(
600
    "SELECT * FROM customers WHERE age > 25",
601
    "postgresql://user:pass@localhost:5432/db"
602
)
603

604
# Partitioned database reading for large tables
605
df = pl.read_database(
606
    "SELECT * FROM large_table",
607
    "postgresql://user:pass@localhost:5432/db",
608
    partition_on="id",
609
    partition_num=4
610
)
611
```
612

613
### Cloud Storage Access
614

615
```python
616
# Read from S3 with credentials
617
df = pl.read_parquet(
618
    "s3://bucket/data.parquet",
619
    credential_provider=pl.CredentialProviderAWS(
620
        access_key_id="key",
621
        secret_access_key="secret",
622
        region="us-east-1"
623
    )
624
)
625

626
# Read from Azure Blob Storage
627
df = pl.read_csv(
628
    "az://container/data.csv",
629
    credential_provider=pl.CredentialProviderAzure()
630
)
631
```
632

633
### Advanced Excel Reading
634

635
```python
636
# Read specific Excel sheet
637
df = pl.read_excel("report.xlsx", sheet_name="Summary")
638

639
# Read multiple sheets
640
sheets = pl.read_excel("report.xlsx", sheet_id=[0, 1, 2])
641
summary_df = sheets["Summary"]
642
details_df = sheets["Details"]
643

644
# Excel with custom options
645
df = pl.read_excel(
646
    "data.xlsx",
647
    engine="openpyxl",
648
    read_options={
649
        "has_header": True,
650
        "skip_rows": 2
651
    },
652
    schema_overrides={
653
        "date": pl.Date,
654
        "amount": pl.Decimal(10, 2)
655
    }
656
)
657
```
658

659
### Data Export
660

661
```python
662
# DataFrame write methods
663
df.write_csv("output.csv")
664
df.write_parquet("output.parquet")
665
df.write_json("output.json")
666
df.write_ipc("output.arrow")
667

668
# LazyFrame collect and write
669
lazy_df.collect().write_parquet("result.parquet")
670

671
# Write with options
672
df.write_csv(
673
    "output.csv",
674
    separator="|",
675
    quote_char="'",
676
    null_value="NULL"
677
)
678
```
679

680
### Schema Management
681

682
```python
683
# Define schema for consistent reading
684
schema = pl.Schema({
685
    "id": pl.Int32,
686
    "name": pl.String,
687
    "amount": pl.Decimal(10, 2),
688
    "timestamp": pl.Datetime("us", "UTC")
689
})
690

691
df = pl.read_csv("data.csv", schema=schema)
692

693
# Override specific column types
694
df = pl.read_csv(
695
    "data.csv",
696
    schema_overrides={
697
        "customer_id": pl.String,  # Keep as string
698
        "amount": pl.Decimal(12, 4)  # Higher precision
699
    }
700
)
701
```

Version

Tile

Files

io-operations.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

io-operations.mddocs/