or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

configuration.mdcore-classes.mddata-types.mdexpressions.mdfunctions.mdindex.mdio-operations.mdsql-functionality.md

io-operations.mddocs/

0

# I/O Operations

1

2

Extensive support for reading and writing data in various formats including CSV, Parquet, JSON, Arrow IPC, databases, Excel, and cloud storage with streaming capabilities for efficient processing of large datasets.

3

4

## Capabilities

5

6

### CSV Operations

7

8

Read and write CSV files with extensive customization options for delimiters, encoding, and data types.

9

10

```python { .api }

11

def read_csv(

12

source: str | Path | list[str] | list[Path] | BinaryIO,

13

*,

14

has_header: bool = True,

15

columns: list[int] | list[str] | None = None,

16

new_columns: list[str] | None = None,

17

dtypes: dict[str, type] | Sequence[type] | None = None,

18

separator: str = ",",

19

comment_prefix: str | None = None,

20

quote_char: str | None = '"',

21

skip_rows: int = 0,

22

skip_rows_after_header: int = 0,

23

row_index_name: str | None = None,

24

row_index_offset: int = 0,

25

sample_size: int = 1024,

26

eol_char: str = "\n",

27

raise_if_empty: bool = True,

28

truncate_ragged_lines: bool = False,

29

rechunk: bool = False,

30

schema_overrides: dict[str, type] | None = None,

31

null_values: str | list[str] | dict[str, str] | None = None,

32

missing_utf8_is_empty_string: bool = False,

33

max_rows: int | None = None,

34

encoding: str = "utf8",

35

try_parse_dates: bool = False,

36

n_threads: int | None = None,

37

infer_schema_length: int | None = 100,

38

batch_size: int = 8192,

39

n_rows: int | None = None,

40

low_memory: bool = False,

41

rechunk_end: bool = True,

42

skip_blank_lines: bool = True,

43

ignore_errors: bool = False

44

) -> DataFrame:

45

"""

46

Read CSV file(s) into DataFrame.

47

48

Parameters:

49

- source: File path(s) or file-like object

50

- has_header: Whether first row contains headers

51

- columns: Columns to select by index or name

52

- dtypes: Data types for columns

53

- separator: Field separator character

54

- quote_char: Quote character for fields

55

- null_values: Values to interpret as null

56

- encoding: Text encoding

57

- n_threads: Number of threads for parallel processing

58

59

Returns:

60

- DataFrame: Parsed CSV data

61

"""

62

63

def read_csv_batched(

64

source: str | Path | BinaryIO,

65

*,

66

batch_size: int = 50000,

67

**kwargs

68

) -> BatchedCsvReader:

69

"""

70

Read CSV file in batches for memory-efficient processing.

71

72

Parameters:

73

- source: File path or file-like object

74

- batch_size: Number of rows per batch

75

- **kwargs: Same parameters as read_csv

76

77

Returns:

78

- BatchedCsvReader: Iterator yielding DataFrame batches

79

"""

80

81

def scan_csv(

82

source: str | Path | list[str] | list[Path],

83

*,

84

has_header: bool = True,

85

separator: str = ",",

86

comment_prefix: str | None = None,

87

quote_char: str | None = '"',

88

skip_rows: int = 0,

89

dtypes: dict[str, type] | None = None,

90

null_values: str | list[str] | dict[str, str] | None = None,

91

missing_utf8_is_empty_string: bool = False,

92

cache: bool = True,

93

with_column_names: Callable[[list[str]], list[str]] | None = None,

94

infer_schema_length: int | None = 100,

95

n_rows: int | None = None,

96

encoding: str = "utf8",

97

low_memory: bool = False,

98

rechunk: bool = False,

99

skip_rows_after_header: int = 0,

100

row_index_name: str | None = None,

101

row_index_offset: int = 0,

102

try_parse_dates: bool = False,

103

eol_char: str = "\n",

104

raise_if_empty: bool = True,

105

truncate_ragged_lines: bool = False,

106

schema: dict[str, type] | None = None,

107

ignore_errors: bool = False

108

) -> LazyFrame:

109

"""

110

Scan CSV file(s) for lazy processing.

111

112

Returns:

113

- LazyFrame: Lazy representation of CSV data

114

"""

115

```

116

117

### Parquet Operations

118

119

Read and write Apache Parquet files with compression and metadata options.

120

121

```python { .api }

122

def read_parquet(

123

source: str | Path | list[str] | list[Path] | BinaryIO,

124

*,

125

columns: list[int] | list[str] | None = None,

126

n_rows: int | None = None,

127

row_index_name: str | None = None,

128

row_index_offset: int = 0,

129

parallel: str = "auto",

130

use_statistics: bool = True,

131

hive_partitioning: bool | None = None,

132

hive_schema: dict[str, type] | None = None,

133

try_parse_hive_dates: bool = True,

134

glob: bool = True,

135

schema: dict[str, type] | None = None,

136

rechunk: bool = False,

137

low_memory: bool = False,

138

storage_options: dict[str, Any] | None = None,

139

credential_provider: CredentialProvider | None = None,

140

retries: int = 2,

141

use_pyarrow: bool = False,

142

pyarrow_options: dict[str, Any] | None = None,

143

memory_map: bool = True

144

) -> DataFrame:

145

"""

146

Read Parquet file(s) into DataFrame.

147

148

Parameters:

149

- source: File path(s) or file-like object

150

- columns: Columns to select

151

- parallel: Parallel reading mode ('auto', 'columns', 'row_groups', 'none')

152

- use_statistics: Use Parquet statistics for optimization

153

- hive_partitioning: Enable Hive-style partitioning

154

- storage_options: Cloud storage configuration

155

- credential_provider: Cloud credentials

156

157

Returns:

158

- DataFrame: Parquet data

159

"""

160

161

def scan_parquet(

162

source: str | Path | list[str] | list[Path],

163

*,

164

n_rows: int | None = None,

165

row_index_name: str | None = None,

166

row_index_offset: int = 0,

167

parallel: str = "auto",

168

use_statistics: bool = True,

169

hive_partitioning: bool | None = None,

170

hive_schema: dict[str, type] | None = None,

171

try_parse_hive_dates: bool = True,

172

glob: bool = True,

173

schema: dict[str, type] | None = None,

174

cache: bool = True,

175

cloud_options: dict[str, Any] | None = None,

176

credential_provider: CredentialProvider | None = None,

177

retries: int = 2

178

) -> LazyFrame:

179

"""

180

Scan Parquet file(s) for lazy processing.

181

182

Returns:

183

- LazyFrame: Lazy representation of Parquet data

184

"""

185

186

def read_parquet_schema(source: str | Path | BinaryIO) -> dict[str, type]:

187

"""

188

Read schema from Parquet file without loading data.

189

190

Parameters:

191

- source: File path or file-like object

192

193

Returns:

194

- dict[str, type]: Column names and types

195

"""

196

197

def read_parquet_metadata(source: str | Path | BinaryIO) -> dict[str, Any]:

198

"""

199

Read metadata from Parquet file.

200

201

Parameters:

202

- source: File path or file-like object

203

204

Returns:

205

- dict[str, Any]: Parquet metadata

206

"""

207

```

208

209

### JSON Operations

210

211

Read and write JSON and newline-delimited JSON (NDJSON) files.

212

213

```python { .api }

214

def read_json(

215

source: str | Path | IOBase | bytes,

216

*,

217

schema: dict[str, type] | None = None,

218

schema_overrides: dict[str, type] | None = None,

219

infer_schema_length: int | None = 100

220

) -> DataFrame:

221

"""

222

Read JSON file into DataFrame.

223

224

Parameters:

225

- source: JSON file path or content

226

- schema: Expected schema

227

- schema_overrides: Override inferred types

228

- infer_schema_length: Rows to scan for schema inference

229

230

Returns:

231

- DataFrame: JSON data

232

"""

233

234

def read_ndjson(

235

source: str | Path | IOBase | bytes,

236

*,

237

schema: dict[str, type] | None = None,

238

schema_overrides: dict[str, type] | None = None,

239

ignore_errors: bool = False

240

) -> DataFrame:

241

"""

242

Read newline-delimited JSON file into DataFrame.

243

244

Parameters:

245

- source: NDJSON file path or content

246

- schema: Expected schema

247

- ignore_errors: Skip malformed JSON lines

248

249

Returns:

250

- DataFrame: NDJSON data

251

"""

252

253

def scan_ndjson(

254

source: str | Path | list[str] | list[Path],

255

*,

256

schema: dict[str, type] | None = None,

257

ignore_errors: bool = False,

258

batch_size: int | None = None,

259

n_rows: int | None = None,

260

low_memory: bool = False,

261

rechunk: bool = False,

262

row_index_name: str | None = None,

263

row_index_offset: int = 0,

264

infer_schema_length: int | None = 100

265

) -> LazyFrame:

266

"""

267

Scan NDJSON file(s) for lazy processing.

268

269

Returns:

270

- LazyFrame: Lazy representation of NDJSON data

271

"""

272

```

273

274

### Arrow IPC Operations

275

276

Read and write Apache Arrow IPC format for efficient columnar data exchange.

277

278

```python { .api }

279

def read_ipc(

280

source: str | Path | BinaryIO,

281

*,

282

columns: list[int] | list[str] | None = None,

283

n_rows: int | None = None,

284

row_index_name: str | None = None,

285

row_index_offset: int = 0,

286

rechunk: bool = False,

287

memory_map: bool = True,

288

storage_options: dict[str, Any] | None = None,

289

credential_provider: CredentialProvider | None = None,

290

retries: int = 2

291

) -> DataFrame:

292

"""

293

Read Arrow IPC file into DataFrame.

294

295

Parameters:

296

- source: IPC file path or file-like object

297

- columns: Columns to select

298

- memory_map: Use memory mapping for better performance

299

- storage_options: Cloud storage configuration

300

301

Returns:

302

- DataFrame: IPC data

303

"""

304

305

def read_ipc_stream(

306

source: str | Path | BinaryIO,

307

*,

308

columns: list[int] | list[str] | None = None,

309

n_rows: int | None = None,

310

row_index_name: str | None = None,

311

row_index_offset: int = 0,

312

rechunk: bool = False,

313

storage_options: dict[str, Any] | None = None,

314

credential_provider: CredentialProvider | None = None,

315

retries: int = 2

316

) -> DataFrame:

317

"""

318

Read Arrow IPC stream into DataFrame.

319

320

Returns:

321

- DataFrame: IPC stream data

322

"""

323

324

def scan_ipc(

325

source: str | Path | list[str] | list[Path],

326

*,

327

n_rows: int | None = None,

328

cache: bool = True,

329

rechunk: bool = False,

330

row_index_name: str | None = None,

331

row_index_offset: int = 0,

332

storage_options: dict[str, Any] | None = None,

333

credential_provider: CredentialProvider | None = None,

334

retries: int = 2,

335

memory_map: bool = True

336

) -> LazyFrame:

337

"""

338

Scan IPC file(s) for lazy processing.

339

340

Returns:

341

- LazyFrame: Lazy representation of IPC data

342

"""

343

344

def read_ipc_schema(source: str | Path | BinaryIO) -> dict[str, type]:

345

"""

346

Read schema from IPC file without loading data.

347

348

Returns:

349

- dict[str, type]: Column names and types

350

"""

351

```

352

353

### Database Operations

354

355

Connect to and query various databases with full SQL support.

356

357

```python { .api }

358

def read_database(

359

query: str | RawExpr,

360

connection: str | ConnectionProtocol,

361

*,

362

partition_on: str | None = None,

363

partition_range: tuple[int, int] | None = None,

364

partition_num: int | None = None,

365

protocol: str | None = None,

366

engine: str | None = None,

367

schema_overrides: dict[str, type] | None = None,

368

execute_options: dict[str, Any] | None = None,

369

iter_batches: bool = False,

370

batch_size: int | None = None

371

) -> DataFrame:

372

"""

373

Execute database query and return DataFrame.

374

375

Parameters:

376

- query: SQL query string

377

- connection: Database connection string or object

378

- partition_on: Column for parallel partitioning

379

- protocol: Database protocol ('adbc', 'connectorx')

380

- engine: Database engine

381

- schema_overrides: Override inferred column types

382

383

Returns:

384

- DataFrame: Query results

385

"""

386

387

def read_database_uri(

388

query: str | RawExpr,

389

uri: str,

390

*,

391

partition_on: str | None = None,

392

partition_range: tuple[int, int] | None = None,

393

partition_num: int | None = None,

394

protocol: str | None = None,

395

engine: str | None = None,

396

schema_overrides: dict[str, type] | None = None,

397

execute_options: dict[str, Any] | None = None

398

) -> DataFrame:

399

"""

400

Execute database query using URI connection string.

401

402

Parameters:

403

- query: SQL query string

404

- uri: Database URI connection string

405

406

Returns:

407

- DataFrame: Query results

408

"""

409

```

410

411

### Spreadsheet Operations

412

413

Read Excel and OpenDocument spreadsheet files.

414

415

```python { .api }

416

def read_excel(

417

source: str | Path | BinaryIO,

418

*,

419

sheet_id: int | None = None,

420

sheet_name: str | None = None,

421

engine: str | None = None,

422

engine_options: dict[str, Any] | None = None,

423

read_options: dict[str, Any] | None = None,

424

schema_overrides: dict[str, type] | None = None,

425

infer_schema_length: int | None = None,

426

raise_if_empty: bool = True

427

) -> DataFrame:

428

"""

429

Read Excel file into DataFrame.

430

431

Parameters:

432

- source: Excel file path or file-like object

433

- sheet_id: Sheet index to read

434

- sheet_name: Sheet name to read

435

- engine: Excel engine ('calamine', 'openpyxl', 'xlsx2csv')

436

- schema_overrides: Override inferred column types

437

438

Returns:

439

- DataFrame: Excel data

440

"""

441

442

def read_ods(

443

source: str | Path | BinaryIO,

444

*,

445

sheet_id: int | None = None,

446

sheet_name: str | None = None,

447

schema_overrides: dict[str, type] | None = None,

448

infer_schema_length: int | None = None,

449

raise_if_empty: bool = True

450

) -> DataFrame:

451

"""

452

Read OpenDocument Spreadsheet file into DataFrame.

453

454

Parameters:

455

- source: ODS file path or file-like object

456

- sheet_id: Sheet index to read

457

- sheet_name: Sheet name to read

458

459

Returns:

460

- DataFrame: ODS data

461

"""

462

```

463

464

### Other Formats

465

466

Support for additional data formats.

467

468

```python { .api }

469

def read_avro(

470

source: str | Path | BinaryIO,

471

*,

472

columns: list[int] | list[str] | None = None,

473

n_rows: int | None = None

474

) -> DataFrame:

475

"""

476

Read Apache Avro file into DataFrame.

477

478

Parameters:

479

- source: Avro file path or file-like object

480

- columns: Columns to select

481

- n_rows: Number of rows to read

482

483

Returns:

484

- DataFrame: Avro data

485

"""

486

487

def read_clipboard(*, separator: str = "\t", **kwargs) -> DataFrame:

488

"""

489

Read data from system clipboard.

490

491

Parameters:

492

- separator: Field separator

493

- **kwargs: Additional CSV parsing options

494

495

Returns:

496

- DataFrame: Clipboard data

497

"""

498

499

def read_delta(

500

source: str | Path,

501

*,

502

version: int | str | datetime | None = None,

503

columns: list[str] | None = None,

504

storage_options: dict[str, str] | None = None,

505

delta_table_options: dict[str, Any] | None = None,

506

pyarrow_options: dict[str, Any] | None = None

507

) -> DataFrame:

508

"""

509

Read Delta Lake table into DataFrame.

510

511

Parameters:

512

- source: Delta table path

513

- version: Table version to read

514

- columns: Columns to select

515

- storage_options: Cloud storage configuration

516

517

Returns:

518

- DataFrame: Delta table data

519

"""

520

521

def scan_delta(

522

source: str | Path,

523

*,

524

version: int | str | datetime | None = None,

525

storage_options: dict[str, str] | None = None,

526

delta_table_options: dict[str, Any] | None = None,

527

pyarrow_options: dict[str, Any] | None = None

528

) -> LazyFrame:

529

"""

530

Scan Delta Lake table for lazy processing.

531

532

Returns:

533

- LazyFrame: Lazy representation of Delta table

534

"""

535

```

536

537

### Cloud Storage Support

538

539

Integration with cloud storage providers and object stores.

540

541

```python { .api }

542

# Cloud credential providers

543

class CredentialProvider:

544

"""Base class for cloud credential providers"""

545

546

class CredentialProviderAWS:

547

def __init__(

548

self,

549

*,

550

access_key_id: str | None = None,

551

secret_access_key: str | None = None,

552

session_token: str | None = None,

553

region: str | None = None,

554

profile: str | None = None

555

):

556

"""

557

AWS credential provider.

558

559

Parameters:

560

- access_key_id: AWS access key

561

- secret_access_key: AWS secret key

562

- session_token: AWS session token

563

- region: AWS region

564

- profile: AWS CLI profile name

565

"""

566

567

class CredentialProviderAzure:

568

def __init__(

569

self,

570

*,

571

account_name: str | None = None,

572

account_key: str | None = None,

573

sas_token: str | None = None,

574

tenant_id: str | None = None,

575

client_id: str | None = None,

576

client_secret: str | None = None

577

):

578

"""

579

Azure credential provider.

580

581

Parameters:

582

- account_name: Storage account name

583

- account_key: Storage account key

584

- sas_token: Shared access signature token

585

"""

586

587

class CredentialProviderGCP:

588

def __init__(

589

self,

590

*,

591

service_account_path: str | None = None,

592

service_account_key: str | None = None,

593

project_id: str | None = None

594

):

595

"""

596

Google Cloud Platform credential provider.

597

598

Parameters:

599

- service_account_path: Path to service account JSON file

600

- service_account_key: Service account key JSON string

601

- project_id: GCP project ID

602

"""

603

604

class CredentialProviderFunction:

605

def __init__(self, func: Callable[[], dict[str, str]]):

606

"""

607

Function-based credential provider.

608

609

Parameters:

610

- func: Function returning credential dictionary

611

"""

612

613

# Cloud scanning

614

def scan_iceberg(

615

source: str,

616

*,

617

mode: str = "convert",

618

pyarrow_options: dict[str, Any] | None = None

619

) -> LazyFrame:

620

"""

621

Scan Apache Iceberg table for lazy processing.

622

623

Parameters:

624

- source: Iceberg table path or catalog reference

625

- mode: Scanning mode ('convert' or 'arrow')

626

627

Returns:

628

- LazyFrame: Lazy representation of Iceberg table

629

"""

630

631

def scan_pyarrow_dataset(

632

source: str | Path,

633

*,

634

schema: dict[str, type] | None = None,

635

allow_pyarrow_filter: bool = True,

636

cache: bool = True

637

) -> LazyFrame:

638

"""

639

Scan PyArrow dataset for lazy processing.

640

641

Parameters:

642

- source: Dataset path

643

- schema: Expected schema

644

- allow_pyarrow_filter: Enable PyArrow predicate pushdown

645

646

Returns:

647

- LazyFrame: Lazy representation of PyArrow dataset

648

"""

649

```

650

651

### Scan Configuration

652

653

Advanced configuration options for scanning operations.

654

655

```python { .api }

656

class ScanCastOptions:

657

def __init__(

658

self,

659

*,

660

cast_time_unit: str | None = None,

661

cast_string_strict: bool = True

662

):

663

"""

664

Options for type casting during scanning.

665

666

Parameters:

667

- cast_time_unit: Time unit for temporal casts

668

- cast_string_strict: Strict string casting

669

"""

670

671

# Partitioning classes

672

class PartitionByKey:

673

def __init__(self, by: str | list[str]):

674

"""Partition by column values."""

675

676

class PartitionMaxSize:

677

def __init__(self, max_size: int):

678

"""Partition by maximum size."""

679

680

class PartitionParted:

681

def __init__(self, n_partitions: int):

682

"""Partition into fixed number of parts."""

683

684

# Context classes for advanced partitioning

685

class BasePartitionContext:

686

"""Base partition context"""

687

688

class KeyedPartitionContext(BasePartitionContext):

689

def __init__(self, key: Any): ...

690

691

class KeyedPartition:

692

def __init__(self, key: Any, partition: DataFrame): ...

693

```

694

695

### Deferred I/O

696

697

Utilities for deferred I/O operations.

698

699

```python { .api }

700

def defer() -> Expr:

701

"""

702

Create deferred I/O expression for use in scan operations.

703

704

Returns:

705

- Expr: Deferred expression

706

"""

707

```

708

709

## Usage Examples

710

711

### CSV Operations

712

713

```python

714

import polars as pl

715

716

# Basic CSV reading

717

df = pl.read_csv("data.csv")

718

719

# CSV with custom options

720

df = pl.read_csv(

721

"data.csv",

722

separator=";",

723

has_header=True,

724

dtypes={"id": pl.Int32, "date": pl.Date},

725

null_values=["", "NULL", "N/A"]

726

)

727

728

# Lazy CSV scanning for large files

729

lazy_df = pl.scan_csv("large_file.csv").filter(pl.col("date") >= "2023-01-01")

730

result = lazy_df.collect()

731

732

# Batched reading for memory efficiency

733

reader = pl.read_csv_batched("huge_file.csv", batch_size=10000)

734

for batch in reader:

735

process_batch(batch)

736

```

737

738

### Parquet Operations

739

740

```python

741

# Read Parquet file

742

df = pl.read_parquet("data.parquet")

743

744

# Parquet with column selection

745

df = pl.read_parquet("data.parquet", columns=["id", "name", "value"])

746

747

# Lazy Parquet scanning with predicate pushdown

748

lazy_df = (

749

pl.scan_parquet("partitioned/*.parquet")

750

.filter(pl.col("year") == 2023)

751

.select(["id", "amount"])

752

)

753

result = lazy_df.collect()

754

755

# Read Parquet metadata

756

schema = pl.read_parquet_schema("data.parquet")

757

metadata = pl.read_parquet_metadata("data.parquet")

758

```

759

760

### Database Operations

761

762

```python

763

# Read from database

764

df = pl.read_database(

765

"SELECT * FROM customers WHERE active = true",

766

"postgresql://user:pass@localhost/db"

767

)

768

769

# Partitioned database reading

770

df = pl.read_database(

771

"SELECT * FROM large_table",

772

connection,

773

partition_on="id",

774

partition_num=4

775

)

776

777

# Using different protocols

778

df = pl.read_database(

779

"SELECT * FROM table",

780

connection,

781

protocol="adbc" # or "connectorx"

782

)

783

```

784

785

### Cloud Storage

786

787

```python

788

# AWS S3

789

aws_creds = pl.CredentialProviderAWS(

790

access_key_id="ACCESS_KEY",

791

secret_access_key="SECRET_KEY",

792

region="us-east-1"

793

)

794

795

df = pl.read_parquet(

796

"s3://bucket/data.parquet",

797

credential_provider=aws_creds

798

)

799

800

# Azure Blob Storage

801

azure_creds = pl.CredentialProviderAzure(

802

account_name="account",

803

account_key="key"

804

)

805

806

df = pl.read_csv(

807

"az://container/data.csv",

808

credential_provider=azure_creds

809

)

810

811

# Google Cloud Storage

812

gcp_creds = pl.CredentialProviderGCP(

813

service_account_path="service-account.json"

814

)

815

816

df = pl.scan_parquet(

817

"gs://bucket/partitioned/*.parquet",

818

credential_provider=gcp_creds

819

)

820

```

821

822

### Excel and Spreadsheets

823

824

```python

825

# Read Excel file

826

df = pl.read_excel("data.xlsx", sheet_name="Sheet1")

827

828

# Excel with specific engine

829

df = pl.read_excel(

830

"data.xlsx",

831

engine="openpyxl",

832

schema_overrides={"date": pl.Date}

833

)

834

835

# OpenDocument Spreadsheet

836

df = pl.read_ods("data.ods", sheet_id=0)

837

```

838

839

### JSON Operations

840

841

```python

842

# Read JSON

843

df = pl.read_json("data.json")

844

845

# Read NDJSON (newline-delimited JSON)

846

df = pl.read_ndjson("logs.jsonl")

847

848

# Lazy NDJSON scanning

849

lazy_df = pl.scan_ndjson("large_logs.jsonl").filter(

850

pl.col("timestamp") >= "2023-01-01"

851

)

852

```

853

854

### Delta Lake

855

856

```python

857

# Read Delta table

858

df = pl.read_delta("path/to/delta/table")

859

860

# Read specific version

861

df = pl.read_delta("delta/table", version=5)

862

863

# Lazy scanning with time travel

864

lazy_df = pl.scan_delta("delta/table", version="2023-01-01T00:00:00Z")

865

```

866

867

### Advanced Scanning

868

869

```python

870

# Scan with custom options

871

cast_options = pl.ScanCastOptions(

872

cast_time_unit="us",

873

cast_string_strict=False

874

)

875

876

lazy_df = pl.scan_csv(

877

"data.csv",

878

cast_options=cast_options

879

)

880

881

# Iceberg table scanning

882

lazy_df = pl.scan_iceberg("catalog.database.table")

883

884

# PyArrow dataset scanning

885

lazy_df = pl.scan_pyarrow_dataset("partitioned/dataset/")

886

```