or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

column-selection.mdconfiguration.mdcore-data-structures.mddata-conversion.mddata-types.mderror-handling.mdfunctions-expressions.mdindex.mdio-operations.mdsql-interface.md

io-operations.mddocs/

0

# Input/Output Operations

1

2

Comprehensive I/O support for 15+ file formats including CSV, Parquet, JSON, Excel, databases, and cloud storage with both eager reading and lazy scanning capabilities for optimal performance and memory usage.

3

4

## Capabilities

5

6

### CSV Operations

7

8

Read and scan CSV files with extensive customization options for delimiters, headers, data types, and parsing behavior.

9

10

```python { .api }

11

def read_csv(

12

source,

13

*,

14

has_header=True,

15

columns=None,

16

new_columns=None,

17

dtypes=None,

18

separator=",",

19

comment_prefix=None,

20

quote_char='"',

21

skip_rows=0,

22

skip_rows_after_header=0,

23

row_index_name=None,

24

row_index_offset=0,

25

sample_size=1024,

26

eol_char="\n",

27

raise_if_empty=True,

28

truncate_ragged_lines=False,

29

decimal_comma=False,

30

glob=True,

31

rechunk=False,

32

low_memory=False,

33

use_pyarrow=False,

34

storage_options=None,

35

credential_provider=None,

36

retries=2,

37

file_cache_ttl=None,

38

include_file_paths=None

39

) -> DataFrame:

40

"""

41

Read CSV file(s) into DataFrame.

42

43

Parameters:

44

- source: File path, URL, or file-like object

45

- has_header: Whether CSV has header row

46

- columns: Column subset to read

47

- new_columns: New column names

48

- dtypes: Column data types

49

- separator: Field delimiter

50

- comment_prefix: Comment line prefix

51

- quote_char: Quote character

52

- skip_rows: Rows to skip at start

53

- skip_rows_after_header: Rows to skip after header

54

- row_index_name: Add row index column

55

- row_index_offset: Row index starting value

56

- sample_size: Rows to sample for type inference

57

- eol_char: End-of-line character

58

- raise_if_empty: Raise error if empty

59

- truncate_ragged_lines: Handle ragged lines

60

- decimal_comma: Use comma as decimal separator

61

- glob: Enable glob patterns

62

- rechunk: Rechunk to contiguous memory

63

- low_memory: Use low memory mode

64

- use_pyarrow: Use PyArrow parser

65

- storage_options: Cloud storage options

66

- credential_provider: Cloud credentials

67

- retries: Number of retries

68

- file_cache_ttl: File cache TTL

69

- include_file_paths: Include file path column

70

71

Returns:

72

DataFrame with CSV data

73

"""

74

75

def scan_csv(

76

source,

77

*,

78

has_header=True,

79

separator=",",

80

comment_prefix=None,

81

quote_char='"',

82

skip_rows=0,

83

dtypes=None,

84

null_values=None,

85

missing_utf8_is_empty_string=False,

86

ignore_errors=False,

87

cache=True,

88

with_columns=None,

89

include_file_paths=None,

90

n_rows=None,

91

encoding="utf8",

92

low_memory=False,

93

rechunk=False,

94

skip_rows_after_header=0,

95

row_index_name=None,

96

row_index_offset=0,

97

sample_size=1024,

98

eol_char="\n",

99

raise_if_empty=True,

100

truncate_ragged_lines=False,

101

decimal_comma=False,

102

glob=True,

103

storage_options=None,

104

credential_provider=None,

105

retries=2,

106

file_cache_ttl=None

107

) -> LazyFrame:

108

"""

109

Scan CSV file(s) lazily into LazyFrame.

110

111

Parameters: Similar to read_csv

112

113

Returns:

114

LazyFrame for deferred CSV reading

115

"""

116

```

117

118

### Parquet Operations

119

120

Read and scan Parquet files with column selection, predicate pushdown, and parallel processing.

121

122

```python { .api }

123

def read_parquet(

124

source,

125

*,

126

columns=None,

127

n_rows=None,

128

parallel="auto",

129

row_index_name=None,

130

row_index_offset=0,

131

low_memory=False,

132

use_pyarrow=False,

133

storage_options=None,

134

credential_provider=None,

135

retries=2,

136

rechunk=False,

137

hive_partitioning=None,

138

hive_schema=None,

139

try_parse_hive_dates=True,

140

include_file_paths=None,

141

allow_missing_columns=False

142

) -> DataFrame:

143

"""

144

Read Parquet file(s) into DataFrame.

145

146

Parameters:

147

- source: File path, URL, or file-like object

148

- columns: Column subset to read

149

- n_rows: Number of rows to read

150

- parallel: Parallel reading mode

151

- row_index_name: Add row index column

152

- row_index_offset: Row index starting value

153

- low_memory: Use low memory mode

154

- use_pyarrow: Use PyArrow reader

155

- storage_options: Cloud storage options

156

- credential_provider: Cloud credentials

157

- retries: Number of retries

158

- rechunk: Rechunk to contiguous memory

159

- hive_partitioning: Enable Hive partitioning

160

- hive_schema: Hive partition schema

161

- try_parse_hive_dates: Parse Hive date partitions

162

- include_file_paths: Include file path column

163

- allow_missing_columns: Allow missing columns

164

165

Returns:

166

DataFrame with Parquet data

167

"""

168

169

def scan_parquet(

170

source,

171

*,

172

n_rows=None,

173

row_index_name=None,

174

row_index_offset=0,

175

parallel="auto",

176

glob=True,

177

rechunk=False,

178

low_memory=False,

179

cache=True,

180

storage_options=None,

181

credential_provider=None,

182

retries=2,

183

hive_partitioning=None,

184

hive_schema=None,

185

try_parse_hive_dates=True,

186

include_file_paths=None,

187

allow_missing_columns=False

188

) -> LazyFrame:

189

"""

190

Scan Parquet file(s) lazily into LazyFrame.

191

192

Parameters: Similar to read_parquet

193

194

Returns:

195

LazyFrame for deferred Parquet reading

196

"""

197

198

def read_parquet_schema(source) -> Schema:

199

"""

200

Read schema from Parquet file without loading data.

201

202

Parameters:

203

- source: Parquet file path or URL

204

205

Returns:

206

Schema of Parquet file

207

"""

208

209

def read_parquet_metadata(source):

210

"""

211

Read metadata from Parquet file.

212

213

Parameters:

214

- source: Parquet file path or URL

215

216

Returns:

217

Parquet file metadata

218

"""

219

```

220

221

### JSON Operations

222

223

Read JSON and newline-delimited JSON files with flexible schema inference and nested data handling.

224

225

```python { .api }

226

def read_json(

227

source,

228

*,

229

schema=None,

230

schema_overrides=None,

231

infer_schema_length=None,

232

batch_size=None,

233

n_rows=None,

234

row_index_name=None,

235

row_index_offset=0,

236

storage_options=None,

237

credential_provider=None,

238

retries=2

239

) -> DataFrame:

240

"""

241

Read JSON file into DataFrame.

242

243

Parameters:

244

- source: File path, URL, or file-like object

245

- schema: Column schema

246

- schema_overrides: Override specific column types

247

- infer_schema_length: Rows to scan for schema inference

248

- batch_size: Processing batch size

249

- n_rows: Number of rows to read

250

- row_index_name: Add row index column

251

- row_index_offset: Row index starting value

252

- storage_options: Cloud storage options

253

- credential_provider: Cloud credentials

254

- retries: Number of retries

255

256

Returns:

257

DataFrame with JSON data

258

"""

259

260

def read_ndjson(

261

source,

262

*,

263

schema=None,

264

schema_overrides=None,

265

infer_schema_length=None,

266

batch_size=None,

267

n_rows=None,

268

row_index_name=None,

269

row_index_offset=0,

270

ignore_errors=False,

271

storage_options=None,

272

credential_provider=None,

273

retries=2

274

) -> DataFrame:

275

"""

276

Read newline-delimited JSON file into DataFrame.

277

278

Parameters: Similar to read_json with additional:

279

- ignore_errors: Continue on parsing errors

280

281

Returns:

282

DataFrame with NDJSON data

283

"""

284

285

def scan_ndjson(

286

source,

287

*,

288

infer_schema_length=100,

289

batch_size=1024,

290

n_rows=None,

291

low_memory=False,

292

rechunk=False,

293

row_index_name=None,

294

row_index_offset=0,

295

ignore_errors=False,

296

schema=None,

297

schema_overrides=None,

298

include_file_paths=None,

299

retries=2

300

) -> LazyFrame:

301

"""

302

Scan NDJSON file(s) lazily into LazyFrame.

303

304

Parameters: Similar to read_ndjson

305

306

Returns:

307

LazyFrame for deferred NDJSON reading

308

"""

309

```

310

311

### Excel and ODS Operations

312

313

Read Excel and OpenDocument spreadsheet files with sheet selection and range specification.

314

315

```python { .api }

316

def read_excel(

317

source,

318

*,

319

sheet_id=None,

320

sheet_name=None,

321

engine=None,

322

engine_options=None,

323

read_options=None,

324

schema_overrides=None,

325

infer_schema_length=1000,

326

raise_if_empty=True

327

) -> DataFrame:

328

"""

329

Read Excel file into DataFrame.

330

331

Parameters:

332

- source: Excel file path or file-like object

333

- sheet_id: Sheet index (0-based)

334

- sheet_name: Sheet name

335

- engine: Excel engine ("xlsx2csv", "openpyxl", "python")

336

- engine_options: Engine-specific options

337

- read_options: Additional read options

338

- schema_overrides: Override column types

339

- infer_schema_length: Rows for schema inference

340

- raise_if_empty: Raise error if empty

341

342

Returns:

343

DataFrame with Excel data

344

"""

345

346

def read_ods(

347

source,

348

*,

349

sheet_id=None,

350

sheet_name=None,

351

schema_overrides=None,

352

infer_schema_length=1000,

353

raise_if_empty=True

354

) -> DataFrame:

355

"""

356

Read OpenDocument Spreadsheet into DataFrame.

357

358

Parameters:

359

- source: ODS file path or file-like object

360

- sheet_id: Sheet index (0-based)

361

- sheet_name: Sheet name

362

- schema_overrides: Override column types

363

- infer_schema_length: Rows for schema inference

364

- raise_if_empty: Raise error if empty

365

366

Returns:

367

DataFrame with ODS data

368

"""

369

```

370

371

### Arrow IPC Operations

372

373

Read and scan Arrow IPC files and streams for efficient data exchange.

374

375

```python { .api }

376

def read_ipc(

377

source,

378

*,

379

columns=None,

380

n_rows=None,

381

row_index_name=None,

382

row_index_offset=0,

383

rechunk=False,

384

memory_map=True,

385

storage_options=None,

386

credential_provider=None,

387

retries=2,

388

include_file_paths=None

389

) -> DataFrame:

390

"""

391

Read Arrow IPC file into DataFrame.

392

393

Parameters:

394

- source: IPC file path, URL, or file-like object

395

- columns: Column subset to read

396

- n_rows: Number of rows to read

397

- row_index_name: Add row index column

398

- row_index_offset: Row index starting value

399

- rechunk: Rechunk to contiguous memory

400

- memory_map: Use memory mapping

401

- storage_options: Cloud storage options

402

- credential_provider: Cloud credentials

403

- retries: Number of retries

404

- include_file_paths: Include file path column

405

406

Returns:

407

DataFrame with IPC data

408

"""

409

410

def read_ipc_stream(

411

source,

412

*,

413

columns=None,

414

n_rows=None,

415

row_index_name=None,

416

row_index_offset=0,

417

rechunk=False,

418

storage_options=None,

419

credential_provider=None,

420

retries=2,

421

include_file_paths=None

422

) -> DataFrame:

423

"""

424

Read Arrow IPC stream into DataFrame.

425

426

Parameters: Similar to read_ipc

427

428

Returns:

429

DataFrame with IPC stream data

430

"""

431

432

def scan_ipc(

433

source,

434

*,

435

n_rows=None,

436

cache=True,

437

rechunk=False,

438

row_index_name=None,

439

row_index_offset=0,

440

storage_options=None,

441

credential_provider=None,

442

retries=2,

443

include_file_paths=None,

444

memory_map=True

445

) -> LazyFrame:

446

"""

447

Scan Arrow IPC file(s) lazily into LazyFrame.

448

449

Parameters: Similar to read_ipc

450

451

Returns:

452

LazyFrame for deferred IPC reading

453

"""

454

455

def read_ipc_schema(source) -> Schema:

456

"""

457

Read schema from Arrow IPC file without loading data.

458

459

Parameters:

460

- source: IPC file path or URL

461

462

Returns:

463

Schema of IPC file

464

"""

465

```

466

467

### Database Operations

468

469

Read data from databases using SQL queries with connection management and credential handling.

470

471

```python { .api }

472

def read_database(

473

query,

474

connection,

475

*,

476

partition_on=None,

477

partition_range=None,

478

partition_num=None,

479

protocol=None,

480

engine=None,

481

schema_overrides=None,

482

execute_options=None

483

) -> DataFrame:

484

"""

485

Read from database using SQL query.

486

487

Parameters:

488

- query: SQL query string

489

- connection: Database connection or connection string

490

- partition_on: Column for partitioning

491

- partition_range: Range for partitioning

492

- partition_num: Number of partitions

493

- protocol: Database protocol

494

- engine: Database engine

495

- schema_overrides: Override column types

496

- execute_options: Execution options

497

498

Returns:

499

DataFrame with query results

500

"""

501

502

def read_database_uri(

503

query,

504

uri,

505

*,

506

partition_on=None,

507

partition_range=None,

508

partition_num=None,

509

protocol=None,

510

engine=None,

511

schema_overrides=None,

512

execute_options=None

513

) -> DataFrame:

514

"""

515

Read from database using connection URI.

516

517

Parameters:

518

- query: SQL query string

519

- uri: Database connection URI

520

- partition_on: Column for partitioning

521

- partition_range: Range for partitioning

522

- partition_num: Number of partitions

523

- protocol: Database protocol

524

- engine: Database engine

525

- schema_overrides: Override column types

526

- execute_options: Execution options

527

528

Returns:

529

DataFrame with query results

530

"""

531

```

532

533

### Cloud Storage and Credentials

534

535

Credential providers for accessing cloud storage services with authentication.

536

537

```python { .api }

538

class CredentialProvider:

539

"""Base class for credential providers."""

540

541

class CredentialProviderAWS(CredentialProvider):

542

def __init__(

543

self,

544

*,

545

access_key_id=None,

546

secret_access_key=None,

547

session_token=None,

548

region_name=None,

549

profile_name=None,

550

assume_role_arn=None,

551

assume_role_session_name=None,

552

assume_role_external_id=None

553

):

554

"""

555

AWS credential provider.

556

557

Parameters:

558

- access_key_id: AWS access key ID

559

- secret_access_key: AWS secret access key

560

- session_token: AWS session token

561

- region_name: AWS region

562

- profile_name: AWS profile name

563

- assume_role_arn: Role ARN to assume

564

- assume_role_session_name: Assume role session name

565

- assume_role_external_id: External ID for assume role

566

"""

567

568

class CredentialProviderGCP(CredentialProvider):

569

def __init__(

570

self,

571

*,

572

service_account_path=None,

573

service_account_key=None,

574

project_id=None

575

):

576

"""

577

Google Cloud credential provider.

578

579

Parameters:

580

- service_account_path: Path to service account key file

581

- service_account_key: Service account key JSON

582

- project_id: GCP project ID

583

"""

584

585

class CredentialProviderAzure(CredentialProvider):

586

def __init__(

587

self,

588

*,

589

account_name=None,

590

account_key=None,

591

sas_token=None,

592

tenant_id=None,

593

client_id=None,

594

client_secret=None

595

):

596

"""

597

Azure credential provider.

598

599

Parameters:

600

- account_name: Storage account name

601

- account_key: Storage account key

602

- sas_token: SAS token

603

- tenant_id: Azure tenant ID

604

- client_id: Azure client ID

605

- client_secret: Azure client secret

606

"""

607

608

class CredentialProviderFunction(CredentialProvider):

609

def __init__(self, function):

610

"""

611

Function-based credential provider.

612

613

Parameters:

614

- function: Function returning credentials

615

"""

616

617

# Type alias for function return

618

CredentialProviderFunctionReturn = dict[str, str]

619

```

620

621

### Specialized Formats

622

623

Support for additional formats including Avro, Delta Lake, and Iceberg.

624

625

```python { .api }

626

def read_avro(

627

source,

628

*,

629

columns=None,

630

n_rows=None,

631

storage_options=None,

632

credential_provider=None,

633

retries=2

634

) -> DataFrame:

635

"""

636

Read Avro file into DataFrame.

637

638

Parameters:

639

- source: Avro file path or URL

640

- columns: Column subset to read

641

- n_rows: Number of rows to read

642

- storage_options: Cloud storage options

643

- credential_provider: Cloud credentials

644

- retries: Number of retries

645

646

Returns:

647

DataFrame with Avro data

648

"""

649

650

def read_delta(

651

source,

652

*,

653

version=None,

654

columns=None,

655

storage_options=None,

656

credential_provider=None,

657

delta_table_options=None

658

) -> DataFrame:

659

"""

660

Read Delta Lake table into DataFrame.

661

662

Parameters:

663

- source: Delta table path or URL

664

- version: Table version to read

665

- columns: Column subset to read

666

- storage_options: Cloud storage options

667

- credential_provider: Cloud credentials

668

- delta_table_options: Delta-specific options

669

670

Returns:

671

DataFrame with Delta table data

672

"""

673

674

def scan_delta(

675

source,

676

*,

677

version=None,

678

storage_options=None,

679

credential_provider=None,

680

delta_table_options=None

681

) -> LazyFrame:

682

"""

683

Scan Delta Lake table lazily into LazyFrame.

684

685

Parameters: Similar to read_delta

686

687

Returns:

688

LazyFrame for deferred Delta reading

689

"""

690

691

def scan_iceberg(

692

source,

693

*,

694

storage_options=None,

695

credential_provider=None

696

) -> LazyFrame:

697

"""

698

Scan Apache Iceberg table lazily into LazyFrame.

699

700

Parameters:

701

- source: Iceberg table path or URL

702

- storage_options: Cloud storage options

703

- credential_provider: Cloud credentials

704

705

Returns:

706

LazyFrame for deferred Iceberg reading

707

"""

708

709

def scan_pyarrow_dataset(

710

source,

711

*,

712

allow_pyarrow_filter=True,

713

pyarrow_options=None

714

) -> LazyFrame:

715

"""

716

Scan PyArrow dataset lazily into LazyFrame.

717

718

Parameters:

719

- source: PyArrow dataset

720

- allow_pyarrow_filter: Enable PyArrow filtering

721

- pyarrow_options: PyArrow-specific options

722

723

Returns:

724

LazyFrame for deferred dataset reading

725

"""

726

```

727

728

### Utility Operations

729

730

Additional I/O utilities including clipboard support and deferred computation.

731

732

```python { .api }

733

def read_clipboard(**kwargs) -> DataFrame:

734

"""

735

Read data from system clipboard.

736

737

Parameters:

738

- kwargs: Additional options (passed to pandas)

739

740

Returns:

741

DataFrame with clipboard data

742

"""

743

744

def defer() -> Expr:

745

"""

746

Create deferred computation placeholder.

747

748

Returns:

749

Deferred expression

750

"""

751

```

752

753

## Usage Examples

754

755

### Basic File Operations

756

757

```python

758

import polars as pl

759

760

# Read CSV with custom options

761

df = pl.read_csv(

762

"data.csv",

763

has_header=True,

764

separator=",",

765

dtypes={"id": pl.Int32, "name": pl.String},

766

null_values=["", "NULL"]

767

)

768

769

# Read Parquet with column selection

770

df = pl.read_parquet(

771

"data.parquet",

772

columns=["id", "name", "value"],

773

n_rows=1000

774

)

775

776

# Read JSON with schema override

777

df = pl.read_json(

778

"data.json",

779

schema_overrides={"timestamp": pl.Datetime}

780

)

781

```

782

783

### Lazy I/O with Query Optimization

784

785

```python

786

# Lazy CSV scanning with predicate pushdown

787

lazy_df = (

788

pl.scan_csv("large_data.csv")

789

.filter(pl.col("amount") > 1000)

790

.select(["customer_id", "amount", "date"])

791

.group_by("customer_id")

792

.agg([

793

pl.col("amount").sum().alias("total_amount"),

794

pl.col("date").max().alias("last_date")

795

])

796

)

797

798

# Execute optimized query

799

result = lazy_df.collect()

800

801

# Lazy Parquet scanning with column selection

802

result = (

803

pl.scan_parquet("*.parquet")

804

.select(["id", "value"])

805

.filter(pl.col("value") > 0)

806

.collect()

807

)

808

```

809

810

### Cloud Storage Access

811

812

```python

813

# AWS S3 access with credentials

814

aws_creds = pl.CredentialProviderAWS(

815

access_key_id="YOUR_KEY",

816

secret_access_key="YOUR_SECRET",

817

region_name="us-east-1"

818

)

819

820

df = pl.read_parquet(

821

"s3://my-bucket/data.parquet",

822

credential_provider=aws_creds

823

)

824

825

# Google Cloud Storage

826

gcp_creds = pl.CredentialProviderGCP(

827

service_account_path="path/to/service-account.json"

828

)

829

830

df = pl.read_csv(

831

"gs://my-bucket/data.csv",

832

credential_provider=gcp_creds

833

)

834

```

835

836

### Database Operations

837

838

```python

839

# Read from database

840

df = pl.read_database(

841

"SELECT * FROM users WHERE age > 18",

842

connection="postgresql://user:pass@localhost/db"

843

)

844

845

# Partitioned database reading

846

df = pl.read_database(

847

"SELECT * FROM large_table WHERE date >= ? AND date < ?",

848

"postgresql://user:pass@localhost/db",

849

partition_on="id",

850

partition_range=(1, 1000000),

851

partition_num=10

852

)

853

```

854

855

### Multiple File Formats

856

857

```python

858

# Read Excel with specific sheet

859

df = pl.read_excel(

860

"report.xlsx",

861

sheet_name="Summary",

862

schema_overrides={"date": pl.Date}

863

)

864

865

# Read Avro file

866

df = pl.read_avro("data.avro")

867

868

# Read Delta Lake table

869

df = pl.read_delta(

870

"path/to/delta/table",

871

version=5 # Read specific version

872

)

873

874

# Scan multiple Parquet files with glob

875

lazy_df = pl.scan_parquet("data/year=*/month=*/*.parquet")

876

result = lazy_df.collect()

877

```

878

879

### Advanced I/O Patterns

880

881

```python

882

# Streaming large files

883

for batch in pl.read_csv_batched("very_large.csv", batch_size=10000):

884

# Process each batch

885

processed = batch.with_columns([

886

pl.col("amount").mul(1.1).alias("amount_with_tax")

887

])

888

# Write or accumulate results

889

890

# Reading with error handling

891

df = pl.read_ndjson(

892

"messy_data.jsonl",

893

ignore_errors=True, # Skip malformed lines

894

infer_schema_length=1000

895

)

896

897

# Include file paths in multi-file reading

898

df = pl.read_csv(

899

"data/*.csv",

900

include_file_paths="source_file"

901

)

902

```