or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

config-utilities.mdcore-data-structures.mddata-types.mdexpressions.mdfunctions.mdindex.mdio-operations.mdselectors.mdsql-interface.md

io-operations.mddocs/

0

# I/O Operations

1

2

Comprehensive I/O capabilities supporting 10+ file formats with both eager reading and lazy scanning for performance optimization. Polars provides efficient data ingestion and export across various formats with advanced features like predicate pushdown and schema inference.

3

4

## Capabilities

5

6

### CSV Operations

7

8

Reading and scanning CSV files with extensive configuration options.

9

10

```python { .api }

11

def read_csv(

12

source: str | Path | IO[str] | IO[bytes] | bytes,

13

*,

14

has_header: bool = True,

15

columns: list[int] | list[str] | None = None,

16

new_columns: list[str] | None = None,

17

dtypes: dict[int | str, DataType] | Sequence[DataType] | None = None,

18

separator: str = ",",

19

comment_prefix: str | None = None,

20

quote_char: str | None = '"',

21

skip_rows: int = 0,

22

skip_rows_after_header: int = 0,

23

row_index_name: str | None = None,

24

row_index_offset: int = 0,

25

sample_size: int = 1024,

26

eol_char: str = "\n",

27

null_values: str | Sequence[str] | dict[str, str] | None = None,

28

missing_utf8_is_empty_string: bool = False,

29

ignore_errors: bool = False,

30

try_parse_dates: bool = False,

31

n_threads: int | None = None,

32

infer_schema_length: int | None = N_INFER_DEFAULT,

33

batch_size: int | None = None,

34

n_rows: int | None = None,

35

encoding: CsvEncoding = "utf8",

36

low_memory: bool = False,

37

rechunk: bool = False,

38

skip_blank_lines: bool = True,

39

raise_if_empty: bool = True,

40

truncate_ragged_lines: bool = False,

41

decimal_comma: bool = False,

42

glob: bool = True

43

) -> DataFrame:

44

"""

45

Read CSV file into DataFrame.

46

47

Parameters:

48

- source: File path, URL, or file-like object

49

- has_header: First row contains column names

50

- columns: Columns to select by index or name

51

- new_columns: Override column names

52

- dtypes: Column data types

53

- separator: Field delimiter

54

- comment_prefix: Comment line prefix to skip

55

- quote_char: Quote character for strings

56

- skip_rows: Number of rows to skip at start

57

- skip_rows_after_header: Rows to skip after header

58

- row_index_name: Add row index column with this name

59

- row_index_offset: Start value for row index

60

- sample_size: Rows to sample for type inference

61

- eol_char: End-of-line character

62

- null_values: Values to interpret as null

63

- missing_utf8_is_empty_string: Treat invalid UTF-8 as empty

64

- ignore_errors: Continue on parse errors

65

- try_parse_dates: Attempt date parsing

66

- n_threads: Number of threads for parsing

67

- infer_schema_length: Rows to scan for schema inference

68

- batch_size: Batch size for processing

69

- n_rows: Maximum rows to read

70

- encoding: Text encoding

71

- low_memory: Use less memory (slower)

72

- rechunk: Rechunk to single chunk

73

- skip_blank_lines: Skip empty lines

74

- raise_if_empty: Raise error if no data

75

- truncate_ragged_lines: Handle inconsistent columns

76

- decimal_comma: Use comma as decimal separator

77

- glob: Use glob patterns for multiple files

78

79

Returns:

80

DataFrame with CSV data

81

"""

82

83

def scan_csv(

84

source: str | Path | list[str] | list[Path],

85

**kwargs

86

) -> LazyFrame:

87

"""

88

Lazy scan CSV file(s) for optimized processing.

89

90

Parameters:

91

Similar to read_csv but returns LazyFrame for deferred execution

92

93

Returns:

94

LazyFrame for lazy evaluation

95

"""

96

```

97

98

### Parquet Operations

99

100

High-performance columnar format operations with advanced features.

101

102

```python { .api }

103

def read_parquet(

104

source: str | Path | IO[bytes] | bytes,

105

*,

106

columns: list[int] | list[str] | None = None,

107

n_rows: int | None = None,

108

row_index_name: str | None = None,

109

row_index_offset: int = 0,

110

parallel: ParallelStrategy = "auto",

111

use_statistics: bool = True,

112

hive_partitioning: bool | None = None,

113

glob: bool = True,

114

rechunk: bool = False,

115

low_memory: bool = False,

116

storage_options: dict[str, Any] | None = None,

117

credential_provider: CredentialProvider | None = None,

118

retries: int = 2,

119

file_cache_ttl: int | None = None

120

) -> DataFrame:

121

"""

122

Read Parquet file into DataFrame.

123

124

Parameters:

125

- source: File path, URL, or bytes

126

- columns: Columns to select

127

- n_rows: Maximum rows to read

128

- row_index_name: Add row index column

129

- row_index_offset: Row index start value

130

- parallel: Parallelization strategy

131

- use_statistics: Use Parquet statistics for optimization

132

- hive_partitioning: Enable Hive-style partitioning

133

- glob: Use glob patterns

134

- rechunk: Rechunk to single chunk

135

- low_memory: Use less memory

136

- storage_options: Cloud storage options

137

- credential_provider: Cloud credentials

138

- retries: Number of retry attempts

139

- file_cache_ttl: File cache time-to-live

140

141

Returns:

142

DataFrame with Parquet data

143

"""

144

145

def scan_parquet(

146

source: str | Path | list[str] | list[Path],

147

**kwargs

148

) -> LazyFrame:

149

"""Lazy scan Parquet file(s)."""

150

151

def read_parquet_metadata(source: str | Path | IO[bytes] | bytes) -> dict[str, Any]:

152

"""Read Parquet file metadata."""

153

154

def read_parquet_schema(source: str | Path | IO[bytes] | bytes) -> Schema:

155

"""Read Parquet file schema."""

156

```

157

158

### JSON Operations

159

160

JSON and newline-delimited JSON file operations.

161

162

```python { .api }

163

def read_json(

164

source: str | Path | IO[str] | IO[bytes] | bytes,

165

*,

166

schema: dict[str, DataType] | None = None,

167

schema_overrides: dict[str, DataType] | None = None,

168

infer_schema_length: int | None = N_INFER_DEFAULT

169

) -> DataFrame:

170

"""

171

Read JSON file into DataFrame.

172

173

Parameters:

174

- source: JSON file path or data

175

- schema: Expected schema

176

- schema_overrides: Override inferred types

177

- infer_schema_length: Rows for schema inference

178

179

Returns:

180

DataFrame with JSON data

181

"""

182

183

def read_ndjson(

184

source: str | Path | IO[str] | IO[bytes] | bytes,

185

*,

186

schema: dict[str, DataType] | None = None,

187

schema_overrides: dict[str, DataType] | None = None,

188

batch_size: int | None = None,

189

n_rows: int | None = None,

190

low_memory: bool = False,

191

rechunk: bool = False,

192

row_index_name: str | None = None,

193

row_index_offset: int = 0,

194

ignore_errors: bool = False

195

) -> DataFrame:

196

"""

197

Read newline-delimited JSON file.

198

199

Parameters:

200

- source: NDJSON file path or data

201

- schema: Expected schema

202

- schema_overrides: Override inferred types

203

- batch_size: Processing batch size

204

- n_rows: Maximum rows to read

205

- low_memory: Use less memory

206

- rechunk: Rechunk to single chunk

207

- row_index_name: Add row index column

208

- row_index_offset: Row index start value

209

- ignore_errors: Continue on parse errors

210

211

Returns:

212

DataFrame with NDJSON data

213

"""

214

215

def scan_ndjson(

216

source: str | Path | list[str] | list[Path],

217

**kwargs

218

) -> LazyFrame:

219

"""Lazy scan NDJSON file(s)."""

220

```

221

222

### Database Operations

223

224

Reading data from various databases using connection strings or objects.

225

226

```python { .api }

227

def read_database(

228

query: str,

229

connection: str | ConnectionOrCursor,

230

*,

231

partition_on: str | None = None,

232

partition_range: tuple[int, int] | None = None,

233

partition_num: int | None = None,

234

protocol: str | None = None,

235

engine: DbReadEngine | None = None,

236

schema_overrides: dict[str, DataType] | None = None,

237

execute_options: dict[str, Any] | None = None

238

) -> DataFrame:

239

"""

240

Read database query results into DataFrame.

241

242

Parameters:

243

- query: SQL query string

244

- connection: Database connection string or object

245

- partition_on: Column for partitioned reading

246

- partition_range: Range for partitioned reading

247

- partition_num: Number of partitions

248

- protocol: Database protocol

249

- engine: Database engine to use

250

- schema_overrides: Override inferred types

251

- execute_options: Additional execution options

252

253

Returns:

254

DataFrame with query results

255

"""

256

257

def read_database_uri(

258

query: str,

259

uri: str,

260

*,

261

partition_on: str | None = None,

262

partition_range: tuple[int, int] | None = None,

263

partition_num: int | None = None,

264

protocol: str | None = None,

265

engine: DbReadEngine | None = None,

266

schema_overrides: dict[str, DataType] | None = None

267

) -> DataFrame:

268

"""

269

Read from database using URI connection string.

270

271

Parameters:

272

- query: SQL query string

273

- uri: Database URI

274

- Other parameters: Same as read_database

275

276

Returns:

277

DataFrame with query results

278

"""

279

```

280

281

### IPC/Arrow Operations

282

283

Apache Arrow IPC format operations for efficient cross-language data exchange.

284

285

```python { .api }

286

def read_ipc(

287

source: str | Path | IO[bytes] | bytes,

288

*,

289

columns: list[int] | list[str] | None = None,

290

n_rows: int | None = None,

291

row_index_name: str | None = None,

292

row_index_offset: int = 0,

293

rechunk: bool = False,

294

memory_map: bool = True,

295

storage_options: dict[str, Any] | None = None,

296

credential_provider: CredentialProvider | None = None

297

) -> DataFrame:

298

"""

299

Read IPC/Arrow file into DataFrame.

300

301

Parameters:

302

- source: IPC file path or bytes

303

- columns: Columns to select

304

- n_rows: Maximum rows to read

305

- row_index_name: Add row index column

306

- row_index_offset: Row index start value

307

- rechunk: Rechunk to single chunk

308

- memory_map: Use memory mapping

309

- storage_options: Cloud storage options

310

- credential_provider: Cloud credentials

311

312

Returns:

313

DataFrame with IPC data

314

"""

315

316

def read_ipc_stream(

317

source: str | Path | IO[bytes] | bytes,

318

**kwargs

319

) -> DataFrame:

320

"""Read IPC stream format."""

321

322

def scan_ipc(

323

source: str | Path | list[str] | list[Path],

324

**kwargs

325

) -> LazyFrame:

326

"""Lazy scan IPC file(s)."""

327

328

def read_ipc_schema(source: str | Path | IO[bytes] | bytes) -> Schema:

329

"""Read IPC file schema."""

330

```

331

332

### Excel Operations

333

334

Reading Excel and OpenDocument spreadsheet files.

335

336

```python { .api }

337

def read_excel(

338

source: str | Path | IO[bytes] | bytes,

339

*,

340

sheet_id: int | Sequence[int] | None = None,

341

sheet_name: str | list[str] | None = None,

342

engine: ExcelSpreadsheetEngine | None = None,

343

engine_options: dict[str, Any] | None = None,

344

read_options: dict[str, Any] | None = None,

345

schema_overrides: dict[str, DataType] | None = None,

346

infer_schema_length: int | None = N_INFER_DEFAULT,

347

raise_if_empty: bool = True

348

) -> DataFrame | dict[str, DataFrame]:

349

"""

350

Read Excel file into DataFrame.

351

352

Parameters:

353

- source: Excel file path or bytes

354

- sheet_id: Sheet index(es) to read

355

- sheet_name: Sheet name(s) to read

356

- engine: Excel engine to use

357

- engine_options: Engine-specific options

358

- read_options: Reading options

359

- schema_overrides: Override inferred types

360

- infer_schema_length: Rows for schema inference

361

- raise_if_empty: Raise error if no data

362

363

Returns:

364

DataFrame or dict of DataFrames (if multiple sheets)

365

"""

366

367

def read_ods(

368

source: str | Path | IO[bytes] | bytes,

369

**kwargs

370

) -> DataFrame | dict[str, DataFrame]:

371

"""Read OpenDocument Spreadsheet file."""

372

```

373

374

### Cloud and Advanced I/O

375

376

Cloud storage integration and advanced I/O features.

377

378

```python { .api }

379

def read_avro(

380

source: str | Path | IO[bytes] | bytes,

381

*,

382

columns: list[int] | list[str] | None = None,

383

n_rows: int | None = None

384

) -> DataFrame:

385

"""Read Apache Avro file."""

386

387

def read_clipboard(**kwargs) -> DataFrame:

388

"""Read data from system clipboard."""

389

390

def scan_iceberg(

391

source: str,

392

**kwargs

393

) -> LazyFrame:

394

"""Lazy scan Apache Iceberg table."""

395

396

def scan_delta(

397

source: str,

398

*,

399

version: int | str | None = None,

400

storage_options: dict[str, str] | None = None,

401

delta_table_options: dict[str, Any] | None = None

402

) -> LazyFrame:

403

"""

404

Lazy scan Delta Lake table.

405

406

Parameters:

407

- source: Delta table path

408

- version: Table version to read

409

- storage_options: Cloud storage options

410

- delta_table_options: Delta table options

411

412

Returns:

413

LazyFrame for Delta table

414

"""

415

416

def read_delta(

417

source: str,

418

**kwargs

419

) -> DataFrame:

420

"""Read Delta Lake table."""

421

422

def scan_pyarrow_dataset(

423

source: str | Path,

424

**kwargs

425

) -> LazyFrame:

426

"""Lazy scan PyArrow dataset."""

427

```

428

429

### Partitioning and Scan Options

430

431

Advanced partitioning strategies and scan configuration.

432

433

```python { .api }

434

class ScanCastOptions:

435

"""Options for casting during scan operations."""

436

def __init__(

437

self,

438

*,

439

enabled: bool = True,

440

dtypes: dict[str, DataType] | None = None,

441

strict: bool = True

442

):

443

"""

444

Configure scan casting.

445

446

Parameters:

447

- enabled: Enable automatic casting

448

- dtypes: Target data types

449

- strict: Strict casting mode

450

"""

451

452

class BasePartitionContext:

453

"""Base class for partition contexts."""

454

455

class KeyedPartitionContext(BasePartitionContext):

456

"""Partition context with key-based partitioning."""

457

458

class KeyedPartition:

459

"""Partition information for keyed partitioning."""

460

def __init__(self, key: Any, df: DataFrame):

461

"""

462

Create keyed partition.

463

464

Parameters:

465

- key: Partition key

466

- df: Partition DataFrame

467

"""

468

469

class PartitionByKey:

470

"""Partition strategy based on column values."""

471

def __init__(self, by: str | list[str]):

472

"""

473

Partition by column key(s).

474

475

Parameters:

476

- by: Column name(s) for partitioning

477

"""

478

479

class PartitionMaxSize:

480

"""Partition strategy based on maximum size."""

481

def __init__(self, max_size: int):

482

"""

483

Partition by maximum size.

484

485

Parameters:

486

- max_size: Maximum partition size

487

"""

488

489

class PartitionParted:

490

"""Information about partitioned data."""

491

```

492

493

### Cloud Credential Providers

494

495

Authentication for cloud storage access.

496

497

```python { .api }

498

class CredentialProvider:

499

"""Base credential provider."""

500

501

class CredentialProviderAWS(CredentialProvider):

502

"""AWS credential provider."""

503

def __init__(

504

self,

505

*,

506

access_key_id: str | None = None,

507

secret_access_key: str | None = None,

508

session_token: str | None = None,

509

region: str | None = None,

510

profile: str | None = None

511

):

512

"""

513

AWS credentials.

514

515

Parameters:

516

- access_key_id: AWS access key

517

- secret_access_key: AWS secret key

518

- session_token: AWS session token

519

- region: AWS region

520

- profile: AWS profile name

521

"""

522

523

class CredentialProviderAzure(CredentialProvider):

524

"""Azure credential provider."""

525

526

class CredentialProviderGCP(CredentialProvider):

527

"""Google Cloud credential provider."""

528

529

class CredentialProviderFunction(CredentialProvider):

530

"""Function-based credential provider."""

531

def __init__(self, func: Callable[[], CredentialProviderFunctionReturn]):

532

"""

533

Function-based credentials.

534

535

Parameters:

536

- func: Function returning credentials

537

"""

538

539

class CredentialProviderFunctionReturn:

540

"""Return type for credential function."""

541

```

542

543

## Usage Examples

544

545

### Basic File Reading

546

547

```python

548

import polars as pl

549

550

# Read CSV with automatic type inference

551

df = pl.read_csv("data.csv")

552

553

# Read with specific options

554

df = pl.read_csv(

555

"data.csv",

556

separator=";",

557

null_values=["", "NULL", "N/A"],

558

try_parse_dates=True,

559

infer_schema_length=1000

560

)

561

562

# Read specific columns

563

df = pl.read_csv("data.csv", columns=["name", "age", "salary"])

564

```

565

566

### Lazy Scanning for Large Files

567

568

```python

569

# Lazy scan for memory efficiency

570

lazy_df = (pl

571

.scan_csv("large_file.csv")

572

.filter(pl.col("amount") > 1000)

573

.select(["customer_id", "amount", "date"])

574

.group_by("customer_id")

575

.agg([

576

pl.col("amount").sum(),

577

pl.col("date").max()

578

])

579

)

580

581

# Execute when ready

582

result = lazy_df.collect()

583

```

584

585

### Working with Multiple Files

586

587

```python

588

# Read multiple CSV files at once

589

df = pl.read_csv("data_*.csv", glob=True)

590

591

# Scan multiple Parquet files

592

lazy_df = pl.scan_parquet(["file1.parquet", "file2.parquet", "file3.parquet"])

593

```

594

595

### Database Integration

596

597

```python

598

# Read from database

599

df = pl.read_database(

600

"SELECT * FROM customers WHERE age > 25",

601

"postgresql://user:pass@localhost:5432/db"

602

)

603

604

# Partitioned database reading for large tables

605

df = pl.read_database(

606

"SELECT * FROM large_table",

607

"postgresql://user:pass@localhost:5432/db",

608

partition_on="id",

609

partition_num=4

610

)

611

```

612

613

### Cloud Storage Access

614

615

```python

616

# Read from S3 with credentials

617

df = pl.read_parquet(

618

"s3://bucket/data.parquet",

619

credential_provider=pl.CredentialProviderAWS(

620

access_key_id="key",

621

secret_access_key="secret",

622

region="us-east-1"

623

)

624

)

625

626

# Read from Azure Blob Storage

627

df = pl.read_csv(

628

"az://container/data.csv",

629

credential_provider=pl.CredentialProviderAzure()

630

)

631

```

632

633

### Advanced Excel Reading

634

635

```python

636

# Read specific Excel sheet

637

df = pl.read_excel("report.xlsx", sheet_name="Summary")

638

639

# Read multiple sheets

640

sheets = pl.read_excel("report.xlsx", sheet_id=[0, 1, 2])

641

summary_df = sheets["Summary"]

642

details_df = sheets["Details"]

643

644

# Excel with custom options

645

df = pl.read_excel(

646

"data.xlsx",

647

engine="openpyxl",

648

read_options={

649

"has_header": True,

650

"skip_rows": 2

651

},

652

schema_overrides={

653

"date": pl.Date,

654

"amount": pl.Decimal(10, 2)

655

}

656

)

657

```

658

659

### Data Export

660

661

```python

662

# DataFrame write methods

663

df.write_csv("output.csv")

664

df.write_parquet("output.parquet")

665

df.write_json("output.json")

666

df.write_ipc("output.arrow")

667

668

# LazyFrame collect and write

669

lazy_df.collect().write_parquet("result.parquet")

670

671

# Write with options

672

df.write_csv(

673

"output.csv",

674

separator="|",

675

quote_char="'",

676

null_value="NULL"

677

)

678

```

679

680

### Schema Management

681

682

```python

683

# Define schema for consistent reading

684

schema = pl.Schema({

685

"id": pl.Int32,

686

"name": pl.String,

687

"amount": pl.Decimal(10, 2),

688

"timestamp": pl.Datetime("us", "UTC")

689

})

690

691

df = pl.read_csv("data.csv", schema=schema)

692

693

# Override specific column types

694

df = pl.read_csv(

695

"data.csv",

696

schema_overrides={

697

"customer_id": pl.String, # Keep as string

698

"amount": pl.Decimal(12, 4) # Higher precision

699

}

700

)

701

```