or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-features.mdarrow-flight.mdcompute-functions.mdcore-data-structures.mddata-types.mddataset-operations.mdfile-formats.mdindex.mdmemory-io.md

file-formats.mddocs/

0

# File Format Support

1

2

Native support for reading and writing multiple file formats including Parquet, CSV, JSON, Feather, and ORC. Provides high-performance I/O with configurable options for compression, encoding, metadata handling, and integration with cloud storage systems.

3

4

## Capabilities

5

6

### Parquet Format

7

8

High-performance columnar storage format with advanced features including compression, encoding, statistics, and schema evolution support.

9

10

```python { .api }

11

# Main I/O functions

12

def read_table(source, columns=None, use_threads=True, metadata=None, schema=None, use_pandas_metadata=False, read_dictionary=None, memory_map=False, buffer_size=None, partitioning=None, filesystem=None, filters=None, use_legacy_dataset=None, ignore_prefixes=None, pre_buffer=None, coerce_int96_timestamp_unit=None, thrift_string_size_limit=None, thrift_container_size_limit=None):

13

"""

14

Read Parquet file as Arrow Table.

15

16

Parameters:

17

- source: str or file-like, path or file object

18

- columns: list of str, columns to read

19

- use_threads: bool, use multiple threads

20

- metadata: FileMetaData, pre-loaded metadata

21

- schema: Schema, expected schema

22

- use_pandas_metadata: bool, use pandas metadata

23

- read_dictionary: list, columns to dictionary encode

24

- memory_map: bool, use memory mapping

25

- buffer_size: int, read buffer size

26

- partitioning: Partitioning, dataset partitioning

27

- filesystem: FileSystem, filesystem to use

28

- filters: list, row filters

29

- use_legacy_dataset: bool, use legacy dataset API

30

- ignore_prefixes: list, prefixes to ignore

31

- pre_buffer: bool, pre-buffer columns

32

- coerce_int96_timestamp_unit: str, int96 timestamp unit

33

- thrift_string_size_limit: int, thrift string size limit

34

- thrift_container_size_limit: int, thrift container size limit

35

36

Returns:

37

Table: Arrow table with data from Parquet file

38

"""

39

40

def write_table(table, where, row_group_size=None, version='2.6', use_dictionary=None, compression='snappy', write_statistics=None, use_deprecated_int96_timestamps=None, coerce_timestamps=None, allow_truncated_timestamps=False, data_page_size=None, data_page_version='1.0', compression_level=None, use_byte_stream_split=None, column_encoding=None, data_encoding=None, use_compliant_nested_type=None, encryption_properties=None, write_batch_size=None, dictionary_pagesize_limit=None, store_schema=None, write_page_index=None, write_page_checksum=None, sorting_columns=None, filesystem=None, metadata_collector=None):

41

"""

42

Write Arrow Table to Parquet file.

43

44

Parameters:

45

- table: Table, Arrow table to write

46

- where: str or file-like, output path or file

47

- row_group_size: int, maximum rows per row group

48

- version: str, Parquet format version

49

- use_dictionary: bool or list, dictionary encoding

50

- compression: str or dict, compression codec

51

- write_statistics: bool or list, write column statistics

52

- use_deprecated_int96_timestamps: bool, use int96 for timestamps

53

- coerce_timestamps: str, timestamp coercion unit

54

- allow_truncated_timestamps: bool, allow timestamp truncation

55

- data_page_size: int, target data page size

56

- data_page_version: str, data page version

57

- compression_level: int, compression level

58

- use_byte_stream_split: bool or list, byte stream split encoding

59

- column_encoding: dict, column encoding options

60

- data_encoding: dict, data encoding options

61

- use_compliant_nested_type: bool, compliant nested type naming

62

- encryption_properties: FileEncryptionProperties, encryption settings

63

- write_batch_size: int, write batch size

64

- dictionary_pagesize_limit: int, dictionary page size limit

65

- store_schema: bool, store schema in metadata

66

- write_page_index: bool, write page index

67

- write_page_checksum: bool, write page checksums

68

- sorting_columns: list, column sorting information

69

- filesystem: FileSystem, filesystem to use

70

- metadata_collector: list, collect metadata

71

"""

72

73

def read_pandas(source, columns=None, **kwargs):

74

"""Read Parquet file optimized for pandas DataFrame."""

75

76

def read_schema(where, memory_map=False, metadata=None, filesystem=None):

77

"""

78

Read schema from Parquet file.

79

80

Parameters:

81

- where: str or file-like, path or file object

82

- memory_map: bool, use memory mapping

83

- metadata: FileMetaData, pre-loaded metadata

84

- filesystem: FileSystem, filesystem to use

85

86

Returns:

87

Schema: Arrow schema from Parquet file

88

"""

89

90

def read_metadata(where, memory_map=False, decryption_properties=None, filesystem=None):

91

"""

92

Read metadata from Parquet file.

93

94

Parameters:

95

- where: str or file-like, path or file object

96

- memory_map: bool, use memory mapping

97

- decryption_properties: FileDecryptionProperties, decryption settings

98

- filesystem: FileSystem, filesystem to use

99

100

Returns:

101

FileMetaData: Parquet file metadata

102

"""

103

104

class ParquetFile:

105

"""

106

Interface for reading Parquet files.

107

108

Attributes:

109

- metadata: FileMetaData object

110

- schema: Arrow schema

111

- schema_arrow: Arrow schema (alias)

112

- num_row_groups: Number of row groups

113

"""

114

115

def __init__(self, source, metadata=None, common_metadata=None, read_dictionary=None, memory_map=False, buffer_size=None, pre_buffer=None, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None): ...

116

117

def read(self, columns=None, use_threads=True, use_pandas_metadata=False):

118

"""Read entire file as Table."""

119

120

def read_row_group(self, i, columns=None, use_threads=True, use_pandas_metadata=False):

121

"""Read specific row group."""

122

123

def read_row_groups(self, row_groups, columns=None, use_threads=True, use_pandas_metadata=False):

124

"""Read multiple row groups."""

125

126

def iter_batches(self, batch_size=1024, row_groups=None, columns=None, use_threads=True, use_pandas_metadata=False):

127

"""Iterate over record batches."""

128

129

def scan_contents(self, columns=None, batch_size=1024):

130

"""Scan file contents."""

131

132

class ParquetWriter:

133

"""

134

Writer for Parquet files.

135

"""

136

137

def __init__(self, where, schema, filesystem=None, **kwargs): ...

138

139

def write_batch(self, batch, row_group_size=None):

140

"""Write record batch."""

141

142

def write_table(self, table, row_group_size=None):

143

"""Write table."""

144

145

def close(self):

146

"""Close writer and finalize file."""

147

148

# Metadata classes

149

class FileMetaData:

150

"""

151

Parquet file metadata.

152

153

Attributes:

154

- created_by: Creator information

155

- format_version: Parquet format version

156

- metadata: Key-value metadata

157

- num_columns: Number of columns

158

- num_row_groups: Number of row groups

159

- num_rows: Total number of rows

160

- schema: Parquet schema

161

- serialized_size: Serialized metadata size

162

"""

163

164

def row_group(self, i):

165

"""Get row group metadata."""

166

167

def to_dict(self):

168

"""Convert to dictionary."""

169

170

class RowGroupMetaData:

171

"""

172

Row group metadata.

173

174

Attributes:

175

- num_columns: Number of columns in row group

176

- num_rows: Number of rows in row group

177

- total_byte_size: Total byte size

178

"""

179

180

def column(self, i):

181

"""Get column chunk metadata."""

182

183

class ColumnChunkMetaData:

184

"""

185

Column chunk metadata.

186

187

Attributes:

188

- column_path: Column path in schema

189

- compression: Compression codec

190

- data_page_offset: Data page offset

191

- dictionary_page_offset: Dictionary page offset

192

- encodings: List of encodings used

193

- file_offset: File offset

194

- file_path: File path (for external columns)

195

- has_dictionary_page: Whether has dictionary page

196

- index_page_offset: Index page offset

197

- num_values: Number of values

198

- physical_type: Physical storage type

199

- statistics: Column statistics

200

- total_compressed_size: Compressed size

201

- total_uncompressed_size: Uncompressed size

202

"""

203

204

def to_dict(self):

205

"""Convert to dictionary."""

206

207

class ParquetSchema:

208

"""

209

Parquet schema representation.

210

211

Attributes:

212

- names: Column names

213

- pandas_metadata: Pandas metadata

214

"""

215

216

def column(self, i):

217

"""Get column schema."""

218

219

def to_arrow_schema(self):

220

"""Convert to Arrow schema."""

221

222

# Encryption support

223

class FileEncryptionProperties:

224

"""File-level encryption properties."""

225

226

class FileDecryptionProperties:

227

"""File-level decryption properties."""

228

```

229

230

### CSV Format

231

232

Flexible CSV reading and writing with extensive parsing options, type inference, and error handling capabilities.

233

234

```python { .api }

235

def read_csv(input_file, read_options=None, parse_options=None, convert_options=None):

236

"""

237

Read CSV file as Arrow Table.

238

239

Parameters:

240

- input_file: str or file-like, CSV file to read

241

- read_options: ReadOptions, reading configuration

242

- parse_options: ParseOptions, parsing configuration

243

- convert_options: ConvertOptions, conversion configuration

244

245

Returns:

246

Table: Arrow table with CSV data

247

"""

248

249

def write_csv(data, output_file, write_options=None):

250

"""

251

Write Table to CSV file.

252

253

Parameters:

254

- data: Table or RecordBatch, data to write

255

- output_file: str or file-like, output CSV file

256

- write_options: WriteOptions, writing configuration

257

"""

258

259

def open_csv(input_file, read_options=None, parse_options=None, convert_options=None):

260

"""

261

Open CSV file for streaming.

262

263

Parameters:

264

- input_file: str or file-like, CSV file to open

265

- read_options: ReadOptions, reading configuration

266

- parse_options: ParseOptions, parsing configuration

267

- convert_options: ConvertOptions, conversion configuration

268

269

Returns:

270

CSVStreamingReader: Streaming CSV reader

271

"""

272

273

class ReadOptions:

274

"""

275

CSV reading options.

276

277

Attributes:

278

- use_threads: Whether to use multiple threads

279

- block_size: Block size for reading

280

- skip_rows: Number of rows to skip at start

281

- skip_rows_after_names: Rows to skip after header

282

- column_names: Explicit column names

283

- autogenerate_column_names: Auto-generate column names

284

- encoding: Character encoding (default: utf8)

285

"""

286

287

class ParseOptions:

288

"""

289

CSV parsing options.

290

291

Attributes:

292

- delimiter: Field delimiter character

293

- quote_char: Quote character

294

- double_quote: Whether quotes are doubled for escaping

295

- escape_char: Escape character

296

- newlines_in_values: Allow newlines in values

297

- ignore_empty_lines: Skip empty lines

298

"""

299

300

class ConvertOptions:

301

"""

302

CSV type conversion options.

303

304

Attributes:

305

- check_utf8: Validate UTF-8 encoding

306

- column_types: Explicit column types (dict)

307

- null_values: Values to treat as null

308

- true_values: Values to treat as True

309

- false_values: Values to treat as False

310

- decimal_point: Decimal point character

311

- strings_can_be_null: Whether strings can be null

312

- quoted_strings_can_be_null: Whether quoted strings can be null

313

- auto_dict_encode: Auto dictionary-encode string columns

314

- auto_dict_max_cardinality: Max cardinality for auto dict encoding

315

- include_columns: Columns to include

316

- include_missing_columns: Include missing columns as null

317

- timestamp_parsers: Custom timestamp parsers

318

"""

319

320

class WriteOptions:

321

"""

322

CSV writing options.

323

324

Attributes:

325

- include_header: Include column names as header

326

- batch_size: Batch size for writing

327

- delimiter: Field delimiter

328

- quoting_style: When to quote fields

329

"""

330

331

class CSVStreamingReader:

332

"""

333

Streaming CSV reader for large files.

334

"""

335

336

def __iter__(self): ...

337

338

def read_next_batch(self):

339

"""Read next batch of records."""

340

341

def schema(self):

342

"""Get schema of CSV data."""

343

344

class CSVWriter:

345

"""CSV writer with configurable options."""

346

347

def __init__(self, sink, schema, write_options=None): ...

348

349

def write_batch(self, batch):

350

"""Write record batch."""

351

352

def write_table(self, table):

353

"""Write table."""

354

355

def close(self):

356

"""Close writer."""

357

358

class InvalidRow:

359

"""Information about invalid rows during parsing."""

360

361

ISO8601 = ... # ISO8601 timestamp parsing constant

362

```

363

364

### JSON Format

365

366

Line-delimited JSON reading with schema inference and flexible parsing options for semi-structured data.

367

368

```python { .api }

369

def read_json(input_file, read_options=None, parse_options=None):

370

"""

371

Read line-delimited JSON file as Arrow Table.

372

373

Parameters:

374

- input_file: str or file-like, JSON file to read

375

- read_options: ReadOptions, reading configuration

376

- parse_options: ParseOptions, parsing configuration

377

378

Returns:

379

Table: Arrow table with JSON data

380

"""

381

382

def open_json(input_file, read_options=None, parse_options=None):

383

"""

384

Open JSON file for streaming.

385

386

Parameters:

387

- input_file: str or file-like, JSON file to open

388

- read_options: ReadOptions, reading configuration

389

- parse_options: ParseOptions, parsing configuration

390

391

Returns:

392

Iterator: Streaming JSON reader

393

"""

394

395

class ReadOptions:

396

"""

397

JSON reading options.

398

399

Attributes:

400

- use_threads: Whether to use multiple threads

401

- block_size: Block size for reading

402

- schema: Explicit schema

403

"""

404

405

class ParseOptions:

406

"""

407

JSON parsing options.

408

409

Attributes:

410

- newlines_in_values: Allow newlines in string values

411

- explicit_schema: Use explicit schema

412

- unexpected_field_behavior: How to handle unexpected fields

413

"""

414

```

415

416

### Feather Format

417

418

Fast, language-agnostic columnar serialization format optimized for data interchange and temporary storage.

419

420

```python { .api }

421

def read_table(source, columns=None, use_threads=True, memory_map=False):

422

"""

423

Read Feather file as Arrow Table.

424

425

Parameters:

426

- source: str or file-like, Feather file to read

427

- columns: list of str, columns to read

428

- use_threads: bool, use multiple threads

429

- memory_map: bool, use memory mapping

430

431

Returns:

432

Table: Arrow table with Feather data

433

"""

434

435

def read_feather(source, columns=None, use_threads=True, memory_map=False):

436

"""Read Feather file (pandas compatibility)."""

437

438

def write_feather(df, dest, compression=None, compression_level=None, chunksize=None, version=None):

439

"""

440

Write Table to Feather file.

441

442

Parameters:

443

- df: Table or pandas DataFrame, data to write

444

- dest: str or file-like, output Feather file

445

- compression: str, compression codec

446

- compression_level: int, compression level

447

- chunksize: int, maximum rows per chunk

448

- version: int, Feather format version

449

"""

450

451

class FeatherDataset:

452

"""Multi-file Feather dataset interface."""

453

454

class FeatherError(Exception):

455

"""Feather format-specific errors."""

456

```

457

458

### ORC Format

459

460

Optimized Row Columnar format with advanced compression and indexing for big data processing.

461

462

```python { .api }

463

def read_table(source, columns=None, use_threads=True, memory_map=False):

464

"""

465

Read ORC file as Arrow Table.

466

467

Parameters:

468

- source: str or file-like, ORC file to read

469

- columns: list of str, columns to read

470

- use_threads: bool, use multiple threads

471

- memory_map: bool, use memory mapping

472

473

Returns:

474

Table: Arrow table with ORC data

475

"""

476

477

def write_table(table, where, file_version='0.12', batch_size=1024, stripe_size=67108864, compression='ZLIB', compression_block_size=65536, compression_strategy='speed', row_index_stride=10000, padding_tolerance=0.0, dictionary_key_size_threshold=0.0, bloom_filter_columns=None, bloom_filter_fpp=0.05):

478

"""

479

Write Arrow Table to ORC file.

480

481

Parameters:

482

- table: Table, Arrow table to write

483

- where: str or file-like, output ORC file

484

- file_version: str, ORC file format version

485

- batch_size: int, batch size for writing

486

- stripe_size: int, target stripe size in bytes

487

- compression: str, compression codec

488

- compression_block_size: int, compression block size

489

- compression_strategy: str, compression strategy

490

- row_index_stride: int, row index stride

491

- padding_tolerance: float, padding tolerance

492

- dictionary_key_size_threshold: float, dictionary encoding threshold

493

- bloom_filter_columns: list, columns for bloom filters

494

- bloom_filter_fpp: float, bloom filter false positive probability

495

"""

496

497

class ORCFile:

498

"""

499

ORC file reader interface.

500

501

Attributes:

502

- metadata: ORC file metadata

503

- schema: Arrow schema

504

- nrows: Number of rows

505

- nstripes: Number of stripes

506

"""

507

508

def __init__(self, source, memory_map=False): ...

509

510

def read(self, columns=None, use_threads=True):

511

"""Read entire file as Table."""

512

513

def read_stripe(self, n, columns=None):

514

"""Read specific stripe."""

515

```

516

517

## Usage Examples

518

519

### Working with Parquet Files

520

521

```python

522

import pyarrow as pa

523

import pyarrow.parquet as pq

524

525

# Write Parquet file

526

table = pa.table({

527

'id': [1, 2, 3, 4, 5],

528

'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],

529

'value': [10.5, 20.3, 30.1, 40.7, 50.2]

530

})

531

532

# Basic write

533

pq.write_table(table, 'example.parquet')

534

535

# Advanced write with options

536

pq.write_table(

537

table,

538

'advanced.parquet',

539

compression='snappy',

540

use_dictionary=['name'],

541

row_group_size=2,

542

write_statistics=True

543

)

544

545

# Read Parquet file

546

loaded_table = pq.read_table('example.parquet')

547

548

# Read specific columns

549

subset = pq.read_table('example.parquet', columns=['id', 'name'])

550

551

# Read with filtering

552

filtered = pq.read_table(

553

'example.parquet',

554

filters=[('value', '>', 25.0)]

555

)

556

557

# Working with ParquetFile class

558

parquet_file = pq.ParquetFile('example.parquet')

559

print(f"Schema: {parquet_file.schema}")

560

print(f"Metadata: {parquet_file.metadata}")

561

print(f"Row groups: {parquet_file.num_row_groups}")

562

563

# Read row group

564

row_group_0 = parquet_file.read_row_group(0)

565

566

# Iterate over batches

567

for batch in parquet_file.iter_batches(batch_size=2):

568

print(batch)

569

```

570

571

### CSV File Operations

572

573

```python

574

import pyarrow as pa

575

import pyarrow.csv as csv

576

577

# Basic CSV reading

578

table = csv.read_csv('data.csv')

579

580

# Advanced CSV reading with options

581

read_options = csv.ReadOptions(

582

skip_rows=1,

583

column_names=['id', 'name', 'age', 'salary']

584

)

585

parse_options = csv.ParseOptions(

586

delimiter=',',

587

quote_char='"',

588

escape_char='\\'

589

)

590

convert_options = csv.ConvertOptions(

591

column_types={

592

'id': pa.int64(),

593

'name': pa.string(),

594

'age': pa.int32(),

595

'salary': pa.float64()

596

},

597

null_values=['', 'NULL', 'null'],

598

strings_can_be_null=True

599

)

600

601

table = csv.read_csv(

602

'data.csv',

603

read_options=read_options,

604

parse_options=parse_options,

605

convert_options=convert_options

606

)

607

608

# Streaming CSV reading

609

reader = csv.open_csv('large_data.csv')

610

for batch in reader:

611

# Process batch

612

print(f"Batch shape: {batch.num_rows} x {batch.num_columns}")

613

614

# Write CSV

615

csv.write_csv(table, 'output.csv')

616

617

# Write with options

618

write_options = csv.WriteOptions(

619

include_header=True,

620

delimiter=';',

621

quoting_style='needed'

622

)

623

csv.write_csv(table, 'output_custom.csv', write_options=write_options)

624

```

625

626

### Multi-Format Workflow

627

628

```python

629

import pyarrow as pa

630

import pyarrow.parquet as pq

631

import pyarrow.csv as csv

632

import pyarrow.feather as feather

633

import pyarrow.orc as orc

634

635

# Create sample data

636

table = pa.table({

637

'date': pa.array(['2023-01-01', '2023-01-02', '2023-01-03']),

638

'value': [100.5, 200.3, 150.7],

639

'category': ['A', 'B', 'A']

640

})

641

642

# Write to different formats

643

pq.write_table(table, 'data.parquet')

644

csv.write_csv(table, 'data.csv')

645

feather.write_feather(table, 'data.feather')

646

orc.write_table(table, 'data.orc')

647

648

# Read from different formats

649

parquet_table = pq.read_table('data.parquet')

650

csv_table = csv.read_csv('data.csv')

651

feather_table = feather.read_table('data.feather')

652

orc_table = orc.read_table('data.orc')

653

654

# Verify all tables are equal

655

assert parquet_table.equals(csv_table)

656

assert csv_table.equals(feather_table)

657

assert feather_table.equals(orc_table)

658

659

# Performance comparison

660

import time

661

662

def time_format(read_func, write_func, filename):

663

# Write timing

664

start = time.time()

665

write_func(table, filename)

666

write_time = time.time() - start

667

668

# Read timing

669

start = time.time()

670

result = read_func(filename)

671

read_time = time.time() - start

672

673

return write_time, read_time

674

675

# Compare formats

676

formats = [

677

('Parquet', pq.read_table, pq.write_table, 'test.parquet'),

678

('Feather', feather.read_table, feather.write_feather, 'test.feather'),

679

('ORC', orc.read_table, orc.write_table, 'test.orc')

680

]

681

682

for name, read_func, write_func, filename in formats:

683

write_time, read_time = time_format(read_func, write_func, filename)

684

print(f"{name}: Write {write_time:.4f}s, Read {read_time:.4f}s")

685

```

686

687

### Advanced Parquet Features

688

689

```python

690

import pyarrow as pa

691

import pyarrow.parquet as pq

692

693

# Schema evolution example

694

old_schema = pa.schema([

695

pa.field('id', pa.int64()),

696

pa.field('name', pa.string()),

697

pa.field('value', pa.float64())

698

])

699

700

new_schema = pa.schema([

701

pa.field('id', pa.int64()),

702

pa.field('name', pa.string()),

703

pa.field('value', pa.float64()),

704

pa.field('category', pa.string()) # New column

705

])

706

707

# Write with old schema

708

old_table = pa.table([

709

[1, 2, 3],

710

['A', 'B', 'C'],

711

[10.5, 20.3, 30.1]

712

], schema=old_schema)

713

714

pq.write_table(old_table, 'old_format.parquet')

715

716

# Read and extend with new schema

717

loaded = pq.read_table('old_format.parquet')

718

extended = loaded.add_column('category', pa.array([None, None, None]))

719

720

# Write with new schema

721

pq.write_table(extended, 'new_format.parquet')

722

723

# Metadata handling

724

metadata = {'version': '1.0', 'created_by': 'pyarrow_example'}

725

table_with_metadata = table.replace_schema_metadata(metadata)

726

pq.write_table(table_with_metadata, 'with_metadata.parquet')

727

728

# Read metadata

729

file_metadata = pq.read_metadata('with_metadata.parquet')

730

print(f"File metadata: {file_metadata.metadata}")

731

print(f"Schema metadata: {file_metadata.schema.to_arrow_schema().metadata}")

732

```