0
# File Format Support
1
2
Native support for reading and writing multiple file formats including Parquet, CSV, JSON, Feather, and ORC. Provides high-performance I/O with configurable options for compression, encoding, metadata handling, and integration with cloud storage systems.
3
4
## Capabilities
5
6
### Parquet Format
7
8
High-performance columnar storage format with advanced features including compression, encoding, statistics, and schema evolution support.
9
10
```python { .api }
11
# Main I/O functions
12
def read_table(source, columns=None, use_threads=True, metadata=None, schema=None, use_pandas_metadata=False, read_dictionary=None, memory_map=False, buffer_size=None, partitioning=None, filesystem=None, filters=None, use_legacy_dataset=None, ignore_prefixes=None, pre_buffer=None, coerce_int96_timestamp_unit=None, thrift_string_size_limit=None, thrift_container_size_limit=None):
13
"""
14
Read Parquet file as Arrow Table.
15
16
Parameters:
17
- source: str or file-like, path or file object
18
- columns: list of str, columns to read
19
- use_threads: bool, use multiple threads
20
- metadata: FileMetaData, pre-loaded metadata
21
- schema: Schema, expected schema
22
- use_pandas_metadata: bool, use pandas metadata
23
- read_dictionary: list, columns to dictionary encode
24
- memory_map: bool, use memory mapping
25
- buffer_size: int, read buffer size
26
- partitioning: Partitioning, dataset partitioning
27
- filesystem: FileSystem, filesystem to use
28
- filters: list, row filters
29
- use_legacy_dataset: bool, use legacy dataset API
30
- ignore_prefixes: list, prefixes to ignore
31
- pre_buffer: bool, pre-buffer columns
32
- coerce_int96_timestamp_unit: str, int96 timestamp unit
33
- thrift_string_size_limit: int, thrift string size limit
34
- thrift_container_size_limit: int, thrift container size limit
35
36
Returns:
37
Table: Arrow table with data from Parquet file
38
"""
39
40
def write_table(table, where, row_group_size=None, version='2.6', use_dictionary=None, compression='snappy', write_statistics=None, use_deprecated_int96_timestamps=None, coerce_timestamps=None, allow_truncated_timestamps=False, data_page_size=None, data_page_version='1.0', compression_level=None, use_byte_stream_split=None, column_encoding=None, data_encoding=None, use_compliant_nested_type=None, encryption_properties=None, write_batch_size=None, dictionary_pagesize_limit=None, store_schema=None, write_page_index=None, write_page_checksum=None, sorting_columns=None, filesystem=None, metadata_collector=None):
41
"""
42
Write Arrow Table to Parquet file.
43
44
Parameters:
45
- table: Table, Arrow table to write
46
- where: str or file-like, output path or file
47
- row_group_size: int, maximum rows per row group
48
- version: str, Parquet format version
49
- use_dictionary: bool or list, dictionary encoding
50
- compression: str or dict, compression codec
51
- write_statistics: bool or list, write column statistics
52
- use_deprecated_int96_timestamps: bool, use int96 for timestamps
53
- coerce_timestamps: str, timestamp coercion unit
54
- allow_truncated_timestamps: bool, allow timestamp truncation
55
- data_page_size: int, target data page size
56
- data_page_version: str, data page version
57
- compression_level: int, compression level
58
- use_byte_stream_split: bool or list, byte stream split encoding
59
- column_encoding: dict, column encoding options
60
- data_encoding: dict, data encoding options
61
- use_compliant_nested_type: bool, compliant nested type naming
62
- encryption_properties: FileEncryptionProperties, encryption settings
63
- write_batch_size: int, write batch size
64
- dictionary_pagesize_limit: int, dictionary page size limit
65
- store_schema: bool, store schema in metadata
66
- write_page_index: bool, write page index
67
- write_page_checksum: bool, write page checksums
68
- sorting_columns: list, column sorting information
69
- filesystem: FileSystem, filesystem to use
70
- metadata_collector: list, collect metadata
71
"""
72
73
def read_pandas(source, columns=None, **kwargs):
74
"""Read Parquet file optimized for pandas DataFrame."""
75
76
def read_schema(where, memory_map=False, metadata=None, filesystem=None):
77
"""
78
Read schema from Parquet file.
79
80
Parameters:
81
- where: str or file-like, path or file object
82
- memory_map: bool, use memory mapping
83
- metadata: FileMetaData, pre-loaded metadata
84
- filesystem: FileSystem, filesystem to use
85
86
Returns:
87
Schema: Arrow schema from Parquet file
88
"""
89
90
def read_metadata(where, memory_map=False, decryption_properties=None, filesystem=None):
91
"""
92
Read metadata from Parquet file.
93
94
Parameters:
95
- where: str or file-like, path or file object
96
- memory_map: bool, use memory mapping
97
- decryption_properties: FileDecryptionProperties, decryption settings
98
- filesystem: FileSystem, filesystem to use
99
100
Returns:
101
FileMetaData: Parquet file metadata
102
"""
103
104
class ParquetFile:
105
"""
106
Interface for reading Parquet files.
107
108
Attributes:
109
- metadata: FileMetaData object
110
- schema: Arrow schema
111
- schema_arrow: Arrow schema (alias)
112
- num_row_groups: Number of row groups
113
"""
114
115
def __init__(self, source, metadata=None, common_metadata=None, read_dictionary=None, memory_map=False, buffer_size=None, pre_buffer=None, coerce_int96_timestamp_unit=None, decryption_properties=None, thrift_string_size_limit=None, thrift_container_size_limit=None): ...
116
117
def read(self, columns=None, use_threads=True, use_pandas_metadata=False):
118
"""Read entire file as Table."""
119
120
def read_row_group(self, i, columns=None, use_threads=True, use_pandas_metadata=False):
121
"""Read specific row group."""
122
123
def read_row_groups(self, row_groups, columns=None, use_threads=True, use_pandas_metadata=False):
124
"""Read multiple row groups."""
125
126
def iter_batches(self, batch_size=1024, row_groups=None, columns=None, use_threads=True, use_pandas_metadata=False):
127
"""Iterate over record batches."""
128
129
def scan_contents(self, columns=None, batch_size=1024):
130
"""Scan file contents."""
131
132
class ParquetWriter:
133
"""
134
Writer for Parquet files.
135
"""
136
137
def __init__(self, where, schema, filesystem=None, **kwargs): ...
138
139
def write_batch(self, batch, row_group_size=None):
140
"""Write record batch."""
141
142
def write_table(self, table, row_group_size=None):
143
"""Write table."""
144
145
def close(self):
146
"""Close writer and finalize file."""
147
148
# Metadata classes
149
class FileMetaData:
150
"""
151
Parquet file metadata.
152
153
Attributes:
154
- created_by: Creator information
155
- format_version: Parquet format version
156
- metadata: Key-value metadata
157
- num_columns: Number of columns
158
- num_row_groups: Number of row groups
159
- num_rows: Total number of rows
160
- schema: Parquet schema
161
- serialized_size: Serialized metadata size
162
"""
163
164
def row_group(self, i):
165
"""Get row group metadata."""
166
167
def to_dict(self):
168
"""Convert to dictionary."""
169
170
class RowGroupMetaData:
171
"""
172
Row group metadata.
173
174
Attributes:
175
- num_columns: Number of columns in row group
176
- num_rows: Number of rows in row group
177
- total_byte_size: Total byte size
178
"""
179
180
def column(self, i):
181
"""Get column chunk metadata."""
182
183
class ColumnChunkMetaData:
184
"""
185
Column chunk metadata.
186
187
Attributes:
188
- column_path: Column path in schema
189
- compression: Compression codec
190
- data_page_offset: Data page offset
191
- dictionary_page_offset: Dictionary page offset
192
- encodings: List of encodings used
193
- file_offset: File offset
194
- file_path: File path (for external columns)
195
- has_dictionary_page: Whether has dictionary page
196
- index_page_offset: Index page offset
197
- num_values: Number of values
198
- physical_type: Physical storage type
199
- statistics: Column statistics
200
- total_compressed_size: Compressed size
201
- total_uncompressed_size: Uncompressed size
202
"""
203
204
def to_dict(self):
205
"""Convert to dictionary."""
206
207
class ParquetSchema:
208
"""
209
Parquet schema representation.
210
211
Attributes:
212
- names: Column names
213
- pandas_metadata: Pandas metadata
214
"""
215
216
def column(self, i):
217
"""Get column schema."""
218
219
def to_arrow_schema(self):
220
"""Convert to Arrow schema."""
221
222
# Encryption support
223
class FileEncryptionProperties:
224
"""File-level encryption properties."""
225
226
class FileDecryptionProperties:
227
"""File-level decryption properties."""
228
```
229
230
### CSV Format
231
232
Flexible CSV reading and writing with extensive parsing options, type inference, and error handling capabilities.
233
234
```python { .api }
235
def read_csv(input_file, read_options=None, parse_options=None, convert_options=None):
236
"""
237
Read CSV file as Arrow Table.
238
239
Parameters:
240
- input_file: str or file-like, CSV file to read
241
- read_options: ReadOptions, reading configuration
242
- parse_options: ParseOptions, parsing configuration
243
- convert_options: ConvertOptions, conversion configuration
244
245
Returns:
246
Table: Arrow table with CSV data
247
"""
248
249
def write_csv(data, output_file, write_options=None):
250
"""
251
Write Table to CSV file.
252
253
Parameters:
254
- data: Table or RecordBatch, data to write
255
- output_file: str or file-like, output CSV file
256
- write_options: WriteOptions, writing configuration
257
"""
258
259
def open_csv(input_file, read_options=None, parse_options=None, convert_options=None):
260
"""
261
Open CSV file for streaming.
262
263
Parameters:
264
- input_file: str or file-like, CSV file to open
265
- read_options: ReadOptions, reading configuration
266
- parse_options: ParseOptions, parsing configuration
267
- convert_options: ConvertOptions, conversion configuration
268
269
Returns:
270
CSVStreamingReader: Streaming CSV reader
271
"""
272
273
class ReadOptions:
274
"""
275
CSV reading options.
276
277
Attributes:
278
- use_threads: Whether to use multiple threads
279
- block_size: Block size for reading
280
- skip_rows: Number of rows to skip at start
281
- skip_rows_after_names: Rows to skip after header
282
- column_names: Explicit column names
283
- autogenerate_column_names: Auto-generate column names
284
- encoding: Character encoding (default: utf8)
285
"""
286
287
class ParseOptions:
288
"""
289
CSV parsing options.
290
291
Attributes:
292
- delimiter: Field delimiter character
293
- quote_char: Quote character
294
- double_quote: Whether quotes are doubled for escaping
295
- escape_char: Escape character
296
- newlines_in_values: Allow newlines in values
297
- ignore_empty_lines: Skip empty lines
298
"""
299
300
class ConvertOptions:
301
"""
302
CSV type conversion options.
303
304
Attributes:
305
- check_utf8: Validate UTF-8 encoding
306
- column_types: Explicit column types (dict)
307
- null_values: Values to treat as null
308
- true_values: Values to treat as True
309
- false_values: Values to treat as False
310
- decimal_point: Decimal point character
311
- strings_can_be_null: Whether strings can be null
312
- quoted_strings_can_be_null: Whether quoted strings can be null
313
- auto_dict_encode: Auto dictionary-encode string columns
314
- auto_dict_max_cardinality: Max cardinality for auto dict encoding
315
- include_columns: Columns to include
316
- include_missing_columns: Include missing columns as null
317
- timestamp_parsers: Custom timestamp parsers
318
"""
319
320
class WriteOptions:
321
"""
322
CSV writing options.
323
324
Attributes:
325
- include_header: Include column names as header
326
- batch_size: Batch size for writing
327
- delimiter: Field delimiter
328
- quoting_style: When to quote fields
329
"""
330
331
class CSVStreamingReader:
332
"""
333
Streaming CSV reader for large files.
334
"""
335
336
def __iter__(self): ...
337
338
def read_next_batch(self):
339
"""Read next batch of records."""
340
341
def schema(self):
342
"""Get schema of CSV data."""
343
344
class CSVWriter:
345
"""CSV writer with configurable options."""
346
347
def __init__(self, sink, schema, write_options=None): ...
348
349
def write_batch(self, batch):
350
"""Write record batch."""
351
352
def write_table(self, table):
353
"""Write table."""
354
355
def close(self):
356
"""Close writer."""
357
358
class InvalidRow:
359
"""Information about invalid rows during parsing."""
360
361
ISO8601 = ... # ISO8601 timestamp parsing constant
362
```
363
364
### JSON Format
365
366
Line-delimited JSON reading with schema inference and flexible parsing options for semi-structured data.
367
368
```python { .api }
369
def read_json(input_file, read_options=None, parse_options=None):
370
"""
371
Read line-delimited JSON file as Arrow Table.
372
373
Parameters:
374
- input_file: str or file-like, JSON file to read
375
- read_options: ReadOptions, reading configuration
376
- parse_options: ParseOptions, parsing configuration
377
378
Returns:
379
Table: Arrow table with JSON data
380
"""
381
382
def open_json(input_file, read_options=None, parse_options=None):
383
"""
384
Open JSON file for streaming.
385
386
Parameters:
387
- input_file: str or file-like, JSON file to open
388
- read_options: ReadOptions, reading configuration
389
- parse_options: ParseOptions, parsing configuration
390
391
Returns:
392
Iterator: Streaming JSON reader
393
"""
394
395
class ReadOptions:
396
"""
397
JSON reading options.
398
399
Attributes:
400
- use_threads: Whether to use multiple threads
401
- block_size: Block size for reading
402
- schema: Explicit schema
403
"""
404
405
class ParseOptions:
406
"""
407
JSON parsing options.
408
409
Attributes:
410
- newlines_in_values: Allow newlines in string values
411
- explicit_schema: Use explicit schema
412
- unexpected_field_behavior: How to handle unexpected fields
413
"""
414
```
415
416
### Feather Format
417
418
Fast, language-agnostic columnar serialization format optimized for data interchange and temporary storage.
419
420
```python { .api }
421
def read_table(source, columns=None, use_threads=True, memory_map=False):
422
"""
423
Read Feather file as Arrow Table.
424
425
Parameters:
426
- source: str or file-like, Feather file to read
427
- columns: list of str, columns to read
428
- use_threads: bool, use multiple threads
429
- memory_map: bool, use memory mapping
430
431
Returns:
432
Table: Arrow table with Feather data
433
"""
434
435
def read_feather(source, columns=None, use_threads=True, memory_map=False):
436
"""Read Feather file (pandas compatibility)."""
437
438
def write_feather(df, dest, compression=None, compression_level=None, chunksize=None, version=None):
439
"""
440
Write Table to Feather file.
441
442
Parameters:
443
- df: Table or pandas DataFrame, data to write
444
- dest: str or file-like, output Feather file
445
- compression: str, compression codec
446
- compression_level: int, compression level
447
- chunksize: int, maximum rows per chunk
448
- version: int, Feather format version
449
"""
450
451
class FeatherDataset:
452
"""Multi-file Feather dataset interface."""
453
454
class FeatherError(Exception):
455
"""Feather format-specific errors."""
456
```
457
458
### ORC Format
459
460
Optimized Row Columnar format with advanced compression and indexing for big data processing.
461
462
```python { .api }
463
def read_table(source, columns=None, use_threads=True, memory_map=False):
464
"""
465
Read ORC file as Arrow Table.
466
467
Parameters:
468
- source: str or file-like, ORC file to read
469
- columns: list of str, columns to read
470
- use_threads: bool, use multiple threads
471
- memory_map: bool, use memory mapping
472
473
Returns:
474
Table: Arrow table with ORC data
475
"""
476
477
def write_table(table, where, file_version='0.12', batch_size=1024, stripe_size=67108864, compression='ZLIB', compression_block_size=65536, compression_strategy='speed', row_index_stride=10000, padding_tolerance=0.0, dictionary_key_size_threshold=0.0, bloom_filter_columns=None, bloom_filter_fpp=0.05):
478
"""
479
Write Arrow Table to ORC file.
480
481
Parameters:
482
- table: Table, Arrow table to write
483
- where: str or file-like, output ORC file
484
- file_version: str, ORC file format version
485
- batch_size: int, batch size for writing
486
- stripe_size: int, target stripe size in bytes
487
- compression: str, compression codec
488
- compression_block_size: int, compression block size
489
- compression_strategy: str, compression strategy
490
- row_index_stride: int, row index stride
491
- padding_tolerance: float, padding tolerance
492
- dictionary_key_size_threshold: float, dictionary encoding threshold
493
- bloom_filter_columns: list, columns for bloom filters
494
- bloom_filter_fpp: float, bloom filter false positive probability
495
"""
496
497
class ORCFile:
498
"""
499
ORC file reader interface.
500
501
Attributes:
502
- metadata: ORC file metadata
503
- schema: Arrow schema
504
- nrows: Number of rows
505
- nstripes: Number of stripes
506
"""
507
508
def __init__(self, source, memory_map=False): ...
509
510
def read(self, columns=None, use_threads=True):
511
"""Read entire file as Table."""
512
513
def read_stripe(self, n, columns=None):
514
"""Read specific stripe."""
515
```
516
517
## Usage Examples
518
519
### Working with Parquet Files
520
521
```python
522
import pyarrow as pa
523
import pyarrow.parquet as pq
524
525
# Write Parquet file
526
table = pa.table({
527
'id': [1, 2, 3, 4, 5],
528
'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
529
'value': [10.5, 20.3, 30.1, 40.7, 50.2]
530
})
531
532
# Basic write
533
pq.write_table(table, 'example.parquet')
534
535
# Advanced write with options
536
pq.write_table(
537
table,
538
'advanced.parquet',
539
compression='snappy',
540
use_dictionary=['name'],
541
row_group_size=2,
542
write_statistics=True
543
)
544
545
# Read Parquet file
546
loaded_table = pq.read_table('example.parquet')
547
548
# Read specific columns
549
subset = pq.read_table('example.parquet', columns=['id', 'name'])
550
551
# Read with filtering
552
filtered = pq.read_table(
553
'example.parquet',
554
filters=[('value', '>', 25.0)]
555
)
556
557
# Working with ParquetFile class
558
parquet_file = pq.ParquetFile('example.parquet')
559
print(f"Schema: {parquet_file.schema}")
560
print(f"Metadata: {parquet_file.metadata}")
561
print(f"Row groups: {parquet_file.num_row_groups}")
562
563
# Read row group
564
row_group_0 = parquet_file.read_row_group(0)
565
566
# Iterate over batches
567
for batch in parquet_file.iter_batches(batch_size=2):
568
print(batch)
569
```
570
571
### CSV File Operations
572
573
```python
574
import pyarrow as pa
575
import pyarrow.csv as csv
576
577
# Basic CSV reading
578
table = csv.read_csv('data.csv')
579
580
# Advanced CSV reading with options
581
read_options = csv.ReadOptions(
582
skip_rows=1,
583
column_names=['id', 'name', 'age', 'salary']
584
)
585
parse_options = csv.ParseOptions(
586
delimiter=',',
587
quote_char='"',
588
escape_char='\\'
589
)
590
convert_options = csv.ConvertOptions(
591
column_types={
592
'id': pa.int64(),
593
'name': pa.string(),
594
'age': pa.int32(),
595
'salary': pa.float64()
596
},
597
null_values=['', 'NULL', 'null'],
598
strings_can_be_null=True
599
)
600
601
table = csv.read_csv(
602
'data.csv',
603
read_options=read_options,
604
parse_options=parse_options,
605
convert_options=convert_options
606
)
607
608
# Streaming CSV reading
609
reader = csv.open_csv('large_data.csv')
610
for batch in reader:
611
# Process batch
612
print(f"Batch shape: {batch.num_rows} x {batch.num_columns}")
613
614
# Write CSV
615
csv.write_csv(table, 'output.csv')
616
617
# Write with options
618
write_options = csv.WriteOptions(
619
include_header=True,
620
delimiter=';',
621
quoting_style='needed'
622
)
623
csv.write_csv(table, 'output_custom.csv', write_options=write_options)
624
```
625
626
### Multi-Format Workflow
627
628
```python
629
import pyarrow as pa
630
import pyarrow.parquet as pq
631
import pyarrow.csv as csv
632
import pyarrow.feather as feather
633
import pyarrow.orc as orc
634
635
# Create sample data
636
table = pa.table({
637
'date': pa.array(['2023-01-01', '2023-01-02', '2023-01-03']),
638
'value': [100.5, 200.3, 150.7],
639
'category': ['A', 'B', 'A']
640
})
641
642
# Write to different formats
643
pq.write_table(table, 'data.parquet')
644
csv.write_csv(table, 'data.csv')
645
feather.write_feather(table, 'data.feather')
646
orc.write_table(table, 'data.orc')
647
648
# Read from different formats
649
parquet_table = pq.read_table('data.parquet')
650
csv_table = csv.read_csv('data.csv')
651
feather_table = feather.read_table('data.feather')
652
orc_table = orc.read_table('data.orc')
653
654
# Verify all tables are equal
655
assert parquet_table.equals(csv_table)
656
assert csv_table.equals(feather_table)
657
assert feather_table.equals(orc_table)
658
659
# Performance comparison
660
import time
661
662
def time_format(read_func, write_func, filename):
663
# Write timing
664
start = time.time()
665
write_func(table, filename)
666
write_time = time.time() - start
667
668
# Read timing
669
start = time.time()
670
result = read_func(filename)
671
read_time = time.time() - start
672
673
return write_time, read_time
674
675
# Compare formats
676
formats = [
677
('Parquet', pq.read_table, pq.write_table, 'test.parquet'),
678
('Feather', feather.read_table, feather.write_feather, 'test.feather'),
679
('ORC', orc.read_table, orc.write_table, 'test.orc')
680
]
681
682
for name, read_func, write_func, filename in formats:
683
write_time, read_time = time_format(read_func, write_func, filename)
684
print(f"{name}: Write {write_time:.4f}s, Read {read_time:.4f}s")
685
```
686
687
### Advanced Parquet Features
688
689
```python
690
import pyarrow as pa
691
import pyarrow.parquet as pq
692
693
# Schema evolution example
694
old_schema = pa.schema([
695
pa.field('id', pa.int64()),
696
pa.field('name', pa.string()),
697
pa.field('value', pa.float64())
698
])
699
700
new_schema = pa.schema([
701
pa.field('id', pa.int64()),
702
pa.field('name', pa.string()),
703
pa.field('value', pa.float64()),
704
pa.field('category', pa.string()) # New column
705
])
706
707
# Write with old schema
708
old_table = pa.table([
709
[1, 2, 3],
710
['A', 'B', 'C'],
711
[10.5, 20.3, 30.1]
712
], schema=old_schema)
713
714
pq.write_table(old_table, 'old_format.parquet')
715
716
# Read and extend with new schema
717
loaded = pq.read_table('old_format.parquet')
718
extended = loaded.add_column('category', pa.array([None, None, None]))
719
720
# Write with new schema
721
pq.write_table(extended, 'new_format.parquet')
722
723
# Metadata handling
724
metadata = {'version': '1.0', 'created_by': 'pyarrow_example'}
725
table_with_metadata = table.replace_schema_metadata(metadata)
726
pq.write_table(table_with_metadata, 'with_metadata.parquet')
727
728
# Read metadata
729
file_metadata = pq.read_metadata('with_metadata.parquet')
730
print(f"File metadata: {file_metadata.metadata}")
731
print(f"Schema metadata: {file_metadata.schema.to_arrow_schema().metadata}")
732
```