0
# Core Data Structures
1
2
Fundamental data containers that form the foundation of PyArrow's columnar data processing capabilities. These structures provide efficient storage and manipulation of typed data in memory-optimized columnar layouts.
3
4
## Capabilities
5
6
### Arrays
7
8
One-dimensional sequences of values with a specific data type. Arrays are immutable and provide the basic building blocks for all other data structures in PyArrow.
9
10
```python { .api }
11
def array(obj, type=None, mask=None, size=None, from_pandas=None, safe=True):
12
"""
13
Create Arrow array from Python sequence, NumPy array, or pandas data.
14
15
Parameters:
16
- obj: sequence, NumPy array, or pandas Series to convert
17
- type: DataType, explicit type for the array
18
- mask: array-like, boolean mask for null values
19
- size: int, length of array if obj is scalar
20
- from_pandas: bool, interpret pandas-specific data
21
- safe: bool, check for overflow/truncation during conversion
22
23
Returns:
24
Array: Arrow array with specified type
25
"""
26
27
def chunked_array(arrays, type=None):
28
"""
29
Create chunked array from list of arrays.
30
31
Parameters:
32
- arrays: sequence of Array objects
33
- type: DataType, explicit type (must match all arrays)
34
35
Returns:
36
ChunkedArray: Chunked array composed of input arrays
37
"""
38
39
def nulls(size, type=None):
40
"""
41
Create array of null values.
42
43
Parameters:
44
- size: int, length of array
45
- type: DataType, type of nulls (default: null type)
46
47
Returns:
48
Array: Array of null values
49
"""
50
51
def repeat(value, size):
52
"""
53
Create array by repeating a single value.
54
55
Parameters:
56
- value: scalar value to repeat
57
- size: int, number of repetitions
58
59
Returns:
60
Array: Array with repeated value
61
"""
62
63
def arange(start, stop=None, step=1, dtype=None):
64
"""
65
Create array with range of values.
66
67
Parameters:
68
- start: int, start value (or stop if stop is None)
69
- stop: int, stop value (exclusive)
70
- step: int, step size
71
- dtype: DataType, array data type
72
73
Returns:
74
Array: Array with range values
75
"""
76
77
class Array:
78
"""
79
Base class for all Arrow arrays.
80
81
Attributes:
82
- type: DataType of the array
83
- length: Number of elements
84
- null_count: Number of null values
85
- is_valid: Boolean array indicating non-null values
86
"""
87
88
def __len__(self): ...
89
def __getitem__(self, key): ...
90
def __iter__(self): ...
91
92
def to_pylist(self):
93
"""Convert to Python list."""
94
95
def to_pandas(self, **kwargs):
96
"""Convert to pandas Series."""
97
98
def to_numpy(self, **kwargs):
99
"""Convert to NumPy array."""
100
101
def slice(self, offset=0, length=None):
102
"""Return slice of array."""
103
104
def take(self, indices):
105
"""Select elements by indices."""
106
107
def filter(self, mask):
108
"""Filter array by boolean mask."""
109
110
def sort(self, **kwargs):
111
"""Return sorted array."""
112
113
def unique(self):
114
"""Return array of unique values."""
115
116
def value_counts(self):
117
"""Return struct array of value counts."""
118
119
class ChunkedArray:
120
"""
121
Array composed of multiple contiguous arrays (chunks).
122
123
Attributes:
124
- type: DataType of the chunked array
125
- length: Total number of elements across chunks
126
- null_count: Total number of null values
127
- num_chunks: Number of chunks
128
- chunks: List of Array chunks
129
"""
130
131
def __len__(self): ...
132
def __getitem__(self, key): ...
133
def __iter__(self): ...
134
135
def chunk(self, i):
136
"""Get chunk at index i."""
137
138
def to_pylist(self):
139
"""Convert to Python list."""
140
141
def to_pandas(self, **kwargs):
142
"""Convert to pandas Series."""
143
144
def slice(self, offset=0, length=None):
145
"""Return slice of chunked array."""
146
147
def take(self, indices):
148
"""Select elements by indices."""
149
150
def filter(self, mask):
151
"""Filter by boolean mask."""
152
153
def combine_chunks(self):
154
"""Combine chunks into single array."""
155
```
156
157
### Tables
158
159
Two-dimensional datasets with named columns, similar to SQL tables or pandas DataFrames. Tables provide the primary interface for working with tabular data in PyArrow.
160
161
```python { .api }
162
def table(data, schema=None, metadata=None, columns=None):
163
"""
164
Create Arrow table from various data sources.
165
166
Parameters:
167
- data: dict, list of arrays, pandas DataFrame, or RecordBatch
168
- schema: Schema, explicit schema for the table
169
- metadata: dict, key-value metadata
170
- columns: list of str, column names (when data is list)
171
172
Returns:
173
Table: Arrow table with specified schema
174
"""
175
176
def record_batch(data, schema=None, metadata=None):
177
"""
178
Create RecordBatch from data.
179
180
Parameters:
181
- data: dict, list of arrays, or sequence
182
- schema: Schema, explicit schema
183
- metadata: dict, key-value metadata
184
185
Returns:
186
RecordBatch: Single batch of columnar data
187
"""
188
189
def concat_tables(tables, promote=False):
190
"""
191
Concatenate tables vertically.
192
193
Parameters:
194
- tables: sequence of Table objects
195
- promote: bool, promote schemas to compatible type
196
197
Returns:
198
Table: Concatenated table
199
"""
200
201
def concat_arrays(arrays):
202
"""
203
Concatenate arrays into single array.
204
205
Parameters:
206
- arrays: sequence of Array objects with same type
207
208
Returns:
209
Array: Concatenated array
210
"""
211
212
def concat_batches(batches, promote=False):
213
"""
214
Concatenate record batches.
215
216
Parameters:
217
- batches: sequence of RecordBatch objects
218
- promote: bool, promote schemas to compatible type
219
220
Returns:
221
Table: Table created from concatenated batches
222
"""
223
224
class Table:
225
"""
226
Two-dimensional table of columnar data.
227
228
Attributes:
229
- schema: Schema of the table
230
- num_columns: Number of columns
231
- num_rows: Number of rows
232
- column_names: List of column names
233
- columns: List of ChunkedArray columns
234
"""
235
236
def __len__(self): ...
237
def __getitem__(self, key): ...
238
def __iter__(self): ...
239
240
def column(self, i):
241
"""Get column by index or name."""
242
243
def select(self, columns):
244
"""Select subset of columns."""
245
246
def slice(self, offset=0, length=None):
247
"""Return slice of table."""
248
249
def filter(self, mask):
250
"""Filter rows by boolean mask."""
251
252
def take(self, indices):
253
"""Select rows by indices."""
254
255
def sort_by(self, sorting):
256
"""Sort table by columns."""
257
258
def group_by(self, keys):
259
"""Group table by columns."""
260
261
def join(self, right_table, **kwargs):
262
"""Join with another table."""
263
264
def to_pandas(self, **kwargs):
265
"""Convert to pandas DataFrame."""
266
267
def to_pydict(self):
268
"""Convert to dictionary of Python lists."""
269
270
def to_batches(self, max_chunksize=None):
271
"""Convert to iterator of RecordBatch objects."""
272
273
def add_column(self, i, field, column):
274
"""Add column at position i."""
275
276
def append_column(self, field, column):
277
"""Append column to table."""
278
279
def remove_column(self, i):
280
"""Remove column at position i."""
281
282
def rename_columns(self, names):
283
"""Rename columns."""
284
285
def drop(self, columns):
286
"""Drop columns by name."""
287
288
def replace_schema_metadata(self, metadata):
289
"""Replace table metadata."""
290
291
class RecordBatch:
292
"""
293
Collection of arrays with shared length representing a single batch.
294
295
Attributes:
296
- schema: Schema of the batch
297
- num_columns: Number of columns
298
- num_rows: Number of rows
299
- column_names: List of column names
300
- columns: List of Array columns
301
"""
302
303
def __len__(self): ...
304
def __getitem__(self, key): ...
305
def __iter__(self): ...
306
307
def column(self, i):
308
"""Get column by index or name."""
309
310
def select(self, columns):
311
"""Select subset of columns."""
312
313
def slice(self, offset=0, length=None):
314
"""Return slice of batch."""
315
316
def filter(self, mask):
317
"""Filter rows by boolean mask."""
318
319
def take(self, indices):
320
"""Select rows by indices."""
321
322
def to_pandas(self, **kwargs):
323
"""Convert to pandas DataFrame."""
324
325
def to_pydict(self):
326
"""Convert to dictionary of Python lists."""
327
328
def add_column(self, i, field, column):
329
"""Add column at position i."""
330
331
def remove_column(self, i):
332
"""Remove column at position i."""
333
334
def rename_columns(self, names):
335
"""Rename columns."""
336
337
class RecordBatchReader:
338
"""
339
Interface for reading stream of record batches.
340
"""
341
342
def __iter__(self): ...
343
344
def read_next_batch(self):
345
"""Read next batch from stream."""
346
347
def read_all(self):
348
"""Read all batches into table."""
349
350
def schema(self):
351
"""Get schema of batches."""
352
353
class TableGroupBy:
354
"""
355
Grouped table operations.
356
"""
357
358
def aggregate(self, aggregations):
359
"""Perform aggregations on groups."""
360
```
361
362
### Schemas and Fields
363
364
Schema definitions that describe table structure, column types, and metadata. Schemas provide type safety and enable efficient data processing by defining the expected structure of tabular data.
365
366
```python { .api }
367
def schema(fields, metadata=None):
368
"""
369
Create schema from list of fields.
370
371
Parameters:
372
- fields: sequence of Field objects or (name, type) tuples
373
- metadata: dict, key-value metadata for schema
374
375
Returns:
376
Schema: Schema object with specified fields
377
"""
378
379
def field(name, type, nullable=True, metadata=None):
380
"""
381
Create field with name and type.
382
383
Parameters:
384
- name: str, field name
385
- type: DataType, field data type
386
- nullable: bool, whether field can contain nulls
387
- metadata: dict, key-value metadata for field
388
389
Returns:
390
Field: Field object with specified properties
391
"""
392
393
def unify_schemas(schemas):
394
"""
395
Unify multiple schemas into compatible schema.
396
397
Parameters:
398
- schemas: sequence of Schema objects
399
400
Returns:
401
Schema: Unified schema compatible with all input schemas
402
"""
403
404
class Schema:
405
"""
406
Schema defining structure of tabular data.
407
408
Attributes:
409
- names: List of field names
410
- types: List of field types
411
- metadata: Key-value metadata
412
"""
413
414
def __len__(self): ...
415
def __getitem__(self, key): ...
416
def __iter__(self): ...
417
418
def field(self, i):
419
"""Get field by index or name."""
420
421
def get_field_index(self, name):
422
"""Get index of field by name."""
423
424
def select(self, names):
425
"""Select subset of fields."""
426
427
def insert(self, i, field):
428
"""Insert field at position i."""
429
430
def append(self, field):
431
"""Append field to schema."""
432
433
def remove(self, i):
434
"""Remove field at position i."""
435
436
def with_metadata(self, metadata):
437
"""Return schema with new metadata."""
438
439
def equals(self, other, check_metadata=True):
440
"""Check equality with another schema."""
441
442
def to_string(self, **kwargs):
443
"""String representation of schema."""
444
445
class Field:
446
"""
447
Named field in a schema with type and metadata.
448
449
Attributes:
450
- name: Field name
451
- type: DataType of field
452
- nullable: Whether field can contain nulls
453
- metadata: Key-value metadata
454
"""
455
456
def with_name(self, name):
457
"""Return field with new name."""
458
459
def with_type(self, type):
460
"""Return field with new type."""
461
462
def with_nullable(self, nullable):
463
"""Return field with new nullable setting."""
464
465
def with_metadata(self, metadata):
466
"""Return field with new metadata."""
467
468
def equals(self, other, check_metadata=True):
469
"""Check equality with another field."""
470
471
def to_string(self, **kwargs):
472
"""String representation of field."""
473
474
class KeyValueMetadata:
475
"""
476
Key-value metadata container.
477
"""
478
479
def __len__(self): ...
480
def __getitem__(self, key): ...
481
def __iter__(self): ...
482
483
def get(self, key, default=None):
484
"""Get value by key."""
485
486
def keys(self):
487
"""Get all keys."""
488
489
def values(self):
490
"""Get all values."""
491
492
def items(self):
493
"""Get key-value pairs."""
494
495
def to_dict(self):
496
"""Convert to Python dictionary."""
497
```
498
499
### Scalars
500
501
Single typed values that provide consistent interface for working with individual data elements. Scalars maintain type information and null state, enabling type-safe operations on individual values.
502
503
```python { .api }
504
def scalar(value, type=None):
505
"""
506
Create scalar from Python value.
507
508
Parameters:
509
- value: Python value to wrap
510
- type: DataType, explicit type for scalar
511
512
Returns:
513
Scalar: Typed scalar value
514
"""
515
516
# Scalar constants
517
NA = ... # Not Available scalar
518
NULL = ... # Null scalar
519
520
class Scalar:
521
"""
522
Base class for typed scalar values.
523
524
Attributes:
525
- type: DataType of scalar
526
- is_valid: Whether scalar is non-null
527
"""
528
529
def __eq__(self, other): ...
530
def __hash__(self): ...
531
532
def as_py(self):
533
"""Convert to Python value."""
534
535
def cast(self, target_type, safe=True):
536
"""Cast to different type."""
537
538
def equals(self, other):
539
"""Check equality with another scalar."""
540
541
# Specific scalar types are available for all Arrow data types:
542
# NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
543
# UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, HalfFloatScalar,
544
# FloatScalar, DoubleScalar, Decimal128Scalar, StringScalar, BinaryScalar,
545
# Date32Scalar, Date64Scalar, TimestampScalar, Time32Scalar, Time64Scalar,
546
# DurationScalar, ListScalar, StructScalar, MapScalar, DictionaryScalar, etc.
547
```
548
549
### Tensors and Sparse Data
550
551
Multi-dimensional arrays and sparse data structures for advanced numerical computing and machine learning applications.
552
553
```python { .api }
554
class Tensor:
555
"""
556
Multi-dimensional array with Arrow data.
557
558
Attributes:
559
- type: DataType of tensor elements
560
- shape: Shape tuple of tensor dimensions
561
- strides: Strides tuple for memory layout
562
- is_mutable: Whether tensor data is mutable
563
"""
564
565
def __getitem__(self, key): ...
566
567
def to_numpy(self):
568
"""Convert to NumPy array."""
569
570
def equals(self, other):
571
"""Check equality with another tensor."""
572
573
class SparseCOOTensor:
574
"""Sparse tensor in COOrdinate format."""
575
576
class SparseCSRMatrix:
577
"""Sparse matrix in Compressed Sparse Row format."""
578
579
class SparseCSCMatrix:
580
"""Sparse matrix in Compressed Sparse Column format."""
581
582
class SparseCSFTensor:
583
"""Sparse tensor in Compressed Sparse Fiber format."""
584
```
585
586
## Type Definitions
587
588
### Memory Management
589
590
```python { .api }
591
class DictionaryMemo:
592
"""
593
Memo for dictionary encoding to ensure consistent dictionaries.
594
"""
595
596
def __init__(self): ...
597
598
def get_dictionary(self, type):
599
"""Get dictionary for type."""
600
601
def set_dictionary(self, type, dictionary):
602
"""Set dictionary for type.</""
603
```
604
605
## Usage Examples
606
607
### Creating and Manipulating Arrays
608
609
```python
610
import pyarrow as pa
611
import numpy as np
612
613
# Create arrays from various sources
614
int_array = pa.array([1, 2, 3, 4, 5])
615
str_array = pa.array(['apple', 'banana', 'cherry', None])
616
np_array = pa.array(np.random.randn(1000))
617
618
# Create chunked array
619
chunks = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]
620
chunked = pa.chunked_array(chunks)
621
622
# Array operations
623
filtered = int_array.filter(pa.array([True, False, True, False, True]))
624
sorted_array = str_array.sort()
625
unique_values = str_array.unique()
626
627
# Convert to other formats
628
python_list = int_array.to_pylist()
629
pandas_series = int_array.to_pandas()
630
numpy_array = int_array.to_numpy()
631
```
632
633
### Working with Tables
634
635
```python
636
import pyarrow as pa
637
638
# Create table from dictionary
639
data = {
640
'id': [1, 2, 3, 4, 5],
641
'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
642
'age': [25, 30, 35, 28, 32],
643
'salary': [50000.0, 60000.0, 70000.0, 55000.0, 65000.0]
644
}
645
table = pa.table(data)
646
647
# Table operations
648
subset = table.select(['name', 'age'])
649
filtered = table.filter(pa.compute.greater(table['age'], 30))
650
sorted_table = table.sort_by([('age', 'descending')])
651
652
# Add/remove columns
653
new_table = table.add_column(4, pa.field('bonus', pa.float64()),
654
pa.array([5000.0, 6000.0, 7000.0, 5500.0, 6500.0]))
655
dropped = table.drop(['salary'])
656
657
# Convert to pandas
658
df = table.to_pandas()
659
```
660
661
### Schema Definition
662
663
```python
664
import pyarrow as pa
665
666
# Define schema explicitly
667
schema = pa.schema([
668
pa.field('id', pa.int64()),
669
pa.field('name', pa.string()),
670
pa.field('scores', pa.list_(pa.float64())),
671
pa.field('metadata', pa.map_(pa.string(), pa.string()))
672
])
673
674
# Create table with schema
675
table = pa.table({
676
'id': [1, 2, 3],
677
'name': ['Alice', 'Bob', 'Charlie'],
678
'scores': [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
679
'metadata': [{'key': 'value'}, {}, {'foo': 'bar'}]
680
}, schema=schema)
681
682
# Schema operations
683
field = schema.field('name')
684
field_index = schema.get_field_index('scores')
685
partial_schema = schema.select(['id', 'name'])
686
```