0
# Data Types
1
2
Comprehensive type system supporting primitive types, nested structures, temporal types, and custom extension types. PyArrow's type system provides rich data modeling capabilities with type checking, conversion, and inference for robust data processing workflows.
3
4
## Capabilities
5
6
### Type Factory Functions
7
8
Functions for creating Arrow data types. These factory functions return DataType objects that can be used to define schemas and create typed arrays.
9
10
```python { .api }
11
# Primitive types
12
def null():
13
"""Null type containing only null values."""
14
15
def bool_():
16
"""Boolean type (true/false values)."""
17
18
def int8():
19
"""8-bit signed integer type."""
20
21
def int16():
22
"""16-bit signed integer type."""
23
24
def int32():
25
"""32-bit signed integer type."""
26
27
def int64():
28
"""64-bit signed integer type."""
29
30
def uint8():
31
"""8-bit unsigned integer type."""
32
33
def uint16():
34
"""16-bit unsigned integer type."""
35
36
def uint32():
37
"""32-bit unsigned integer type."""
38
39
def uint64():
40
"""64-bit unsigned integer type."""
41
42
def float16():
43
"""16-bit floating point type."""
44
45
def float32():
46
"""32-bit floating point type."""
47
48
def float64():
49
"""64-bit floating point type."""
50
51
# Decimal types
52
def decimal32(precision, scale=0):
53
"""
54
32-bit decimal type.
55
56
Parameters:
57
- precision: int, total number of digits (1-7)
58
- scale: int, number of digits after decimal point
59
60
Returns:
61
Decimal32Type: 32-bit decimal type
62
"""
63
64
def decimal64(precision, scale=0):
65
"""
66
64-bit decimal type.
67
68
Parameters:
69
- precision: int, total number of digits (1-15)
70
- scale: int, number of digits after decimal point
71
72
Returns:
73
Decimal64Type: 64-bit decimal type
74
"""
75
76
def decimal128(precision, scale=0):
77
"""
78
128-bit decimal type.
79
80
Parameters:
81
- precision: int, total number of digits (1-38)
82
- scale: int, number of digits after decimal point
83
84
Returns:
85
Decimal128Type: 128-bit decimal type
86
"""
87
88
def decimal256(precision, scale=0):
89
"""
90
256-bit decimal type.
91
92
Parameters:
93
- precision: int, total number of digits (1-76)
94
- scale: int, number of digits after decimal point
95
96
Returns:
97
Decimal256Type: 256-bit decimal type
98
"""
99
100
# Temporal types
101
def time32(unit='s'):
102
"""
103
32-bit time type.
104
105
Parameters:
106
- unit: str, time unit ('s' for seconds, 'ms' for milliseconds)
107
108
Returns:
109
Time32Type: 32-bit time type
110
"""
111
112
def time64(unit='us'):
113
"""
114
64-bit time type.
115
116
Parameters:
117
- unit: str, time unit ('us' for microseconds, 'ns' for nanoseconds)
118
119
Returns:
120
Time64Type: 64-bit time type
121
"""
122
123
def timestamp(unit, tz=None):
124
"""
125
Timestamp type with timezone support.
126
127
Parameters:
128
- unit: str, time unit ('s', 'ms', 'us', 'ns')
129
- tz: str, timezone identifier (e.g., 'UTC', 'America/New_York')
130
131
Returns:
132
TimestampType: Timestamp type with specified precision and timezone
133
"""
134
135
def date32():
136
"""32-bit date type (days since epoch)."""
137
138
def date64():
139
"""64-bit date type (milliseconds since epoch)."""
140
141
def duration(unit):
142
"""
143
Duration type.
144
145
Parameters:
146
- unit: str, time unit ('s', 'ms', 'us', 'ns')
147
148
Returns:
149
DurationType: Duration type with specified unit
150
"""
151
152
def month_day_nano_interval():
153
"""Month-day-nanosecond interval type."""
154
155
# Binary and string types
156
def binary():
157
"""Variable-length binary type."""
158
159
def string():
160
"""Variable-length string type (UTF-8)."""
161
162
def utf8():
163
"""Alias for string() - UTF-8 encoded strings."""
164
165
def large_binary():
166
"""Large variable-length binary type (64-bit offsets)."""
167
168
def large_string():
169
"""Large variable-length string type (64-bit offsets)."""
170
171
def large_utf8():
172
"""Alias for large_string() - large UTF-8 strings."""
173
174
def binary_view():
175
"""Binary view type for large binary data."""
176
177
def string_view():
178
"""String view type for large string data."""
179
180
def fixed_size_binary(byte_width):
181
"""
182
Fixed-size binary type.
183
184
Parameters:
185
- byte_width: int, number of bytes per value
186
187
Returns:
188
FixedSizeBinaryType: Fixed-size binary type
189
"""
190
191
# Container types
192
def list_(value_type):
193
"""
194
Variable-length list type.
195
196
Parameters:
197
- value_type: DataType, type of list elements
198
199
Returns:
200
ListType: List type with specified element type
201
"""
202
203
def large_list(value_type):
204
"""
205
Large variable-length list type (64-bit offsets).
206
207
Parameters:
208
- value_type: DataType, type of list elements
209
210
Returns:
211
LargeListType: Large list type with specified element type
212
"""
213
214
def fixed_size_list(value_type, list_size):
215
"""
216
Fixed-size list type.
217
218
Parameters:
219
- value_type: DataType, type of list elements
220
- list_size: int, number of elements per list
221
222
Returns:
223
FixedSizeListType: Fixed-size list type
224
"""
225
226
def list_view(value_type):
227
"""
228
List view type for efficient list operations.
229
230
Parameters:
231
- value_type: DataType, type of list elements
232
233
Returns:
234
ListViewType: List view type with specified element type
235
"""
236
237
def large_list_view(value_type):
238
"""
239
Large list view type.
240
241
Parameters:
242
- value_type: DataType, type of list elements
243
244
Returns:
245
LargeListViewType: Large list view type with specified element type
246
"""
247
248
def map_(key_type, item_type, keys_sorted=False):
249
"""
250
Map type (key-value pairs).
251
252
Parameters:
253
- key_type: DataType, type of map keys
254
- item_type: DataType, type of map values
255
- keys_sorted: bool, whether keys are sorted
256
257
Returns:
258
MapType: Map type with specified key and value types
259
"""
260
261
def struct(fields):
262
"""
263
Struct type with named fields.
264
265
Parameters:
266
- fields: list of Field objects or (name, type) tuples
267
268
Returns:
269
StructType: Struct type with specified fields
270
"""
271
272
def union(fields, mode='sparse'):
273
"""
274
Union type supporting multiple value types.
275
276
Parameters:
277
- fields: list of Field objects
278
- mode: str, union mode ('sparse' or 'dense')
279
280
Returns:
281
UnionType: Union type with specified fields and mode
282
"""
283
284
def sparse_union(fields):
285
"""
286
Sparse union type.
287
288
Parameters:
289
- fields: list of Field objects
290
291
Returns:
292
SparseUnionType: Sparse union type
293
"""
294
295
def dense_union(fields):
296
"""
297
Dense union type.
298
299
Parameters:
300
- fields: list of Field objects
301
302
Returns:
303
DenseUnionType: Dense union type
304
"""
305
306
def dictionary(index_type, value_type, ordered=False):
307
"""
308
Dictionary-encoded type.
309
310
Parameters:
311
- index_type: DataType, type of dictionary indices
312
- value_type: DataType, type of dictionary values
313
- ordered: bool, whether dictionary is ordered
314
315
Returns:
316
DictionaryType: Dictionary type
317
"""
318
319
def run_end_encoded(run_end_type, value_type):
320
"""
321
Run-end encoded type for efficient storage of repeated values.
322
323
Parameters:
324
- run_end_type: DataType, type for run end indices
325
- value_type: DataType, type of encoded values
326
327
Returns:
328
RunEndEncodedType: Run-end encoded type
329
"""
330
331
# Advanced types
332
def fixed_shape_tensor(shape, value_type):
333
"""
334
Fixed-shape tensor type.
335
336
Parameters:
337
- shape: tuple of int, tensor shape
338
- value_type: DataType, type of tensor elements
339
340
Returns:
341
FixedShapeTensorType: Fixed-shape tensor type
342
"""
343
344
def json_():
345
"""JSON type for storing JSON documents."""
346
347
def opaque(opaque_type):
348
"""
349
Opaque type for application-specific data.
350
351
Parameters:
352
- opaque_type: DataType, underlying storage type
353
354
Returns:
355
OpaqueType: Opaque type
356
"""
357
358
def uuid():
359
"""UUID type for universally unique identifiers."""
360
```
361
362
### Type System Functions
363
364
Utility functions for working with types, including type inference, conversion, and registration of custom types.
365
366
```python { .api }
367
def type_for_alias(name):
368
"""
369
Get Arrow type from string alias.
370
371
Parameters:
372
- name: str, type alias (e.g., 'int64', 'string', 'float32')
373
374
Returns:
375
DataType: Arrow type corresponding to alias
376
"""
377
378
def from_numpy_dtype(dtype):
379
"""
380
Convert NumPy dtype to Arrow type.
381
382
Parameters:
383
- dtype: numpy.dtype, NumPy data type
384
385
Returns:
386
DataType: Corresponding Arrow type
387
"""
388
389
def infer_type(values, mask=None, from_pandas=False):
390
"""
391
Infer Arrow type from Python sequence.
392
393
Parameters:
394
- values: sequence, data to infer type from
395
- mask: array-like, boolean mask for null values
396
- from_pandas: bool, use pandas-specific inference
397
398
Returns:
399
DataType: Inferred Arrow type
400
"""
401
402
def register_extension_type(ext_type):
403
"""
404
Register custom extension type.
405
406
Parameters:
407
- ext_type: ExtensionType, extension type to register
408
"""
409
410
def unregister_extension_type(type_name):
411
"""
412
Unregister extension type.
413
414
Parameters:
415
- type_name: str, name of extension type to unregister
416
"""
417
```
418
419
### Type Classes
420
421
Base classes and specific implementations for all Arrow data types. These classes provide type information and enable type-safe operations.
422
423
```python { .api }
424
class DataType:
425
"""
426
Base class for all Arrow data types.
427
428
Attributes:
429
- id: Type identifier
430
"""
431
432
def __eq__(self, other): ...
433
def __hash__(self): ...
434
435
def equals(self, other):
436
"""Check type equality."""
437
438
def to_pandas_dtype(self):
439
"""Convert to pandas dtype."""
440
441
class DictionaryType(DataType):
442
"""
443
Dictionary-encoded type.
444
445
Attributes:
446
- index_type: Type of dictionary indices
447
- value_type: Type of dictionary values
448
- ordered: Whether dictionary is ordered
449
"""
450
451
class StructType(DataType):
452
"""
453
Struct type with named fields.
454
455
Attributes:
456
- num_fields: Number of fields
457
"""
458
459
def field(self, i):
460
"""Get field by index."""
461
462
def get_field_index(self, name):
463
"""Get field index by name."""
464
465
def get_all_field_indices(self, name):
466
"""Get all field indices by name."""
467
468
class ListType(DataType):
469
"""
470
Variable-length list type.
471
472
Attributes:
473
- value_type: Type of list elements
474
"""
475
476
class LargeListType(DataType):
477
"""
478
Large variable-length list type.
479
480
Attributes:
481
- value_type: Type of list elements
482
"""
483
484
class FixedSizeListType(DataType):
485
"""
486
Fixed-size list type.
487
488
Attributes:
489
- value_type: Type of list elements
490
- list_size: Number of elements per list
491
"""
492
493
class ListViewType(DataType):
494
"""
495
List view type.
496
497
Attributes:
498
- value_type: Type of list elements
499
"""
500
501
class LargeListViewType(DataType):
502
"""
503
Large list view type.
504
505
Attributes:
506
- value_type: Type of list elements
507
"""
508
509
class MapType(DataType):
510
"""
511
Map type for key-value pairs.
512
513
Attributes:
514
- key_type: Type of map keys
515
- item_type: Type of map values
516
- keys_sorted: Whether keys are sorted
517
"""
518
519
class UnionType(DataType):
520
"""
521
Base class for union types.
522
523
Attributes:
524
- mode: Union mode ('sparse' or 'dense')
525
- num_fields: Number of union fields
526
"""
527
528
class SparseUnionType(UnionType):
529
"""Sparse union type."""
530
531
class DenseUnionType(UnionType):
532
"""Dense union type."""
533
534
class TimestampType(DataType):
535
"""
536
Timestamp type.
537
538
Attributes:
539
- unit: Time unit ('s', 'ms', 'us', 'ns')
540
- tz: Timezone identifier
541
"""
542
543
class Time32Type(DataType):
544
"""
545
32-bit time type.
546
547
Attributes:
548
- unit: Time unit ('s', 'ms')
549
"""
550
551
class Time64Type(DataType):
552
"""
553
64-bit time type.
554
555
Attributes:
556
- unit: Time unit ('us', 'ns')
557
"""
558
559
class DurationType(DataType):
560
"""
561
Duration type.
562
563
Attributes:
564
- unit: Time unit ('s', 'ms', 'us', 'ns')
565
"""
566
567
class FixedSizeBinaryType(DataType):
568
"""
569
Fixed-size binary type.
570
571
Attributes:
572
- byte_width: Number of bytes per value
573
"""
574
575
class Decimal32Type(DataType):
576
"""
577
32-bit decimal type.
578
579
Attributes:
580
- precision: Total number of digits
581
- scale: Number of digits after decimal point
582
"""
583
584
class Decimal64Type(DataType):
585
"""
586
64-bit decimal type.
587
588
Attributes:
589
- precision: Total number of digits
590
- scale: Number of digits after decimal point
591
"""
592
593
class Decimal128Type(DataType):
594
"""
595
128-bit decimal type.
596
597
Attributes:
598
- precision: Total number of digits
599
- scale: Number of digits after decimal point
600
"""
601
602
class Decimal256Type(DataType):
603
"""
604
256-bit decimal type.
605
606
Attributes:
607
- precision: Total number of digits
608
- scale: Number of digits after decimal point
609
"""
610
611
class BaseExtensionType(DataType):
612
"""Base class for extension types."""
613
614
class ExtensionType(BaseExtensionType):
615
"""
616
User-defined extension type.
617
618
Attributes:
619
- extension_name: Name of extension type
620
- storage_type: Underlying storage type
621
"""
622
623
def __arrow_ext_serialize__(self):
624
"""Serialize extension type metadata."""
625
626
def __arrow_ext_deserialize__(self, storage_type, serialized):
627
"""Deserialize extension type from metadata."""
628
629
class RunEndEncodedType(DataType):
630
"""
631
Run-end encoded type.
632
633
Attributes:
634
- run_end_type: Type of run end indices
635
- value_type: Type of encoded values
636
"""
637
638
class FixedShapeTensorType(DataType):
639
"""
640
Fixed-shape tensor type.
641
642
Attributes:
643
- shape: Tensor shape
644
- value_type: Type of tensor elements
645
"""
646
647
class JsonType(DataType):
648
"""JSON document type."""
649
650
class OpaqueType(DataType):
651
"""
652
Opaque type for application-specific data.
653
654
Attributes:
655
- opaque_type: Underlying storage type
656
"""
657
658
class UuidType(DataType):
659
"""UUID type."""
660
661
class UnknownExtensionType(ExtensionType):
662
"""Unknown extension type placeholder."""
663
```
664
665
### Type Checking Functions
666
667
Functions to check and validate Arrow data types. These predicates enable type-safe programming and conditional logic based on type information.
668
669
```python { .api }
670
# Primitive type checks
671
def is_null(type):
672
"""Check if type is null type."""
673
674
def is_boolean(type):
675
"""Check if type is boolean type."""
676
677
def is_integer(type):
678
"""Check if type is any integer type."""
679
680
def is_signed_integer(type):
681
"""Check if type is signed integer type."""
682
683
def is_unsigned_integer(type):
684
"""Check if type is unsigned integer type."""
685
686
def is_int8(type):
687
"""Check if type is 8-bit signed integer."""
688
689
def is_int16(type):
690
"""Check if type is 16-bit signed integer."""
691
692
def is_int32(type):
693
"""Check if type is 32-bit signed integer."""
694
695
def is_int64(type):
696
"""Check if type is 64-bit signed integer."""
697
698
def is_uint8(type):
699
"""Check if type is 8-bit unsigned integer."""
700
701
def is_uint16(type):
702
"""Check if type is 16-bit unsigned integer."""
703
704
def is_uint32(type):
705
"""Check if type is 32-bit unsigned integer."""
706
707
def is_uint64(type):
708
"""Check if type is 64-bit unsigned integer."""
709
710
def is_floating(type):
711
"""Check if type is floating point type."""
712
713
def is_float16(type):
714
"""Check if type is 16-bit floating point."""
715
716
def is_float32(type):
717
"""Check if type is 32-bit floating point."""
718
719
def is_float64(type):
720
"""Check if type is 64-bit floating point."""
721
722
# Container type checks
723
def is_list(type):
724
"""Check if type is variable-length list."""
725
726
def is_large_list(type):
727
"""Check if type is large variable-length list."""
728
729
def is_fixed_size_list(type):
730
"""Check if type is fixed-size list."""
731
732
def is_list_view(type):
733
"""Check if type is list view."""
734
735
def is_large_list_view(type):
736
"""Check if type is large list view."""
737
738
def is_struct(type):
739
"""Check if type is struct type."""
740
741
def is_union(type):
742
"""Check if type is union type."""
743
744
def is_nested(type):
745
"""Check if type is nested (list, struct, map, union)."""
746
747
def is_run_end_encoded(type):
748
"""Check if type is run-end encoded."""
749
750
# Temporal type checks
751
def is_temporal(type):
752
"""Check if type is temporal (timestamp, date, time, duration)."""
753
754
def is_timestamp(type):
755
"""Check if type is timestamp."""
756
757
def is_duration(type):
758
"""Check if type is duration."""
759
760
def is_time(type):
761
"""Check if type is time (32-bit or 64-bit)."""
762
763
def is_time32(type):
764
"""Check if type is 32-bit time."""
765
766
def is_time64(type):
767
"""Check if type is 64-bit time."""
768
769
def is_date(type):
770
"""Check if type is date (32-bit or 64-bit)."""
771
772
def is_date32(type):
773
"""Check if type is 32-bit date."""
774
775
def is_date64(type):
776
"""Check if type is 64-bit date."""
777
778
# Binary and string type checks
779
def is_binary(type):
780
"""Check if type is variable-length binary."""
781
782
def is_large_binary(type):
783
"""Check if type is large variable-length binary."""
784
785
def is_string(type):
786
"""Check if type is variable-length string."""
787
788
def is_large_string(type):
789
"""Check if type is large variable-length string."""
790
791
def is_binary_view(type):
792
"""Check if type is binary view."""
793
794
def is_string_view(type):
795
"""Check if type is string view."""
796
797
def is_fixed_size_binary(type):
798
"""Check if type is fixed-size binary."""
799
800
# Other type checks
801
def is_map(type):
802
"""Check if type is map type."""
803
804
def is_decimal(type):
805
"""Check if type is any decimal type."""
806
807
def is_decimal32(type):
808
"""Check if type is 32-bit decimal."""
809
810
def is_decimal64(type):
811
"""Check if type is 64-bit decimal."""
812
813
def is_decimal128(type):
814
"""Check if type is 128-bit decimal."""
815
816
def is_decimal256(type):
817
"""Check if type is 256-bit decimal."""
818
819
def is_dictionary(type):
820
"""Check if type is dictionary-encoded."""
821
822
def is_interval(type):
823
"""Check if type is interval type."""
824
825
def is_primitive(type):
826
"""Check if type is primitive (non-nested)."""
827
```
828
829
## Usage Examples
830
831
### Creating and Using Types
832
833
```python
834
import pyarrow as pa
835
836
# Create primitive types
837
int_type = pa.int64()
838
str_type = pa.string()
839
float_type = pa.float64()
840
841
# Create temporal types
842
timestamp_type = pa.timestamp('ms', tz='UTC')
843
date_type = pa.date32()
844
duration_type = pa.duration('us')
845
846
# Create decimal types
847
decimal_type = pa.decimal128(precision=10, scale=2)
848
849
# Create nested types
850
list_type = pa.list_(pa.int32())
851
struct_type = pa.struct([
852
pa.field('name', pa.string()),
853
pa.field('age', pa.int32()),
854
pa.field('scores', pa.list_(pa.float64()))
855
])
856
map_type = pa.map_(pa.string(), pa.int64())
857
```
858
859
### Type Checking and Conversion
860
861
```python
862
import pyarrow as pa
863
864
# Type checking
865
data_type = pa.int64()
866
print(pa.types.is_integer(data_type)) # True
867
print(pa.types.is_floating(data_type)) # False
868
print(pa.types.is_signed_integer(data_type)) # True
869
870
# Type inference
871
values = [1, 2, 3, 4, 5]
872
inferred_type = pa.infer_type(values)
873
print(inferred_type) # int64
874
875
# Convert from NumPy
876
import numpy as np
877
numpy_dtype = np.dtype('float32')
878
arrow_type = pa.from_numpy_dtype(numpy_dtype)
879
print(arrow_type) # float32
880
881
# Type aliases
882
string_type = pa.type_for_alias('string')
883
int_type = pa.type_for_alias('int64')
884
```
885
886
### Working with Complex Types
887
888
```python
889
import pyarrow as pa
890
891
# Create schema with complex types
892
schema = pa.schema([
893
pa.field('id', pa.int64()),
894
pa.field('name', pa.string()),
895
pa.field('tags', pa.list_(pa.string())),
896
pa.field('metadata', pa.map_(pa.string(), pa.string())),
897
pa.field('location', pa.struct([
898
pa.field('lat', pa.float64()),
899
pa.field('lon', pa.float64())
900
])),
901
pa.field('timestamp', pa.timestamp('ms', tz='UTC'))
902
])
903
904
# Create arrays with complex types
905
tags_array = pa.array([['python', 'data'], ['arrow', 'columnar'], ['analytics']])
906
metadata_array = pa.array([
907
{'version': '1.0', 'author': 'alice'},
908
{'version': '2.0'},
909
{}
910
])
911
location_array = pa.array([
912
{'lat': 40.7128, 'lon': -74.0060},
913
{'lat': 51.5074, 'lon': -0.1278},
914
{'lat': 35.6762, 'lon': 139.6503}
915
])
916
917
# Create table with complex data
918
table = pa.table({
919
'id': [1, 2, 3],
920
'name': ['New York', 'London', 'Tokyo'],
921
'tags': tags_array,
922
'metadata': metadata_array,
923
'location': location_array,
924
'timestamp': pa.array([
925
'2023-01-01T00:00:00.000Z',
926
'2023-01-02T00:00:00.000Z',
927
'2023-01-03T00:00:00.000Z'
928
], type=pa.timestamp('ms', tz='UTC'))
929
}, schema=schema)
930
```
931
932
### Extension Types
933
934
```python
935
import pyarrow as pa
936
937
# Define custom extension type
938
class UuidType(pa.ExtensionType):
939
def __init__(self):
940
super().__init__(pa.binary(16), "uuid")
941
942
def __arrow_ext_serialize__(self):
943
return b''
944
945
@classmethod
946
def __arrow_ext_deserialize__(cls, storage_type, serialized):
947
return UuidType()
948
949
# Register extension type
950
pa.register_extension_type(UuidType())
951
952
# Create array with extension type
953
uuid_type = UuidType()
954
uuid_array = pa.array([
955
b'\x12\x34\x56\x78\x90\xab\xcd\xef\x12\x34\x56\x78\x90\xab\xcd\xef',
956
b'\xfe\xdc\xba\x98\x76\x54\x32\x10\xfe\xdc\xba\x98\x76\x54\x32\x10'
957
], type=uuid_type)
958
```