or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-features.mdarrow-flight.mdcompute-functions.mdcore-data-structures.mddata-types.mddataset-operations.mdfile-formats.mdindex.mdmemory-io.md

data-types.mddocs/

0

# Data Types

1

2

Comprehensive type system supporting primitive types, nested structures, temporal types, and custom extension types. PyArrow's type system provides rich data modeling capabilities with type checking, conversion, and inference for robust data processing workflows.

3

4

## Capabilities

5

6

### Type Factory Functions

7

8

Functions for creating Arrow data types. These factory functions return DataType objects that can be used to define schemas and create typed arrays.

9

10

```python { .api }

11

# Primitive types

12

def null():

13

"""Null type containing only null values."""

14

15

def bool_():

16

"""Boolean type (true/false values)."""

17

18

def int8():

19

"""8-bit signed integer type."""

20

21

def int16():

22

"""16-bit signed integer type."""

23

24

def int32():

25

"""32-bit signed integer type."""

26

27

def int64():

28

"""64-bit signed integer type."""

29

30

def uint8():

31

"""8-bit unsigned integer type."""

32

33

def uint16():

34

"""16-bit unsigned integer type."""

35

36

def uint32():

37

"""32-bit unsigned integer type."""

38

39

def uint64():

40

"""64-bit unsigned integer type."""

41

42

def float16():

43

"""16-bit floating point type."""

44

45

def float32():

46

"""32-bit floating point type."""

47

48

def float64():

49

"""64-bit floating point type."""

50

51

# Decimal types

52

def decimal32(precision, scale=0):

53

"""

54

32-bit decimal type.

55

56

Parameters:

57

- precision: int, total number of digits (1-7)

58

- scale: int, number of digits after decimal point

59

60

Returns:

61

Decimal32Type: 32-bit decimal type

62

"""

63

64

def decimal64(precision, scale=0):

65

"""

66

64-bit decimal type.

67

68

Parameters:

69

- precision: int, total number of digits (1-15)

70

- scale: int, number of digits after decimal point

71

72

Returns:

73

Decimal64Type: 64-bit decimal type

74

"""

75

76

def decimal128(precision, scale=0):

77

"""

78

128-bit decimal type.

79

80

Parameters:

81

- precision: int, total number of digits (1-38)

82

- scale: int, number of digits after decimal point

83

84

Returns:

85

Decimal128Type: 128-bit decimal type

86

"""

87

88

def decimal256(precision, scale=0):

89

"""

90

256-bit decimal type.

91

92

Parameters:

93

- precision: int, total number of digits (1-76)

94

- scale: int, number of digits after decimal point

95

96

Returns:

97

Decimal256Type: 256-bit decimal type

98

"""

99

100

# Temporal types

101

def time32(unit='s'):

102

"""

103

32-bit time type.

104

105

Parameters:

106

- unit: str, time unit ('s' for seconds, 'ms' for milliseconds)

107

108

Returns:

109

Time32Type: 32-bit time type

110

"""

111

112

def time64(unit='us'):

113

"""

114

64-bit time type.

115

116

Parameters:

117

- unit: str, time unit ('us' for microseconds, 'ns' for nanoseconds)

118

119

Returns:

120

Time64Type: 64-bit time type

121

"""

122

123

def timestamp(unit, tz=None):

124

"""

125

Timestamp type with timezone support.

126

127

Parameters:

128

- unit: str, time unit ('s', 'ms', 'us', 'ns')

129

- tz: str, timezone identifier (e.g., 'UTC', 'America/New_York')

130

131

Returns:

132

TimestampType: Timestamp type with specified precision and timezone

133

"""

134

135

def date32():

136

"""32-bit date type (days since epoch)."""

137

138

def date64():

139

"""64-bit date type (milliseconds since epoch)."""

140

141

def duration(unit):

142

"""

143

Duration type.

144

145

Parameters:

146

- unit: str, time unit ('s', 'ms', 'us', 'ns')

147

148

Returns:

149

DurationType: Duration type with specified unit

150

"""

151

152

def month_day_nano_interval():

153

"""Month-day-nanosecond interval type."""

154

155

# Binary and string types

156

def binary():

157

"""Variable-length binary type."""

158

159

def string():

160

"""Variable-length string type (UTF-8)."""

161

162

def utf8():

163

"""Alias for string() - UTF-8 encoded strings."""

164

165

def large_binary():

166

"""Large variable-length binary type (64-bit offsets)."""

167

168

def large_string():

169

"""Large variable-length string type (64-bit offsets)."""

170

171

def large_utf8():

172

"""Alias for large_string() - large UTF-8 strings."""

173

174

def binary_view():

175

"""Binary view type for large binary data."""

176

177

def string_view():

178

"""String view type for large string data."""

179

180

def fixed_size_binary(byte_width):

181

"""

182

Fixed-size binary type.

183

184

Parameters:

185

- byte_width: int, number of bytes per value

186

187

Returns:

188

FixedSizeBinaryType: Fixed-size binary type

189

"""

190

191

# Container types

192

def list_(value_type):

193

"""

194

Variable-length list type.

195

196

Parameters:

197

- value_type: DataType, type of list elements

198

199

Returns:

200

ListType: List type with specified element type

201

"""

202

203

def large_list(value_type):

204

"""

205

Large variable-length list type (64-bit offsets).

206

207

Parameters:

208

- value_type: DataType, type of list elements

209

210

Returns:

211

LargeListType: Large list type with specified element type

212

"""

213

214

def fixed_size_list(value_type, list_size):

215

"""

216

Fixed-size list type.

217

218

Parameters:

219

- value_type: DataType, type of list elements

220

- list_size: int, number of elements per list

221

222

Returns:

223

FixedSizeListType: Fixed-size list type

224

"""

225

226

def list_view(value_type):

227

"""

228

List view type for efficient list operations.

229

230

Parameters:

231

- value_type: DataType, type of list elements

232

233

Returns:

234

ListViewType: List view type with specified element type

235

"""

236

237

def large_list_view(value_type):

238

"""

239

Large list view type.

240

241

Parameters:

242

- value_type: DataType, type of list elements

243

244

Returns:

245

LargeListViewType: Large list view type with specified element type

246

"""

247

248

def map_(key_type, item_type, keys_sorted=False):

249

"""

250

Map type (key-value pairs).

251

252

Parameters:

253

- key_type: DataType, type of map keys

254

- item_type: DataType, type of map values

255

- keys_sorted: bool, whether keys are sorted

256

257

Returns:

258

MapType: Map type with specified key and value types

259

"""

260

261

def struct(fields):

262

"""

263

Struct type with named fields.

264

265

Parameters:

266

- fields: list of Field objects or (name, type) tuples

267

268

Returns:

269

StructType: Struct type with specified fields

270

"""

271

272

def union(fields, mode='sparse'):

273

"""

274

Union type supporting multiple value types.

275

276

Parameters:

277

- fields: list of Field objects

278

- mode: str, union mode ('sparse' or 'dense')

279

280

Returns:

281

UnionType: Union type with specified fields and mode

282

"""

283

284

def sparse_union(fields):

285

"""

286

Sparse union type.

287

288

Parameters:

289

- fields: list of Field objects

290

291

Returns:

292

SparseUnionType: Sparse union type

293

"""

294

295

def dense_union(fields):

296

"""

297

Dense union type.

298

299

Parameters:

300

- fields: list of Field objects

301

302

Returns:

303

DenseUnionType: Dense union type

304

"""

305

306

def dictionary(index_type, value_type, ordered=False):

307

"""

308

Dictionary-encoded type.

309

310

Parameters:

311

- index_type: DataType, type of dictionary indices

312

- value_type: DataType, type of dictionary values

313

- ordered: bool, whether dictionary is ordered

314

315

Returns:

316

DictionaryType: Dictionary type

317

"""

318

319

def run_end_encoded(run_end_type, value_type):

320

"""

321

Run-end encoded type for efficient storage of repeated values.

322

323

Parameters:

324

- run_end_type: DataType, type for run end indices

325

- value_type: DataType, type of encoded values

326

327

Returns:

328

RunEndEncodedType: Run-end encoded type

329

"""

330

331

# Advanced types

332

def fixed_shape_tensor(shape, value_type):

333

"""

334

Fixed-shape tensor type.

335

336

Parameters:

337

- shape: tuple of int, tensor shape

338

- value_type: DataType, type of tensor elements

339

340

Returns:

341

FixedShapeTensorType: Fixed-shape tensor type

342

"""

343

344

def json_():

345

"""JSON type for storing JSON documents."""

346

347

def opaque(opaque_type):

348

"""

349

Opaque type for application-specific data.

350

351

Parameters:

352

- opaque_type: DataType, underlying storage type

353

354

Returns:

355

OpaqueType: Opaque type

356

"""

357

358

def uuid():

359

"""UUID type for universally unique identifiers."""

360

```

361

362

### Type System Functions

363

364

Utility functions for working with types, including type inference, conversion, and registration of custom types.

365

366

```python { .api }

367

def type_for_alias(name):

368

"""

369

Get Arrow type from string alias.

370

371

Parameters:

372

- name: str, type alias (e.g., 'int64', 'string', 'float32')

373

374

Returns:

375

DataType: Arrow type corresponding to alias

376

"""

377

378

def from_numpy_dtype(dtype):

379

"""

380

Convert NumPy dtype to Arrow type.

381

382

Parameters:

383

- dtype: numpy.dtype, NumPy data type

384

385

Returns:

386

DataType: Corresponding Arrow type

387

"""

388

389

def infer_type(values, mask=None, from_pandas=False):

390

"""

391

Infer Arrow type from Python sequence.

392

393

Parameters:

394

- values: sequence, data to infer type from

395

- mask: array-like, boolean mask for null values

396

- from_pandas: bool, use pandas-specific inference

397

398

Returns:

399

DataType: Inferred Arrow type

400

"""

401

402

def register_extension_type(ext_type):

403

"""

404

Register custom extension type.

405

406

Parameters:

407

- ext_type: ExtensionType, extension type to register

408

"""

409

410

def unregister_extension_type(type_name):

411

"""

412

Unregister extension type.

413

414

Parameters:

415

- type_name: str, name of extension type to unregister

416

"""

417

```

418

419

### Type Classes

420

421

Base classes and specific implementations for all Arrow data types. These classes provide type information and enable type-safe operations.

422

423

```python { .api }

424

class DataType:

425

"""

426

Base class for all Arrow data types.

427

428

Attributes:

429

- id: Type identifier

430

"""

431

432

def __eq__(self, other): ...

433

def __hash__(self): ...

434

435

def equals(self, other):

436

"""Check type equality."""

437

438

def to_pandas_dtype(self):

439

"""Convert to pandas dtype."""

440

441

class DictionaryType(DataType):

442

"""

443

Dictionary-encoded type.

444

445

Attributes:

446

- index_type: Type of dictionary indices

447

- value_type: Type of dictionary values

448

- ordered: Whether dictionary is ordered

449

"""

450

451

class StructType(DataType):

452

"""

453

Struct type with named fields.

454

455

Attributes:

456

- num_fields: Number of fields

457

"""

458

459

def field(self, i):

460

"""Get field by index."""

461

462

def get_field_index(self, name):

463

"""Get field index by name."""

464

465

def get_all_field_indices(self, name):

466

"""Get all field indices by name."""

467

468

class ListType(DataType):

469

"""

470

Variable-length list type.

471

472

Attributes:

473

- value_type: Type of list elements

474

"""

475

476

class LargeListType(DataType):

477

"""

478

Large variable-length list type.

479

480

Attributes:

481

- value_type: Type of list elements

482

"""

483

484

class FixedSizeListType(DataType):

485

"""

486

Fixed-size list type.

487

488

Attributes:

489

- value_type: Type of list elements

490

- list_size: Number of elements per list

491

"""

492

493

class ListViewType(DataType):

494

"""

495

List view type.

496

497

Attributes:

498

- value_type: Type of list elements

499

"""

500

501

class LargeListViewType(DataType):

502

"""

503

Large list view type.

504

505

Attributes:

506

- value_type: Type of list elements

507

"""

508

509

class MapType(DataType):

510

"""

511

Map type for key-value pairs.

512

513

Attributes:

514

- key_type: Type of map keys

515

- item_type: Type of map values

516

- keys_sorted: Whether keys are sorted

517

"""

518

519

class UnionType(DataType):

520

"""

521

Base class for union types.

522

523

Attributes:

524

- mode: Union mode ('sparse' or 'dense')

525

- num_fields: Number of union fields

526

"""

527

528

class SparseUnionType(UnionType):

529

"""Sparse union type."""

530

531

class DenseUnionType(UnionType):

532

"""Dense union type."""

533

534

class TimestampType(DataType):

535

"""

536

Timestamp type.

537

538

Attributes:

539

- unit: Time unit ('s', 'ms', 'us', 'ns')

540

- tz: Timezone identifier

541

"""

542

543

class Time32Type(DataType):

544

"""

545

32-bit time type.

546

547

Attributes:

548

- unit: Time unit ('s', 'ms')

549

"""

550

551

class Time64Type(DataType):

552

"""

553

64-bit time type.

554

555

Attributes:

556

- unit: Time unit ('us', 'ns')

557

"""

558

559

class DurationType(DataType):

560

"""

561

Duration type.

562

563

Attributes:

564

- unit: Time unit ('s', 'ms', 'us', 'ns')

565

"""

566

567

class FixedSizeBinaryType(DataType):

568

"""

569

Fixed-size binary type.

570

571

Attributes:

572

- byte_width: Number of bytes per value

573

"""

574

575

class Decimal32Type(DataType):

576

"""

577

32-bit decimal type.

578

579

Attributes:

580

- precision: Total number of digits

581

- scale: Number of digits after decimal point

582

"""

583

584

class Decimal64Type(DataType):

585

"""

586

64-bit decimal type.

587

588

Attributes:

589

- precision: Total number of digits

590

- scale: Number of digits after decimal point

591

"""

592

593

class Decimal128Type(DataType):

594

"""

595

128-bit decimal type.

596

597

Attributes:

598

- precision: Total number of digits

599

- scale: Number of digits after decimal point

600

"""

601

602

class Decimal256Type(DataType):

603

"""

604

256-bit decimal type.

605

606

Attributes:

607

- precision: Total number of digits

608

- scale: Number of digits after decimal point

609

"""

610

611

class BaseExtensionType(DataType):

612

"""Base class for extension types."""

613

614

class ExtensionType(BaseExtensionType):

615

"""

616

User-defined extension type.

617

618

Attributes:

619

- extension_name: Name of extension type

620

- storage_type: Underlying storage type

621

"""

622

623

def __arrow_ext_serialize__(self):

624

"""Serialize extension type metadata."""

625

626

def __arrow_ext_deserialize__(self, storage_type, serialized):

627

"""Deserialize extension type from metadata."""

628

629

class RunEndEncodedType(DataType):

630

"""

631

Run-end encoded type.

632

633

Attributes:

634

- run_end_type: Type of run end indices

635

- value_type: Type of encoded values

636

"""

637

638

class FixedShapeTensorType(DataType):

639

"""

640

Fixed-shape tensor type.

641

642

Attributes:

643

- shape: Tensor shape

644

- value_type: Type of tensor elements

645

"""

646

647

class JsonType(DataType):

648

"""JSON document type."""

649

650

class OpaqueType(DataType):

651

"""

652

Opaque type for application-specific data.

653

654

Attributes:

655

- opaque_type: Underlying storage type

656

"""

657

658

class UuidType(DataType):

659

"""UUID type."""

660

661

class UnknownExtensionType(ExtensionType):

662

"""Unknown extension type placeholder."""

663

```

664

665

### Type Checking Functions

666

667

Functions to check and validate Arrow data types. These predicates enable type-safe programming and conditional logic based on type information.

668

669

```python { .api }

670

# Primitive type checks

671

def is_null(type):

672

"""Check if type is null type."""

673

674

def is_boolean(type):

675

"""Check if type is boolean type."""

676

677

def is_integer(type):

678

"""Check if type is any integer type."""

679

680

def is_signed_integer(type):

681

"""Check if type is signed integer type."""

682

683

def is_unsigned_integer(type):

684

"""Check if type is unsigned integer type."""

685

686

def is_int8(type):

687

"""Check if type is 8-bit signed integer."""

688

689

def is_int16(type):

690

"""Check if type is 16-bit signed integer."""

691

692

def is_int32(type):

693

"""Check if type is 32-bit signed integer."""

694

695

def is_int64(type):

696

"""Check if type is 64-bit signed integer."""

697

698

def is_uint8(type):

699

"""Check if type is 8-bit unsigned integer."""

700

701

def is_uint16(type):

702

"""Check if type is 16-bit unsigned integer."""

703

704

def is_uint32(type):

705

"""Check if type is 32-bit unsigned integer."""

706

707

def is_uint64(type):

708

"""Check if type is 64-bit unsigned integer."""

709

710

def is_floating(type):

711

"""Check if type is floating point type."""

712

713

def is_float16(type):

714

"""Check if type is 16-bit floating point."""

715

716

def is_float32(type):

717

"""Check if type is 32-bit floating point."""

718

719

def is_float64(type):

720

"""Check if type is 64-bit floating point."""

721

722

# Container type checks

723

def is_list(type):

724

"""Check if type is variable-length list."""

725

726

def is_large_list(type):

727

"""Check if type is large variable-length list."""

728

729

def is_fixed_size_list(type):

730

"""Check if type is fixed-size list."""

731

732

def is_list_view(type):

733

"""Check if type is list view."""

734

735

def is_large_list_view(type):

736

"""Check if type is large list view."""

737

738

def is_struct(type):

739

"""Check if type is struct type."""

740

741

def is_union(type):

742

"""Check if type is union type."""

743

744

def is_nested(type):

745

"""Check if type is nested (list, struct, map, union)."""

746

747

def is_run_end_encoded(type):

748

"""Check if type is run-end encoded."""

749

750

# Temporal type checks

751

def is_temporal(type):

752

"""Check if type is temporal (timestamp, date, time, duration)."""

753

754

def is_timestamp(type):

755

"""Check if type is timestamp."""

756

757

def is_duration(type):

758

"""Check if type is duration."""

759

760

def is_time(type):

761

"""Check if type is time (32-bit or 64-bit)."""

762

763

def is_time32(type):

764

"""Check if type is 32-bit time."""

765

766

def is_time64(type):

767

"""Check if type is 64-bit time."""

768

769

def is_date(type):

770

"""Check if type is date (32-bit or 64-bit)."""

771

772

def is_date32(type):

773

"""Check if type is 32-bit date."""

774

775

def is_date64(type):

776

"""Check if type is 64-bit date."""

777

778

# Binary and string type checks

779

def is_binary(type):

780

"""Check if type is variable-length binary."""

781

782

def is_large_binary(type):

783

"""Check if type is large variable-length binary."""

784

785

def is_string(type):

786

"""Check if type is variable-length string."""

787

788

def is_large_string(type):

789

"""Check if type is large variable-length string."""

790

791

def is_binary_view(type):

792

"""Check if type is binary view."""

793

794

def is_string_view(type):

795

"""Check if type is string view."""

796

797

def is_fixed_size_binary(type):

798

"""Check if type is fixed-size binary."""

799

800

# Other type checks

801

def is_map(type):

802

"""Check if type is map type."""

803

804

def is_decimal(type):

805

"""Check if type is any decimal type."""

806

807

def is_decimal32(type):

808

"""Check if type is 32-bit decimal."""

809

810

def is_decimal64(type):

811

"""Check if type is 64-bit decimal."""

812

813

def is_decimal128(type):

814

"""Check if type is 128-bit decimal."""

815

816

def is_decimal256(type):

817

"""Check if type is 256-bit decimal."""

818

819

def is_dictionary(type):

820

"""Check if type is dictionary-encoded."""

821

822

def is_interval(type):

823

"""Check if type is interval type."""

824

825

def is_primitive(type):

826

"""Check if type is primitive (non-nested)."""

827

```

828

829

## Usage Examples

830

831

### Creating and Using Types

832

833

```python

834

import pyarrow as pa

835

836

# Create primitive types

837

int_type = pa.int64()

838

str_type = pa.string()

839

float_type = pa.float64()

840

841

# Create temporal types

842

timestamp_type = pa.timestamp('ms', tz='UTC')

843

date_type = pa.date32()

844

duration_type = pa.duration('us')

845

846

# Create decimal types

847

decimal_type = pa.decimal128(precision=10, scale=2)

848

849

# Create nested types

850

list_type = pa.list_(pa.int32())

851

struct_type = pa.struct([

852

pa.field('name', pa.string()),

853

pa.field('age', pa.int32()),

854

pa.field('scores', pa.list_(pa.float64()))

855

])

856

map_type = pa.map_(pa.string(), pa.int64())

857

```

858

859

### Type Checking and Conversion

860

861

```python

862

import pyarrow as pa

863

864

# Type checking

865

data_type = pa.int64()

866

print(pa.types.is_integer(data_type)) # True

867

print(pa.types.is_floating(data_type)) # False

868

print(pa.types.is_signed_integer(data_type)) # True

869

870

# Type inference

871

values = [1, 2, 3, 4, 5]

872

inferred_type = pa.infer_type(values)

873

print(inferred_type) # int64

874

875

# Convert from NumPy

876

import numpy as np

877

numpy_dtype = np.dtype('float32')

878

arrow_type = pa.from_numpy_dtype(numpy_dtype)

879

print(arrow_type) # float32

880

881

# Type aliases

882

string_type = pa.type_for_alias('string')

883

int_type = pa.type_for_alias('int64')

884

```

885

886

### Working with Complex Types

887

888

```python

889

import pyarrow as pa

890

891

# Create schema with complex types

892

schema = pa.schema([

893

pa.field('id', pa.int64()),

894

pa.field('name', pa.string()),

895

pa.field('tags', pa.list_(pa.string())),

896

pa.field('metadata', pa.map_(pa.string(), pa.string())),

897

pa.field('location', pa.struct([

898

pa.field('lat', pa.float64()),

899

pa.field('lon', pa.float64())

900

])),

901

pa.field('timestamp', pa.timestamp('ms', tz='UTC'))

902

])

903

904

# Create arrays with complex types

905

tags_array = pa.array([['python', 'data'], ['arrow', 'columnar'], ['analytics']])

906

metadata_array = pa.array([

907

{'version': '1.0', 'author': 'alice'},

908

{'version': '2.0'},

909

{}

910

])

911

location_array = pa.array([

912

{'lat': 40.7128, 'lon': -74.0060},

913

{'lat': 51.5074, 'lon': -0.1278},

914

{'lat': 35.6762, 'lon': 139.6503}

915

])

916

917

# Create table with complex data

918

table = pa.table({

919

'id': [1, 2, 3],

920

'name': ['New York', 'London', 'Tokyo'],

921

'tags': tags_array,

922

'metadata': metadata_array,

923

'location': location_array,

924

'timestamp': pa.array([

925

'2023-01-01T00:00:00.000Z',

926

'2023-01-02T00:00:00.000Z',

927

'2023-01-03T00:00:00.000Z'

928

], type=pa.timestamp('ms', tz='UTC'))

929

}, schema=schema)

930

```

931

932

### Extension Types

933

934

```python

935

import pyarrow as pa

936

937

# Define custom extension type

938

class UuidType(pa.ExtensionType):

939

def __init__(self):

940

super().__init__(pa.binary(16), "uuid")

941

942

def __arrow_ext_serialize__(self):

943

return b''

944

945

@classmethod

946

def __arrow_ext_deserialize__(cls, storage_type, serialized):

947

return UuidType()

948

949

# Register extension type

950

pa.register_extension_type(UuidType())

951

952

# Create array with extension type

953

uuid_type = UuidType()

954

uuid_array = pa.array([

955

b'\x12\x34\x56\x78\x90\xab\xcd\xef\x12\x34\x56\x78\x90\xab\xcd\xef',

956

b'\xfe\xdc\xba\x98\x76\x54\x32\x10\xfe\xdc\xba\x98\x76\x54\x32\x10'

957

], type=uuid_type)

958

```