or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

advanced-features.mdarrow-flight.mdcompute-functions.mdcore-data-structures.mddata-types.mddataset-operations.mdfile-formats.mdindex.mdmemory-io.md

core-data-structures.mddocs/

0

# Core Data Structures

1

2

Fundamental data containers that form the foundation of PyArrow's columnar data processing capabilities. These structures provide efficient storage and manipulation of typed data in memory-optimized columnar layouts.

3

4

## Capabilities

5

6

### Arrays

7

8

One-dimensional sequences of values with a specific data type. Arrays are immutable and provide the basic building blocks for all other data structures in PyArrow.

9

10

```python { .api }

11

def array(obj, type=None, mask=None, size=None, from_pandas=None, safe=True):

12

"""

13

Create Arrow array from Python sequence, NumPy array, or pandas data.

14

15

Parameters:

16

- obj: sequence, NumPy array, or pandas Series to convert

17

- type: DataType, explicit type for the array

18

- mask: array-like, boolean mask for null values

19

- size: int, length of array if obj is scalar

20

- from_pandas: bool, interpret pandas-specific data

21

- safe: bool, check for overflow/truncation during conversion

22

23

Returns:

24

Array: Arrow array with specified type

25

"""

26

27

def chunked_array(arrays, type=None):

28

"""

29

Create chunked array from list of arrays.

30

31

Parameters:

32

- arrays: sequence of Array objects

33

- type: DataType, explicit type (must match all arrays)

34

35

Returns:

36

ChunkedArray: Chunked array composed of input arrays

37

"""

38

39

def nulls(size, type=None):

40

"""

41

Create array of null values.

42

43

Parameters:

44

- size: int, length of array

45

- type: DataType, type of nulls (default: null type)

46

47

Returns:

48

Array: Array of null values

49

"""

50

51

def repeat(value, size):

52

"""

53

Create array by repeating a single value.

54

55

Parameters:

56

- value: scalar value to repeat

57

- size: int, number of repetitions

58

59

Returns:

60

Array: Array with repeated value

61

"""

62

63

def arange(start, stop=None, step=1, dtype=None):

64

"""

65

Create array with range of values.

66

67

Parameters:

68

- start: int, start value (or stop if stop is None)

69

- stop: int, stop value (exclusive)

70

- step: int, step size

71

- dtype: DataType, array data type

72

73

Returns:

74

Array: Array with range values

75

"""

76

77

class Array:

78

"""

79

Base class for all Arrow arrays.

80

81

Attributes:

82

- type: DataType of the array

83

- length: Number of elements

84

- null_count: Number of null values

85

- is_valid: Boolean array indicating non-null values

86

"""

87

88

def __len__(self): ...

89

def __getitem__(self, key): ...

90

def __iter__(self): ...

91

92

def to_pylist(self):

93

"""Convert to Python list."""

94

95

def to_pandas(self, **kwargs):

96

"""Convert to pandas Series."""

97

98

def to_numpy(self, **kwargs):

99

"""Convert to NumPy array."""

100

101

def slice(self, offset=0, length=None):

102

"""Return slice of array."""

103

104

def take(self, indices):

105

"""Select elements by indices."""

106

107

def filter(self, mask):

108

"""Filter array by boolean mask."""

109

110

def sort(self, **kwargs):

111

"""Return sorted array."""

112

113

def unique(self):

114

"""Return array of unique values."""

115

116

def value_counts(self):

117

"""Return struct array of value counts."""

118

119

class ChunkedArray:

120

"""

121

Array composed of multiple contiguous arrays (chunks).

122

123

Attributes:

124

- type: DataType of the chunked array

125

- length: Total number of elements across chunks

126

- null_count: Total number of null values

127

- num_chunks: Number of chunks

128

- chunks: List of Array chunks

129

"""

130

131

def __len__(self): ...

132

def __getitem__(self, key): ...

133

def __iter__(self): ...

134

135

def chunk(self, i):

136

"""Get chunk at index i."""

137

138

def to_pylist(self):

139

"""Convert to Python list."""

140

141

def to_pandas(self, **kwargs):

142

"""Convert to pandas Series."""

143

144

def slice(self, offset=0, length=None):

145

"""Return slice of chunked array."""

146

147

def take(self, indices):

148

"""Select elements by indices."""

149

150

def filter(self, mask):

151

"""Filter by boolean mask."""

152

153

def combine_chunks(self):

154

"""Combine chunks into single array."""

155

```

156

157

### Tables

158

159

Two-dimensional datasets with named columns, similar to SQL tables or pandas DataFrames. Tables provide the primary interface for working with tabular data in PyArrow.

160

161

```python { .api }

162

def table(data, schema=None, metadata=None, columns=None):

163

"""

164

Create Arrow table from various data sources.

165

166

Parameters:

167

- data: dict, list of arrays, pandas DataFrame, or RecordBatch

168

- schema: Schema, explicit schema for the table

169

- metadata: dict, key-value metadata

170

- columns: list of str, column names (when data is list)

171

172

Returns:

173

Table: Arrow table with specified schema

174

"""

175

176

def record_batch(data, schema=None, metadata=None):

177

"""

178

Create RecordBatch from data.

179

180

Parameters:

181

- data: dict, list of arrays, or sequence

182

- schema: Schema, explicit schema

183

- metadata: dict, key-value metadata

184

185

Returns:

186

RecordBatch: Single batch of columnar data

187

"""

188

189

def concat_tables(tables, promote=False):

190

"""

191

Concatenate tables vertically.

192

193

Parameters:

194

- tables: sequence of Table objects

195

- promote: bool, promote schemas to compatible type

196

197

Returns:

198

Table: Concatenated table

199

"""

200

201

def concat_arrays(arrays):

202

"""

203

Concatenate arrays into single array.

204

205

Parameters:

206

- arrays: sequence of Array objects with same type

207

208

Returns:

209

Array: Concatenated array

210

"""

211

212

def concat_batches(batches, promote=False):

213

"""

214

Concatenate record batches.

215

216

Parameters:

217

- batches: sequence of RecordBatch objects

218

- promote: bool, promote schemas to compatible type

219

220

Returns:

221

Table: Table created from concatenated batches

222

"""

223

224

class Table:

225

"""

226

Two-dimensional table of columnar data.

227

228

Attributes:

229

- schema: Schema of the table

230

- num_columns: Number of columns

231

- num_rows: Number of rows

232

- column_names: List of column names

233

- columns: List of ChunkedArray columns

234

"""

235

236

def __len__(self): ...

237

def __getitem__(self, key): ...

238

def __iter__(self): ...

239

240

def column(self, i):

241

"""Get column by index or name."""

242

243

def select(self, columns):

244

"""Select subset of columns."""

245

246

def slice(self, offset=0, length=None):

247

"""Return slice of table."""

248

249

def filter(self, mask):

250

"""Filter rows by boolean mask."""

251

252

def take(self, indices):

253

"""Select rows by indices."""

254

255

def sort_by(self, sorting):

256

"""Sort table by columns."""

257

258

def group_by(self, keys):

259

"""Group table by columns."""

260

261

def join(self, right_table, **kwargs):

262

"""Join with another table."""

263

264

def to_pandas(self, **kwargs):

265

"""Convert to pandas DataFrame."""

266

267

def to_pydict(self):

268

"""Convert to dictionary of Python lists."""

269

270

def to_batches(self, max_chunksize=None):

271

"""Convert to iterator of RecordBatch objects."""

272

273

def add_column(self, i, field, column):

274

"""Add column at position i."""

275

276

def append_column(self, field, column):

277

"""Append column to table."""

278

279

def remove_column(self, i):

280

"""Remove column at position i."""

281

282

def rename_columns(self, names):

283

"""Rename columns."""

284

285

def drop(self, columns):

286

"""Drop columns by name."""

287

288

def replace_schema_metadata(self, metadata):

289

"""Replace table metadata."""

290

291

class RecordBatch:

292

"""

293

Collection of arrays with shared length representing a single batch.

294

295

Attributes:

296

- schema: Schema of the batch

297

- num_columns: Number of columns

298

- num_rows: Number of rows

299

- column_names: List of column names

300

- columns: List of Array columns

301

"""

302

303

def __len__(self): ...

304

def __getitem__(self, key): ...

305

def __iter__(self): ...

306

307

def column(self, i):

308

"""Get column by index or name."""

309

310

def select(self, columns):

311

"""Select subset of columns."""

312

313

def slice(self, offset=0, length=None):

314

"""Return slice of batch."""

315

316

def filter(self, mask):

317

"""Filter rows by boolean mask."""

318

319

def take(self, indices):

320

"""Select rows by indices."""

321

322

def to_pandas(self, **kwargs):

323

"""Convert to pandas DataFrame."""

324

325

def to_pydict(self):

326

"""Convert to dictionary of Python lists."""

327

328

def add_column(self, i, field, column):

329

"""Add column at position i."""

330

331

def remove_column(self, i):

332

"""Remove column at position i."""

333

334

def rename_columns(self, names):

335

"""Rename columns."""

336

337

class RecordBatchReader:

338

"""

339

Interface for reading stream of record batches.

340

"""

341

342

def __iter__(self): ...

343

344

def read_next_batch(self):

345

"""Read next batch from stream."""

346

347

def read_all(self):

348

"""Read all batches into table."""

349

350

def schema(self):

351

"""Get schema of batches."""

352

353

class TableGroupBy:

354

"""

355

Grouped table operations.

356

"""

357

358

def aggregate(self, aggregations):

359

"""Perform aggregations on groups."""

360

```

361

362

### Schemas and Fields

363

364

Schema definitions that describe table structure, column types, and metadata. Schemas provide type safety and enable efficient data processing by defining the expected structure of tabular data.

365

366

```python { .api }

367

def schema(fields, metadata=None):

368

"""

369

Create schema from list of fields.

370

371

Parameters:

372

- fields: sequence of Field objects or (name, type) tuples

373

- metadata: dict, key-value metadata for schema

374

375

Returns:

376

Schema: Schema object with specified fields

377

"""

378

379

def field(name, type, nullable=True, metadata=None):

380

"""

381

Create field with name and type.

382

383

Parameters:

384

- name: str, field name

385

- type: DataType, field data type

386

- nullable: bool, whether field can contain nulls

387

- metadata: dict, key-value metadata for field

388

389

Returns:

390

Field: Field object with specified properties

391

"""

392

393

def unify_schemas(schemas):

394

"""

395

Unify multiple schemas into compatible schema.

396

397

Parameters:

398

- schemas: sequence of Schema objects

399

400

Returns:

401

Schema: Unified schema compatible with all input schemas

402

"""

403

404

class Schema:

405

"""

406

Schema defining structure of tabular data.

407

408

Attributes:

409

- names: List of field names

410

- types: List of field types

411

- metadata: Key-value metadata

412

"""

413

414

def __len__(self): ...

415

def __getitem__(self, key): ...

416

def __iter__(self): ...

417

418

def field(self, i):

419

"""Get field by index or name."""

420

421

def get_field_index(self, name):

422

"""Get index of field by name."""

423

424

def select(self, names):

425

"""Select subset of fields."""

426

427

def insert(self, i, field):

428

"""Insert field at position i."""

429

430

def append(self, field):

431

"""Append field to schema."""

432

433

def remove(self, i):

434

"""Remove field at position i."""

435

436

def with_metadata(self, metadata):

437

"""Return schema with new metadata."""

438

439

def equals(self, other, check_metadata=True):

440

"""Check equality with another schema."""

441

442

def to_string(self, **kwargs):

443

"""String representation of schema."""

444

445

class Field:

446

"""

447

Named field in a schema with type and metadata.

448

449

Attributes:

450

- name: Field name

451

- type: DataType of field

452

- nullable: Whether field can contain nulls

453

- metadata: Key-value metadata

454

"""

455

456

def with_name(self, name):

457

"""Return field with new name."""

458

459

def with_type(self, type):

460

"""Return field with new type."""

461

462

def with_nullable(self, nullable):

463

"""Return field with new nullable setting."""

464

465

def with_metadata(self, metadata):

466

"""Return field with new metadata."""

467

468

def equals(self, other, check_metadata=True):

469

"""Check equality with another field."""

470

471

def to_string(self, **kwargs):

472

"""String representation of field."""

473

474

class KeyValueMetadata:

475

"""

476

Key-value metadata container.

477

"""

478

479

def __len__(self): ...

480

def __getitem__(self, key): ...

481

def __iter__(self): ...

482

483

def get(self, key, default=None):

484

"""Get value by key."""

485

486

def keys(self):

487

"""Get all keys."""

488

489

def values(self):

490

"""Get all values."""

491

492

def items(self):

493

"""Get key-value pairs."""

494

495

def to_dict(self):

496

"""Convert to Python dictionary."""

497

```

498

499

### Scalars

500

501

Single typed values that provide consistent interface for working with individual data elements. Scalars maintain type information and null state, enabling type-safe operations on individual values.

502

503

```python { .api }

504

def scalar(value, type=None):

505

"""

506

Create scalar from Python value.

507

508

Parameters:

509

- value: Python value to wrap

510

- type: DataType, explicit type for scalar

511

512

Returns:

513

Scalar: Typed scalar value

514

"""

515

516

# Scalar constants

517

NA = ... # Not Available scalar

518

NULL = ... # Null scalar

519

520

class Scalar:

521

"""

522

Base class for typed scalar values.

523

524

Attributes:

525

- type: DataType of scalar

526

- is_valid: Whether scalar is non-null

527

"""

528

529

def __eq__(self, other): ...

530

def __hash__(self): ...

531

532

def as_py(self):

533

"""Convert to Python value."""

534

535

def cast(self, target_type, safe=True):

536

"""Cast to different type."""

537

538

def equals(self, other):

539

"""Check equality with another scalar."""

540

541

# Specific scalar types are available for all Arrow data types:

542

# NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,

543

# UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, HalfFloatScalar,

544

# FloatScalar, DoubleScalar, Decimal128Scalar, StringScalar, BinaryScalar,

545

# Date32Scalar, Date64Scalar, TimestampScalar, Time32Scalar, Time64Scalar,

546

# DurationScalar, ListScalar, StructScalar, MapScalar, DictionaryScalar, etc.

547

```

548

549

### Tensors and Sparse Data

550

551

Multi-dimensional arrays and sparse data structures for advanced numerical computing and machine learning applications.

552

553

```python { .api }

554

class Tensor:

555

"""

556

Multi-dimensional array with Arrow data.

557

558

Attributes:

559

- type: DataType of tensor elements

560

- shape: Shape tuple of tensor dimensions

561

- strides: Strides tuple for memory layout

562

- is_mutable: Whether tensor data is mutable

563

"""

564

565

def __getitem__(self, key): ...

566

567

def to_numpy(self):

568

"""Convert to NumPy array."""

569

570

def equals(self, other):

571

"""Check equality with another tensor."""

572

573

class SparseCOOTensor:

574

"""Sparse tensor in COOrdinate format."""

575

576

class SparseCSRMatrix:

577

"""Sparse matrix in Compressed Sparse Row format."""

578

579

class SparseCSCMatrix:

580

"""Sparse matrix in Compressed Sparse Column format."""

581

582

class SparseCSFTensor:

583

"""Sparse tensor in Compressed Sparse Fiber format."""

584

```

585

586

## Type Definitions

587

588

### Memory Management

589

590

```python { .api }

591

class DictionaryMemo:

592

"""

593

Memo for dictionary encoding to ensure consistent dictionaries.

594

"""

595

596

def __init__(self): ...

597

598

def get_dictionary(self, type):

599

"""Get dictionary for type."""

600

601

def set_dictionary(self, type, dictionary):

602

"""Set dictionary for type.</""

603

```

604

605

## Usage Examples

606

607

### Creating and Manipulating Arrays

608

609

```python

610

import pyarrow as pa

611

import numpy as np

612

613

# Create arrays from various sources

614

int_array = pa.array([1, 2, 3, 4, 5])

615

str_array = pa.array(['apple', 'banana', 'cherry', None])

616

np_array = pa.array(np.random.randn(1000))

617

618

# Create chunked array

619

chunks = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]

620

chunked = pa.chunked_array(chunks)

621

622

# Array operations

623

filtered = int_array.filter(pa.array([True, False, True, False, True]))

624

sorted_array = str_array.sort()

625

unique_values = str_array.unique()

626

627

# Convert to other formats

628

python_list = int_array.to_pylist()

629

pandas_series = int_array.to_pandas()

630

numpy_array = int_array.to_numpy()

631

```

632

633

### Working with Tables

634

635

```python

636

import pyarrow as pa

637

638

# Create table from dictionary

639

data = {

640

'id': [1, 2, 3, 4, 5],

641

'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],

642

'age': [25, 30, 35, 28, 32],

643

'salary': [50000.0, 60000.0, 70000.0, 55000.0, 65000.0]

644

}

645

table = pa.table(data)

646

647

# Table operations

648

subset = table.select(['name', 'age'])

649

filtered = table.filter(pa.compute.greater(table['age'], 30))

650

sorted_table = table.sort_by([('age', 'descending')])

651

652

# Add/remove columns

653

new_table = table.add_column(4, pa.field('bonus', pa.float64()),

654

pa.array([5000.0, 6000.0, 7000.0, 5500.0, 6500.0]))

655

dropped = table.drop(['salary'])

656

657

# Convert to pandas

658

df = table.to_pandas()

659

```

660

661

### Schema Definition

662

663

```python

664

import pyarrow as pa

665

666

# Define schema explicitly

667

schema = pa.schema([

668

pa.field('id', pa.int64()),

669

pa.field('name', pa.string()),

670

pa.field('scores', pa.list_(pa.float64())),

671

pa.field('metadata', pa.map_(pa.string(), pa.string()))

672

])

673

674

# Create table with schema

675

table = pa.table({

676

'id': [1, 2, 3],

677

'name': ['Alice', 'Bob', 'Charlie'],

678

'scores': [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],

679

'metadata': [{'key': 'value'}, {}, {'foo': 'bar'}]

680

}, schema=schema)

681

682

# Schema operations

683

field = schema.field('name')

684

field_index = schema.get_field_index('scores')

685

partial_schema = schema.select(['id', 'name'])

686

```