Tessl Tile for pypi/pyarrow@21.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

advanced-features.md arrow-flight.md compute-functions.md core-data-structures.md data-types.md dataset-operations.md file-formats.md index.md memory-io.md

core-data-structures.mddocs/

0
# Core Data Structures
1

2
Fundamental data containers that form the foundation of PyArrow's columnar data processing capabilities. These structures provide efficient storage and manipulation of typed data in memory-optimized columnar layouts.
3

4
## Capabilities
5

6
### Arrays
7

8
One-dimensional sequences of values with a specific data type. Arrays are immutable and provide the basic building blocks for all other data structures in PyArrow.
9

10
```python { .api }
11
def array(obj, type=None, mask=None, size=None, from_pandas=None, safe=True):
12
    """
13
    Create Arrow array from Python sequence, NumPy array, or pandas data.
14
    
15
    Parameters:
16
    - obj: sequence, NumPy array, or pandas Series to convert
17
    - type: DataType, explicit type for the array
18
    - mask: array-like, boolean mask for null values
19
    - size: int, length of array if obj is scalar
20
    - from_pandas: bool, interpret pandas-specific data
21
    - safe: bool, check for overflow/truncation during conversion
22
    
23
    Returns:
24
    Array: Arrow array with specified type
25
    """
26

27
def chunked_array(arrays, type=None):
28
    """
29
    Create chunked array from list of arrays.
30
    
31
    Parameters:
32
    - arrays: sequence of Array objects
33
    - type: DataType, explicit type (must match all arrays)
34
    
35
    Returns:
36
    ChunkedArray: Chunked array composed of input arrays
37
    """
38

39
def nulls(size, type=None):
40
    """
41
    Create array of null values.
42
    
43
    Parameters:
44
    - size: int, length of array
45
    - type: DataType, type of nulls (default: null type)
46
    
47
    Returns:
48
    Array: Array of null values
49
    """
50

51
def repeat(value, size):
52
    """
53
    Create array by repeating a single value.
54
    
55
    Parameters:
56
    - value: scalar value to repeat
57
    - size: int, number of repetitions
58
    
59
    Returns:
60
    Array: Array with repeated value
61
    """
62

63
def arange(start, stop=None, step=1, dtype=None):
64
    """
65
    Create array with range of values.
66
    
67
    Parameters:
68
    - start: int, start value (or stop if stop is None)
69
    - stop: int, stop value (exclusive)
70
    - step: int, step size
71
    - dtype: DataType, array data type
72
    
73
    Returns:
74
    Array: Array with range values
75
    """
76

77
class Array:
78
    """
79
    Base class for all Arrow arrays.
80
    
81
    Attributes:
82
    - type: DataType of the array
83
    - length: Number of elements
84
    - null_count: Number of null values
85
    - is_valid: Boolean array indicating non-null values
86
    """
87
    
88
    def __len__(self): ...
89
    def __getitem__(self, key): ...
90
    def __iter__(self): ...
91
    
92
    def to_pylist(self):
93
        """Convert to Python list."""
94
    
95
    def to_pandas(self, **kwargs):
96
        """Convert to pandas Series."""
97
    
98
    def to_numpy(self, **kwargs):
99
        """Convert to NumPy array."""
100
    
101
    def slice(self, offset=0, length=None):
102
        """Return slice of array."""
103
    
104
    def take(self, indices):
105
        """Select elements by indices."""
106
    
107
    def filter(self, mask):
108
        """Filter array by boolean mask."""
109
    
110
    def sort(self, **kwargs):
111
        """Return sorted array."""
112
    
113
    def unique(self):
114
        """Return array of unique values."""
115
    
116
    def value_counts(self):
117
        """Return struct array of value counts."""
118

119
class ChunkedArray:
120
    """
121
    Array composed of multiple contiguous arrays (chunks).
122
    
123
    Attributes:
124
    - type: DataType of the chunked array
125
    - length: Total number of elements across chunks
126
    - null_count: Total number of null values
127
    - num_chunks: Number of chunks
128
    - chunks: List of Array chunks
129
    """
130
    
131
    def __len__(self): ...
132
    def __getitem__(self, key): ...
133
    def __iter__(self): ...
134
    
135
    def chunk(self, i):
136
        """Get chunk at index i."""
137
    
138
    def to_pylist(self):
139
        """Convert to Python list."""
140
    
141
    def to_pandas(self, **kwargs):
142
        """Convert to pandas Series."""
143
    
144
    def slice(self, offset=0, length=None):
145
        """Return slice of chunked array."""
146
    
147
    def take(self, indices):
148
        """Select elements by indices."""
149
    
150
    def filter(self, mask):
151
        """Filter by boolean mask."""
152
    
153
    def combine_chunks(self):
154
        """Combine chunks into single array."""
155
```
156

157
### Tables
158

159
Two-dimensional datasets with named columns, similar to SQL tables or pandas DataFrames. Tables provide the primary interface for working with tabular data in PyArrow.
160

161
```python { .api }
162
def table(data, schema=None, metadata=None, columns=None):
163
    """
164
    Create Arrow table from various data sources.
165
    
166
    Parameters:
167
    - data: dict, list of arrays, pandas DataFrame, or RecordBatch
168
    - schema: Schema, explicit schema for the table
169
    - metadata: dict, key-value metadata
170
    - columns: list of str, column names (when data is list)
171
    
172
    Returns:
173
    Table: Arrow table with specified schema
174
    """
175

176
def record_batch(data, schema=None, metadata=None):
177
    """
178
    Create RecordBatch from data.
179
    
180
    Parameters:
181
    - data: dict, list of arrays, or sequence
182
    - schema: Schema, explicit schema
183
    - metadata: dict, key-value metadata
184
    
185
    Returns:
186
    RecordBatch: Single batch of columnar data
187
    """
188

189
def concat_tables(tables, promote=False):
190
    """
191
    Concatenate tables vertically.
192
    
193
    Parameters:
194
    - tables: sequence of Table objects
195
    - promote: bool, promote schemas to compatible type
196
    
197
    Returns:
198
    Table: Concatenated table
199
    """
200

201
def concat_arrays(arrays):
202
    """
203
    Concatenate arrays into single array.
204
    
205
    Parameters:
206
    - arrays: sequence of Array objects with same type
207
    
208
    Returns:
209
    Array: Concatenated array
210
    """
211

212
def concat_batches(batches, promote=False):
213
    """
214
    Concatenate record batches.
215
    
216
    Parameters:
217
    - batches: sequence of RecordBatch objects
218
    - promote: bool, promote schemas to compatible type
219
    
220
    Returns:
221
    Table: Table created from concatenated batches
222
    """
223

224
class Table:
225
    """
226
    Two-dimensional table of columnar data.
227
    
228
    Attributes:
229
    - schema: Schema of the table
230
    - num_columns: Number of columns
231
    - num_rows: Number of rows
232
    - column_names: List of column names
233
    - columns: List of ChunkedArray columns
234
    """
235
    
236
    def __len__(self): ...
237
    def __getitem__(self, key): ...
238
    def __iter__(self): ...
239
    
240
    def column(self, i):
241
        """Get column by index or name."""
242
    
243
    def select(self, columns):
244
        """Select subset of columns."""
245
    
246
    def slice(self, offset=0, length=None):
247
        """Return slice of table."""
248
    
249
    def filter(self, mask):
250
        """Filter rows by boolean mask."""
251
    
252
    def take(self, indices):
253
        """Select rows by indices."""
254
    
255
    def sort_by(self, sorting):
256
        """Sort table by columns."""
257
    
258
    def group_by(self, keys):
259
        """Group table by columns."""
260
    
261
    def join(self, right_table, **kwargs):
262
        """Join with another table."""
263
    
264
    def to_pandas(self, **kwargs):
265
        """Convert to pandas DataFrame."""
266
    
267
    def to_pydict(self):
268
        """Convert to dictionary of Python lists."""
269
    
270
    def to_batches(self, max_chunksize=None):
271
        """Convert to iterator of RecordBatch objects."""
272
    
273
    def add_column(self, i, field, column):
274
        """Add column at position i."""
275
    
276
    def append_column(self, field, column):
277
        """Append column to table."""
278
    
279
    def remove_column(self, i):
280
        """Remove column at position i."""
281
    
282
    def rename_columns(self, names):
283
        """Rename columns."""
284
    
285
    def drop(self, columns):
286
        """Drop columns by name."""
287
    
288
    def replace_schema_metadata(self, metadata):
289
        """Replace table metadata."""
290

291
class RecordBatch:
292
    """
293
    Collection of arrays with shared length representing a single batch.
294
    
295
    Attributes:
296
    - schema: Schema of the batch
297
    - num_columns: Number of columns
298
    - num_rows: Number of rows
299
    - column_names: List of column names
300
    - columns: List of Array columns
301
    """
302
    
303
    def __len__(self): ...
304
    def __getitem__(self, key): ...
305
    def __iter__(self): ...
306
    
307
    def column(self, i):
308
        """Get column by index or name."""
309
    
310
    def select(self, columns):
311
        """Select subset of columns."""
312
    
313
    def slice(self, offset=0, length=None):
314
        """Return slice of batch."""
315
    
316
    def filter(self, mask):
317
        """Filter rows by boolean mask."""
318
    
319
    def take(self, indices):
320
        """Select rows by indices."""
321
    
322
    def to_pandas(self, **kwargs):
323
        """Convert to pandas DataFrame."""
324
    
325
    def to_pydict(self):
326
        """Convert to dictionary of Python lists."""
327
    
328
    def add_column(self, i, field, column):
329
        """Add column at position i."""
330
    
331
    def remove_column(self, i):
332
        """Remove column at position i."""
333
    
334
    def rename_columns(self, names):
335
        """Rename columns."""
336

337
class RecordBatchReader:
338
    """
339
    Interface for reading stream of record batches.
340
    """
341
    
342
    def __iter__(self): ...
343
    
344
    def read_next_batch(self):
345
        """Read next batch from stream."""
346
    
347
    def read_all(self):
348
        """Read all batches into table."""
349
    
350
    def schema(self):
351
        """Get schema of batches."""
352

353
class TableGroupBy:
354
    """
355
    Grouped table operations.
356
    """
357
    
358
    def aggregate(self, aggregations):
359
        """Perform aggregations on groups."""
360
```
361

362
### Schemas and Fields
363

364
Schema definitions that describe table structure, column types, and metadata. Schemas provide type safety and enable efficient data processing by defining the expected structure of tabular data.
365

366
```python { .api }
367
def schema(fields, metadata=None):
368
    """
369
    Create schema from list of fields.
370
    
371
    Parameters:
372
    - fields: sequence of Field objects or (name, type) tuples
373
    - metadata: dict, key-value metadata for schema
374
    
375
    Returns:
376
    Schema: Schema object with specified fields
377
    """
378

379
def field(name, type, nullable=True, metadata=None):
380
    """
381
    Create field with name and type.
382
    
383
    Parameters:
384
    - name: str, field name
385
    - type: DataType, field data type
386
    - nullable: bool, whether field can contain nulls
387
    - metadata: dict, key-value metadata for field
388
    
389
    Returns:
390
    Field: Field object with specified properties
391
    """
392

393
def unify_schemas(schemas):
394
    """
395
    Unify multiple schemas into compatible schema.
396
    
397
    Parameters:
398
    - schemas: sequence of Schema objects
399
    
400
    Returns:
401
    Schema: Unified schema compatible with all input schemas
402
    """
403

404
class Schema:
405
    """
406
    Schema defining structure of tabular data.
407
    
408
    Attributes:
409
    - names: List of field names
410
    - types: List of field types
411
    - metadata: Key-value metadata
412
    """
413
    
414
    def __len__(self): ...
415
    def __getitem__(self, key): ...
416
    def __iter__(self): ...
417
    
418
    def field(self, i):
419
        """Get field by index or name."""
420
    
421
    def get_field_index(self, name):
422
        """Get index of field by name."""
423
    
424
    def select(self, names):
425
        """Select subset of fields."""
426
    
427
    def insert(self, i, field):
428
        """Insert field at position i."""
429
    
430
    def append(self, field):
431
        """Append field to schema."""
432
    
433
    def remove(self, i):
434
        """Remove field at position i."""
435
    
436
    def with_metadata(self, metadata):
437
        """Return schema with new metadata."""
438
    
439
    def equals(self, other, check_metadata=True):
440
        """Check equality with another schema."""
441
    
442
    def to_string(self, **kwargs):
443
        """String representation of schema."""
444

445
class Field:
446
    """
447
    Named field in a schema with type and metadata.
448
    
449
    Attributes:
450
    - name: Field name
451
    - type: DataType of field
452
    - nullable: Whether field can contain nulls
453
    - metadata: Key-value metadata
454
    """
455
    
456
    def with_name(self, name):
457
        """Return field with new name."""
458
    
459
    def with_type(self, type):
460
        """Return field with new type."""
461
    
462
    def with_nullable(self, nullable):
463
        """Return field with new nullable setting."""
464
    
465
    def with_metadata(self, metadata):
466
        """Return field with new metadata."""
467
    
468
    def equals(self, other, check_metadata=True):
469
        """Check equality with another field."""
470
    
471
    def to_string(self, **kwargs):
472
        """String representation of field."""
473

474
class KeyValueMetadata:
475
    """
476
    Key-value metadata container.
477
    """
478
    
479
    def __len__(self): ...
480
    def __getitem__(self, key): ...
481
    def __iter__(self): ...
482
    
483
    def get(self, key, default=None):
484
        """Get value by key."""
485
    
486
    def keys(self):
487
        """Get all keys."""
488
    
489
    def values(self):
490
        """Get all values."""
491
    
492
    def items(self):
493
        """Get key-value pairs."""
494
    
495
    def to_dict(self):
496
        """Convert to Python dictionary."""
497
```
498

499
### Scalars
500

501
Single typed values that provide consistent interface for working with individual data elements. Scalars maintain type information and null state, enabling type-safe operations on individual values.
502

503
```python { .api }
504
def scalar(value, type=None):
505
    """
506
    Create scalar from Python value.
507
    
508
    Parameters:
509
    - value: Python value to wrap
510
    - type: DataType, explicit type for scalar
511
    
512
    Returns:
513
    Scalar: Typed scalar value
514
    """
515

516
# Scalar constants
517
NA = ...  # Not Available scalar
518
NULL = ...  # Null scalar
519

520
class Scalar:
521
    """
522
    Base class for typed scalar values.
523
    
524
    Attributes:
525
    - type: DataType of scalar
526
    - is_valid: Whether scalar is non-null
527
    """
528
    
529
    def __eq__(self, other): ...
530
    def __hash__(self): ...
531
    
532
    def as_py(self):
533
        """Convert to Python value."""
534
    
535
    def cast(self, target_type, safe=True):
536
        """Cast to different type."""
537
    
538
    def equals(self, other):
539
        """Check equality with another scalar."""
540

541
# Specific scalar types are available for all Arrow data types:
542
# NullScalar, BooleanScalar, Int8Scalar, Int16Scalar, Int32Scalar, Int64Scalar,
543
# UInt8Scalar, UInt16Scalar, UInt32Scalar, UInt64Scalar, HalfFloatScalar,
544
# FloatScalar, DoubleScalar, Decimal128Scalar, StringScalar, BinaryScalar,
545
# Date32Scalar, Date64Scalar, TimestampScalar, Time32Scalar, Time64Scalar,
546
# DurationScalar, ListScalar, StructScalar, MapScalar, DictionaryScalar, etc.
547
```
548

549
### Tensors and Sparse Data
550

551
Multi-dimensional arrays and sparse data structures for advanced numerical computing and machine learning applications.
552

553
```python { .api }
554
class Tensor:
555
    """
556
    Multi-dimensional array with Arrow data.
557
    
558
    Attributes:
559
    - type: DataType of tensor elements
560
    - shape: Shape tuple of tensor dimensions
561
    - strides: Strides tuple for memory layout
562
    - is_mutable: Whether tensor data is mutable
563
    """
564
    
565
    def __getitem__(self, key): ...
566
    
567
    def to_numpy(self):
568
        """Convert to NumPy array."""
569
    
570
    def equals(self, other):
571
        """Check equality with another tensor."""
572

573
class SparseCOOTensor:
574
    """Sparse tensor in COOrdinate format."""
575
    
576
class SparseCSRMatrix:
577
    """Sparse matrix in Compressed Sparse Row format."""
578
    
579
class SparseCSCMatrix:
580
    """Sparse matrix in Compressed Sparse Column format."""
581

582
class SparseCSFTensor:
583
    """Sparse tensor in Compressed Sparse Fiber format."""
584
```
585

586
## Type Definitions
587

588
### Memory Management
589

590
```python { .api }
591
class DictionaryMemo:
592
    """
593
    Memo for dictionary encoding to ensure consistent dictionaries.
594
    """
595
    
596
    def __init__(self): ...
597
    
598
    def get_dictionary(self, type):
599
        """Get dictionary for type."""
600
    
601
    def set_dictionary(self, type, dictionary):
602
        """Set dictionary for type.</""
603
```
604

605
## Usage Examples
606

607
### Creating and Manipulating Arrays
608

609
```python
610
import pyarrow as pa
611
import numpy as np
612

613
# Create arrays from various sources
614
int_array = pa.array([1, 2, 3, 4, 5])
615
str_array = pa.array(['apple', 'banana', 'cherry', None])
616
np_array = pa.array(np.random.randn(1000))
617

618
# Create chunked array
619
chunks = [pa.array([1, 2, 3]), pa.array([4, 5, 6])]
620
chunked = pa.chunked_array(chunks)
621

622
# Array operations
623
filtered = int_array.filter(pa.array([True, False, True, False, True]))
624
sorted_array = str_array.sort()
625
unique_values = str_array.unique()
626

627
# Convert to other formats
628
python_list = int_array.to_pylist()
629
pandas_series = int_array.to_pandas()
630
numpy_array = int_array.to_numpy()
631
```
632

633
### Working with Tables
634

635
```python
636
import pyarrow as pa
637

638
# Create table from dictionary
639
data = {
640
    'id': [1, 2, 3, 4, 5],
641
    'name': ['Alice', 'Bob', 'Charlie', 'Diana', 'Eve'],
642
    'age': [25, 30, 35, 28, 32],
643
    'salary': [50000.0, 60000.0, 70000.0, 55000.0, 65000.0]
644
}
645
table = pa.table(data)
646

647
# Table operations
648
subset = table.select(['name', 'age'])
649
filtered = table.filter(pa.compute.greater(table['age'], 30))
650
sorted_table = table.sort_by([('age', 'descending')])
651

652
# Add/remove columns
653
new_table = table.add_column(4, pa.field('bonus', pa.float64()), 
654
                            pa.array([5000.0, 6000.0, 7000.0, 5500.0, 6500.0]))
655
dropped = table.drop(['salary'])
656

657
# Convert to pandas
658
df = table.to_pandas()
659
```
660

661
### Schema Definition
662

663
```python
664
import pyarrow as pa
665

666
# Define schema explicitly
667
schema = pa.schema([
668
    pa.field('id', pa.int64()),
669
    pa.field('name', pa.string()),
670
    pa.field('scores', pa.list_(pa.float64())),
671
    pa.field('metadata', pa.map_(pa.string(), pa.string()))
672
])
673

674
# Create table with schema
675
table = pa.table({
676
    'id': [1, 2, 3],
677
    'name': ['Alice', 'Bob', 'Charlie'],
678
    'scores': [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],
679
    'metadata': [{'key': 'value'}, {}, {'foo': 'bar'}]
680
}, schema=schema)
681

682
# Schema operations
683
field = schema.field('name')
684
field_index = schema.get_field_index('scores')
685
partial_schema = schema.select(['id', 'name'])
686
```

Version

Tile

Files

core-data-structures.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

core-data-structures.mddocs/