Tessl Tile for pypi/awkward@2.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-creation.md array-manipulation.md data-conversion.md index.md integration.md mathematical-operations.md string-operations.md type-system.md

data-conversion.mddocs/

0
# Data Conversion and I/O
1

2
Extensive support for reading from and writing to various data formats including Arrow, Parquet, JSON, NumPy, and integration with popular frameworks like PyTorch, TensorFlow, and JAX. These functions enable seamless interoperability with the broader data science ecosystem.
3

4
## Capabilities
5

6
### NumPy Integration
7

8
Conversion functions for seamless integration with NumPy arrays, the foundation of the Python scientific computing ecosystem.
9

10
```python { .api }
11
def to_numpy(array, allow_missing=True):
12
    """
13
    Convert array to NumPy format.
14
    
15
    Parameters:
16
    - array: Array to convert
17
    - allow_missing: bool, if False raise error for arrays with missing values
18
    
19
    Returns:
20
    numpy.ndarray containing the array data
21
    
22
    Raises:
23
    ValueError if array contains variable-length lists or missing values when allow_missing=False
24
    """
25

26
def to_list(array, behavior=None):
27
    """
28
    Convert array to Python list of nested objects.
29
    
30
    Parameters:
31
    - array: Array to convert
32
    - behavior: dict, custom behavior for conversion
33
    
34
    Returns:
35
    Python list/dict structure containing the array data
36
    """
37

38
def to_packed(array, highlevel=True, behavior=None):
39
    """
40
    Pack array into contiguous memory layout for efficient I/O.
41
    
42
    Parameters:
43
    - array: Array to pack
44
    - highlevel: bool, if True return Array, if False return Content layout
45
    - behavior: dict, custom behavior for the result
46
    
47
    Returns:
48
    Array with packed, contiguous memory layout
49
    """
50
```
51

52
### Apache Arrow Integration
53

54
Comprehensive support for Apache Arrow format, enabling high-performance data exchange and columnar analytics.
55

56
```python { .api }
57
def to_arrow(array, list_to32=False, string_to32=True, bytestring_to32=True, 
58
           extension_array=True, count_nulls=True, extensionarray=None, 
59
           categorical_as_dictionary=False):
60
    """
61
    Convert array to Apache Arrow format.
62
    
63
    Parameters:
64
    - array: Array to convert
65
    - list_to32: bool, if True use 32-bit list offsets
66
    - string_to32: bool, if True use 32-bit string offsets  
67
    - bytestring_to32: bool, if True use 32-bit bytestring offsets
68
    - extension_array: bool, if True use Arrow extension arrays
69
    - count_nulls: bool, if True include null count in metadata
70
    - extensionarray: deprecated, use extension_array
71
    - categorical_as_dictionary: bool, if True convert categorical to Arrow dictionary
72
    
73
    Returns:
74
    pyarrow.Array containing the converted data
75
    """
76

77
def to_arrow_table(arrays, list_to32=False, string_to32=True, bytestring_to32=True,
78
                  extension_array=True, count_nulls=True, extensionarray=None,
79
                  categorical_as_dictionary=False):
80
    """
81
    Convert arrays to Apache Arrow Table format.
82
    
83
    Parameters:
84
    - arrays: dict mapping column names to Arrays, or single Array
85
    - list_to32: bool, if True use 32-bit list offsets
86
    - string_to32: bool, if True use 32-bit string offsets
87
    - bytestring_to32: bool, if True use 32-bit bytestring offsets  
88
    - extension_array: bool, if True use Arrow extension arrays
89
    - count_nulls: bool, if True include null count in metadata
90
    - extensionarray: deprecated, use extension_array
91
    - categorical_as_dictionary: bool, if True convert categorical to Arrow dictionary
92
    
93
    Returns:
94
    pyarrow.Table containing the converted data
95
    """
96

97
def to_buffers(array):
98
    """
99
    Convert array to buffers format for serialization.
100
    
101
    Parameters:
102
    - array: Array to convert to buffers
103
    
104
    Returns:
105
    dict containing form, length, and container with named buffers
106
    """
107
```
108

109
### Parquet File Format
110

111
High-performance columnar storage with compression, metadata preservation, and chunked I/O support.
112

113
```python { .api }
114
def to_parquet(array, destination, list_to32=False, string_to32=True, 
115
              bytestring_to32=True, extension_array=True, count_nulls=True,
116
              compression="zstd", compression_level=None, row_group_size=64*1024*1024,
117
              data_page_size=None, parquet_flavor=None, parquet_version="2.4",
118
              parquet_page_version="1.0", parquet_metadata_statistics=True,
119
              parquet_dictionary_encoding=True, parquet_byte_stream_split=False,
120
              parquet_coerce_timestamps=None, parquet_old_int96_timestamps=None,
121
              parquet_compliant_nested=False, parquet_extra_options=None):
122
    """
123
    Write array to Parquet file format.
124
    
125
    Parameters:
126
    - array: Array to write
127
    - destination: str, file path or file-like object
128
    - list_to32: bool, if True use 32-bit list offsets
129
    - string_to32: bool, if True use 32-bit string offsets
130
    - bytestring_to32: bool, if True use 32-bit bytestring offsets
131
    - extension_array: bool, if True use Arrow extension arrays
132
    - count_nulls: bool, if True include null count in metadata
133
    - compression: str, compression algorithm ("none", "snappy", "gzip", "lz4", "zstd", "brotli")
134
    - compression_level: int, compression level (algorithm-specific)
135
    - row_group_size: int, target row group size in bytes
136
    - data_page_size: int, target data page size in bytes
137
    - parquet_flavor: str, Parquet flavor ("spark", None)
138
    - parquet_version: str, Parquet format version
139
    - parquet_page_version: str, Parquet page format version
140
    - parquet_metadata_statistics: bool, include column statistics
141
    - parquet_dictionary_encoding: bool, use dictionary encoding
142
    - parquet_byte_stream_split: bool, use byte stream split encoding
143
    - parquet_coerce_timestamps: str, timestamp coercion behavior
144
    - parquet_old_int96_timestamps: bool, use old int96 timestamp format
145
    - parquet_compliant_nested: bool, use Parquet-compliant nested encoding
146
    - parquet_extra_options: dict, additional Parquet options
147
    """
148

149
def to_parquet_dataset(arrays, destination, **kwargs):
150
    """
151
    Write arrays as Parquet dataset with partitioning.
152
    
153
    Parameters:
154
    - arrays: dict mapping column names to Arrays
155
    - destination: str, directory path for dataset
156
    - kwargs: additional arguments passed to to_parquet
157
    """
158

159
def to_parquet_row_groups(arrays, destination, **kwargs):
160
    """
161
    Write arrays as Parquet file with multiple row groups.
162
    
163
    Parameters:  
164
    - arrays: sequence of dicts, each containing Arrays for one row group
165
    - destination: str, file path
166
    - kwargs: additional arguments passed to to_parquet
167
    """
168
```
169

170
### Feather/Arrow IPC Format
171

172
Fast binary columnar format for efficient data exchange between processes and languages.
173

174
```python { .api }
175
def to_feather(array, file, compression="zstd", compression_level=None):
176
    """
177
    Write array to Feather (Arrow IPC) format.
178
    
179
    Parameters:
180
    - array: Array to write  
181
    - file: str, file path or file-like object
182
    - compression: str, compression algorithm ("none", "zstd", "lz4")
183
    - compression_level: int, compression level
184
    """
185
```
186

187
### JSON Format
188

189
Human-readable text format supporting complex nested structures and mixed data types.
190

191
```python { .api }
192
def to_json(array, destination=None, pretty=False, maxdecimals=None,
193
           convert_bytes=None, convert_other=None):
194
    """
195
    Convert array to JSON format.
196
    
197
    Parameters:
198
    - array: Array to convert
199
    - destination: str or file-like, output destination (None for string return)
200
    - pretty: bool, if True format with indentation
201
    - maxdecimals: int, maximum decimal places for floats
202
    - convert_bytes: callable, function to convert bytes objects
203
    - convert_other: callable, function to convert unrecognized types
204
    
205
    Returns:
206
    str containing JSON data if destination is None
207
    """
208
```
209

210
### DataFrame Integration
211

212
Conversion to and from Pandas DataFrames for integration with data analysis workflows.
213

214
```python { .api }
215
def to_dataframe(array, how="inner", levelname="sublevel", anonymous="values"):
216
    """
217
    Convert array to Pandas DataFrame.
218
    
219
    Parameters:
220
    - array: Array to convert
221
    - how: str, how to handle nested structure ("inner", "outer") 
222
    - levelname: str, name for MultiIndex levels
223
    - anonymous: str, name for arrays without field names
224
    
225
    Returns:
226
    pandas.DataFrame containing the array data
227
    """
228

229
def to_rdataframe(array):
230
    """
231
    Convert array to ROOT RDataFrame.
232
    
233
    Parameters:
234
    - array: Array to convert
235
    
236
    Returns:
237
    ROOT.RDataFrame containing the array data
238
    """
239
```
240

241
### Machine Learning Framework Integration
242

243
Seamless conversion to and from popular ML frameworks for deep learning and numerical computing workflows.
244

245
```python { .api }
246
def to_torch(array, device=None):
247
    """
248
    Convert array to PyTorch tensor.
249
    
250
    Parameters:
251
    - array: Array to convert (must be rectangular/regular)
252
    - device: torch.device, target device for tensor
253
    
254
    Returns:
255
    torch.Tensor containing the array data
256
    """
257

258
def to_tensorflow(array):
259
    """
260
    Convert array to TensorFlow tensor.
261
    
262
    Parameters:
263
    - array: Array to convert (must be rectangular/regular)
264
    
265
    Returns:
266
    tf.Tensor containing the array data
267
    """
268

269
def to_raggedtensor(array):
270
    """
271
    Convert array to TensorFlow RaggedTensor.
272
    
273
    Parameters:
274
    - array: Array to convert
275
    
276
    Returns:
277
    tf.RaggedTensor containing the array data with nested structure
278
    """
279

280
def to_jax(array):
281
    """
282
    Convert array to JAX array.
283
    
284
    Parameters:
285
    - array: Array to convert (must be rectangular/regular)
286
    
287
    Returns:
288
    jax.numpy.ndarray containing the array data
289
    """
290

291
def to_cupy(array):
292
    """
293
    Convert array to CuPy array for GPU computation.
294
    
295
    Parameters:
296
    - array: Array to convert (must be rectangular/regular)
297
    
298
    Returns:
299
    cupy.ndarray containing the array data
300
    """
301

302
def to_cudf(array):
303
    """
304
    Convert array to cuDF DataFrame for GPU-accelerated analytics.
305
    
306
    Parameters:
307
    - array: Array to convert
308
    
309
    Returns:
310
    cudf.DataFrame containing the array data
311
    """
312
```
313

314
### Type and Layout Conversion
315

316
Functions for converting between different array representations and type systems.
317

318
```python { .api }
319
def to_layout(array):
320
    """
321
    Get low-level Content layout from high-level Array.
322
    
323
    Parameters:
324
    - array: Array to get layout from
325
    
326
    Returns:
327
    Content layout object representing array structure
328
    """
329

330
def to_regular(array, axis=1, highlevel=True, behavior=None):
331
    """
332
    Convert variable-length lists to regular (fixed-length) array.
333
    
334
    Parameters:
335
    - array: Array to convert
336
    - axis: int, axis along which to regularize
337
    - highlevel: bool, if True return Array, if False return Content layout
338
    - behavior: dict, custom behavior for the result
339
    
340
    Returns:
341
    Array with regular structure (fails if lists have different lengths)
342
    """
343

344
def values_astype(array, to, highlevel=True, behavior=None):
345
    """
346
    Cast array values to specified dtype.
347
    
348
    Parameters:
349
    - array: Array to cast
350
    - to: numpy.dtype or str, target data type
351
    - highlevel: bool, if True return Array, if False return Content layout  
352
    - behavior: dict, custom behavior for the result
353
    
354
    Returns:
355
    Array with values cast to new type
356
    """
357

358
def strings_astype(array, to, highlevel=True, behavior=None):
359
    """
360
    Cast string array to specified type by parsing.
361
    
362
    Parameters:
363
    - array: Array of strings to parse
364
    - to: numpy.dtype or str, target data type  
365
    - highlevel: bool, if True return Array, if False return Content layout
366
    - behavior: dict, custom behavior for the result
367
    
368
    Returns:
369
    Array with strings parsed to new type
370
    """
371

372
def categories(array):
373
    """
374
    Get categories from categorical array.
375
    
376
    Parameters:
377
    - array: Categorical Array
378
    
379
    Returns:
380
    Array containing the category values
381
    """
382
```
383

384
### Backend Management
385

386
Functions for managing computational backends and moving data between different execution environments.
387

388
```python { .api }
389
def backend(array):
390
    """
391
    Get the computational backend used by array.
392
    
393
    Parameters:
394
    - array: Array to check backend for
395
    
396
    Returns:
397
    str indicating backend ("cpu", "cuda", "jax", etc.)
398
    """
399

400
def to_backend(array, backend, highlevel=True, behavior=None):
401
    """
402
    Move array to specified computational backend.
403
    
404
    Parameters:
405
    - array: Array to move
406
    - backend: str, target backend ("cpu", "cuda", "jax")  
407
    - highlevel: bool, if True return Array, if False return Content layout
408
    - behavior: dict, custom behavior for the result
409
    
410
    Returns:
411
    Array moved to target backend
412
    """
413
```
414

415
### Specialized Formats
416

417
Support for domain-specific data formats common in scientific computing.
418

419
```python { .api }
420
def to_avro(array, file, schema=None):
421
    """
422
    Write array to Avro format.
423
    
424
    Parameters:
425
    - array: Array to write
426
    - file: str, file path or file-like object
427
    - schema: dict, Avro schema (inferred if None)
428
    """
429
```
430

431
## Usage Examples
432

433
### Basic Conversions
434

435
```python
436
import awkward as ak
437
import numpy as np
438

439
# Create nested array
440
data = ak.Array([[1, 2, 3], [4], [5, 6]])
441

442
# Convert to Python lists
443
python_list = ak.to_list(data)  # [[1, 2, 3], [4], [5, 6]]
444

445
# Convert flat data to NumPy
446
flat_data = ak.Array([1, 2, 3, 4, 5])
447
numpy_array = ak.to_numpy(flat_data)  # np.array([1, 2, 3, 4, 5])
448
```
449

450
### File I/O
451

452
```python
453
import awkward as ak
454

455
# Create sample data
456
records = ak.Array([
457
    {"x": [1, 2], "y": 3.14, "name": "alice"}, 
458
    {"x": [4], "y": 2.71, "name": "bob"}
459
])
460

461
# Write to Parquet
462
ak.to_parquet(records, "data.parquet")
463

464
# Write to JSON
465
ak.to_json(records, "data.json", pretty=True)
466

467
# Write to Feather
468
ak.to_feather(records, "data.feather")
469
```
470

471
### Arrow Integration
472

473
```python
474
import awkward as ak
475
import pyarrow as pa
476

477
data = ak.Array([[1, 2, 3], [4], [5, 6]])
478

479
# Convert to Arrow array
480
arrow_array = ak.to_arrow(data)
481

482
# Convert to Arrow table  
483
table_data = {"numbers": data, "counts": ak.num(data)}
484
arrow_table = ak.to_arrow_table(table_data)
485
```
486

487
### DataFrame Conversion
488

489
```python
490
import awkward as ak
491
import pandas as pd
492

493
# Nested data
494
records = ak.Array([
495
    {"a": [1, 2], "b": "x"},
496
    {"a": [3, 4, 5], "b": "y"}  
497
])
498

499
# Convert to DataFrame (flattens nested structure)
500
df = ak.to_dataframe(records)
501
print(df)
502
#      a  b
503
# 0    1  x
504
# 1    2  x  
505
# 2    3  y
506
# 3    4  y
507
# 4    5  y
508
```
509

510
### ML Framework Integration
511

512
```python
513
import awkward as ak
514
import torch
515
import tensorflow as tf
516

517
# Regular (rectangular) data for ML frameworks
518
regular_data = ak.Array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
519

520
# Convert to PyTorch
521
torch_tensor = ak.to_torch(regular_data)  # torch.Tensor([[1,2], [3,4], [5,6]])
522

523
# Convert to TensorFlow
524
tf_tensor = ak.to_tensorflow(regular_data)  # tf.Tensor([[1,2], [3,4], [5,6]])
525

526
# Variable-length data for TensorFlow RaggedTensor
527
variable_data = ak.Array([[1, 2, 3], [4], [5, 6]])
528
ragged_tensor = ak.to_raggedtensor(variable_data)
529
```
530

531
### Type Conversion
532

533
```python
534
import awkward as ak
535
import numpy as np
536

537
# String to numeric conversion
538
strings = ak.Array(["1.5", "2.7", "3.14"])
539
floats = ak.strings_astype(strings, np.float64)
540

541
# Change numeric type  
542
integers = ak.Array([1, 2, 3])
543
floats = ak.values_astype(integers, np.float32)
544

545
# Convert to regular array (if possible)
546
data = ak.Array([[1, 2], [3, 4], [5, 6]])  # All lists length 2
547
regular = ak.to_regular(data)  # RegularArray with size=2
548
```

Version

Tile

Files

data-conversion.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

data-conversion.mddocs/