0
# Data Conversion and I/O
1
2
Extensive support for reading from and writing to various data formats including Arrow, Parquet, JSON, NumPy, and integration with popular frameworks like PyTorch, TensorFlow, and JAX. These functions enable seamless interoperability with the broader data science ecosystem.
3
4
## Capabilities
5
6
### NumPy Integration
7
8
Conversion functions for seamless integration with NumPy arrays, the foundation of the Python scientific computing ecosystem.
9
10
```python { .api }
11
def to_numpy(array, allow_missing=True):
12
"""
13
Convert array to NumPy format.
14
15
Parameters:
16
- array: Array to convert
17
- allow_missing: bool, if False raise error for arrays with missing values
18
19
Returns:
20
numpy.ndarray containing the array data
21
22
Raises:
23
ValueError if array contains variable-length lists or missing values when allow_missing=False
24
"""
25
26
def to_list(array, behavior=None):
27
"""
28
Convert array to Python list of nested objects.
29
30
Parameters:
31
- array: Array to convert
32
- behavior: dict, custom behavior for conversion
33
34
Returns:
35
Python list/dict structure containing the array data
36
"""
37
38
def to_packed(array, highlevel=True, behavior=None):
39
"""
40
Pack array into contiguous memory layout for efficient I/O.
41
42
Parameters:
43
- array: Array to pack
44
- highlevel: bool, if True return Array, if False return Content layout
45
- behavior: dict, custom behavior for the result
46
47
Returns:
48
Array with packed, contiguous memory layout
49
"""
50
```
51
52
### Apache Arrow Integration
53
54
Comprehensive support for Apache Arrow format, enabling high-performance data exchange and columnar analytics.
55
56
```python { .api }
57
def to_arrow(array, list_to32=False, string_to32=True, bytestring_to32=True,
58
extension_array=True, count_nulls=True, extensionarray=None,
59
categorical_as_dictionary=False):
60
"""
61
Convert array to Apache Arrow format.
62
63
Parameters:
64
- array: Array to convert
65
- list_to32: bool, if True use 32-bit list offsets
66
- string_to32: bool, if True use 32-bit string offsets
67
- bytestring_to32: bool, if True use 32-bit bytestring offsets
68
- extension_array: bool, if True use Arrow extension arrays
69
- count_nulls: bool, if True include null count in metadata
70
- extensionarray: deprecated, use extension_array
71
- categorical_as_dictionary: bool, if True convert categorical to Arrow dictionary
72
73
Returns:
74
pyarrow.Array containing the converted data
75
"""
76
77
def to_arrow_table(arrays, list_to32=False, string_to32=True, bytestring_to32=True,
78
extension_array=True, count_nulls=True, extensionarray=None,
79
categorical_as_dictionary=False):
80
"""
81
Convert arrays to Apache Arrow Table format.
82
83
Parameters:
84
- arrays: dict mapping column names to Arrays, or single Array
85
- list_to32: bool, if True use 32-bit list offsets
86
- string_to32: bool, if True use 32-bit string offsets
87
- bytestring_to32: bool, if True use 32-bit bytestring offsets
88
- extension_array: bool, if True use Arrow extension arrays
89
- count_nulls: bool, if True include null count in metadata
90
- extensionarray: deprecated, use extension_array
91
- categorical_as_dictionary: bool, if True convert categorical to Arrow dictionary
92
93
Returns:
94
pyarrow.Table containing the converted data
95
"""
96
97
def to_buffers(array):
98
"""
99
Convert array to buffers format for serialization.
100
101
Parameters:
102
- array: Array to convert to buffers
103
104
Returns:
105
dict containing form, length, and container with named buffers
106
"""
107
```
108
109
### Parquet File Format
110
111
High-performance columnar storage with compression, metadata preservation, and chunked I/O support.
112
113
```python { .api }
114
def to_parquet(array, destination, list_to32=False, string_to32=True,
115
bytestring_to32=True, extension_array=True, count_nulls=True,
116
compression="zstd", compression_level=None, row_group_size=64*1024*1024,
117
data_page_size=None, parquet_flavor=None, parquet_version="2.4",
118
parquet_page_version="1.0", parquet_metadata_statistics=True,
119
parquet_dictionary_encoding=True, parquet_byte_stream_split=False,
120
parquet_coerce_timestamps=None, parquet_old_int96_timestamps=None,
121
parquet_compliant_nested=False, parquet_extra_options=None):
122
"""
123
Write array to Parquet file format.
124
125
Parameters:
126
- array: Array to write
127
- destination: str, file path or file-like object
128
- list_to32: bool, if True use 32-bit list offsets
129
- string_to32: bool, if True use 32-bit string offsets
130
- bytestring_to32: bool, if True use 32-bit bytestring offsets
131
- extension_array: bool, if True use Arrow extension arrays
132
- count_nulls: bool, if True include null count in metadata
133
- compression: str, compression algorithm ("none", "snappy", "gzip", "lz4", "zstd", "brotli")
134
- compression_level: int, compression level (algorithm-specific)
135
- row_group_size: int, target row group size in bytes
136
- data_page_size: int, target data page size in bytes
137
- parquet_flavor: str, Parquet flavor ("spark", None)
138
- parquet_version: str, Parquet format version
139
- parquet_page_version: str, Parquet page format version
140
- parquet_metadata_statistics: bool, include column statistics
141
- parquet_dictionary_encoding: bool, use dictionary encoding
142
- parquet_byte_stream_split: bool, use byte stream split encoding
143
- parquet_coerce_timestamps: str, timestamp coercion behavior
144
- parquet_old_int96_timestamps: bool, use old int96 timestamp format
145
- parquet_compliant_nested: bool, use Parquet-compliant nested encoding
146
- parquet_extra_options: dict, additional Parquet options
147
"""
148
149
def to_parquet_dataset(arrays, destination, **kwargs):
150
"""
151
Write arrays as Parquet dataset with partitioning.
152
153
Parameters:
154
- arrays: dict mapping column names to Arrays
155
- destination: str, directory path for dataset
156
- kwargs: additional arguments passed to to_parquet
157
"""
158
159
def to_parquet_row_groups(arrays, destination, **kwargs):
160
"""
161
Write arrays as Parquet file with multiple row groups.
162
163
Parameters:
164
- arrays: sequence of dicts, each containing Arrays for one row group
165
- destination: str, file path
166
- kwargs: additional arguments passed to to_parquet
167
"""
168
```
169
170
### Feather/Arrow IPC Format
171
172
Fast binary columnar format for efficient data exchange between processes and languages.
173
174
```python { .api }
175
def to_feather(array, file, compression="zstd", compression_level=None):
176
"""
177
Write array to Feather (Arrow IPC) format.
178
179
Parameters:
180
- array: Array to write
181
- file: str, file path or file-like object
182
- compression: str, compression algorithm ("none", "zstd", "lz4")
183
- compression_level: int, compression level
184
"""
185
```
186
187
### JSON Format
188
189
Human-readable text format supporting complex nested structures and mixed data types.
190
191
```python { .api }
192
def to_json(array, destination=None, pretty=False, maxdecimals=None,
193
convert_bytes=None, convert_other=None):
194
"""
195
Convert array to JSON format.
196
197
Parameters:
198
- array: Array to convert
199
- destination: str or file-like, output destination (None for string return)
200
- pretty: bool, if True format with indentation
201
- maxdecimals: int, maximum decimal places for floats
202
- convert_bytes: callable, function to convert bytes objects
203
- convert_other: callable, function to convert unrecognized types
204
205
Returns:
206
str containing JSON data if destination is None
207
"""
208
```
209
210
### DataFrame Integration
211
212
Conversion to and from Pandas DataFrames for integration with data analysis workflows.
213
214
```python { .api }
215
def to_dataframe(array, how="inner", levelname="sublevel", anonymous="values"):
216
"""
217
Convert array to Pandas DataFrame.
218
219
Parameters:
220
- array: Array to convert
221
- how: str, how to handle nested structure ("inner", "outer")
222
- levelname: str, name for MultiIndex levels
223
- anonymous: str, name for arrays without field names
224
225
Returns:
226
pandas.DataFrame containing the array data
227
"""
228
229
def to_rdataframe(array):
230
"""
231
Convert array to ROOT RDataFrame.
232
233
Parameters:
234
- array: Array to convert
235
236
Returns:
237
ROOT.RDataFrame containing the array data
238
"""
239
```
240
241
### Machine Learning Framework Integration
242
243
Seamless conversion to and from popular ML frameworks for deep learning and numerical computing workflows.
244
245
```python { .api }
246
def to_torch(array, device=None):
247
"""
248
Convert array to PyTorch tensor.
249
250
Parameters:
251
- array: Array to convert (must be rectangular/regular)
252
- device: torch.device, target device for tensor
253
254
Returns:
255
torch.Tensor containing the array data
256
"""
257
258
def to_tensorflow(array):
259
"""
260
Convert array to TensorFlow tensor.
261
262
Parameters:
263
- array: Array to convert (must be rectangular/regular)
264
265
Returns:
266
tf.Tensor containing the array data
267
"""
268
269
def to_raggedtensor(array):
270
"""
271
Convert array to TensorFlow RaggedTensor.
272
273
Parameters:
274
- array: Array to convert
275
276
Returns:
277
tf.RaggedTensor containing the array data with nested structure
278
"""
279
280
def to_jax(array):
281
"""
282
Convert array to JAX array.
283
284
Parameters:
285
- array: Array to convert (must be rectangular/regular)
286
287
Returns:
288
jax.numpy.ndarray containing the array data
289
"""
290
291
def to_cupy(array):
292
"""
293
Convert array to CuPy array for GPU computation.
294
295
Parameters:
296
- array: Array to convert (must be rectangular/regular)
297
298
Returns:
299
cupy.ndarray containing the array data
300
"""
301
302
def to_cudf(array):
303
"""
304
Convert array to cuDF DataFrame for GPU-accelerated analytics.
305
306
Parameters:
307
- array: Array to convert
308
309
Returns:
310
cudf.DataFrame containing the array data
311
"""
312
```
313
314
### Type and Layout Conversion
315
316
Functions for converting between different array representations and type systems.
317
318
```python { .api }
319
def to_layout(array):
320
"""
321
Get low-level Content layout from high-level Array.
322
323
Parameters:
324
- array: Array to get layout from
325
326
Returns:
327
Content layout object representing array structure
328
"""
329
330
def to_regular(array, axis=1, highlevel=True, behavior=None):
331
"""
332
Convert variable-length lists to regular (fixed-length) array.
333
334
Parameters:
335
- array: Array to convert
336
- axis: int, axis along which to regularize
337
- highlevel: bool, if True return Array, if False return Content layout
338
- behavior: dict, custom behavior for the result
339
340
Returns:
341
Array with regular structure (fails if lists have different lengths)
342
"""
343
344
def values_astype(array, to, highlevel=True, behavior=None):
345
"""
346
Cast array values to specified dtype.
347
348
Parameters:
349
- array: Array to cast
350
- to: numpy.dtype or str, target data type
351
- highlevel: bool, if True return Array, if False return Content layout
352
- behavior: dict, custom behavior for the result
353
354
Returns:
355
Array with values cast to new type
356
"""
357
358
def strings_astype(array, to, highlevel=True, behavior=None):
359
"""
360
Cast string array to specified type by parsing.
361
362
Parameters:
363
- array: Array of strings to parse
364
- to: numpy.dtype or str, target data type
365
- highlevel: bool, if True return Array, if False return Content layout
366
- behavior: dict, custom behavior for the result
367
368
Returns:
369
Array with strings parsed to new type
370
"""
371
372
def categories(array):
373
"""
374
Get categories from categorical array.
375
376
Parameters:
377
- array: Categorical Array
378
379
Returns:
380
Array containing the category values
381
"""
382
```
383
384
### Backend Management
385
386
Functions for managing computational backends and moving data between different execution environments.
387
388
```python { .api }
389
def backend(array):
390
"""
391
Get the computational backend used by array.
392
393
Parameters:
394
- array: Array to check backend for
395
396
Returns:
397
str indicating backend ("cpu", "cuda", "jax", etc.)
398
"""
399
400
def to_backend(array, backend, highlevel=True, behavior=None):
401
"""
402
Move array to specified computational backend.
403
404
Parameters:
405
- array: Array to move
406
- backend: str, target backend ("cpu", "cuda", "jax")
407
- highlevel: bool, if True return Array, if False return Content layout
408
- behavior: dict, custom behavior for the result
409
410
Returns:
411
Array moved to target backend
412
"""
413
```
414
415
### Specialized Formats
416
417
Support for domain-specific data formats common in scientific computing.
418
419
```python { .api }
420
def to_avro(array, file, schema=None):
421
"""
422
Write array to Avro format.
423
424
Parameters:
425
- array: Array to write
426
- file: str, file path or file-like object
427
- schema: dict, Avro schema (inferred if None)
428
"""
429
```
430
431
## Usage Examples
432
433
### Basic Conversions
434
435
```python
436
import awkward as ak
437
import numpy as np
438
439
# Create nested array
440
data = ak.Array([[1, 2, 3], [4], [5, 6]])
441
442
# Convert to Python lists
443
python_list = ak.to_list(data) # [[1, 2, 3], [4], [5, 6]]
444
445
# Convert flat data to NumPy
446
flat_data = ak.Array([1, 2, 3, 4, 5])
447
numpy_array = ak.to_numpy(flat_data) # np.array([1, 2, 3, 4, 5])
448
```
449
450
### File I/O
451
452
```python
453
import awkward as ak
454
455
# Create sample data
456
records = ak.Array([
457
{"x": [1, 2], "y": 3.14, "name": "alice"},
458
{"x": [4], "y": 2.71, "name": "bob"}
459
])
460
461
# Write to Parquet
462
ak.to_parquet(records, "data.parquet")
463
464
# Write to JSON
465
ak.to_json(records, "data.json", pretty=True)
466
467
# Write to Feather
468
ak.to_feather(records, "data.feather")
469
```
470
471
### Arrow Integration
472
473
```python
474
import awkward as ak
475
import pyarrow as pa
476
477
data = ak.Array([[1, 2, 3], [4], [5, 6]])
478
479
# Convert to Arrow array
480
arrow_array = ak.to_arrow(data)
481
482
# Convert to Arrow table
483
table_data = {"numbers": data, "counts": ak.num(data)}
484
arrow_table = ak.to_arrow_table(table_data)
485
```
486
487
### DataFrame Conversion
488
489
```python
490
import awkward as ak
491
import pandas as pd
492
493
# Nested data
494
records = ak.Array([
495
{"a": [1, 2], "b": "x"},
496
{"a": [3, 4, 5], "b": "y"}
497
])
498
499
# Convert to DataFrame (flattens nested structure)
500
df = ak.to_dataframe(records)
501
print(df)
502
# a b
503
# 0 1 x
504
# 1 2 x
505
# 2 3 y
506
# 3 4 y
507
# 4 5 y
508
```
509
510
### ML Framework Integration
511
512
```python
513
import awkward as ak
514
import torch
515
import tensorflow as tf
516
517
# Regular (rectangular) data for ML frameworks
518
regular_data = ak.Array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])
519
520
# Convert to PyTorch
521
torch_tensor = ak.to_torch(regular_data) # torch.Tensor([[1,2], [3,4], [5,6]])
522
523
# Convert to TensorFlow
524
tf_tensor = ak.to_tensorflow(regular_data) # tf.Tensor([[1,2], [3,4], [5,6]])
525
526
# Variable-length data for TensorFlow RaggedTensor
527
variable_data = ak.Array([[1, 2, 3], [4], [5, 6]])
528
ragged_tensor = ak.to_raggedtensor(variable_data)
529
```
530
531
### Type Conversion
532
533
```python
534
import awkward as ak
535
import numpy as np
536
537
# String to numeric conversion
538
strings = ak.Array(["1.5", "2.7", "3.14"])
539
floats = ak.strings_astype(strings, np.float64)
540
541
# Change numeric type
542
integers = ak.Array([1, 2, 3])
543
floats = ak.values_astype(integers, np.float32)
544
545
# Convert to regular array (if possible)
546
data = ak.Array([[1, 2], [3, 4], [5, 6]]) # All lists length 2
547
regular = ak.to_regular(data) # RegularArray with size=2
548
```