or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-creation.mdarray-manipulation.mddata-conversion.mdindex.mdintegration.mdmathematical-operations.mdstring-operations.mdtype-system.md

data-conversion.mddocs/

0

# Data Conversion and I/O

1

2

Extensive support for reading from and writing to various data formats including Arrow, Parquet, JSON, NumPy, and integration with popular frameworks like PyTorch, TensorFlow, and JAX. These functions enable seamless interoperability with the broader data science ecosystem.

3

4

## Capabilities

5

6

### NumPy Integration

7

8

Conversion functions for seamless integration with NumPy arrays, the foundation of the Python scientific computing ecosystem.

9

10

```python { .api }

11

def to_numpy(array, allow_missing=True):

12

"""

13

Convert array to NumPy format.

14

15

Parameters:

16

- array: Array to convert

17

- allow_missing: bool, if False raise error for arrays with missing values

18

19

Returns:

20

numpy.ndarray containing the array data

21

22

Raises:

23

ValueError if array contains variable-length lists or missing values when allow_missing=False

24

"""

25

26

def to_list(array, behavior=None):

27

"""

28

Convert array to Python list of nested objects.

29

30

Parameters:

31

- array: Array to convert

32

- behavior: dict, custom behavior for conversion

33

34

Returns:

35

Python list/dict structure containing the array data

36

"""

37

38

def to_packed(array, highlevel=True, behavior=None):

39

"""

40

Pack array into contiguous memory layout for efficient I/O.

41

42

Parameters:

43

- array: Array to pack

44

- highlevel: bool, if True return Array, if False return Content layout

45

- behavior: dict, custom behavior for the result

46

47

Returns:

48

Array with packed, contiguous memory layout

49

"""

50

```

51

52

### Apache Arrow Integration

53

54

Comprehensive support for Apache Arrow format, enabling high-performance data exchange and columnar analytics.

55

56

```python { .api }

57

def to_arrow(array, list_to32=False, string_to32=True, bytestring_to32=True,

58

extension_array=True, count_nulls=True, extensionarray=None,

59

categorical_as_dictionary=False):

60

"""

61

Convert array to Apache Arrow format.

62

63

Parameters:

64

- array: Array to convert

65

- list_to32: bool, if True use 32-bit list offsets

66

- string_to32: bool, if True use 32-bit string offsets

67

- bytestring_to32: bool, if True use 32-bit bytestring offsets

68

- extension_array: bool, if True use Arrow extension arrays

69

- count_nulls: bool, if True include null count in metadata

70

- extensionarray: deprecated, use extension_array

71

- categorical_as_dictionary: bool, if True convert categorical to Arrow dictionary

72

73

Returns:

74

pyarrow.Array containing the converted data

75

"""

76

77

def to_arrow_table(arrays, list_to32=False, string_to32=True, bytestring_to32=True,

78

extension_array=True, count_nulls=True, extensionarray=None,

79

categorical_as_dictionary=False):

80

"""

81

Convert arrays to Apache Arrow Table format.

82

83

Parameters:

84

- arrays: dict mapping column names to Arrays, or single Array

85

- list_to32: bool, if True use 32-bit list offsets

86

- string_to32: bool, if True use 32-bit string offsets

87

- bytestring_to32: bool, if True use 32-bit bytestring offsets

88

- extension_array: bool, if True use Arrow extension arrays

89

- count_nulls: bool, if True include null count in metadata

90

- extensionarray: deprecated, use extension_array

91

- categorical_as_dictionary: bool, if True convert categorical to Arrow dictionary

92

93

Returns:

94

pyarrow.Table containing the converted data

95

"""

96

97

def to_buffers(array):

98

"""

99

Convert array to buffers format for serialization.

100

101

Parameters:

102

- array: Array to convert to buffers

103

104

Returns:

105

dict containing form, length, and container with named buffers

106

"""

107

```

108

109

### Parquet File Format

110

111

High-performance columnar storage with compression, metadata preservation, and chunked I/O support.

112

113

```python { .api }

114

def to_parquet(array, destination, list_to32=False, string_to32=True,

115

bytestring_to32=True, extension_array=True, count_nulls=True,

116

compression="zstd", compression_level=None, row_group_size=64*1024*1024,

117

data_page_size=None, parquet_flavor=None, parquet_version="2.4",

118

parquet_page_version="1.0", parquet_metadata_statistics=True,

119

parquet_dictionary_encoding=True, parquet_byte_stream_split=False,

120

parquet_coerce_timestamps=None, parquet_old_int96_timestamps=None,

121

parquet_compliant_nested=False, parquet_extra_options=None):

122

"""

123

Write array to Parquet file format.

124

125

Parameters:

126

- array: Array to write

127

- destination: str, file path or file-like object

128

- list_to32: bool, if True use 32-bit list offsets

129

- string_to32: bool, if True use 32-bit string offsets

130

- bytestring_to32: bool, if True use 32-bit bytestring offsets

131

- extension_array: bool, if True use Arrow extension arrays

132

- count_nulls: bool, if True include null count in metadata

133

- compression: str, compression algorithm ("none", "snappy", "gzip", "lz4", "zstd", "brotli")

134

- compression_level: int, compression level (algorithm-specific)

135

- row_group_size: int, target row group size in bytes

136

- data_page_size: int, target data page size in bytes

137

- parquet_flavor: str, Parquet flavor ("spark", None)

138

- parquet_version: str, Parquet format version

139

- parquet_page_version: str, Parquet page format version

140

- parquet_metadata_statistics: bool, include column statistics

141

- parquet_dictionary_encoding: bool, use dictionary encoding

142

- parquet_byte_stream_split: bool, use byte stream split encoding

143

- parquet_coerce_timestamps: str, timestamp coercion behavior

144

- parquet_old_int96_timestamps: bool, use old int96 timestamp format

145

- parquet_compliant_nested: bool, use Parquet-compliant nested encoding

146

- parquet_extra_options: dict, additional Parquet options

147

"""

148

149

def to_parquet_dataset(arrays, destination, **kwargs):

150

"""

151

Write arrays as Parquet dataset with partitioning.

152

153

Parameters:

154

- arrays: dict mapping column names to Arrays

155

- destination: str, directory path for dataset

156

- kwargs: additional arguments passed to to_parquet

157

"""

158

159

def to_parquet_row_groups(arrays, destination, **kwargs):

160

"""

161

Write arrays as Parquet file with multiple row groups.

162

163

Parameters:

164

- arrays: sequence of dicts, each containing Arrays for one row group

165

- destination: str, file path

166

- kwargs: additional arguments passed to to_parquet

167

"""

168

```

169

170

### Feather/Arrow IPC Format

171

172

Fast binary columnar format for efficient data exchange between processes and languages.

173

174

```python { .api }

175

def to_feather(array, file, compression="zstd", compression_level=None):

176

"""

177

Write array to Feather (Arrow IPC) format.

178

179

Parameters:

180

- array: Array to write

181

- file: str, file path or file-like object

182

- compression: str, compression algorithm ("none", "zstd", "lz4")

183

- compression_level: int, compression level

184

"""

185

```

186

187

### JSON Format

188

189

Human-readable text format supporting complex nested structures and mixed data types.

190

191

```python { .api }

192

def to_json(array, destination=None, pretty=False, maxdecimals=None,

193

convert_bytes=None, convert_other=None):

194

"""

195

Convert array to JSON format.

196

197

Parameters:

198

- array: Array to convert

199

- destination: str or file-like, output destination (None for string return)

200

- pretty: bool, if True format with indentation

201

- maxdecimals: int, maximum decimal places for floats

202

- convert_bytes: callable, function to convert bytes objects

203

- convert_other: callable, function to convert unrecognized types

204

205

Returns:

206

str containing JSON data if destination is None

207

"""

208

```

209

210

### DataFrame Integration

211

212

Conversion to and from Pandas DataFrames for integration with data analysis workflows.

213

214

```python { .api }

215

def to_dataframe(array, how="inner", levelname="sublevel", anonymous="values"):

216

"""

217

Convert array to Pandas DataFrame.

218

219

Parameters:

220

- array: Array to convert

221

- how: str, how to handle nested structure ("inner", "outer")

222

- levelname: str, name for MultiIndex levels

223

- anonymous: str, name for arrays without field names

224

225

Returns:

226

pandas.DataFrame containing the array data

227

"""

228

229

def to_rdataframe(array):

230

"""

231

Convert array to ROOT RDataFrame.

232

233

Parameters:

234

- array: Array to convert

235

236

Returns:

237

ROOT.RDataFrame containing the array data

238

"""

239

```

240

241

### Machine Learning Framework Integration

242

243

Seamless conversion to and from popular ML frameworks for deep learning and numerical computing workflows.

244

245

```python { .api }

246

def to_torch(array, device=None):

247

"""

248

Convert array to PyTorch tensor.

249

250

Parameters:

251

- array: Array to convert (must be rectangular/regular)

252

- device: torch.device, target device for tensor

253

254

Returns:

255

torch.Tensor containing the array data

256

"""

257

258

def to_tensorflow(array):

259

"""

260

Convert array to TensorFlow tensor.

261

262

Parameters:

263

- array: Array to convert (must be rectangular/regular)

264

265

Returns:

266

tf.Tensor containing the array data

267

"""

268

269

def to_raggedtensor(array):

270

"""

271

Convert array to TensorFlow RaggedTensor.

272

273

Parameters:

274

- array: Array to convert

275

276

Returns:

277

tf.RaggedTensor containing the array data with nested structure

278

"""

279

280

def to_jax(array):

281

"""

282

Convert array to JAX array.

283

284

Parameters:

285

- array: Array to convert (must be rectangular/regular)

286

287

Returns:

288

jax.numpy.ndarray containing the array data

289

"""

290

291

def to_cupy(array):

292

"""

293

Convert array to CuPy array for GPU computation.

294

295

Parameters:

296

- array: Array to convert (must be rectangular/regular)

297

298

Returns:

299

cupy.ndarray containing the array data

300

"""

301

302

def to_cudf(array):

303

"""

304

Convert array to cuDF DataFrame for GPU-accelerated analytics.

305

306

Parameters:

307

- array: Array to convert

308

309

Returns:

310

cudf.DataFrame containing the array data

311

"""

312

```

313

314

### Type and Layout Conversion

315

316

Functions for converting between different array representations and type systems.

317

318

```python { .api }

319

def to_layout(array):

320

"""

321

Get low-level Content layout from high-level Array.

322

323

Parameters:

324

- array: Array to get layout from

325

326

Returns:

327

Content layout object representing array structure

328

"""

329

330

def to_regular(array, axis=1, highlevel=True, behavior=None):

331

"""

332

Convert variable-length lists to regular (fixed-length) array.

333

334

Parameters:

335

- array: Array to convert

336

- axis: int, axis along which to regularize

337

- highlevel: bool, if True return Array, if False return Content layout

338

- behavior: dict, custom behavior for the result

339

340

Returns:

341

Array with regular structure (fails if lists have different lengths)

342

"""

343

344

def values_astype(array, to, highlevel=True, behavior=None):

345

"""

346

Cast array values to specified dtype.

347

348

Parameters:

349

- array: Array to cast

350

- to: numpy.dtype or str, target data type

351

- highlevel: bool, if True return Array, if False return Content layout

352

- behavior: dict, custom behavior for the result

353

354

Returns:

355

Array with values cast to new type

356

"""

357

358

def strings_astype(array, to, highlevel=True, behavior=None):

359

"""

360

Cast string array to specified type by parsing.

361

362

Parameters:

363

- array: Array of strings to parse

364

- to: numpy.dtype or str, target data type

365

- highlevel: bool, if True return Array, if False return Content layout

366

- behavior: dict, custom behavior for the result

367

368

Returns:

369

Array with strings parsed to new type

370

"""

371

372

def categories(array):

373

"""

374

Get categories from categorical array.

375

376

Parameters:

377

- array: Categorical Array

378

379

Returns:

380

Array containing the category values

381

"""

382

```

383

384

### Backend Management

385

386

Functions for managing computational backends and moving data between different execution environments.

387

388

```python { .api }

389

def backend(array):

390

"""

391

Get the computational backend used by array.

392

393

Parameters:

394

- array: Array to check backend for

395

396

Returns:

397

str indicating backend ("cpu", "cuda", "jax", etc.)

398

"""

399

400

def to_backend(array, backend, highlevel=True, behavior=None):

401

"""

402

Move array to specified computational backend.

403

404

Parameters:

405

- array: Array to move

406

- backend: str, target backend ("cpu", "cuda", "jax")

407

- highlevel: bool, if True return Array, if False return Content layout

408

- behavior: dict, custom behavior for the result

409

410

Returns:

411

Array moved to target backend

412

"""

413

```

414

415

### Specialized Formats

416

417

Support for domain-specific data formats common in scientific computing.

418

419

```python { .api }

420

def to_avro(array, file, schema=None):

421

"""

422

Write array to Avro format.

423

424

Parameters:

425

- array: Array to write

426

- file: str, file path or file-like object

427

- schema: dict, Avro schema (inferred if None)

428

"""

429

```

430

431

## Usage Examples

432

433

### Basic Conversions

434

435

```python

436

import awkward as ak

437

import numpy as np

438

439

# Create nested array

440

data = ak.Array([[1, 2, 3], [4], [5, 6]])

441

442

# Convert to Python lists

443

python_list = ak.to_list(data) # [[1, 2, 3], [4], [5, 6]]

444

445

# Convert flat data to NumPy

446

flat_data = ak.Array([1, 2, 3, 4, 5])

447

numpy_array = ak.to_numpy(flat_data) # np.array([1, 2, 3, 4, 5])

448

```

449

450

### File I/O

451

452

```python

453

import awkward as ak

454

455

# Create sample data

456

records = ak.Array([

457

{"x": [1, 2], "y": 3.14, "name": "alice"},

458

{"x": [4], "y": 2.71, "name": "bob"}

459

])

460

461

# Write to Parquet

462

ak.to_parquet(records, "data.parquet")

463

464

# Write to JSON

465

ak.to_json(records, "data.json", pretty=True)

466

467

# Write to Feather

468

ak.to_feather(records, "data.feather")

469

```

470

471

### Arrow Integration

472

473

```python

474

import awkward as ak

475

import pyarrow as pa

476

477

data = ak.Array([[1, 2, 3], [4], [5, 6]])

478

479

# Convert to Arrow array

480

arrow_array = ak.to_arrow(data)

481

482

# Convert to Arrow table

483

table_data = {"numbers": data, "counts": ak.num(data)}

484

arrow_table = ak.to_arrow_table(table_data)

485

```

486

487

### DataFrame Conversion

488

489

```python

490

import awkward as ak

491

import pandas as pd

492

493

# Nested data

494

records = ak.Array([

495

{"a": [1, 2], "b": "x"},

496

{"a": [3, 4, 5], "b": "y"}

497

])

498

499

# Convert to DataFrame (flattens nested structure)

500

df = ak.to_dataframe(records)

501

print(df)

502

# a b

503

# 0 1 x

504

# 1 2 x

505

# 2 3 y

506

# 3 4 y

507

# 4 5 y

508

```

509

510

### ML Framework Integration

511

512

```python

513

import awkward as ak

514

import torch

515

import tensorflow as tf

516

517

# Regular (rectangular) data for ML frameworks

518

regular_data = ak.Array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]])

519

520

# Convert to PyTorch

521

torch_tensor = ak.to_torch(regular_data) # torch.Tensor([[1,2], [3,4], [5,6]])

522

523

# Convert to TensorFlow

524

tf_tensor = ak.to_tensorflow(regular_data) # tf.Tensor([[1,2], [3,4], [5,6]])

525

526

# Variable-length data for TensorFlow RaggedTensor

527

variable_data = ak.Array([[1, 2, 3], [4], [5, 6]])

528

ragged_tensor = ak.to_raggedtensor(variable_data)

529

```

530

531

### Type Conversion

532

533

```python

534

import awkward as ak

535

import numpy as np

536

537

# String to numeric conversion

538

strings = ak.Array(["1.5", "2.7", "3.14"])

539

floats = ak.strings_astype(strings, np.float64)

540

541

# Change numeric type

542

integers = ak.Array([1, 2, 3])

543

floats = ak.values_astype(integers, np.float32)

544

545

# Convert to regular array (if possible)

546

data = ak.Array([[1, 2], [3, 4], [5, 6]]) # All lists length 2

547

regular = ak.to_regular(data) # RegularArray with size=2

548

```