or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

config-utilities.mdcore-data-structures.mddata-types.mdexpressions.mdfunctions.mdindex.mdio-operations.mdselectors.mdsql-interface.md

core-data-structures.mddocs/

0

# Core Data Structures

1

2

Primary data structures for working with tabular data in Polars, including eager DataFrame/Series for immediate operations and LazyFrame for optimized query execution with the 64-bit index variant supporting datasets exceeding 4.2 billion rows.

3

4

## Capabilities

5

6

### DataFrame

7

8

Two-dimensional labeled data structure with columns of potentially different types. The primary data structure for eager evaluation where operations are executed immediately.

9

10

```python { .api }

11

class DataFrame:

12

def __init__(

13

self,

14

data=None,

15

schema=None,

16

schema_overrides=None,

17

orient=None,

18

infer_schema_length=N_INFER_DEFAULT,

19

nan_to_null=False

20

):

21

"""

22

Create a DataFrame from various data sources.

23

24

Parameters:

25

- data: Data source (dict, list, numpy array, pandas DataFrame, etc.)

26

- schema: Column names and types

27

- schema_overrides: Override inferred types for specific columns

28

- orient: Data orientation ("row" or "col")

29

- infer_schema_length: Number of rows to scan for type inference

30

- nan_to_null: Convert NaN values to null

31

"""

32

33

@property

34

def shape(self) -> tuple[int, int]:

35

"""Get the shape (rows, columns) of the DataFrame."""

36

37

@property

38

def height(self) -> int:

39

"""Get the number of rows."""

40

41

@property

42

def width(self) -> int:

43

"""Get the number of columns."""

44

45

@property

46

def columns(self) -> list[str]:

47

"""Get column names."""

48

49

@property

50

def dtypes(self) -> list[DataType]:

51

"""Get data types of all columns."""

52

53

@property

54

def schema(self) -> Schema:

55

"""Get the schema (column names and types)."""

56

57

def select(self, *exprs, **named_exprs) -> DataFrame:

58

"""

59

Select columns using expressions.

60

61

Parameters:

62

- exprs: Column expressions to select

63

- named_exprs: Named expressions for new columns

64

65

Returns:

66

DataFrame with selected columns

67

"""

68

69

def filter(self, *predicates, **constraints) -> DataFrame:

70

"""

71

Filter rows based on predicates.

72

73

Parameters:

74

- predicates: Boolean expressions for filtering

75

- constraints: Named constraints

76

77

Returns:

78

Filtered DataFrame

79

"""

80

81

def with_columns(self, *exprs, **named_exprs) -> DataFrame:

82

"""

83

Add or modify columns.

84

85

Parameters:

86

- exprs: Column expressions to add/modify

87

- named_exprs: Named expressions for new columns

88

89

Returns:

90

DataFrame with added/modified columns

91

"""

92

93

def drop(self, *columns, strict=True) -> DataFrame:

94

"""

95

Drop columns from DataFrame.

96

97

Parameters:

98

- columns: Column names to drop

99

- strict: Whether to raise error if column doesn't exist

100

101

Returns:

102

DataFrame without dropped columns

103

"""

104

105

def rename(self, mapping: dict[str, str] | Callable[[str], str]) -> DataFrame:

106

"""

107

Rename columns.

108

109

Parameters:

110

- mapping: Dictionary mapping old to new names, or function

111

112

Returns:

113

DataFrame with renamed columns

114

"""

115

116

def sort(

117

self,

118

by,

119

*,

120

descending=False,

121

nulls_last=False,

122

multithreaded=True

123

) -> DataFrame:

124

"""

125

Sort DataFrame by columns.

126

127

Parameters:

128

- by: Column(s) to sort by

129

- descending: Sort in descending order

130

- nulls_last: Place nulls at end

131

- multithreaded: Use multiple threads

132

133

Returns:

134

Sorted DataFrame

135

"""

136

137

def group_by(self, *by, maintain_order=False, **named_by) -> GroupBy:

138

"""

139

Group DataFrame for aggregation.

140

141

Parameters:

142

- by: Columns to group by

143

- maintain_order: Maintain order of groups

144

- named_by: Named grouping expressions

145

146

Returns:

147

GroupBy object for aggregation

148

"""

149

150

def join(

151

self,

152

other,

153

on=None,

154

how="inner",

155

*,

156

left_on=None,

157

right_on=None,

158

suffix="_right",

159

validate="m:m",

160

join_nulls=False,

161

coalesce=None

162

) -> DataFrame:

163

"""

164

Join with another DataFrame.

165

166

Parameters:

167

- other: DataFrame to join with

168

- on: Column(s) to join on

169

- how: Join type ("inner", "left", "outer", "cross", "anti", "semi")

170

- left_on: Left DataFrame join columns

171

- right_on: Right DataFrame join columns

172

- suffix: Suffix for duplicate column names

173

- validate: Join validation ("m:m", "1:m", "m:1", "1:1")

174

- join_nulls: Join on null values

175

- coalesce: Coalesce join columns

176

177

Returns:

178

Joined DataFrame

179

"""

180

181

def concat(self, other, *, how="vertical", ignore_index=False) -> DataFrame:

182

"""

183

Concatenate with other DataFrame(s).

184

185

Parameters:

186

- other: DataFrame(s) to concatenate

187

- how: Concatenation method ("vertical", "horizontal", "diagonal")

188

- ignore_index: Reset index after concatenation

189

190

Returns:

191

Concatenated DataFrame

192

"""

193

194

def to_pandas(self, **kwargs) -> pd.DataFrame:

195

"""Convert to pandas DataFrame."""

196

197

def to_numpy(self, structured=False, order="c") -> np.ndarray:

198

"""Convert to NumPy array."""

199

200

def to_arrow(self, *, compat_level=None) -> pa.Table:

201

"""Convert to PyArrow Table."""

202

203

def to_dict(self, *, as_series=True) -> dict[str, Series | list[Any]]:

204

"""Convert to dictionary."""

205

206

def write_csv(self, file=None, **kwargs) -> str | None:

207

"""Write to CSV file."""

208

209

def write_json(self, file=None, **kwargs) -> str | None:

210

"""Write to JSON file."""

211

212

def write_parquet(self, file, **kwargs) -> None:

213

"""Write to Parquet file."""

214

215

def write_ipc(self, file, **kwargs) -> None:

216

"""Write to IPC/Arrow file."""

217

218

def lazy(self) -> LazyFrame:

219

"""Convert to LazyFrame for optimized operations."""

220

221

def head(self, n=5) -> DataFrame:

222

"""Get first n rows."""

223

224

def tail(self, n=5) -> DataFrame:

225

"""Get last n rows."""

226

227

def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> DataFrame:

228

"""Sample rows from DataFrame."""

229

230

def null_count(self) -> DataFrame:

231

"""Count null values per column."""

232

233

def is_empty(self) -> bool:

234

"""Check if DataFrame is empty."""

235

236

def clone(self) -> DataFrame:

237

"""Create a copy of the DataFrame."""

238

```

239

240

### Series

241

242

One-dimensional labeled array with homogeneous data type. Similar to a column in a DataFrame but can exist independently.

243

244

```python { .api }

245

class Series:

246

def __init__(

247

self,

248

name=None,

249

values=None,

250

dtype=None,

251

strict=True,

252

nan_to_null=False,

253

dtype_if_empty=Null

254

):

255

"""

256

Create a Series.

257

258

Parameters:

259

- name: Series name

260

- values: Data values

261

- dtype: Data type

262

- strict: Strict type checking

263

- nan_to_null: Convert NaN to null

264

- dtype_if_empty: Type when empty

265

"""

266

267

@property

268

def name(self) -> str:

269

"""Get Series name."""

270

271

@property

272

def dtype(self) -> DataType:

273

"""Get data type."""

274

275

@property

276

def shape(self) -> tuple[int]:

277

"""Get shape (length,)."""

278

279

def len(self) -> int:

280

"""Get length."""

281

282

def sum(self) -> Any:

283

"""Sum all values."""

284

285

def mean(self) -> float | None:

286

"""Calculate mean."""

287

288

def max(self) -> Any:

289

"""Get maximum value."""

290

291

def min(self) -> Any:

292

"""Get minimum value."""

293

294

def sort(self, *, descending=False, nulls_last=False) -> Series:

295

"""Sort Series values."""

296

297

def filter(self, predicate) -> Series:

298

"""Filter values based on predicate."""

299

300

def to_list(self) -> list[Any]:

301

"""Convert to Python list."""

302

303

def to_numpy(self) -> np.ndarray:

304

"""Convert to NumPy array."""

305

306

def to_pandas(self) -> pd.Series:

307

"""Convert to pandas Series."""

308

309

def to_frame(self, name=None) -> DataFrame:

310

"""Convert to single-column DataFrame."""

311

```

312

313

### LazyFrame

314

315

Lazy evaluation version of DataFrame that builds a query plan without executing until `.collect()` is called. Enables query optimization and efficient processing of large datasets.

316

317

```python { .api }

318

class LazyFrame:

319

def select(self, *exprs, **named_exprs) -> LazyFrame:

320

"""Select columns (lazy operation)."""

321

322

def filter(self, *predicates, **constraints) -> LazyFrame:

323

"""Filter rows (lazy operation)."""

324

325

def with_columns(self, *exprs, **named_exprs) -> LazyFrame:

326

"""Add/modify columns (lazy operation)."""

327

328

def drop(self, *columns, strict=True) -> LazyFrame:

329

"""Drop columns (lazy operation)."""

330

331

def sort(self, by, *, descending=False, nulls_last=False, multithreaded=True) -> LazyFrame:

332

"""Sort by columns (lazy operation)."""

333

334

def group_by(self, *by, maintain_order=False, **named_by) -> LazyGroupBy:

335

"""Group for aggregation (lazy operation)."""

336

337

def join(

338

self,

339

other,

340

on=None,

341

how="inner",

342

*,

343

left_on=None,

344

right_on=None,

345

suffix="_right",

346

validate="m:m",

347

join_nulls=False,

348

coalesce=None

349

) -> LazyFrame:

350

"""Join with another LazyFrame (lazy operation)."""

351

352

def collect(

353

self,

354

*,

355

type_coercion=True,

356

predicate_pushdown=True,

357

projection_pushdown=True,

358

simplify_expression=True,

359

slice_pushdown=True,

360

comm_subplan_elim=True,

361

comm_subexpr_elim=True,

362

cluster_with_columns=True,

363

no_optimization=False,

364

streaming=False,

365

background=False,

366

_eager=False

367

) -> DataFrame:

368

"""

369

Execute the lazy query and return DataFrame.

370

371

Parameters:

372

- type_coercion: Apply automatic type coercion

373

- predicate_pushdown: Push filters down to scan level

374

- projection_pushdown: Push column selection down

375

- simplify_expression: Simplify expressions

376

- slice_pushdown: Push limits/offsets down

377

- comm_subplan_elim: Eliminate common subplans

378

- comm_subexpr_elim: Eliminate common subexpressions

379

- cluster_with_columns: Cluster with_columns operations

380

- no_optimization: Disable all optimizations

381

- streaming: Execute in streaming mode

382

- background: Execute in background thread

383

384

Returns:

385

Executed DataFrame

386

"""

387

388

def explain(self, *, optimized=True, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, streaming=False) -> str:

389

"""Get query execution plan."""

390

391

def schema(self) -> Schema:

392

"""Get the expected schema."""

393

394

def dtypes(self) -> list[DataType]:

395

"""Get expected column data types."""

396

397

def columns(self) -> list[str]:

398

"""Get expected column names."""

399

400

def head(self, n=5) -> LazyFrame:

401

"""Get first n rows (lazy operation)."""

402

403

def tail(self, n=5) -> LazyFrame:

404

"""Get last n rows (lazy operation)."""

405

406

def limit(self, n) -> LazyFrame:

407

"""Limit number of rows (lazy operation)."""

408

409

def offset(self, n) -> LazyFrame:

410

"""Skip first n rows (lazy operation)."""

411

412

def slice(self, offset, length=None) -> LazyFrame:

413

"""Slice rows (lazy operation)."""

414

```

415

416

### GroupBy Operations

417

418

GroupBy objects returned from `group_by()` operations on DataFrame and LazyFrame for aggregation operations.

419

420

```python { .api }

421

class GroupBy:

422

def agg(self, *aggs, **named_aggs) -> DataFrame:

423

"""

424

Aggregate grouped data.

425

426

Parameters:

427

- aggs: Aggregation expressions

428

- named_aggs: Named aggregation expressions

429

430

Returns:

431

DataFrame with aggregated results

432

"""

433

434

def sum(self) -> DataFrame:

435

"""Sum each group."""

436

437

def mean(self) -> DataFrame:

438

"""Mean of each group."""

439

440

def max(self) -> DataFrame:

441

"""Maximum of each group."""

442

443

def min(self) -> DataFrame:

444

"""Minimum of each group."""

445

446

def count(self) -> DataFrame:

447

"""Count rows in each group."""

448

449

def first(self) -> DataFrame:

450

"""First value in each group."""

451

452

def last(self) -> DataFrame:

453

"""Last value in each group."""

454

455

class LazyGroupBy:

456

def agg(self, *aggs, **named_aggs) -> LazyFrame:

457

"""Aggregate grouped data (lazy operation)."""

458

459

def sum(self) -> LazyFrame:

460

"""Sum each group (lazy operation)."""

461

462

def mean(self) -> LazyFrame:

463

"""Mean of each group (lazy operation)."""

464

465

def max(self) -> LazyFrame:

466

"""Maximum of each group (lazy operation)."""

467

468

def min(self) -> LazyFrame:

469

"""Minimum of each group (lazy operation)."""

470

471

def count(self) -> LazyFrame:

472

"""Count rows in each group (lazy operation)."""

473

```

474

475

## Usage Examples

476

477

### Creating DataFrames

478

479

```python

480

import polars as pl

481

482

# From dictionary

483

df = pl.DataFrame({

484

"name": ["Alice", "Bob", "Charlie"],

485

"age": [25, 30, 35],

486

"salary": [50000, 60000, 70000]

487

})

488

489

# From list of dictionaries

490

data = [

491

{"name": "Alice", "age": 25, "salary": 50000},

492

{"name": "Bob", "age": 30, "salary": 60000},

493

{"name": "Charlie", "age": 35, "salary": 70000}

494

]

495

df = pl.DataFrame(data)

496

497

# From NumPy array

498

import numpy as np

499

arr = np.array([[1, 2, 3], [4, 5, 6]])

500

df = pl.DataFrame(arr, schema=["a", "b", "c"])

501

```

502

503

### DataFrame Operations

504

505

```python

506

# Basic operations

507

result = (df

508

.filter(pl.col("age") > 28)

509

.select([

510

pl.col("name"),

511

pl.col("age"),

512

(pl.col("salary") / 1000).alias("salary_k")

513

])

514

.sort("age", descending=True)

515

)

516

517

# Grouping and aggregation

518

summary = (df

519

.group_by("department")

520

.agg([

521

pl.col("salary").mean().alias("avg_salary"),

522

pl.col("name").count().alias("employee_count"),

523

pl.col("age").max().alias("max_age")

524

])

525

)

526

```

527

528

### Lazy Operations

529

530

```python

531

# Build query plan without execution

532

lazy_query = (pl

533

.scan_csv("large_dataset.csv")

534

.filter(pl.col("amount") > 1000)

535

.group_by("category")

536

.agg([

537

pl.col("amount").sum().alias("total"),

538

pl.col("id").count().alias("count")

539

])

540

.sort("total", descending=True)

541

)

542

543

# Execute optimized query

544

result = lazy_query.collect()

545

546

# Check execution plan

547

print(lazy_query.explain())

548

```

549

550

### Working with Large Datasets (64-bit Index)

551

552

```python

553

# The u64-idx variant handles datasets > 4.2B rows

554

very_large_df = pl.scan_parquet("huge_dataset.parquet")

555

556

# Operations work the same but support more rows

557

result = (very_large_df

558

.filter(pl.col("timestamp") > "2023-01-01")

559

.group_by("user_id")

560

.agg([

561

pl.col("value").sum(),

562

pl.col("event").count()

563

])

564

.collect(streaming=True) # Use streaming for memory efficiency

565

)

566

```