or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

column-selection.mdconfiguration.mdcore-data-structures.mddata-conversion.mddata-types.mderror-handling.mdfunctions-expressions.mdindex.mdio-operations.mdsql-interface.md

core-data-structures.mddocs/

0

# Core Data Structures

1

2

The fundamental data structures that form the foundation of Polars: DataFrame for eager evaluation, LazyFrame for lazy evaluation with query optimization, Series for one-dimensional data, and Expr for building complex column operations and transformations.

3

4

## Capabilities

5

6

### DataFrame

7

8

Primary data structure for eager evaluation providing immediate computation with comprehensive data manipulation methods including filtering, selection, aggregation, joining, and reshaping operations.

9

10

```python { .api }

11

class DataFrame:

12

def __init__(

13

self,

14

data=None,

15

schema=None,

16

*,

17

schema_overrides=None,

18

strict=True,

19

orient=None,

20

infer_schema_length=None,

21

nan_to_null=False

22

):

23

"""

24

Create a DataFrame from various data sources.

25

26

Parameters:

27

- data: Data source (dict, list, arrow table, pandas df, etc.)

28

- schema: Column names and types

29

- schema_overrides: Override specific column types

30

- strict: Strict schema validation

31

- orient: Data orientation ('row' or 'col')

32

- infer_schema_length: Rows to scan for type inference

33

- nan_to_null: Convert NaN to null values

34

"""

35

36

# Selection and Projection

37

def select(self, *exprs, **named_exprs) -> DataFrame: ...

38

def with_columns(self, *exprs, **named_exprs) -> DataFrame: ...

39

def drop(self, *columns) -> DataFrame: ...

40

def rename(self, mapping) -> DataFrame: ...

41

42

# Filtering and Sorting

43

def filter(self, *predicates) -> DataFrame: ...

44

def sort(self, by, *, descending=False, nulls_last=False) -> DataFrame: ...

45

def unique(self, subset=None, *, keep="any", maintain_order=False) -> DataFrame: ...

46

def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> DataFrame: ...

47

48

# Aggregation and Grouping

49

def group_by(self, *by, maintain_order=False) -> GroupBy: ...

50

def sum(self) -> DataFrame: ...

51

def mean(self) -> DataFrame: ...

52

def max(self) -> DataFrame: ...

53

def min(self) -> DataFrame: ...

54

def std(self, ddof=1) -> DataFrame: ...

55

def var(self, ddof=1) -> DataFrame: ...

56

57

# Reshaping and Transformation

58

def pivot(self, *, on, index=None, values=None, aggregate_function="first", sort_columns=False) -> DataFrame: ...

59

def unpivot(self, *, on=None, index=None, variable_name=None, value_name=None) -> DataFrame: ...

60

def transpose(self, *, include_header=False, header_name="column", column_names=None) -> DataFrame: ...

61

def explode(self, columns, *, schema_overrides=None) -> DataFrame: ...

62

63

# Joining Operations

64

def join(self, other, *, on=None, how="inner", left_on=None, right_on=None, suffix="_right", validate=None, join_nulls=False) -> DataFrame: ...

65

def join_asof(self, other, *, left_on=None, right_on=None, on=None, by_left=None, by_right=None, by=None, strategy="backward") -> DataFrame: ...

66

67

# Window Operations

68

def with_row_index(self, name="row_nr", offset=0) -> DataFrame: ...

69

def rolling(self, index_column, *, period, offset=None, closed="right", by=None, check_sorted=True) -> RollingGroupBy: ...

70

71

# I/O Operations

72

def write_csv(self, file=None, **kwargs) -> str | None: ...

73

def write_parquet(self, file, **kwargs) -> None: ...

74

def write_json(self, file=None, **kwargs) -> str | None: ...

75

def write_excel(self, workbook=None, worksheet=None, **kwargs): ...

76

def write_database(self, table_name, connection, **kwargs) -> int: ...

77

78

# Conversion Methods

79

def to_pandas(self, **kwargs): ...

80

def to_numpy(self, structured=False, **kwargs): ...

81

def to_arrow(self) -> pa.Table: ...

82

def to_dict(self, as_series=True) -> dict: ...

83

def to_dicts(self) -> list[dict]: ...

84

85

# Utility Methods

86

def head(self, n=5) -> DataFrame: ...

87

def tail(self, n=5) -> DataFrame: ...

88

def slice(self, offset, length=None) -> DataFrame: ...

89

def glimpse(self, *, max_items_per_column=10, max_colname_length=50, return_as_string=False) -> str | None: ...

90

def describe(self, *, percentiles=None, interpolation="nearest") -> DataFrame: ...

91

def is_empty(self) -> bool: ...

92

def lazy(self) -> LazyFrame: ...

93

94

# Properties

95

@property

96

def columns(self) -> list[str]: ...

97

@property

98

def dtypes(self) -> list[DataType]: ...

99

@property

100

def schema(self) -> Schema: ...

101

@property

102

def shape(self) -> tuple[int, int]: ...

103

@property

104

def height(self) -> int: ...

105

@property

106

def width(self) -> int: ...

107

@property

108

def flags(self) -> dict[str, dict[str, bool]]: ...

109

```

110

111

### LazyFrame

112

113

Lazy evaluation data structure that builds a computation graph for query optimization, predicate pushdown, and efficient memory usage with automatic query planning.

114

115

```python { .api }

116

class LazyFrame:

117

# Selection and Projection

118

def select(self, *exprs, **named_exprs) -> LazyFrame: ...

119

def with_columns(self, *exprs, **named_exprs) -> LazyFrame: ...

120

def drop(self, *columns) -> LazyFrame: ...

121

def rename(self, mapping) -> LazyFrame: ...

122

123

# Filtering and Sorting

124

def filter(self, *predicates) -> LazyFrame: ...

125

def sort(self, by, *, descending=False, nulls_last=False, multithreaded=True, maintain_order=False) -> LazyFrame: ...

126

def unique(self, subset=None, *, keep="any", maintain_order=False) -> LazyFrame: ...

127

def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> LazyFrame: ...

128

129

# Aggregation and Grouping

130

def group_by(self, *by, maintain_order=False) -> LazyGroupBy: ...

131

def sum(self) -> LazyFrame: ...

132

def mean(self) -> LazyFrame: ...

133

def max(self) -> LazyFrame: ...

134

def min(self) -> LazyFrame: ...

135

def std(self, ddof=1) -> LazyFrame: ...

136

def var(self, ddof=1) -> LazyFrame: ...

137

138

# Reshaping and Transformation

139

def pivot(self, *, on, index=None, values=None, aggregate_function="first", sort_columns=False) -> LazyFrame: ...

140

def unpivot(self, *, on=None, index=None, variable_name=None, value_name=None) -> LazyFrame: ...

141

def explode(self, columns, *, schema_overrides=None) -> LazyFrame: ...

142

143

# Joining Operations

144

def join(self, other, *, on=None, how="inner", left_on=None, right_on=None, suffix="_right", validate=None, join_nulls=False) -> LazyFrame: ...

145

def join_asof(self, other, *, left_on=None, right_on=None, on=None, by_left=None, by_right=None, by=None, strategy="backward") -> LazyFrame: ...

146

147

# Window Operations

148

def with_row_index(self, name="row_nr", offset=0) -> LazyFrame: ...

149

def rolling(self, index_column, *, period, offset=None, closed="right", by=None) -> RollingGroupBy: ...

150

151

# Execution and Optimization

152

def collect(self, *, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, streaming=False, background=False, _eager=True) -> DataFrame: ...

153

def explain(self, *, optimized=True, type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True, format="plain") -> str: ...

154

def show_graph(self, *, optimized=True, show=True, output_path=None, raw_output=False, figsize=(16, 12), type_coercion=True, predicate_pushdown=True, projection_pushdown=True, simplify_expression=True, slice_pushdown=True, comm_subplan_elim=True, comm_subexpr_elim=True, cluster_with_columns=True) -> str | None: ...

155

156

# Utility Methods

157

def head(self, n=5) -> LazyFrame: ...

158

def tail(self, n=5) -> LazyFrame: ...

159

def slice(self, offset, length=None) -> LazyFrame: ...

160

def first(self) -> LazyFrame: ...

161

def last(self) -> LazyFrame: ...

162

def cache(self) -> LazyFrame: ...

163

164

# Properties

165

@property

166

def columns(self) -> list[str]: ...

167

@property

168

def dtypes(self) -> list[DataType]: ...

169

@property

170

def schema(self) -> Schema: ...

171

@property

172

def width(self) -> int: ...

173

```

174

175

### Series

176

177

One-dimensional data structure with vectorized operations, supporting element-wise transformations, aggregations, and integration with DataFrame operations.

178

179

```python { .api }

180

class Series:

181

def __init__(self, name=None, values=None, dtype=None, strict=True, nan_to_null=False):

182

"""

183

Create a Series from values.

184

185

Parameters:

186

- name: Series name

187

- values: Data values (list, array, etc.)

188

- dtype: Data type

189

- strict: Strict type checking

190

- nan_to_null: Convert NaN to null

191

"""

192

193

# Element Access and Slicing

194

def __getitem__(self, item): ...

195

def get(self, index, *, default=None): ...

196

def slice(self, offset, length=None) -> Series: ...

197

def head(self, n=5) -> Series: ...

198

def tail(self, n=5) -> Series: ...

199

def take(self, indices) -> Series: ...

200

def gather(self, indices) -> Series: ...

201

202

# Filtering and Selection

203

def filter(self, predicate) -> Series: ...

204

def unique(self, *, maintain_order=False) -> Series: ...

205

def sample(self, n=None, *, fraction=None, with_replacement=False, shuffle=False, seed=None) -> Series: ...

206

def sort(self, *, descending=False, nulls_last=False) -> Series: ...

207

208

# Transformations

209

def map_elements(self, function, return_dtype=None, *, skip_nulls=True) -> Series: ...

210

def cast(self, dtype, *, strict=True) -> Series: ...

211

def alias(self, name) -> Series: ...

212

def rename(self, name) -> Series: ...

213

214

# Aggregations

215

def sum(self) -> int | float: ...

216

def mean(self) -> float | None: ...

217

def median(self) -> float | None: ...

218

def max(self) -> Any: ...

219

def min(self) -> Any: ...

220

def std(self, ddof=1) -> float | None: ...

221

def var(self, ddof=1) -> float | None: ...

222

def count(self) -> int: ...

223

def len(self) -> int: ...

224

225

# String Operations (when dtype is String)

226

@property

227

def str(self) -> StringNameSpace: ...

228

229

# Datetime Operations (when dtype is temporal)

230

@property

231

def dt(self) -> DateTimeNameSpace: ...

232

233

# List Operations (when dtype is List)

234

@property

235

def list(self) -> ListNameSpace: ...

236

237

# Array Operations (when dtype is Array)

238

@property

239

def arr(self) -> ArrayNameSpace: ...

240

241

# Struct Operations (when dtype is Struct)

242

@property

243

def struct(self) -> StructNameSpace: ...

244

245

# Categorical Operations (when dtype is Categorical)

246

@property

247

def cat(self) -> CategoricalNameSpace: ...

248

249

# Binary Operations (when dtype is Binary)

250

@property

251

def bin(self) -> BinaryNameSpace: ...

252

253

# Conversion Methods

254

def to_list(self) -> list: ...

255

def to_numpy(self, *, zero_copy_only=False, writable=False) -> np.ndarray: ...

256

def to_arrow(self) -> pa.Array: ...

257

def to_pandas(self, **kwargs): ...

258

def to_frame(self, name=None) -> DataFrame: ...

259

260

# Utility Methods

261

def is_null(self) -> Series: ...

262

def is_not_null(self) -> Series: ...

263

def is_finite(self) -> Series: ...

264

def is_infinite(self) -> Series: ...

265

def is_nan(self) -> Series: ...

266

def is_not_nan(self) -> Series: ...

267

def is_empty(self) -> bool: ...

268

def describe(self, *, percentiles=None, interpolation="nearest") -> DataFrame: ...

269

270

# Properties

271

@property

272

def name(self) -> str: ...

273

@property

274

def dtype(self) -> DataType: ...

275

@property

276

def shape(self) -> tuple[int]: ...

277

@property

278

def flags(self) -> dict[str, bool]: ...

279

```

280

281

### Expr

282

283

Expression builder for column operations, transformations, and aggregations that can be used across DataFrame, LazyFrame, and various contexts for building complex data processing pipelines.

284

285

```python { .api }

286

class Expr:

287

# Aliasing and Naming

288

def alias(self, name: str) -> Expr: ...

289

def name(self) -> ExprNameNameSpace: ...

290

291

# Filtering and Selection

292

def filter(self, predicate) -> Expr: ...

293

def sort(self, *, descending=False, nulls_last=False) -> Expr: ...

294

def sort_by(self, by, *, descending=False, nulls_last=False) -> Expr: ...

295

def unique(self, *, maintain_order=False) -> Expr: ...

296

def slice(self, offset, length=None) -> Expr: ...

297

def head(self, n=5) -> Expr: ...

298

def tail(self, n=5) -> Expr: ...

299

def first(self) -> Expr: ...

300

def last(self) -> Expr: ...

301

def take(self, indices) -> Expr: ...

302

def gather(self, indices) -> Expr: ...

303

304

# Aggregations

305

def sum(self) -> Expr: ...

306

def mean(self) -> Expr: ...

307

def median(self) -> Expr: ...

308

def max(self) -> Expr: ...

309

def min(self) -> Expr: ...

310

def std(self, ddof=1) -> Expr: ...

311

def var(self, ddof=1) -> Expr: ...

312

def count(self) -> Expr: ...

313

def len(self) -> Expr: ...

314

def n_unique(self) -> Expr: ...

315

def null_count(self) -> Expr: ...

316

def quantile(self, quantile, interpolation="nearest") -> Expr: ...

317

318

# Window Functions

319

def over(self, partition_by=None, *, order_by=None, mapping_strategy="group_to_rows") -> Expr: ...

320

def rank(self, method="average", *, descending=False, seed=None) -> Expr: ...

321

def cum_sum(self, *, reverse=False) -> Expr: ...

322

def cum_count(self, *, reverse=False) -> Expr: ...

323

def cum_max(self, *, reverse=False) -> Expr: ...

324

def cum_min(self, *, reverse=False) -> Expr: ...

325

326

# Mathematical Operations

327

def abs(self) -> Expr: ...

328

def sqrt(self) -> Expr: ...

329

def log(self, base=None) -> Expr: ...

330

def log10(self) -> Expr: ...

331

def exp(self) -> Expr: ...

332

def pow(self, exponent) -> Expr: ...

333

def round(self, decimals=0) -> Expr: ...

334

def floor(self) -> Expr: ...

335

def ceil(self) -> Expr: ...

336

337

# Type Operations

338

def cast(self, dtype, *, strict=True) -> Expr: ...

339

def is_null(self) -> Expr: ...

340

def is_not_null(self) -> Expr: ...

341

def is_finite(self) -> Expr: ...

342

def is_infinite(self) -> Expr: ...

343

def is_nan(self) -> Expr: ...

344

def is_not_nan(self) -> Expr: ...

345

def is_duplicated(self) -> Expr: ...

346

def is_unique(self) -> Expr: ...

347

def is_first_distinct(self) -> Expr: ...

348

def is_last_distinct(self) -> Expr: ...

349

350

# Conditional Operations

351

def is_between(self, lower_bound, upper_bound, closed="both") -> Expr: ...

352

def is_in(self, other) -> Expr: ...

353

def when(self, condition) -> When: ...

354

355

# String Operations (when expression evaluates to String)

356

@property

357

def str(self) -> ExprStringNameSpace: ...

358

359

# Datetime Operations (when expression evaluates to temporal type)

360

@property

361

def dt(self) -> ExprDateTimeNameSpace: ...

362

363

# List Operations (when expression evaluates to List)

364

@property

365

def list(self) -> ExprListNameSpace: ...

366

367

# Array Operations (when expression evaluates to Array)

368

@property

369

def arr(self) -> ExprArrayNameSpace: ...

370

371

# Struct Operations (when expression evaluates to Struct)

372

@property

373

def struct(self) -> ExprStructNameSpace: ...

374

375

# Categorical Operations (when expression evaluates to Categorical)

376

@property

377

def cat(self) -> ExprCategoricalNameSpace: ...

378

379

# Binary Operations (when expression evaluates to Binary)

380

@property

381

def bin(self) -> ExprBinaryNameSpace: ...

382

383

# Meta Operations

384

@property

385

def meta(self) -> ExprMetaNameSpace: ...

386

```

387

388

## Usage Examples

389

390

### Basic DataFrame Operations

391

392

```python

393

import polars as pl

394

395

# Create DataFrame

396

df = pl.DataFrame({

397

"product": ["A", "B", "C", "A", "B"],

398

"sales": [100, 200, 150, 80, 250],

399

"region": ["North", "South", "North", "South", "North"]

400

})

401

402

# Chain operations

403

result = (

404

df

405

.filter(pl.col("sales") > 100)

406

.with_columns(

407

pl.col("sales").mul(1.1).alias("sales_with_tax"),

408

pl.col("product").str.to_lowercase().alias("product_lower")

409

)

410

.group_by("region")

411

.agg([

412

pl.col("sales").sum().alias("total_sales"),

413

pl.col("product").count().alias("product_count")

414

])

415

)

416

```

417

418

### Lazy Evaluation with Query Optimization

419

420

```python

421

# Build lazy computation

422

lazy_query = (

423

pl.scan_csv("large_dataset.csv")

424

.filter(pl.col("amount") > 1000)

425

.with_columns(

426

pl.col("date").str.to_date().alias("parsed_date"),

427

pl.col("category").str.to_uppercase()

428

)

429

.group_by(["category", pl.col("parsed_date").dt.month()])

430

.agg([

431

pl.col("amount").sum().alias("monthly_total"),

432

pl.col("transaction_id").count().alias("transaction_count")

433

])

434

.sort("monthly_total", descending=True)

435

)

436

437

# Execute optimized query

438

result = lazy_query.collect()

439

440

# View query plan

441

print(lazy_query.explain(optimized=True))

442

```

443

444

### Advanced Expressions

445

446

```python

447

# Complex expression building

448

complex_expr = (

449

pl.when(pl.col("score") >= 90)

450

.then(pl.lit("A"))

451

.when(pl.col("score") >= 80)

452

.then(pl.lit("B"))

453

.when(pl.col("score") >= 70)

454

.then(pl.lit("C"))

455

.otherwise(pl.lit("F"))

456

.alias("grade")

457

)

458

459

df = df.with_columns(complex_expr)

460

461

# Window functions

462

df = df.with_columns([

463

pl.col("sales").rank().over("region").alias("sales_rank"),

464

pl.col("sales").cum_sum().over("region").alias("running_total")

465

])

466

```