or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-context-rdds.mdindex.mdlegacy-mllib.mdmachine-learning.mdpandas-api.mdresource-management.mdsql-dataframes.mdstreaming.md

pandas-api.mddocs/

0

# Pandas API on Spark

1

2

Pandas-compatible API for familiar pandas operations on distributed datasets. This enables seamless scaling of pandas workflows to large datasets while maintaining the familiar pandas interface and functionality.

3

4

## Capabilities

5

6

### DataFrame Operations

7

8

Core DataFrame functionality with pandas-compatible interface.

9

10

```python { .api }

11

class DataFrame:

12

"""Pandas-compatible DataFrame on Spark."""

13

14

def head(self, n=5):

15

"""

16

Return first n rows.

17

18

Parameters:

19

- n (int): Number of rows

20

21

Returns:

22

DataFrame with first n rows

23

"""

24

25

def tail(self, n=5):

26

"""

27

Return last n rows.

28

29

Parameters:

30

- n (int): Number of rows

31

32

Returns:

33

DataFrame with last n rows

34

"""

35

36

def describe(self, percentiles=None, include=None, exclude=None):

37

"""

38

Generate descriptive statistics.

39

40

Parameters:

41

- percentiles (list): Percentiles to include

42

- include: Data types to include

43

- exclude: Data types to exclude

44

45

Returns:

46

DataFrame with statistics

47

"""

48

49

def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, show_counts=None):

50

"""Print DataFrame info."""

51

52

def count(self):

53

"""Count non-null values."""

54

55

def sum(self, axis=None, skipna=True, level=None, numeric_only=None, min_count=0):

56

"""Sum values."""

57

58

def mean(self, axis=None, skipna=True, level=None, numeric_only=None):

59

"""Calculate mean."""

60

61

def median(self, axis=None, skipna=True, level=None, numeric_only=None):

62

"""Calculate median."""

63

64

def std(self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None):

65

"""Calculate standard deviation."""

66

67

def var(self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None):

68

"""Calculate variance."""

69

70

def min(self, axis=None, skipna=True, level=None, numeric_only=None):

71

"""Return minimum values."""

72

73

def max(self, axis=None, skipna=True, level=None, numeric_only=None):

74

"""Return maximum values."""

75

76

def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, observed=False, dropna=True):

77

"""

78

Group DataFrame by columns.

79

80

Parameters:

81

- by: Columns to group by

82

- axis (int): Axis to group by

83

- level: Level for MultiIndex

84

- as_index (bool): Whether to use group keys as index

85

- sort (bool): Sort group keys

86

- group_keys (bool): Add group keys to index

87

- squeeze (bool): Reduce dimensionality

88

- observed (bool): Only show observed values for categorical

89

- dropna (bool): Drop NA values from groups

90

91

Returns:

92

GroupBy object

93

"""

94

95

def merge(self, right, how='inner', on=None, left_on=None, right_on=None,

96

left_index=False, right_index=False, sort=False, suffixes=('_x', '_y')):

97

"""

98

Merge DataFrames.

99

100

Parameters:

101

- right (DataFrame): DataFrame to merge with

102

- how (str): Type of merge ('left', 'right', 'outer', 'inner')

103

- on: Column names to join on

104

- left_on: Left DataFrame column names

105

- right_on: Right DataFrame column names

106

- left_index (bool): Use left index as join key

107

- right_index (bool): Use right index as join key

108

- sort (bool): Sort join keys

109

- suffixes (tuple): Suffixes for overlapping column names

110

111

Returns:

112

Merged DataFrame

113

"""

114

115

def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False):

116

"""Join DataFrames."""

117

118

def drop(self, labels=None, axis=0, index=None, columns=None, level=None,

119

inplace=False, errors='raise'):

120

"""Drop specified labels."""

121

122

def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False):

123

"""Remove missing values."""

124

125

def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None):

126

"""Fill missing values."""

127

128

def sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'):

129

"""Sort by values."""

130

131

def sort_index(self, axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True):

132

"""Sort by index."""

133

134

class Series:

135

"""Pandas-compatible Series on Spark."""

136

137

def head(self, n=5):

138

"""Return first n elements."""

139

140

def tail(self, n=5):

141

"""Return last n elements."""

142

143

def describe(self, percentiles=None, include=None, exclude=None):

144

"""Generate descriptive statistics."""

145

146

def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True):

147

"""Count unique values."""

148

149

def unique(self):

150

"""Return unique values."""

151

152

def nunique(self, dropna=True):

153

"""Count unique values."""

154

155

def drop_duplicates(self, keep='first', inplace=False):

156

"""Remove duplicate values."""

157

```

158

159

### Data I/O Functions

160

161

Functions for reading and writing data in pandas-compatible format.

162

163

```python { .api }

164

def read_csv(path, sep=',', header='infer', names=None, index_col=None,

165

usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True,

166

dtype=None, engine=None, converters=None, true_values=None,

167

false_values=None, skipinitialspace=False, skiprows=None,

168

skipfooter=0, nrows=None, na_values=None, keep_default_na=True,

169

na_filter=True, verbose=False, skip_blank_lines=True,

170

parse_dates=False, infer_datetime_format=False, keep_date_col=False,

171

date_parser=None, dayfirst=False, cache_dates=True, iterator=False,

172

chunksize=None, compression='infer', thousands=None, decimal='.',

173

lineterminator=None, quotechar='"', quoting=0, doublequote=True,

174

escapechar=None, comment=None, encoding=None, dialect=None,

175

error_bad_lines=True, warn_bad_lines=True, delim_whitespace=False,

176

low_memory=True, memory_map=False, float_precision=None):

177

"""

178

Read CSV file into DataFrame.

179

180

Parameters:

181

- path (str): File path

182

- sep (str): Column separator

183

- header: Row to use as column names

184

- names (list): Column names

185

- index_col: Column to use as row labels

186

- usecols: Columns to read

187

- dtype: Data type specification

188

- parse_dates: Parse date columns

189

- na_values: Additional strings to recognize as NA

190

191

Returns:

192

DataFrame

193

"""

194

195

def read_parquet(path, engine='auto', columns=None, **kwargs):

196

"""

197

Read Parquet file into DataFrame.

198

199

Parameters:

200

- path (str): File path

201

- engine (str): Parquet library to use

202

- columns (list): Columns to read

203

204

Returns:

205

DataFrame

206

"""

207

208

def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None,

209

convert_axes=None, convert_dates=True, keep_default_dates=True,

210

numpy=False, precise_float=False, date_unit=None, encoding=None,

211

lines=False, chunksize=None, compression='infer'):

212

"""Read JSON file into DataFrame."""

213

214

def read_excel(io, sheet_name=0, header=0, names=None, index_col=None,

215

usecols=None, squeeze=False, dtype=None, engine=None,

216

converters=None, true_values=None, false_values=None,

217

skiprows=None, nrows=None, na_values=None, keep_default_na=True,

218

na_filter=True, verbose=False, parse_dates=False,

219

date_parser=None, thousands=None, comment=None, skipfooter=0,

220

convert_float=True, mangle_dupe_cols=True):

221

"""Read Excel file into DataFrame."""

222

```

223

224

### Utility Functions

225

226

```python { .api }

227

def concat(objs, axis=0, join='outer', ignore_index=False, keys=None,

228

levels=None, names=None, verify_integrity=False, sort=False, copy=True):

229

"""

230

Concatenate pandas objects.

231

232

Parameters:

233

- objs: Objects to concatenate

234

- axis (int): Axis to concatenate along

235

- join (str): How to handle indexes ('inner' or 'outer')

236

- ignore_index (bool): Ignore index values

237

- keys: Construct hierarchical index

238

- sort (bool): Sort non-concatenation axis

239

240

Returns:

241

Concatenated object

242

"""

243

244

def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value',

245

col_level=None, ignore_index=True):

246

"""

247

Unpivot DataFrame from wide to long format.

248

249

Parameters:

250

- frame (DataFrame): DataFrame to melt

251

- id_vars: Columns to use as identifier variables

252

- value_vars: Columns to unpivot

253

- var_name (str): Name for variable column

254

- value_name (str): Name for value column

255

256

Returns:

257

Melted DataFrame

258

"""

259

260

def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,

261

columns=None, sparse=False, drop_first=False, dtype=None):

262

"""

263

Convert categorical variables to dummy/indicator variables.

264

265

Parameters:

266

- data: Input data

267

- prefix: String to append to column names

268

- prefix_sep (str): Separator between prefix and category

269

- dummy_na (bool): Include column for NAs

270

- columns: Columns to encode

271

- drop_first (bool): Drop first category to avoid collinearity

272

- dtype: Data type for new columns

273

274

Returns:

275

DataFrame with dummy variables

276

"""

277

278

def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,

279

utc=None, format=None, exact=True, unit=None,

280

infer_datetime_format=False, origin='unix', cache=True):

281

"""

282

Convert argument to datetime.

283

284

Parameters:

285

- arg: Object to convert

286

- errors (str): Error handling ('raise', 'coerce', 'ignore')

287

- format (str): strftime format

288

- unit (str): Unit of numeric values

289

290

Returns:

291

Datetime object

292

"""

293

294

def date_range(start=None, end=None, periods=None, freq=None, tz=None,

295

normalize=False, name=None, closed=None, **kwargs):

296

"""

297

Generate range of dates.

298

299

Parameters:

300

- start: Start date

301

- end: End date

302

- periods (int): Number of periods

303

- freq (str): Frequency string

304

- tz: Time zone

305

- normalize (bool): Normalize to midnight

306

- name (str): Name for index

307

308

Returns:

309

DatetimeIndex

310

"""

311

312

def from_pandas(pdf):

313

"""

314

Create PySpark DataFrame from pandas DataFrame.

315

316

Parameters:

317

- pdf (pandas.DataFrame): pandas DataFrame

318

319

Returns:

320

pyspark.pandas.DataFrame

321

"""

322

323

def sql(query, **kwargs):

324

"""

325

Execute SQL query on pandas objects.

326

327

Parameters:

328

- query (str): SQL query string

329

330

Returns:

331

Query result as DataFrame

332

"""

333

```

334

335

### Configuration

336

337

```python { .api }

338

def get_option(pat):

339

"""

340

Get configuration option.

341

342

Parameters:

343

- pat (str): Option pattern

344

345

Returns:

346

Option value

347

"""

348

349

def set_option(pat, value):

350

"""

351

Set configuration option.

352

353

Parameters:

354

- pat (str): Option pattern

355

- value: Option value

356

"""

357

358

def reset_option(pat):

359

"""

360

Reset configuration option to default.

361

362

Parameters:

363

- pat (str): Option pattern

364

"""

365

366

def option_context(*args):

367

"""

368

Context manager for temporarily setting options.

369

370

Parameters:

371

- args: Option-value pairs

372

373

Returns:

374

Context manager

375

"""

376

377

class options:

378

"""Options configuration object."""

379

pass

380

```

381

382

## Types

383

384

```python { .api }

385

class Index:

386

"""Index for pandas objects."""

387

388

def to_pandas(self):

389

"""Convert to pandas Index."""

390

391

class MultiIndex(Index):

392

"""Multi-level index."""

393

394

@classmethod

395

def from_tuples(cls, tuples, sortorder=None, names=None):

396

"""Create MultiIndex from tuples."""

397

398

@classmethod

399

def from_arrays(cls, arrays, sortorder=None, names=None):

400

"""Create MultiIndex from arrays."""

401

402

class DatetimeIndex(Index):

403

"""Index for datetime data."""

404

405

def strftime(self, date_format):

406

"""Format datetime as strings."""

407

408

class CategoricalIndex(Index):

409

"""Index for categorical data."""

410

411

@property

412

def categories(self):

413

"""Categories of the index."""

414

415

class NamedAgg:

416

"""Named aggregation for groupby operations."""

417

418

def __init__(self, column, aggfunc):

419

"""

420

Create named aggregation.

421

422

Parameters:

423

- column (str): Column name

424

- aggfunc: Aggregation function

425

"""

426

```