0
# Pandas API on Spark
1
2
Pandas-compatible API for familiar pandas operations on distributed datasets. This enables seamless scaling of pandas workflows to large datasets while maintaining the familiar pandas interface and functionality.
3
4
## Capabilities
5
6
### DataFrame Operations
7
8
Core DataFrame functionality with pandas-compatible interface.
9
10
```python { .api }
11
class DataFrame:
12
"""Pandas-compatible DataFrame on Spark."""
13
14
def head(self, n=5):
15
"""
16
Return first n rows.
17
18
Parameters:
19
- n (int): Number of rows
20
21
Returns:
22
DataFrame with first n rows
23
"""
24
25
def tail(self, n=5):
26
"""
27
Return last n rows.
28
29
Parameters:
30
- n (int): Number of rows
31
32
Returns:
33
DataFrame with last n rows
34
"""
35
36
def describe(self, percentiles=None, include=None, exclude=None):
37
"""
38
Generate descriptive statistics.
39
40
Parameters:
41
- percentiles (list): Percentiles to include
42
- include: Data types to include
43
- exclude: Data types to exclude
44
45
Returns:
46
DataFrame with statistics
47
"""
48
49
def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, show_counts=None):
50
"""Print DataFrame info."""
51
52
def count(self):
53
"""Count non-null values."""
54
55
def sum(self, axis=None, skipna=True, level=None, numeric_only=None, min_count=0):
56
"""Sum values."""
57
58
def mean(self, axis=None, skipna=True, level=None, numeric_only=None):
59
"""Calculate mean."""
60
61
def median(self, axis=None, skipna=True, level=None, numeric_only=None):
62
"""Calculate median."""
63
64
def std(self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None):
65
"""Calculate standard deviation."""
66
67
def var(self, axis=None, skipna=True, level=None, ddof=1, numeric_only=None):
68
"""Calculate variance."""
69
70
def min(self, axis=None, skipna=True, level=None, numeric_only=None):
71
"""Return minimum values."""
72
73
def max(self, axis=None, skipna=True, level=None, numeric_only=None):
74
"""Return maximum values."""
75
76
def groupby(self, by=None, axis=0, level=None, as_index=True, sort=True, group_keys=True, squeeze=False, observed=False, dropna=True):
77
"""
78
Group DataFrame by columns.
79
80
Parameters:
81
- by: Columns to group by
82
- axis (int): Axis to group by
83
- level: Level for MultiIndex
84
- as_index (bool): Whether to use group keys as index
85
- sort (bool): Sort group keys
86
- group_keys (bool): Add group keys to index
87
- squeeze (bool): Reduce dimensionality
88
- observed (bool): Only show observed values for categorical
89
- dropna (bool): Drop NA values from groups
90
91
Returns:
92
GroupBy object
93
"""
94
95
def merge(self, right, how='inner', on=None, left_on=None, right_on=None,
96
left_index=False, right_index=False, sort=False, suffixes=('_x', '_y')):
97
"""
98
Merge DataFrames.
99
100
Parameters:
101
- right (DataFrame): DataFrame to merge with
102
- how (str): Type of merge ('left', 'right', 'outer', 'inner')
103
- on: Column names to join on
104
- left_on: Left DataFrame column names
105
- right_on: Right DataFrame column names
106
- left_index (bool): Use left index as join key
107
- right_index (bool): Use right index as join key
108
- sort (bool): Sort join keys
109
- suffixes (tuple): Suffixes for overlapping column names
110
111
Returns:
112
Merged DataFrame
113
"""
114
115
def join(self, other, on=None, how='left', lsuffix='', rsuffix='', sort=False):
116
"""Join DataFrames."""
117
118
def drop(self, labels=None, axis=0, index=None, columns=None, level=None,
119
inplace=False, errors='raise'):
120
"""Drop specified labels."""
121
122
def dropna(self, axis=0, how='any', thresh=None, subset=None, inplace=False):
123
"""Remove missing values."""
124
125
def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None, downcast=None):
126
"""Fill missing values."""
127
128
def sort_values(self, by, axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last'):
129
"""Sort by values."""
130
131
def sort_index(self, axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True):
132
"""Sort by index."""
133
134
class Series:
135
"""Pandas-compatible Series on Spark."""
136
137
def head(self, n=5):
138
"""Return first n elements."""
139
140
def tail(self, n=5):
141
"""Return last n elements."""
142
143
def describe(self, percentiles=None, include=None, exclude=None):
144
"""Generate descriptive statistics."""
145
146
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True):
147
"""Count unique values."""
148
149
def unique(self):
150
"""Return unique values."""
151
152
def nunique(self, dropna=True):
153
"""Count unique values."""
154
155
def drop_duplicates(self, keep='first', inplace=False):
156
"""Remove duplicate values."""
157
```
158
159
### Data I/O Functions
160
161
Functions for reading and writing data in pandas-compatible format.
162
163
```python { .api }
164
def read_csv(path, sep=',', header='infer', names=None, index_col=None,
165
usecols=None, squeeze=False, prefix=None, mangle_dupe_cols=True,
166
dtype=None, engine=None, converters=None, true_values=None,
167
false_values=None, skipinitialspace=False, skiprows=None,
168
skipfooter=0, nrows=None, na_values=None, keep_default_na=True,
169
na_filter=True, verbose=False, skip_blank_lines=True,
170
parse_dates=False, infer_datetime_format=False, keep_date_col=False,
171
date_parser=None, dayfirst=False, cache_dates=True, iterator=False,
172
chunksize=None, compression='infer', thousands=None, decimal='.',
173
lineterminator=None, quotechar='"', quoting=0, doublequote=True,
174
escapechar=None, comment=None, encoding=None, dialect=None,
175
error_bad_lines=True, warn_bad_lines=True, delim_whitespace=False,
176
low_memory=True, memory_map=False, float_precision=None):
177
"""
178
Read CSV file into DataFrame.
179
180
Parameters:
181
- path (str): File path
182
- sep (str): Column separator
183
- header: Row to use as column names
184
- names (list): Column names
185
- index_col: Column to use as row labels
186
- usecols: Columns to read
187
- dtype: Data type specification
188
- parse_dates: Parse date columns
189
- na_values: Additional strings to recognize as NA
190
191
Returns:
192
DataFrame
193
"""
194
195
def read_parquet(path, engine='auto', columns=None, **kwargs):
196
"""
197
Read Parquet file into DataFrame.
198
199
Parameters:
200
- path (str): File path
201
- engine (str): Parquet library to use
202
- columns (list): Columns to read
203
204
Returns:
205
DataFrame
206
"""
207
208
def read_json(path_or_buf=None, orient=None, typ='frame', dtype=None,
209
convert_axes=None, convert_dates=True, keep_default_dates=True,
210
numpy=False, precise_float=False, date_unit=None, encoding=None,
211
lines=False, chunksize=None, compression='infer'):
212
"""Read JSON file into DataFrame."""
213
214
def read_excel(io, sheet_name=0, header=0, names=None, index_col=None,
215
usecols=None, squeeze=False, dtype=None, engine=None,
216
converters=None, true_values=None, false_values=None,
217
skiprows=None, nrows=None, na_values=None, keep_default_na=True,
218
na_filter=True, verbose=False, parse_dates=False,
219
date_parser=None, thousands=None, comment=None, skipfooter=0,
220
convert_float=True, mangle_dupe_cols=True):
221
"""Read Excel file into DataFrame."""
222
```
223
224
### Utility Functions
225
226
```python { .api }
227
def concat(objs, axis=0, join='outer', ignore_index=False, keys=None,
228
levels=None, names=None, verify_integrity=False, sort=False, copy=True):
229
"""
230
Concatenate pandas objects.
231
232
Parameters:
233
- objs: Objects to concatenate
234
- axis (int): Axis to concatenate along
235
- join (str): How to handle indexes ('inner' or 'outer')
236
- ignore_index (bool): Ignore index values
237
- keys: Construct hierarchical index
238
- sort (bool): Sort non-concatenation axis
239
240
Returns:
241
Concatenated object
242
"""
243
244
def melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value',
245
col_level=None, ignore_index=True):
246
"""
247
Unpivot DataFrame from wide to long format.
248
249
Parameters:
250
- frame (DataFrame): DataFrame to melt
251
- id_vars: Columns to use as identifier variables
252
- value_vars: Columns to unpivot
253
- var_name (str): Name for variable column
254
- value_name (str): Name for value column
255
256
Returns:
257
Melted DataFrame
258
"""
259
260
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False,
261
columns=None, sparse=False, drop_first=False, dtype=None):
262
"""
263
Convert categorical variables to dummy/indicator variables.
264
265
Parameters:
266
- data: Input data
267
- prefix: String to append to column names
268
- prefix_sep (str): Separator between prefix and category
269
- dummy_na (bool): Include column for NAs
270
- columns: Columns to encode
271
- drop_first (bool): Drop first category to avoid collinearity
272
- dtype: Data type for new columns
273
274
Returns:
275
DataFrame with dummy variables
276
"""
277
278
def to_datetime(arg, errors='raise', dayfirst=False, yearfirst=False,
279
utc=None, format=None, exact=True, unit=None,
280
infer_datetime_format=False, origin='unix', cache=True):
281
"""
282
Convert argument to datetime.
283
284
Parameters:
285
- arg: Object to convert
286
- errors (str): Error handling ('raise', 'coerce', 'ignore')
287
- format (str): strftime format
288
- unit (str): Unit of numeric values
289
290
Returns:
291
Datetime object
292
"""
293
294
def date_range(start=None, end=None, periods=None, freq=None, tz=None,
295
normalize=False, name=None, closed=None, **kwargs):
296
"""
297
Generate range of dates.
298
299
Parameters:
300
- start: Start date
301
- end: End date
302
- periods (int): Number of periods
303
- freq (str): Frequency string
304
- tz: Time zone
305
- normalize (bool): Normalize to midnight
306
- name (str): Name for index
307
308
Returns:
309
DatetimeIndex
310
"""
311
312
def from_pandas(pdf):
313
"""
314
Create PySpark DataFrame from pandas DataFrame.
315
316
Parameters:
317
- pdf (pandas.DataFrame): pandas DataFrame
318
319
Returns:
320
pyspark.pandas.DataFrame
321
"""
322
323
def sql(query, **kwargs):
324
"""
325
Execute SQL query on pandas objects.
326
327
Parameters:
328
- query (str): SQL query string
329
330
Returns:
331
Query result as DataFrame
332
"""
333
```
334
335
### Configuration
336
337
```python { .api }
338
def get_option(pat):
339
"""
340
Get configuration option.
341
342
Parameters:
343
- pat (str): Option pattern
344
345
Returns:
346
Option value
347
"""
348
349
def set_option(pat, value):
350
"""
351
Set configuration option.
352
353
Parameters:
354
- pat (str): Option pattern
355
- value: Option value
356
"""
357
358
def reset_option(pat):
359
"""
360
Reset configuration option to default.
361
362
Parameters:
363
- pat (str): Option pattern
364
"""
365
366
def option_context(*args):
367
"""
368
Context manager for temporarily setting options.
369
370
Parameters:
371
- args: Option-value pairs
372
373
Returns:
374
Context manager
375
"""
376
377
class options:
378
"""Options configuration object."""
379
pass
380
```
381
382
## Types
383
384
```python { .api }
385
class Index:
386
"""Index for pandas objects."""
387
388
def to_pandas(self):
389
"""Convert to pandas Index."""
390
391
class MultiIndex(Index):
392
"""Multi-level index."""
393
394
@classmethod
395
def from_tuples(cls, tuples, sortorder=None, names=None):
396
"""Create MultiIndex from tuples."""
397
398
@classmethod
399
def from_arrays(cls, arrays, sortorder=None, names=None):
400
"""Create MultiIndex from arrays."""
401
402
class DatetimeIndex(Index):
403
"""Index for datetime data."""
404
405
def strftime(self, date_format):
406
"""Format datetime as strings."""
407
408
class CategoricalIndex(Index):
409
"""Index for categorical data."""
410
411
@property
412
def categories(self):
413
"""Categories of the index."""
414
415
class NamedAgg:
416
"""Named aggregation for groupby operations."""
417
418
def __init__(self, column, aggfunc):
419
"""
420
Create named aggregation.
421
422
Parameters:
423
- column (str): Column name
424
- aggfunc: Aggregation function
425
"""
426
```