0
# Data Manipulation and Reshaping
1
2
Functions for combining, reshaping, and transforming data including merging, concatenation, pivoting, melting, and advanced data restructuring operations.
3
4
## Core Imports
5
6
```python
7
import pandas as pd
8
from pandas import concat, merge, pivot_table, melt
9
```
10
11
## Capabilities
12
13
### Combining Data
14
15
Functions to combine multiple DataFrames or Series through concatenation and merging operations.
16
17
```python { .api }
18
def concat(objs, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=True):
19
"""
20
Concatenate pandas objects along a particular axis.
21
22
Parameters:
23
- objs: sequence or mapping of Series or DataFrame objects
24
- axis: int, axis to concatenate along (0='index', 1='columns')
25
- join: str, how to handle indexes on other axis ('inner'/'outer')
26
- ignore_index: bool, do not use index values along concatenation axis
27
- keys: sequence, construct hierarchical index using passed keys
28
- levels: list of sequences, specific levels to use for constructing MultiIndex
29
- names: list, names for levels in hierarchical index
30
- verify_integrity: bool, check whether new concatenated axis contains duplicates
31
- sort: bool, sort non-concatenation axis if not already aligned
32
33
Returns:
34
object, type of objects being concatenated
35
"""
36
37
def merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):
38
"""
39
Merge DataFrame or named Series objects with a database-style join.
40
41
Parameters:
42
- left: DataFrame or named Series
43
- right: DataFrame or named Series
44
- how: str, type of merge ('left', 'right', 'outer', 'inner', 'cross')
45
- on: label or list, column names to join on
46
- left_on: label or list, left DataFrame column names to join on
47
- right_on: label or list, right DataFrame column names to join on
48
- left_index: bool, use left DataFrame index as join key
49
- right_index: bool, use right DataFrame index as join key
50
- sort: bool, sort join keys lexicographically
51
- suffixes: list-like, suffix to apply to overlapping column names
52
- indicator: bool or str, add column to output indicating source of each row
53
- validate: str, validate uniqueness of merge keys
54
55
Returns:
56
DataFrame
57
"""
58
59
def merge_asof(left, right, on=None, left_on=None, right_on=None, left_index=False, right_index=False, by=None, left_by=None, right_by=None, suffixes=('_x', '_y'), tolerance=None, allow_exact_matches=True, direction='backward'):
60
"""
61
Perform a merge by key distance.
62
63
Parameters:
64
- left: DataFrame or named Series
65
- right: DataFrame or named Series
66
- on: label, column name to merge on (must be sorted)
67
- by: column name or list of column names, match on these columns before searching
68
- tolerance: int or Timedelta, select closest key within this distance
69
- allow_exact_matches: bool, allow matching with exact same key
70
- direction: str, search direction ('backward', 'forward', 'nearest')
71
72
Returns:
73
DataFrame
74
"""
75
76
def merge_ordered(left, right, on=None, left_on=None, right_on=None, left_by=None, right_by=None, fill_method=None, suffixes=('_x', '_y'), how='outer'):
77
"""
78
Perform merge with optional filling/interpolation.
79
80
Parameters:
81
- left: DataFrame or named Series
82
- right: DataFrame or named Series
83
- fill_method: str, interpolation method ('ffill')
84
- how: str, type of merge ('left', 'right', 'outer', 'inner')
85
86
Returns:
87
DataFrame
88
"""
89
```
90
91
### Reshaping Data
92
93
Functions to reshape data between wide and long formats, create pivot tables, and restructure DataFrames.
94
95
```python { .api }
96
def pivot(data, index=None, columns=None, values=None):
97
"""
98
Return reshaped DataFrame organized by given index/column values.
99
100
Parameters:
101
- data: DataFrame
102
- index: column to use to make new frame's index
103
- columns: column to use to make new frame's columns
104
- values: column(s) to use for populating new frame's values
105
106
Returns:
107
DataFrame
108
"""
109
110
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All', observed=False, sort=True):
111
"""
112
Create a spreadsheet-style pivot table as a DataFrame.
113
114
Parameters:
115
- data: DataFrame
116
- values: column to aggregate
117
- index: column, Grouper, array, list of columns to use as index
118
- columns: column, Grouper, array, list of columns to use as columns
119
- aggfunc: function, function to use for aggregation ('mean', 'sum', 'count', etc.)
120
- fill_value: scalar, value to replace missing values
121
- margins: bool, add row/column margins (subtotals)
122
- dropna: bool, do not include columns with all NaN values
123
- margins_name: str, name of row/column containing totals
124
- observed: bool, for categorical columns, consider only observed categories
125
126
Returns:
127
DataFrame
128
"""
129
130
def melt(data, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None, ignore_index=True):
131
"""
132
Unpivot a DataFrame from wide to long format.
133
134
Parameters:
135
- data: DataFrame
136
- id_vars: column(s) to use as identifier variables
137
- value_vars: column(s) to unpivot, defaults to all columns not in id_vars
138
- var_name: str, name to use for variable column
139
- value_name: str, name to use for value column
140
- col_level: int or str, level in columns to melt
141
- ignore_index: bool, ignore index in result
142
143
Returns:
144
DataFrame
145
"""
146
147
def wide_to_long(df, stubnames, i, j, sep='', suffix='\\d+'):
148
"""
149
Pivot a wide table to long (stacked) format.
150
151
Parameters:
152
- df: DataFrame
153
- stubnames: str or list, stub name(s)
154
- i: column(s) to use as id variable(s)
155
- j: str, suffix of wide variables
156
- sep: str, separator between stub names and suffix
157
- suffix: str, regular expression for suffix
158
159
Returns:
160
DataFrame
161
"""
162
163
def lreshape(data, groups, dropna=True):
164
"""
165
Reshape wide-format data to long.
166
167
Parameters:
168
- data: DataFrame
169
- groups: dict, mapping of column names to group labels
170
- dropna: bool, drop rows containing missing values
171
172
Returns:
173
DataFrame
174
"""
175
```
176
177
### Categorical Data
178
179
Functions for working with categorical data and creating dummy variables.
180
181
```python { .api }
182
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None):
183
"""
184
Convert categorical variable(s) into dummy/indicator variables.
185
186
Parameters:
187
- data: array-like, Series, or DataFrame
188
- prefix: str, list of str, or dict of str, string to append DataFrame column names
189
- prefix_sep: str, separator/delimiter to use when adding prefix
190
- dummy_na: bool, add column to indicate NaNs
191
- columns: list-like, column names in DataFrame to encode
192
- sparse: bool, return SparseArray (True) or NumPy array (False)
193
- drop_first: bool, remove first level of categorical variable
194
- dtype: dtype, data type for new columns
195
196
Returns:
197
DataFrame
198
"""
199
200
def from_dummies(data, sep=None, default_category=None):
201
"""
202
Create a categorical DataFrame from a DataFrame of dummy variables.
203
204
Parameters:
205
- data: DataFrame, data of which to get dummy indicators
206
- sep: str, separator used in column names of dummy DataFrame
207
- default_category: None, str, or dict of str, name of column containing default category
208
209
Returns:
210
DataFrame
211
"""
212
213
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False):
214
"""
215
Compute a simple cross-tabulation of two (or more) factors.
216
217
Parameters:
218
- index: array-like, values to group by in rows
219
- columns: array-like, values to group by in columns
220
- values: array-like, array of values to aggregate according to factors
221
- rownames: sequence, names for row index
222
- colnames: sequence, names for column index
223
- aggfunc: function, aggregation function to use
224
- margins: bool, add row/column margins (subtotals)
225
- dropna: bool, do not include columns with all NaN values
226
- normalize: bool, normalize by dividing all values by sum
227
228
Returns:
229
DataFrame
230
"""
231
```
232
233
### Binning and Discretization
234
235
Functions to bin continuous data into discrete intervals or quantile-based groups.
236
237
```python { .api }
238
def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise', ordered=True):
239
"""
240
Bin values into discrete intervals.
241
242
Parameters:
243
- x: array-like, input array to be binned
244
- bins: int, sequence of scalars, or IntervalIndex, criteria to bin by
245
- right: bool, indicates whether bins include rightmost edge
246
- labels: array or bool, specifies labels for returned bins
247
- retbins: bool, return bins or not
248
- precision: int, precision at which to store and display bins labels
249
- include_lowest: bool, whether first interval should be left-inclusive
250
- duplicates: str, behavior when bin edges are not unique ('raise' or 'drop')
251
- ordered: bool, whether returned Categorical will be ordered
252
253
Returns:
254
Categorical, Series, or array of intervals
255
"""
256
257
def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
258
"""
259
Quantile-based discretization function.
260
261
Parameters:
262
- x: array-like, input array to be binned
263
- q: int or list-like of float, number of quantiles or quantile boundaries
264
- labels: array or bool, used as labels for resulting bins
265
- retbins: bool, return (bins, labels) or not
266
- precision: int, precision at which to store and display bins labels
267
- duplicates: str, behavior when bin edges are not unique ('raise' or 'drop')
268
269
Returns:
270
Categorical, Series, or array of intervals
271
"""
272
```
273
274
### DataFrame and Series Transformation
275
276
Core methods for transforming and manipulating individual DataFrames and Series.
277
278
```python { .api }
279
# DataFrame transformation methods (already covered in core-data-structures.md)
280
# These are methods of DataFrame/Series classes:
281
282
# DataFrame.pivot(index=None, columns=None, values=None)
283
# DataFrame.transpose() / DataFrame.T
284
# DataFrame.stack(level=-1, dropna=True)
285
# DataFrame.unstack(level=-1, fill_value=None)
286
# DataFrame.explode(column, ignore_index=False)
287
# Series.explode(ignore_index=False)
288
289
# Additional utility functions
290
def eval(expr, parser='pandas', engine=None, local_dict=None, global_dict=None, resolvers=(), level=0, target=None, inplace=False):
291
"""
292
Evaluate a Python expression as a string using pandas objects.
293
294
Parameters:
295
- expr: str, expression to evaluate
296
- parser: str, parser to use ('pandas' or 'python')
297
- engine: str, engine to use ('python', 'numexpr')
298
- local_dict: dict, local variable scope
299
- global_dict: dict, global variable scope
300
- level: int, number of prior stack frames to traverse
301
- target: object, assign result to this variable
302
- inplace: bool, perform operation in-place
303
304
Returns:
305
ndarray, numeric scalar, DataFrame, Series
306
"""
307
```
308
309
## Advanced Reshaping Patterns
310
311
### MultiIndex Operations
312
313
```python { .api }
314
# MultiIndex creation and manipulation
315
def MultiIndex.from_arrays(arrays, sortorder=None, names=None):
316
"""Create MultiIndex from arrays."""
317
318
def MultiIndex.from_tuples(tuples, sortorder=None, names=None):
319
"""Create MultiIndex from list of tuples."""
320
321
def MultiIndex.from_product(iterables, sortorder=None, names=None):
322
"""Create MultiIndex from cartesian product of iterables."""
323
324
def MultiIndex.from_frame(df, sortorder=None, names=None):
325
"""Create MultiIndex from DataFrame."""
326
327
# These are methods of DataFrames with MultiIndex:
328
# df.stack() - pivot columns to rows (wide to long)
329
# df.unstack() - pivot rows to columns (long to wide)
330
# df.swaplevel() - swap levels in MultiIndex
331
# df.reorder_levels() - rearrange index levels
332
```
333
334
### Advanced Merging
335
336
```python { .api }
337
# Additional merge validation options
338
class MergeValidation:
339
ONE_TO_ONE = '1:1'
340
ONE_TO_MANY = '1:m'
341
MANY_TO_ONE = 'm:1'
342
MANY_TO_MANY = 'm:m'
343
344
# Join methods (these are DataFrame methods)
345
# df.join(other, on=None, how='left', lsuffix='', rsuffix='', sort=False)
346
# df.combine(other, func, fill_value=None, overwrite=True)
347
# df.combine_first(other) - combine with other, using non-null values from calling DataFrame
348
# df.update(other, join='left', overwrite=True, filter_func=None, errors='ignore')
349
```
350
351
## Types
352
353
```python { .api }
354
# Merge and join types
355
MergeHow = Literal['left', 'right', 'outer', 'inner', 'cross']
356
JoinHow = Literal['left', 'right', 'outer', 'inner']
357
358
# Pivot aggregation functions
359
AggFunc = Union[str, Callable, List[Union[str, Callable]], Dict[str, Union[str, Callable]]]
360
361
# Binning edge behavior
362
BinEdge = Literal['left', 'right']
363
364
# Categorical ordering
365
CategoricalOrdered = bool
366
367
# Cross-tabulation normalization
368
NormalizeOptions = Union[bool, Literal['all', 'index', 'columns']]
369
```