Powerful data structures for data analysis, time series, and statistics
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Functions for combining, reshaping, and transforming data including merging, concatenation, pivoting, melting, and advanced data restructuring operations.
import pandas as pd
from pandas import concat, merge, pivot_table, meltFunctions to combine multiple DataFrames or Series through concatenation and merging operations.
def concat(objs, axis=0, join='outer', ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=False, copy=True):
"""
Concatenate pandas objects along a particular axis.
Parameters:
- objs: sequence or mapping of Series or DataFrame objects
- axis: int, axis to concatenate along (0='index', 1='columns')
- join: str, how to handle indexes on other axis ('inner'/'outer')
- ignore_index: bool, do not use index values along concatenation axis
- keys: sequence, construct hierarchical index using passed keys
- levels: list of sequences, specific levels to use for constructing MultiIndex
- names: list, names for levels in hierarchical index
- verify_integrity: bool, check whether new concatenated axis contains duplicates
- sort: bool, sort non-concatenation axis if not already aligned
Returns:
object, type of objects being concatenated
"""
def merge(left, right, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, sort=False, suffixes=('_x', '_y'), copy=True, indicator=False, validate=None):
"""
Merge DataFrame or named Series objects with a database-style join.
Parameters:
- left: DataFrame or named Series
- right: DataFrame or named Series
- how: str, type of merge ('left', 'right', 'outer', 'inner', 'cross')
- on: label or list, column names to join on
- left_on: label or list, left DataFrame column names to join on
- right_on: label or list, right DataFrame column names to join on
- left_index: bool, use left DataFrame index as join key
- right_index: bool, use right DataFrame index as join key
- sort: bool, sort join keys lexicographically
- suffixes: list-like, suffix to apply to overlapping column names
- indicator: bool or str, add column to output indicating source of each row
- validate: str, validate uniqueness of merge keys
Returns:
DataFrame
"""
def merge_asof(left, right, on=None, left_on=None, right_on=None, left_index=False, right_index=False, by=None, left_by=None, right_by=None, suffixes=('_x', '_y'), tolerance=None, allow_exact_matches=True, direction='backward'):
"""
Perform a merge by key distance.
Parameters:
- left: DataFrame or named Series
- right: DataFrame or named Series
- on: label, column name to merge on (must be sorted)
- by: column name or list of column names, match on these columns before searching
- tolerance: int or Timedelta, select closest key within this distance
- allow_exact_matches: bool, allow matching with exact same key
- direction: str, search direction ('backward', 'forward', 'nearest')
Returns:
DataFrame
"""
def merge_ordered(left, right, on=None, left_on=None, right_on=None, left_by=None, right_by=None, fill_method=None, suffixes=('_x', '_y'), how='outer'):
"""
Perform merge with optional filling/interpolation.
Parameters:
- left: DataFrame or named Series
- right: DataFrame or named Series
- fill_method: str, interpolation method ('ffill')
- how: str, type of merge ('left', 'right', 'outer', 'inner')
Returns:
DataFrame
"""Functions to reshape data between wide and long formats, create pivot tables, and restructure DataFrames.
def pivot(data, index=None, columns=None, values=None):
"""
Return reshaped DataFrame organized by given index/column values.
Parameters:
- data: DataFrame
- index: column to use to make new frame's index
- columns: column to use to make new frame's columns
- values: column(s) to use for populating new frame's values
Returns:
DataFrame
"""
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', fill_value=None, margins=False, dropna=True, margins_name='All', observed=False, sort=True):
"""
Create a spreadsheet-style pivot table as a DataFrame.
Parameters:
- data: DataFrame
- values: column to aggregate
- index: column, Grouper, array, list of columns to use as index
- columns: column, Grouper, array, list of columns to use as columns
- aggfunc: function, function to use for aggregation ('mean', 'sum', 'count', etc.)
- fill_value: scalar, value to replace missing values
- margins: bool, add row/column margins (subtotals)
- dropna: bool, do not include columns with all NaN values
- margins_name: str, name of row/column containing totals
- observed: bool, for categorical columns, consider only observed categories
Returns:
DataFrame
"""
def melt(data, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None, ignore_index=True):
"""
Unpivot a DataFrame from wide to long format.
Parameters:
- data: DataFrame
- id_vars: column(s) to use as identifier variables
- value_vars: column(s) to unpivot, defaults to all columns not in id_vars
- var_name: str, name to use for variable column
- value_name: str, name to use for value column
- col_level: int or str, level in columns to melt
- ignore_index: bool, ignore index in result
Returns:
DataFrame
"""
def wide_to_long(df, stubnames, i, j, sep='', suffix='\\d+'):
"""
Pivot a wide table to long (stacked) format.
Parameters:
- df: DataFrame
- stubnames: str or list, stub name(s)
- i: column(s) to use as id variable(s)
- j: str, suffix of wide variables
- sep: str, separator between stub names and suffix
- suffix: str, regular expression for suffix
Returns:
DataFrame
"""
def lreshape(data, groups, dropna=True):
"""
Reshape wide-format data to long.
Parameters:
- data: DataFrame
- groups: dict, mapping of column names to group labels
- dropna: bool, drop rows containing missing values
Returns:
DataFrame
"""Functions for working with categorical data and creating dummy variables.
def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False, drop_first=False, dtype=None):
"""
Convert categorical variable(s) into dummy/indicator variables.
Parameters:
- data: array-like, Series, or DataFrame
- prefix: str, list of str, or dict of str, string to append DataFrame column names
- prefix_sep: str, separator/delimiter to use when adding prefix
- dummy_na: bool, add column to indicate NaNs
- columns: list-like, column names in DataFrame to encode
- sparse: bool, return SparseArray (True) or NumPy array (False)
- drop_first: bool, remove first level of categorical variable
- dtype: dtype, data type for new columns
Returns:
DataFrame
"""
def from_dummies(data, sep=None, default_category=None):
"""
Create a categorical DataFrame from a DataFrame of dummy variables.
Parameters:
- data: DataFrame, data of which to get dummy indicators
- sep: str, separator used in column names of dummy DataFrame
- default_category: None, str, or dict of str, name of column containing default category
Returns:
DataFrame
"""
def crosstab(index, columns, values=None, rownames=None, colnames=None, aggfunc=None, margins=False, margins_name='All', dropna=True, normalize=False):
"""
Compute a simple cross-tabulation of two (or more) factors.
Parameters:
- index: array-like, values to group by in rows
- columns: array-like, values to group by in columns
- values: array-like, array of values to aggregate according to factors
- rownames: sequence, names for row index
- colnames: sequence, names for column index
- aggfunc: function, aggregation function to use
- margins: bool, add row/column margins (subtotals)
- dropna: bool, do not include columns with all NaN values
- normalize: bool, normalize by dividing all values by sum
Returns:
DataFrame
"""Functions to bin continuous data into discrete intervals or quantile-based groups.
def cut(x, bins, right=True, labels=None, retbins=False, precision=3, include_lowest=False, duplicates='raise', ordered=True):
"""
Bin values into discrete intervals.
Parameters:
- x: array-like, input array to be binned
- bins: int, sequence of scalars, or IntervalIndex, criteria to bin by
- right: bool, indicates whether bins include rightmost edge
- labels: array or bool, specifies labels for returned bins
- retbins: bool, return bins or not
- precision: int, precision at which to store and display bins labels
- include_lowest: bool, whether first interval should be left-inclusive
- duplicates: str, behavior when bin edges are not unique ('raise' or 'drop')
- ordered: bool, whether returned Categorical will be ordered
Returns:
Categorical, Series, or array of intervals
"""
def qcut(x, q, labels=None, retbins=False, precision=3, duplicates='raise'):
"""
Quantile-based discretization function.
Parameters:
- x: array-like, input array to be binned
- q: int or list-like of float, number of quantiles or quantile boundaries
- labels: array or bool, used as labels for resulting bins
- retbins: bool, return (bins, labels) or not
- precision: int, precision at which to store and display bins labels
- duplicates: str, behavior when bin edges are not unique ('raise' or 'drop')
Returns:
Categorical, Series, or array of intervals
"""Core methods for transforming and manipulating individual DataFrames and Series.
# DataFrame transformation methods (already covered in core-data-structures.md)
# These are methods of DataFrame/Series classes:
# DataFrame.pivot(index=None, columns=None, values=None)
# DataFrame.transpose() / DataFrame.T
# DataFrame.stack(level=-1, dropna=True)
# DataFrame.unstack(level=-1, fill_value=None)
# DataFrame.explode(column, ignore_index=False)
# Series.explode(ignore_index=False)
# Additional utility functions
def eval(expr, parser='pandas', engine=None, local_dict=None, global_dict=None, resolvers=(), level=0, target=None, inplace=False):
"""
Evaluate a Python expression as a string using pandas objects.
Parameters:
- expr: str, expression to evaluate
- parser: str, parser to use ('pandas' or 'python')
- engine: str, engine to use ('python', 'numexpr')
- local_dict: dict, local variable scope
- global_dict: dict, global variable scope
- level: int, number of prior stack frames to traverse
- target: object, assign result to this variable
- inplace: bool, perform operation in-place
Returns:
ndarray, numeric scalar, DataFrame, Series
"""# MultiIndex creation and manipulation
def MultiIndex.from_arrays(arrays, sortorder=None, names=None):
"""Create MultiIndex from arrays."""
def MultiIndex.from_tuples(tuples, sortorder=None, names=None):
"""Create MultiIndex from list of tuples."""
def MultiIndex.from_product(iterables, sortorder=None, names=None):
"""Create MultiIndex from cartesian product of iterables."""
def MultiIndex.from_frame(df, sortorder=None, names=None):
"""Create MultiIndex from DataFrame."""
# These are methods of DataFrames with MultiIndex:
# df.stack() - pivot columns to rows (wide to long)
# df.unstack() - pivot rows to columns (long to wide)
# df.swaplevel() - swap levels in MultiIndex
# df.reorder_levels() - rearrange index levels# Additional merge validation options
class MergeValidation:
ONE_TO_ONE = '1:1'
ONE_TO_MANY = '1:m'
MANY_TO_ONE = 'm:1'
MANY_TO_MANY = 'm:m'
# Join methods (these are DataFrame methods)
# df.join(other, on=None, how='left', lsuffix='', rsuffix='', sort=False)
# df.combine(other, func, fill_value=None, overwrite=True)
# df.combine_first(other) - combine with other, using non-null values from calling DataFrame
# df.update(other, join='left', overwrite=True, filter_func=None, errors='ignore')# Merge and join types
MergeHow = Literal['left', 'right', 'outer', 'inner', 'cross']
JoinHow = Literal['left', 'right', 'outer', 'inner']
# Pivot aggregation functions
AggFunc = Union[str, Callable, List[Union[str, Callable]], Dict[str, Union[str, Callable]]]
# Binning edge behavior
BinEdge = Literal['left', 'right']
# Categorical ordering
CategoricalOrdered = bool
# Cross-tabulation normalization
NormalizeOptions = Union[bool, Literal['all', 'index', 'columns']]Install with Tessl CLI
npx tessl i tessl/pypi-pandas