Powerful data structures for data analysis, time series, and statistics
—
Quality
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Extension data types, missing data handling, and type conversion utilities including nullable integer/boolean types, categorical data, and advanced missing value operations.
import pandas as pd
from pandas import isna, notna, Categorical, NAFunctions to detect and handle missing values in pandas data structures.
def isna(obj):
"""
Detect missing values for an array-like object.
Parameters:
- obj: scalar or array-like, object to check for null or missing values
Returns:
bool or array-like of bool, boolean mask indicating missing values
"""
def isnull(obj):
"""
Detect missing values for an array-like object.
Alias for isna().
Parameters:
- obj: scalar or array-like, object to check for null or missing values
Returns:
bool or array-like of bool, boolean mask indicating missing values
"""
def notna(obj):
"""
Detect existing (non-missing) values.
Parameters:
- obj: scalar or array-like, object to check for non-null values
Returns:
bool or array-like of bool, boolean mask indicating non-missing values
"""
def notnull(obj):
"""
Detect existing (non-missing) values.
Alias for notna().
Parameters:
- obj: scalar or array-like, object to check for non-null values
Returns:
bool or array-like of bool, boolean mask indicating non-missing values
"""Categorical data type for efficient storage and computation of repetitive data.
class Categorical:
def __init__(self, values, categories=None, ordered=None, dtype=None, fastpath=False):
"""
Represent a categorical variable in classic R / S-plus fashion.
Parameters:
- values: list-like, values for the categorical
- categories: Index-like, unique categories for this categorical
- ordered: bool, whether categories have meaningful order
- dtype: CategoricalDtype, dtype for the categorical
"""
def add_categories(self, new_categories, inplace=False):
"""Add new categories."""
def remove_categories(self, removals, inplace=False):
"""Remove categories."""
def rename_categories(self, new_categories, inplace=False):
"""Rename categories."""
def reorder_categories(self, new_categories, ordered=None, inplace=False):
"""Reorder categories."""
def remove_unused_categories(self, inplace=False):
"""Remove categories not in use."""
def set_categories(self, new_categories, ordered=None, rename=False, inplace=False):
"""Set categories to specified new_categories."""
def as_ordered(self, inplace=False):
"""Set Categorical to be ordered."""
def as_unordered(self, inplace=False):
"""Set Categorical to be unordered."""
@property
def categories(self):
"""The categories of this categorical."""
@property
def ordered(self):
"""Whether the categories have an ordered relationship."""
@property
def codes(self):
"""The category codes of this categorical."""
def value_counts(self, sort=True, ascending=False, dropna=True):
"""Return counts of each category."""
class CategoricalDtype:
def __init__(self, categories=None, ordered=None):
"""
Type for categorical data with categories and ordered attributes.
Parameters:
- categories: sequence, categories for the dtype
- ordered: bool, whether the categories are ordered
"""
@property
def categories(self):
"""Categorical categories."""
@property
def ordered(self):
"""Whether categories are ordered."""Specialized data types that extend pandas' capabilities beyond NumPy types.
class StringDtype:
def __init__(self, storage=None):
"""
Extension dtype for string data.
Parameters:
- storage: str, storage type ('python' or 'pyarrow')
"""
class BooleanDtype:
def __init__(self):
"""Extension dtype for boolean data with missing value support."""
class Int8Dtype:
def __init__(self):
"""Extension dtype for nullable 8-bit integer data."""
class Int16Dtype:
def __init__(self):
"""Extension dtype for nullable 16-bit integer data."""
class Int32Dtype:
def __init__(self):
"""Extension dtype for nullable 32-bit integer data."""
class Int64Dtype:
def __init__(self):
"""Extension dtype for nullable 64-bit integer data."""
class UInt8Dtype:
def __init__(self):
"""Extension dtype for nullable 8-bit unsigned integer data."""
class UInt16Dtype:
def __init__(self):
"""Extension dtype for nullable 16-bit unsigned integer data."""
class UInt32Dtype:
def __init__(self):
"""Extension dtype for nullable 32-bit unsigned integer data."""
class UInt64Dtype:
def __init__(self):
"""Extension dtype for nullable 64-bit unsigned integer data."""
class Float32Dtype:
def __init__(self):
"""Extension dtype for nullable 32-bit floating point data."""
class Float64Dtype:
def __init__(self):
"""Extension dtype for nullable 64-bit floating point data."""
class PeriodDtype:
def __init__(self, freq=None):
"""
Extension dtype for Period data.
Parameters:
- freq: str or DateOffset, frequency of the Period
"""
class IntervalDtype:
def __init__(self, subtype=None, closed=None):
"""
Extension dtype for Interval data.
Parameters:
- subtype: str or numpy dtype, subtype of interval
- closed: str, whether intervals are closed ('left', 'right', 'both', 'neither')
"""
class DatetimeTZDtype:
def __init__(self, tz=None, unit='ns'):
"""
Extension dtype for timezone-aware datetime data.
Parameters:
- tz: str or tzinfo, timezone information
- unit: str, unit of precision ('ns', 'us', 'ms', 's')
"""
class SparseDtype:
def __init__(self, dtype=numpy.float64, fill_value=None):
"""
Extension dtype for sparse data.
Parameters:
- dtype: str, numpy.dtype, ExtensionDtype, the dtype of non-sparse values
- fill_value: scalar, value used for sparse locations
"""Apache Arrow-backed data types for improved performance and interoperability.
class ArrowDtype:
def __init__(self, pyarrow_dtype):
"""
Extension dtype for PyArrow data types.
Parameters:
- pyarrow_dtype: pyarrow.DataType, PyArrow data type
"""
@property
def pyarrow_dtype(self):
"""Return the PyArrow data type."""
@property
def name(self):
"""Return the name of the data type."""
@property
def type(self):
"""Return the scalar type for the array."""Functions to create pandas arrays and convert between different array types.
def array(data, dtype=None, copy=True):
"""
Create an ExtensionArray from the input data.
Parameters:
- data: Sequence, 1-dimensional list, Series, Index, or ExtensionArray
- dtype: str, np.dtype, or ExtensionDtype, dtype for the array
- copy: bool, whether to copy the data
Returns:
ExtensionArray, newly created array
"""
def factorize(values, sort=False, na_sentinel=-1, use_na_sentinel=True, size_hint=None):
"""
Encode the object as an enumerated type or categorical variable.
Parameters:
- values: sequence, 1-d array-like
- sort: bool, sort uniques
- na_sentinel: int, value to mark missing values
- use_na_sentinel: bool, use na_sentinel for missing values
- size_hint: int, hint to the hashtable sizer
Returns:
tuple of (codes, uniques)
"""
def unique(values):
"""
Return unique values based on a hash table.
Parameters:
- values: 1d array-like
Returns:
ndarray or ExtensionArray, unique values
"""
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None, dropna=True):
"""
Compute a histogram of the 1D array values.
Parameters:
- values: 1d array-like
- sort: bool, sort by values
- ascending: bool, sort in ascending order
- normalize: bool, return relative frequencies
- bins: int, rather than count values, group them into half-open bins
- dropna: bool, don't include counts of NaN
Returns:
Series
"""Functions to check data types and properties of pandas objects.
# Available in pandas.api.types
def infer_dtype(value, skipna=True):
"""
Efficiently infer the type of a passed val.
Parameters:
- value: object, object whose type is to be inferred
- skipna: bool, ignore NaN values when inferring type
Returns:
str, type of the object
"""
def is_any_real_numeric_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is a real number data type."""
def is_bool_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is a boolean data type."""
def is_categorical_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is Categorical data type."""
def is_complex_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is a complex data type."""
def is_datetime64_any_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is datetime64 data type."""
def is_datetime64_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is datetime64[ns] data type."""
def is_datetime64_ns_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is datetime64[ns] data type."""
def is_datetime64tz_dtype(arr_or_dtype):
"""Check whether the provided array or dtype has a timezone-aware datetime64 data type."""
def is_extension_array_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is an extension data type."""
def is_float_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is a float data type."""
def is_integer_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is an integer data type."""
def is_interval_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is Interval data type."""
def is_numeric_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is a numeric data type."""
def is_object_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is object data type."""
def is_period_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is Period data type."""
def is_signed_integer_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is a signed integer data type."""
def is_string_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is a string data type."""
def is_timedelta64_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is timedelta64 data type."""
def is_timedelta64_ns_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is timedelta64[ns] data type."""
def is_unsigned_integer_dtype(arr_or_dtype):
"""Check whether the provided array or dtype is an unsigned integer data type."""
def pandas_dtype(dtype):
"""
Convert input into a pandas only dtype object or a numpy dtype object.
Parameters:
- dtype: object to be converted
Returns:
np.dtype or pandas dtype
"""Specialized array classes that provide the foundation for extension data types.
class BooleanArray:
def __init__(self, values, mask, copy=False):
"""
Array of boolean (True/False) data with missing values.
Parameters:
- values: numpy.ndarray, boolean array
- mask: numpy.ndarray, boolean array indicating missing values
- copy: bool, copy the input arrays
"""
class IntegerArray:
def __init__(self, values, mask, copy=False):
"""
Array of integer values with missing value support.
Parameters:
- values: numpy.ndarray, integer array
- mask: numpy.ndarray, boolean array indicating missing values
- copy: bool, copy the input arrays
"""
class FloatingArray:
def __init__(self, values, mask, copy=False):
"""
Array of floating point values with missing value support.
Parameters:
- values: numpy.ndarray, float array
- mask: numpy.ndarray, boolean array indicating missing values
- copy: bool, copy the input arrays
"""
class StringArray:
def __init__(self, values, copy=False):
"""
Extension array for string data in a pandas Series or DataFrame.
Parameters:
- values: array-like, sequence of strings
- copy: bool, copy the input array
"""
class IntervalArray:
def __init__(self, data, closed=None, dtype=None, copy=False, verify_integrity=True):
"""
Pandas array for interval data that are closed on the same side.
Parameters:
- data: array-like (1-dimensional), array of Interval objects
- closed: str, whether intervals are closed ('left', 'right', 'both', 'neither')
- dtype: IntervalDtype, dtype for the IntervalArray
- copy: bool, copy the input data
- verify_integrity: bool, verify data integrity
"""
class PeriodArray:
def __init__(self, values, dtype=None, freq=None, copy=False):
"""
Pandas array for storing Period data.
Parameters:
- values: Union[PeriodArray, Series[period], ndarray[int], PeriodIndex]
- dtype: PeriodDtype, optional
- freq: str or period object, frequency
- copy: bool, copy the input data
"""
class DatetimeArray:
def __init__(self, values, dtype=None, freq=None, copy=False):
"""
Pandas array for datetime64 data.
Parameters:
- values: Series, Index, DatetimeArray, ndarray
- dtype: numpy.dtype or DatetimeTZDtype
- freq: str or Offset
- copy: bool, copy the input data
"""
class TimedeltaArray:
def __init__(self, values, dtype=None, freq=None, copy=False):
"""
Pandas array for timedelta64 data.
Parameters:
- values: array-like, sequence of timedelta-like objects
- dtype: numpy.dtype
- freq: str or Offset
- copy: bool, copy the input data
"""
class SparseArray:
def __init__(self, data, sparse_index=None, fill_value=None, kind='integer', dtype=None, copy=False):
"""
An ExtensionArray for storing sparse data.
Parameters:
- data: array-like or scalar
- sparse_index: SparseIndex, locations of non-fill_value entries
- fill_value: scalar, entries matching this value are omitted from representation
- kind: str, sparse index kind ('integer' or 'block')
- dtype: numpy.dtype
- copy: bool, copy the input data
"""def union_categoricals(to_union, sort_categories=False, ignore_order=False):
"""
Combine list-like of Categorical-like into a single Categorical.
Parameters:
- to_union: list-like, Categorical, CategoricalIndex, or Series with categorical dtype
- sort_categories: bool, sort resulting categories
- ignore_order: bool, ignore category order
Returns:
Categorical
"""
def concat_categoricals(to_concat, axis=0, join='outer', ignore_index=False):
"""
Concatenate Categoricals.
Parameters:
- to_concat: list of Categoricals
- axis: int, axis to concatenate along
- join: str, join method for categories
- ignore_index: bool, reset index in result
Returns:
Categorical
"""# Constructor functions for nullable integer arrays
def Int8Array(values, mask=None, copy=False):
"""Construct Int8Array."""
def Int16Array(values, mask=None, copy=False):
"""Construct Int16Array."""
def Int32Array(values, mask=None, copy=False):
"""Construct Int32Array."""
def Int64Array(values, mask=None, copy=False):
"""Construct Int64Array."""
def UInt8Array(values, mask=None, copy=False):
"""Construct UInt8Array."""
def UInt16Array(values, mask=None, copy=False):
"""Construct UInt16Array."""
def UInt32Array(values, mask=None, copy=False):
"""Construct UInt32Array."""
def UInt64Array(values, mask=None, copy=False):
"""Construct UInt64Array."""# Missing value sentinels
NA: object # Pandas missing value for extension dtypes
NaT: object # Not-a-Time for datetime/timedelta
# Extension dtype base classes
class ExtensionDtype:
"""Base class for custom data types."""
@property
def name(self):
"""Return a string representation of the dtype."""
@property
def type(self):
"""Return the scalar type for the array."""
@classmethod
def construct_from_string(cls, string):
"""Construct this type from a string."""
# Categorical ordering
CategoricalOrdering = bool
# Dtype inference results
InferredType = Literal[
'boolean', 'integer', 'floating', 'complex', 'string', 'unicode',
'mixed', 'mixed-integer', 'mixed-integer-float', 'decimal',
'datetime', 'datetime64', 'timedelta', 'timedelta64',
'period', 'categorical', 'interval', 'bytes', 'empty'
]
# Arrow dtype string representations
ArrowDtypeStr = str # PyArrow dtype string like 'int64[pyarrow]'
# Sparse array kinds
SparseKind = Literal['integer', 'block']Install with Tessl CLI
npx tessl i tessl/pypi-pandas