Comprehensive DataFrame comparison library providing functionality equivalent to SAS's PROC COMPARE for Python with support for Pandas, Spark, Polars, Snowflake, and distributed computing
npx @tessl/cli install tessl/pypi-datacompy@0.18.0DataComPy is a comprehensive DataFrame comparison library that provides functionality equivalent to SAS's PROC COMPARE for Python data analysis workflows. It supports comparison across multiple DataFrame backends including Pandas, Spark, Polars, Snowflake (via Snowpark), Dask (via Fugue), and DuckDB (via Fugue), making it a versatile tool for data validation and quality assurance.
pip install datacompypip install datacompy[spark]pip install datacompy[fugue]pip install datacompy[snowflake]import datacompySpecific comparison classes:
from datacompy import Compare, PolarsCompare, SparkSQLCompare, SnowflakeCompareUtility functions:
from datacompy import columns_equal, is_match, report, all_columns_matchimport pandas as pd
import datacompy
# Create sample DataFrames
df1 = pd.DataFrame({
'id': [1, 2, 3, 4],
'name': ['Alice', 'Bob', 'Charlie', 'David'],
'score': [85.5, 92.0, 78.5, 91.0]
})
df2 = pd.DataFrame({
'id': [1, 2, 3, 5],
'name': ['Alice', 'Bob', 'Charlie', 'Eve'],
'score': [85.5, 92.1, 78.5, 89.0]
})
# Compare DataFrames
compare = datacompy.Compare(df1, df2, join_columns=['id'])
# Check if DataFrames match
if compare.matches():
print("DataFrames are identical")
else:
print("DataFrames differ")
print(compare.report())
# Access comparison results
print(f"Rows in df1 only: {len(compare.df1_unq_rows)}")
print(f"Rows in df2 only: {len(compare.df2_unq_rows)}")
print(f"Shared rows: {len(compare.intersect_rows)}")DataComPy uses a consistent architecture across all DataFrame backends:
This design allows seamless switching between DataFrame libraries while maintaining identical functionality and API consistency.
Core DataFrame comparison functionality for Pandas, including detailed statistical reporting, tolerance-based numeric comparisons, and comprehensive mismatch analysis.
class Compare(BaseCompare):
def __init__(
self,
df1: pd.DataFrame,
df2: pd.DataFrame,
join_columns: List[str] | str | None = None,
on_index: bool = False,
abs_tol: float | Dict[str, float] = 0,
rel_tol: float | Dict[str, float] = 0,
df1_name: str = "df1",
df2_name: str = "df2",
ignore_spaces: bool = False,
ignore_case: bool = False,
cast_column_names_lower: bool = True
): ...
def matches(self, ignore_extra_columns: bool = False) -> bool: ...
def report(
self,
sample_count: int = 10,
column_count: int = 10,
html_file: str | None = None,
template_path: str | None = None
) -> str: ...Comparison classes for Polars, Spark, and Snowflake DataFrames, providing the same functionality as Pandas comparison but optimized for each backend's specific characteristics and capabilities.
class PolarsCompare(BaseCompare): ...
class SparkSQLCompare(BaseCompare): ...
class SnowflakeCompare(BaseCompare): ...Fugue-powered distributed comparison functions that work across multiple backends including Dask, DuckDB, Ray, and Arrow, enabling scalable comparison of large datasets.
def is_match(
df1: AnyDataFrame,
df2: AnyDataFrame,
join_columns: str | List[str],
abs_tol: float = 0,
rel_tol: float = 0,
df1_name: str = "df1",
df2_name: str = "df2",
ignore_spaces: bool = False,
ignore_case: bool = False,
cast_column_names_lower: bool = True,
parallelism: int | None = None,
strict_schema: bool = False
) -> bool: ...
def report(
df1: AnyDataFrame,
df2: AnyDataFrame,
join_columns: str | List[str],
abs_tol: float = 0,
rel_tol: float = 0,
df1_name: str = "df1",
df2_name: str = "df2",
ignore_spaces: bool = False,
ignore_case: bool = False,
cast_column_names_lower: bool = True,
sample_count: int = 10,
column_count: int = 10,
html_file: str | None = None,
parallelism: int | None = None
) -> str: ...Low-level functions for comparing individual columns and performing specialized comparisons, useful for custom comparison logic and integration with other data processing workflows.
def columns_equal(
col_1: pd.Series[Any],
col_2: pd.Series[Any],
rel_tol: float = 0,
abs_tol: float = 0,
ignore_spaces: bool = False,
ignore_case: bool = False
) -> pd.Series[bool]: ...
def calculate_max_diff(col_1: pd.Series[Any], col_2: pd.Series[Any]) -> float: ...Template-based reporting system with customizable HTML and text output, providing detailed comparison statistics, mismatch samples, and publication-ready reports.
def render(template_name: str, **context: Any) -> str: ...
def save_html_report(report: str, html_file: str | Path) -> None: ...
def df_to_str(df: Any, sample_count: int | None, on_index: bool) -> str: ...