Comprehensive DataFrame comparison library providing functionality equivalent to SAS's PROC COMPARE for Python with support for Pandas, Spark, Polars, Snowflake, and distributed computing
—
Comparison classes for Polars, Spark, and Snowflake DataFrames, providing the same functionality as Pandas comparison but optimized for each backend's specific characteristics and capabilities.
High-performance DataFrame comparison for Polars, leveraging Polars' optimized computation engine while maintaining the same API as Pandas comparison.
class PolarsCompare(BaseCompare):
"""Comparison class for Polars DataFrames."""
def __init__(
self,
df1: pl.DataFrame,
df2: pl.DataFrame,
join_columns: List[str] | str,
abs_tol: float | Dict[str, float] = 0,
rel_tol: float | Dict[str, float] = 0,
df1_name: str = "df1",
df2_name: str = "df2",
ignore_spaces: bool = False,
ignore_case: bool = False,
cast_column_names_lower: bool = True
):
"""
Parameters:
- df1: First Polars DataFrame to compare
- df2: Second Polars DataFrame to compare
- join_columns: Column(s) to join dataframes on
- abs_tol: Absolute tolerance for numeric comparisons
- rel_tol: Relative tolerance for numeric comparisons
- df1_name: Display name for first DataFrame
- df2_name: Display name for second DataFrame
- ignore_spaces: Strip whitespace from string columns
- ignore_case: Ignore case in string comparisons
- cast_column_names_lower: Convert column names to lowercase
"""@property
def df1(self) -> pl.DataFrame:
"""Get the first Polars dataframe."""
@property
def df2(self) -> pl.DataFrame:
"""Get the second Polars dataframe."""
# Attributes
df1_unq_rows: pl.DataFrame # Rows only in df1
df2_unq_rows: pl.DataFrame # Rows only in df2
intersect_rows: pl.DataFrame # Shared rows with match indicators
column_stats: List[Dict[str, Any]] # Column comparison statisticsDistributed DataFrame comparison for Spark SQL DataFrames, enabling comparison of large-scale datasets with Spark's distributed computing capabilities.
class SparkSQLCompare(BaseCompare):
"""Comparison class for Spark SQL DataFrames."""
def __init__(
self,
spark_session: pyspark.sql.SparkSession,
df1: pyspark.sql.DataFrame,
df2: pyspark.sql.DataFrame,
join_columns: List[str] | str,
abs_tol: float | Dict[str, float] = 0,
rel_tol: float | Dict[str, float] = 0,
df1_name: str = "df1",
df2_name: str = "df2",
ignore_spaces: bool = False,
ignore_case: bool = False,
cast_column_names_lower: bool = True
):
"""
Parameters:
- spark_session: Active Spark session
- df1: First Spark DataFrame to compare
- df2: Second Spark DataFrame to compare
- join_columns: Column(s) to join dataframes on
- abs_tol: Absolute tolerance for numeric comparisons
- rel_tol: Relative tolerance for numeric comparisons
- df1_name: Display name for first DataFrame
- df2_name: Display name for second DataFrame
- ignore_spaces: Strip whitespace from string columns
- ignore_case: Ignore case in string comparisons
- cast_column_names_lower: Convert column names to lowercase
"""@property
def df1(self) -> pyspark.sql.DataFrame:
"""Get the first Spark dataframe."""
@property
def df2(self) -> pyspark.sql.DataFrame:
"""Get the second Spark dataframe."""
# Attributes
df1_unq_rows: pyspark.sql.DataFrame # Rows only in df1
df2_unq_rows: pyspark.sql.DataFrame # Rows only in df2
intersect_rows: pyspark.sql.DataFrame # Shared rows with match indicators
column_stats: List # Column comparison statisticsCloud-native DataFrame comparison for Snowflake DataFrames via Snowpark, enabling comparison of data directly in Snowflake's cloud data platform.
class SnowflakeCompare(BaseCompare):
"""Comparison class for Snowflake DataFrames."""
def __init__(
self,
session: sp.Session,
df1: Union[str, sp.DataFrame],
df2: Union[str, sp.DataFrame],
join_columns: List[str] | str | None = None,
abs_tol: float | Dict[str, float] = 0,
rel_tol: float | Dict[str, float] = 0,
df1_name: str | None = None,
df2_name: str | None = None,
ignore_spaces: bool = False
):
"""
Parameters:
- session: Snowflake session object
- df1: First DataFrame or table name
- df2: Second DataFrame or table name
- join_columns: Column(s) to join dataframes on
- abs_tol: Absolute tolerance for numeric comparisons
- rel_tol: Relative tolerance for numeric comparisons
- df1_name: Display name for first DataFrame
- df2_name: Display name for second DataFrame
- ignore_spaces: Strip whitespace from string columns
"""@property
def df1(self) -> sp.DataFrame:
"""Get the first Snowpark dataframe."""
@property
def df2(self) -> sp.DataFrame:
"""Get the second Snowpark dataframe."""
# Attributes
df1_unq_rows: sp.DataFrame # Rows only in df1
df2_unq_rows: sp.DataFrame # Rows only in df2
intersect_rows: sp.DataFrame # Shared rows with match indicators
column_stats: List[Dict[str, Any]] # Column comparison statisticsAll multi-backend comparison classes share the same method signatures as the Pandas Compare class:
# Column analysis
def df1_unq_columns(self) -> OrderedSet[str]: ...
def df2_unq_columns(self) -> OrderedSet[str]: ...
def intersect_columns(self) -> OrderedSet[str]: ...
def all_columns_match(self) -> bool: ...
# Row analysis
def all_rows_overlap(self) -> bool: ...
def count_matching_rows(self) -> int: ...
def intersect_rows_match(self) -> bool: ...
# Matching validation
def matches(self, ignore_extra_columns: bool = False) -> bool: ...
def subset(self) -> bool: ...
# Mismatch analysis
def sample_mismatch(self, column: str, sample_count: int = 10, for_display: bool = False) -> Any: ...
def all_mismatch(self, ignore_matching_cols: bool = False) -> Any: ...
# Reporting
def report(
self,
sample_count: int = 10,
column_count: int = 10,
html_file: str | None = None,
template_path: str | None = None
) -> str: ...import polars as pl
import datacompy
# Create Polars DataFrames
df1 = pl.DataFrame({
'id': [1, 2, 3, 4],
'value': [10.0, 20.0, 30.0, 40.0],
'status': ['active', 'active', 'inactive', 'active']
})
df2 = pl.DataFrame({
'id': [1, 2, 3, 5],
'value': [10.1, 20.0, 30.0, 50.0],
'status': ['active', 'active', 'inactive', 'pending']
})
# Compare with Polars
compare = datacompy.PolarsCompare(
df1, df2,
join_columns=['id'],
abs_tol=0.1
)
print(f"DataFrames match: {compare.matches()}")
print(compare.report())from pyspark.sql import SparkSession
import datacompy
# Initialize Spark session
spark = SparkSession.builder.appName("DataComPy").getOrCreate()
# Create Spark DataFrames
df1 = spark.createDataFrame([
(1, 10.0, 'active'),
(2, 20.0, 'active'),
(3, 30.0, 'inactive'),
(4, 40.0, 'active')
], ['id', 'value', 'status'])
df2 = spark.createDataFrame([
(1, 10.1, 'active'),
(2, 20.0, 'active'),
(3, 30.0, 'inactive'),
(5, 50.0, 'pending')
], ['id', 'value', 'status'])
# Compare with Spark
compare = datacompy.SparkSQLCompare(
spark, df1, df2,
join_columns=['id'],
abs_tol=0.1
)
print(f"DataFrames match: {compare.matches()}")
print(compare.report())from snowflake.snowpark import Session
import datacompy
# Create Snowflake session
session = Session.builder.configs({
'account': 'your_account',
'user': 'your_user',
'password': 'your_password',
'database': 'your_database',
'schema': 'your_schema'
}).create()
# Compare tables directly by name
compare = datacompy.SnowflakeCompare(
session,
df1='table1', # Table name
df2='table2', # Table name
join_columns=['id'],
abs_tol=0.1
)
# Or compare DataFrame objects
df1 = session.table('table1')
df2 = session.table('table2')
compare = datacompy.SnowflakeCompare(
session, df1, df2,
join_columns=['id'],
abs_tol=0.1
)
print(f"DataFrames match: {compare.matches()}")
print(compare.report())Install with Tessl CLI
npx tessl i tessl/pypi-datacompy