Comprehensive DataFrame comparison library providing functionality equivalent to SAS's PROC COMPARE for Python with support for Pandas, Spark, Polars, Snowflake, and distributed computing
—
Low-level functions for comparing individual columns and performing specialized comparisons, useful for custom comparison logic and integration with other data processing workflows.
Functions for comparing individual Pandas Series with tolerance support and specialized handling for different data types.
def columns_equal(
col_1: pd.Series[Any],
col_2: pd.Series[Any],
rel_tol: float = 0,
abs_tol: float = 0,
ignore_spaces: bool = False,
ignore_case: bool = False
) -> pd.Series[bool]:
"""
Compare two Pandas Series element-wise.
Parameters:
- col_1: First Series to compare
- col_2: Second Series to compare
- rel_tol: Relative tolerance for numeric comparisons
- abs_tol: Absolute tolerance for numeric comparisons
- ignore_spaces: Strip whitespace from string values
- ignore_case: Ignore case in string comparisons
Returns:
Boolean Series indicating element-wise equality
"""
def compare_string_and_date_columns(
col_1: pd.Series[Any],
col_2: pd.Series[Any]
) -> pd.Series[bool]:
"""
Compare string and date columns with specialized logic.
Parameters:
- col_1: First Series (string or date type)
- col_2: Second Series (string or date type)
Returns:
Boolean Series indicating element-wise equality
"""
def calculate_max_diff(col_1: pd.Series[Any], col_2: pd.Series[Any]) -> float:
"""
Calculate maximum absolute difference between numeric columns.
Parameters:
- col_1: First numeric Series
- col_2: Second numeric Series
Returns:
Maximum absolute difference as float
"""Helper functions for DataFrame manipulation and analysis during comparison operations.
def get_merged_columns(
original_df: pd.DataFrame,
merged_df: pd.DataFrame,
suffix: str
) -> List[str]:
"""
Get column names from merged DataFrame with specific suffix.
Parameters:
- original_df: Original DataFrame before merge
- merged_df: DataFrame after merge operation
- suffix: Suffix used in merge operation
Returns:
List of column names with the specified suffix
"""
def generate_id_within_group(
dataframe: pd.DataFrame,
join_columns: List[str]
) -> pd.Series[int]:
"""
Generate unique identifiers within groups for deduplication.
Parameters:
- dataframe: DataFrame to generate IDs for
- join_columns: Columns defining the groups
Returns:
Series of integer IDs unique within each group
"""Functions for normalizing string data during comparisons.
def normalize_string_column(
column: pd.Series,
ignore_spaces: bool,
ignore_case: bool
) -> pd.Series:
"""
Normalize string column for comparison.
Parameters:
- column: String Series to normalize
- ignore_spaces: Strip leading/trailing whitespace
- ignore_case: Convert to lowercase
Returns:
Normalized string Series
"""Specialized functions for comparing Polars Series with optimized performance.
def columns_equal(
col_1: pl.Series,
col_2: pl.Series,
rel_tol: float = 0,
abs_tol: float = 0,
ignore_spaces: bool = False,
ignore_case: bool = False
) -> pl.Series:
"""
Compare two Polars Series element-wise.
Parameters:
- col_1: First Polars Series to compare
- col_2: Second Polars Series to compare
- rel_tol: Relative tolerance for numeric comparisons
- abs_tol: Absolute tolerance for numeric comparisons
- ignore_spaces: Strip whitespace from string values
- ignore_case: Ignore case in string comparisons
Returns:
Boolean Polars Series indicating element-wise equality
"""
def calculate_max_diff(col_1: pl.Series, col_2: pl.Series) -> float:
"""
Calculate maximum absolute difference between numeric Polars Series.
Parameters:
- col_1: First numeric Polars Series
- col_2: Second numeric Polars Series
Returns:
Maximum absolute difference as float
"""
def normalize_string_column(
column: pl.Series,
ignore_spaces: bool,
ignore_case: bool
) -> pl.Series:
"""
Normalize Polars string Series for comparison.
Parameters:
- column: String Polars Series to normalize
- ignore_spaces: Strip leading/trailing whitespace
- ignore_case: Convert to lowercase
Returns:
Normalized string Polars Series
"""Functions for comparing Spark DataFrame columns using SQL expressions.
def columns_equal(
dataframe: pyspark.sql.DataFrame,
col_1: str,
col_2: str,
rel_tol: float = 0,
abs_tol: float = 0,
ignore_spaces: bool = False,
ignore_case: bool = False
) -> pyspark.sql.Column:
"""
Create Spark SQL Column expression for comparing two columns.
Parameters:
- dataframe: Spark DataFrame containing the columns
- col_1: Name of first column to compare
- col_2: Name of second column to compare
- rel_tol: Relative tolerance for numeric comparisons
- abs_tol: Absolute tolerance for numeric comparisons
- ignore_spaces: Strip whitespace from string values
- ignore_case: Ignore case in string comparisons
Returns:
Spark Column expression evaluating to boolean
"""
def calculate_max_diff(
dataframe: pyspark.sql.DataFrame,
col_1: str,
col_2: str
) -> float:
"""
Calculate maximum absolute difference between numeric columns.
Parameters:
- dataframe: Spark DataFrame containing the columns
- col_1: Name of first column
- col_2: Name of second column
Returns:
Maximum absolute difference as float
"""
def calculate_null_diff(
dataframe: pyspark.sql.DataFrame,
col_1: str,
col_2: str
) -> int:
"""
Calculate differences in null values between columns.
Parameters:
- dataframe: Spark DataFrame containing the columns
- col_1: Name of first column
- col_2: Name of second column
Returns:
Count of null value differences as int
"""Functions for comparing Snowflake DataFrame columns with cloud-optimized processing.
def columns_equal(
dataframe: sp.DataFrame,
col_1: str,
col_2: str,
col_match: str,
rel_tol: float = 0,
abs_tol: float = 0,
ignore_spaces: bool = False
) -> sp.DataFrame:
"""
Compare columns in Snowflake DataFrame.
Parameters:
- dataframe: Snowpark DataFrame containing the columns
- col_1: Name of first column to compare
- col_2: Name of second column to compare
- col_match: Name of column to store match results
- rel_tol: Relative tolerance for numeric comparisons
- abs_tol: Absolute tolerance for numeric comparisons
- ignore_spaces: Strip whitespace from string values
Returns:
Snowpark DataFrame with comparison results
"""
def calculate_max_diff(
dataframe: sp.DataFrame,
col_1: str,
col_2: str
) -> float:
"""
Calculate maximum absolute difference between numeric columns.
Parameters:
- dataframe: Snowpark DataFrame containing the columns
- col_1: Name of first column
- col_2: Name of second column
Returns:
Maximum absolute difference as float
"""
def calculate_null_diff(
dataframe: sp.DataFrame,
col_1: str,
col_2: str
) -> int:
"""
Calculate differences in null values between columns.
Parameters:
- dataframe: Snowpark DataFrame containing the columns
- col_1: Name of first column
- col_2: Name of second column
Returns:
Count of null value differences as int
"""import pandas as pd
import datacompy
# Create test Series
col1 = pd.Series([1.0, 2.0, 3.0, 4.0, None])
col2 = pd.Series([1.1, 2.0, 3.2, 4.0, None])
# Compare with tolerance
matches = datacompy.columns_equal(col1, col2, abs_tol=0.1)
print(matches) # [True, True, False, True, True]
# Calculate maximum difference
max_diff = datacompy.calculate_max_diff(col1, col2)
print(f"Maximum difference: {max_diff}") # 0.2import pandas as pd
import datacompy
# String data with case and space variations
col1 = pd.Series(['Alice', 'Bob ', 'CHARLIE', 'david'])
col2 = pd.Series(['alice', 'Bob', 'charlie', 'David'])
# Case-sensitive comparison
strict_matches = datacompy.columns_equal(col1, col2)
print(strict_matches) # [False, False, False, False]
# Case-insensitive with space normalization
flexible_matches = datacompy.columns_equal(
col1, col2,
ignore_case=True,
ignore_spaces=True
)
print(flexible_matches) # [True, True, True, True]import polars as pl
import datacompy
# Create Polars Series
col1 = pl.Series([1.0, 2.0, 3.0, 4.0])
col2 = pl.Series([1.1, 2.0, 3.2, 4.0])
# Compare with tolerance
matches = datacompy.columns_equal(col1, col2, abs_tol=0.1)
print(matches)
# String normalization
str_col = pl.Series([' Alice ', 'BOB', 'charlie'])
normalized = datacompy.normalize_string_column(
str_col,
ignore_spaces=True,
ignore_case=True
)
print(normalized) # ['alice', 'bob', 'charlie']from pyspark.sql import SparkSession
import datacompy
spark = SparkSession.builder.appName("DataComPy").getOrCreate()
# Create DataFrame
df = spark.createDataFrame([
(1, 1.0, 1.1),
(2, 2.0, 2.0),
(3, 3.0, 3.2)
], ['id', 'col1', 'col2'])
# Create comparison expression
comparison_expr = datacompy.columns_equal(
df, 'col1', 'col2',
abs_tol=0.1
)
# Apply comparison
result = df.withColumn('matches', comparison_expr)
result.show()
# Calculate maximum difference
max_diff = datacompy.calculate_max_diff(df, 'col1', 'col2')
print(f"Max difference: {max_diff}")import pandas as pd
import datacompy
# Example DataFrame with duplicates
df = pd.DataFrame({
'group': ['A', 'A', 'B', 'B', 'A'],
'value': [1, 2, 3, 4, 5]
})
# Generate unique IDs within each group
ids = datacompy.generate_id_within_group(df, ['group'])
print(ids) # [0, 1, 0, 1, 2]
# Example of merged column extraction
df1 = pd.DataFrame({'a': [1, 2], 'b': [3, 4]})
df2 = pd.DataFrame({'a': [1, 2], 'c': [5, 6]})
merged = df1.merge(df2, on='a', suffixes=('_left', '_right'))
left_cols = datacompy.get_merged_columns(df1, merged, '_left')
print(left_cols) # ['b_left']# Polars type identifiers
STRING_TYPE: List[str] = ["String", "Utf8"]
LIST_TYPE: List[str] = ["List", "Array"]
# Spark numeric types (internal)
NUMERIC_SPARK_TYPES: List # Internal Spark numeric type list
# Snowflake numeric types (internal)
NUMERIC_SNOWPARK_TYPES: List # Internal Snowpark numeric type listThese utility functions provide the building blocks for custom comparison logic and can be combined to create specialized comparison workflows tailored to specific data analysis needs.
Install with Tessl CLI
npx tessl i tessl/pypi-datacompy