tessl/pypi-datacompy

Comprehensive DataFrame comparison library providing functionality equivalent to SAS's PROC COMPARE for Python with support for Pandas, Spark, Polars, Snowflake, and distributed computing

—

Pending

Overview

Eval results

Files

Pandas DataFrame Comparison

Name: tessl/pypi-datacompy
Author: tessl

Core DataFrame comparison functionality for Pandas DataFrames, providing detailed statistical reporting, tolerance-based numeric comparisons, and comprehensive mismatch analysis. This is the primary comparison class in DataComPy.

Capabilities

Compare Class

The main comparison class for Pandas DataFrames that performs comprehensive comparison analysis with configurable tolerance settings and detailed reporting.

class Compare(BaseCompare):
    """Comparison class for Pandas DataFrames.
    
    Both df1 and df2 should be dataframes containing all of the join_columns,
    with unique column names. Differences between values are compared to
    abs_tol + rel_tol * abs(df2['value']).
    """
    
    def __init__(
        self,
        df1: pd.DataFrame,
        df2: pd.DataFrame,
        join_columns: List[str] | str | None = None,
        on_index: bool = False,
        abs_tol: float | Dict[str, float] = 0,
        rel_tol: float | Dict[str, float] = 0,
        df1_name: str = "df1",
        df2_name: str = "df2",
        ignore_spaces: bool = False,
        ignore_case: bool = False,
        cast_column_names_lower: bool = True
    ):
        """
        Parameters:
        - df1: First DataFrame to compare
        - df2: Second DataFrame to compare
        - join_columns: Column(s) to join dataframes on
        - on_index: If True, join on DataFrame index instead of columns
        - abs_tol: Absolute tolerance for numeric comparisons (float or dict)
        - rel_tol: Relative tolerance for numeric comparisons (float or dict)
        - df1_name: Display name for first DataFrame
        - df2_name: Display name for second DataFrame
        - ignore_spaces: Strip whitespace from string columns
        - ignore_case: Ignore case in string comparisons
        - cast_column_names_lower: Convert column names to lowercase
        """

Properties and Attributes

Access to comparison results and DataFrame metadata.

# Properties
@property
def df1(self) -> pd.DataFrame:
    """Get the first dataframe."""

@property
def df2(self) -> pd.DataFrame:
    """Get the second dataframe."""

# Attributes (available after comparison)
df1_unq_rows: pd.DataFrame  # Rows only in df1
df2_unq_rows: pd.DataFrame  # Rows only in df2
intersect_rows: pd.DataFrame  # Shared rows with match indicators
column_stats: List[Dict[str, Any]]  # Column-by-column comparison statistics

Column Information Methods

Methods to analyze column structure and relationships between DataFrames.

def df1_unq_columns(self) -> OrderedSet[str]:
    """Get columns that are unique to df1."""

def df2_unq_columns(self) -> OrderedSet[str]:
    """Get columns that are unique to df2."""

def intersect_columns(self) -> OrderedSet[str]:
    """Get columns that are shared between the two dataframes."""

def all_columns_match(self) -> bool:
    """Check if all columns match between DataFrames."""

Row Comparison Methods

Methods to analyze row-level differences and overlaps.

def all_rows_overlap(self) -> bool:
    """Check if all rows are present in both DataFrames."""

def count_matching_rows(self) -> int:
    """Count the number of matching rows."""

def intersect_rows_match(self) -> bool:
    """Check if rows that exist in both DataFrames have matching values."""

Matching and Validation Methods

High-level methods to determine if DataFrames match according to various criteria.

def matches(self, ignore_extra_columns: bool = False) -> bool:
    """
    Check if DataFrames match completely.
    
    Parameters:
    - ignore_extra_columns: If True, ignore columns that exist in only one DataFrame
    
    Returns:
    True if DataFrames match, False otherwise
    """

def subset(self) -> bool:
    """
    Check if df2 is a subset of df1.
    
    Returns:
    True if df2 is a subset of df1, False otherwise
    """

Mismatch Analysis Methods

Methods to identify and analyze specific differences between DataFrames.

def sample_mismatch(
    self,
    column: str,
    sample_count: int = 10,
    for_display: bool = False
) -> pd.DataFrame | None:
    """
    Get a sample of mismatched values for a specific column.
    
    Parameters:
    - column: Name of column to sample
    - sample_count: Number of mismatched rows to return
    - for_display: Format output for display purposes
    
    Returns:
    DataFrame with sample of mismatched rows, or None if no mismatches
    """

def all_mismatch(self, ignore_matching_cols: bool = False) -> pd.DataFrame:
    """
    Get all mismatched rows.
    
    Parameters:
    - ignore_matching_cols: If True, exclude columns that match completely
    
    Returns:
    DataFrame containing all rows with mismatches
    """

Report Generation

Comprehensive reporting functionality with customizable output formats.

def report(
    self,
    sample_count: int = 10,
    column_count: int = 10,
    html_file: str | None = None,
    template_path: str | None = None
) -> str:
    """
    Generate comprehensive comparison report.
    
    Parameters:
    - sample_count: Number of sample mismatches to include
    - column_count: Number of columns to include in detailed stats
    - html_file: Path to save HTML report (optional)
    - template_path: Custom template path (optional)
    
    Returns:
    String containing the formatted comparison report
    """

Usage Examples

Basic Comparison

import pandas as pd
import datacompy

# Create test data
df1 = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'value': [10.0, 20.0, 30.0, 40.0],
    'status': ['active', 'active', 'inactive', 'active']
})

df2 = pd.DataFrame({
    'id': [1, 2, 3, 5],
    'value': [10.1, 20.0, 30.0, 50.0],
    'status': ['active', 'active', 'inactive', 'pending']
})

# Perform comparison
compare = datacompy.Compare(df1, df2, join_columns=['id'])

# Check results
print(f"DataFrames match: {compare.matches()}")
print(f"Rows only in df1: {len(compare.df1_unq_rows)}")
print(f"Rows only in df2: {len(compare.df2_unq_rows)}")

Tolerance-Based Comparison

# Compare with tolerance for numeric columns
compare = datacompy.Compare(
    df1, df2, 
    join_columns=['id'],
    abs_tol=0.1,  # Allow 0.1 absolute difference
    rel_tol=0.05  # Allow 5% relative difference
)

# Per-column tolerance
compare = datacompy.Compare(
    df1, df2,
    join_columns=['id'],
    abs_tol={'value': 0.2, 'default': 0.1},
    rel_tol={'value': 0.1, 'default': 0.05}
)

String Comparison Options

# Ignore case and whitespace in string comparisons
compare = datacompy.Compare(
    df1, df2,
    join_columns=['id'],
    ignore_case=True,
    ignore_spaces=True
)

Detailed Reporting

# Generate detailed report
report = compare.report(sample_count=20, column_count=15)
print(report)

# Save HTML report
compare.report(html_file='comparison_report.html')

# Get specific mismatch samples
value_mismatches = compare.sample_mismatch('value', sample_count=5)
print(value_mismatches)

Install with Tessl CLI