tessl/pypi-datacompy

Comprehensive DataFrame comparison library providing functionality equivalent to SAS's PROC COMPARE for Python with support for Pandas, Spark, Polars, Snowflake, and distributed computing

—

Pending

Overview

Eval results

Files

Reporting and Output

Name: tessl/pypi-datacompy
Author: tessl

Template-based reporting system with customizable HTML and text output, providing detailed comparison statistics, mismatch samples, and publication-ready reports.

Capabilities

Template Rendering System

Jinja2-based template system for generating customizable comparison reports with flexible formatting options.

def render(template_name: str, **context: Any) -> str:
    """
    Render Jinja2 template with provided context.
    
    Parameters:
    - template_name: Name of template file to render
    - **context: Template variables as keyword arguments
    
    Returns:
    Rendered template as string
    """

HTML Report Generation

Generate and save HTML reports with interactive features and professional formatting.

def save_html_report(report: str, html_file: str | Path) -> None:
    """
    Save comparison report as HTML file.
    
    Parameters:
    - report: Report content as string
    - html_file: Path where HTML file should be saved
    """

DataFrame String Conversion

Convert DataFrames to formatted string representations for display and logging purposes.

def df_to_str(df: Any, sample_count: int | None, on_index: bool) -> str:
    """
    Convert DataFrame to formatted string representation.
    
    Parameters:
    - df: DataFrame to convert (any supported backend)
    - sample_count: Number of rows to include (None for all)
    - on_index: Whether to include index in output
    
    Returns:
    Formatted string representation of DataFrame
    """

Utility Functions

Helper functions for report generation and data formatting.

def temp_column_name(*dataframes) -> str:
    """
    Generate unique temporary column name that doesn't conflict with existing columns.
    
    Parameters:
    - *dataframes: Variable number of DataFrames to check for column conflicts
    
    Returns:
    Unique temporary column name as string
    """

Template System

Default Template Variables

The default report template (report_template.j2) supports the following variables:

# Template context variables
df1_name: str           # Name of first DataFrame
df2_name: str           # Name of second DataFrame
df1_shape: tuple        # Shape of first DataFrame (rows, columns)
df2_shape: tuple        # Shape of second DataFrame (rows, columns)
column_summary: dict    # Summary of column differences
row_summary: dict       # Summary of row differences
column_comparison: list # Detailed column-by-column statistics
mismatch_stats: dict    # Statistics about mismatched values
df1_unique_rows: Any    # Rows unique to first DataFrame
df2_unique_rows: Any    # Rows unique to second DataFrame
column_count: int       # Number of columns to include in detailed output

Custom Templates

Create custom templates for specialized reporting needs:

# Use custom template
custom_report = comparison.report(
    template_path='/path/to/custom/templates',
    sample_count=20
)

# Available in custom templates
template_vars = {
    'comparison_summary': '...',
    'detailed_stats': [...],
    'sample_mismatches': {...},
    'metadata': {...}
}

Usage Examples

Basic Report Generation

import pandas as pd
import datacompy

# Create test DataFrames
df1 = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'value': [10.0, 20.0, 30.0, 40.0],
    'status': ['active', 'active', 'inactive', 'active']
})

df2 = pd.DataFrame({
    'id': [1, 2, 3, 5],
    'value': [10.1, 20.0, 30.0, 50.0],
    'status': ['active', 'active', 'inactive', 'pending']
})

# Create comparison
compare = datacompy.Compare(df1, df2, join_columns=['id'])

# Generate basic text report
text_report = compare.report()
print(text_report)

# Generate HTML report
html_report = compare.report(html_file='comparison_report.html')
print("HTML report saved to comparison_report.html")

Customized Report Parameters

# Detailed report with more samples and columns
detailed_report = compare.report(
    sample_count=25,      # Show 25 sample mismatches
    column_count=20       # Include up to 20 columns in stats
)

# Minimal report
minimal_report = compare.report(
    sample_count=5,       # Show only 5 sample mismatches
    column_count=5        # Include only 5 columns in stats
)

Custom HTML Styling

# Generate report with additional context
custom_context = {
    'title': 'Quarterly Data Comparison',
    'analyst': 'Data Team',
    'date': '2024-01-15'
}

# Create custom template that includes these variables
custom_report = compare.report(
    html_file='quarterly_report.html',
    template_path='/path/to/custom/templates'
)

DataFrame Display Utilities

import datacompy

# Convert DataFrame to string for logging
df_string = datacompy.df_to_str(
    df1, 
    sample_count=10,    # Show first 10 rows
    on_index=True       # Include index
)
print("DataFrame preview:")
print(df_string)

# Generate temporary column name
temp_col = datacompy.temp_column_name(df1, df2)
print(f"Safe temporary column name: {temp_col}")

Programmatic Report Processing

# Generate report and extract specific information
report = compare.report()

# Parse report sections (example)
lines = report.split('\n')
summary_line = [line for line in lines if 'DataFrames match' in line][0]
print(f"Match status: {summary_line}")

# Access structured comparison data
print(f"Unique rows in df1: {len(compare.df1_unq_rows)}")
print(f"Unique rows in df2: {len(compare.df2_unq_rows)}")
print(f"Column statistics: {compare.column_stats}")

Batch Report Generation

import os
from datetime import datetime

# Generate multiple reports with timestamps
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Text report
text_file = f"comparison_report_{timestamp}.txt"
with open(text_file, 'w') as f:
    f.write(compare.report())

# HTML report
html_file = f"comparison_report_{timestamp}.html"
compare.report(html_file=html_file)

# Summary report for dashboard
summary = {
    'timestamp': timestamp,
    'matches': compare.matches(),
    'total_rows_df1': len(compare.df1),
    'total_rows_df2': len(compare.df2),
    'unique_rows_df1': len(compare.df1_unq_rows),
    'unique_rows_df2': len(compare.df2_unq_rows),
    'shared_columns': len(compare.intersect_columns()),
    'unique_columns_df1': len(compare.df1_unq_columns()),
    'unique_columns_df2': len(compare.df2_unq_columns())
}

import json
with open(f"comparison_summary_{timestamp}.json", 'w') as f:
    json.dump(summary, f, indent=2)

Template Customization

# Create custom template directory structure
# /custom_templates/
#   └── custom_report.j2

custom_template_content = """
<!DOCTYPE html>
<html>
<head>
    <title>{{ title | default('DataComPy Comparison Report') }}</title>
    <style>
        .summary { background-color: #f0f0f0; padding: 10px; }
        .mismatch { background-color: #ffe6e6; }
        .match { background-color: #e6ffe6; }
    </style>
</head>
<body>
    <h1>Comparison: {{ df1_name }} vs {{ df2_name }}</h1>
    
    <div class="summary">
        <h2>Summary</h2>
        <p>{{ df1_name }}: {{ df1_shape[0] }} rows, {{ df1_shape[1] }} columns</p>
        <p>{{ df2_name }}: {{ df2_shape[0] }} rows, {{ df2_shape[1] }} columns</p>
    </div>
    
    <!-- Custom sections here -->
    
</body>
</html>
"""

# Save custom template
os.makedirs('/custom_templates', exist_ok=True)
with open('/custom_templates/custom_report.j2', 'w') as f:
    f.write(custom_template_content)

# Use custom template
custom_report = compare.report(
    html_file='custom_comparison.html',
    template_path='/custom_templates'
)

Integration with Jupyter Notebooks

from IPython.display import HTML, display
import datacompy

# Generate comparison
compare = datacompy.Compare(df1, df2, join_columns=['id'])

# Display HTML report inline in Jupyter
html_report = compare.report()
display(HTML(html_report))

# Or save and display file
compare.report(html_file='notebook_report.html')
display(HTML(filename='notebook_report.html'))