Comprehensive DataFrame comparison library providing functionality equivalent to SAS's PROC COMPARE for Python with support for Pandas, Spark, Polars, Snowflake, and distributed computing
—
Template-based reporting system with customizable HTML and text output, providing detailed comparison statistics, mismatch samples, and publication-ready reports.
Jinja2-based template system for generating customizable comparison reports with flexible formatting options.
def render(template_name: str, **context: Any) -> str:
"""
Render Jinja2 template with provided context.
Parameters:
- template_name: Name of template file to render
- **context: Template variables as keyword arguments
Returns:
Rendered template as string
"""Generate and save HTML reports with interactive features and professional formatting.
def save_html_report(report: str, html_file: str | Path) -> None:
"""
Save comparison report as HTML file.
Parameters:
- report: Report content as string
- html_file: Path where HTML file should be saved
"""Convert DataFrames to formatted string representations for display and logging purposes.
def df_to_str(df: Any, sample_count: int | None, on_index: bool) -> str:
"""
Convert DataFrame to formatted string representation.
Parameters:
- df: DataFrame to convert (any supported backend)
- sample_count: Number of rows to include (None for all)
- on_index: Whether to include index in output
Returns:
Formatted string representation of DataFrame
"""Helper functions for report generation and data formatting.
def temp_column_name(*dataframes) -> str:
"""
Generate unique temporary column name that doesn't conflict with existing columns.
Parameters:
- *dataframes: Variable number of DataFrames to check for column conflicts
Returns:
Unique temporary column name as string
"""The default report template (report_template.j2) supports the following variables:
# Template context variables
df1_name: str # Name of first DataFrame
df2_name: str # Name of second DataFrame
df1_shape: tuple # Shape of first DataFrame (rows, columns)
df2_shape: tuple # Shape of second DataFrame (rows, columns)
column_summary: dict # Summary of column differences
row_summary: dict # Summary of row differences
column_comparison: list # Detailed column-by-column statistics
mismatch_stats: dict # Statistics about mismatched values
df1_unique_rows: Any # Rows unique to first DataFrame
df2_unique_rows: Any # Rows unique to second DataFrame
column_count: int # Number of columns to include in detailed outputCreate custom templates for specialized reporting needs:
# Use custom template
custom_report = comparison.report(
template_path='/path/to/custom/templates',
sample_count=20
)
# Available in custom templates
template_vars = {
'comparison_summary': '...',
'detailed_stats': [...],
'sample_mismatches': {...},
'metadata': {...}
}import pandas as pd
import datacompy
# Create test DataFrames
df1 = pd.DataFrame({
'id': [1, 2, 3, 4],
'value': [10.0, 20.0, 30.0, 40.0],
'status': ['active', 'active', 'inactive', 'active']
})
df2 = pd.DataFrame({
'id': [1, 2, 3, 5],
'value': [10.1, 20.0, 30.0, 50.0],
'status': ['active', 'active', 'inactive', 'pending']
})
# Create comparison
compare = datacompy.Compare(df1, df2, join_columns=['id'])
# Generate basic text report
text_report = compare.report()
print(text_report)
# Generate HTML report
html_report = compare.report(html_file='comparison_report.html')
print("HTML report saved to comparison_report.html")# Detailed report with more samples and columns
detailed_report = compare.report(
sample_count=25, # Show 25 sample mismatches
column_count=20 # Include up to 20 columns in stats
)
# Minimal report
minimal_report = compare.report(
sample_count=5, # Show only 5 sample mismatches
column_count=5 # Include only 5 columns in stats
)# Generate report with additional context
custom_context = {
'title': 'Quarterly Data Comparison',
'analyst': 'Data Team',
'date': '2024-01-15'
}
# Create custom template that includes these variables
custom_report = compare.report(
html_file='quarterly_report.html',
template_path='/path/to/custom/templates'
)import datacompy
# Convert DataFrame to string for logging
df_string = datacompy.df_to_str(
df1,
sample_count=10, # Show first 10 rows
on_index=True # Include index
)
print("DataFrame preview:")
print(df_string)
# Generate temporary column name
temp_col = datacompy.temp_column_name(df1, df2)
print(f"Safe temporary column name: {temp_col}")# Generate report and extract specific information
report = compare.report()
# Parse report sections (example)
lines = report.split('\n')
summary_line = [line for line in lines if 'DataFrames match' in line][0]
print(f"Match status: {summary_line}")
# Access structured comparison data
print(f"Unique rows in df1: {len(compare.df1_unq_rows)}")
print(f"Unique rows in df2: {len(compare.df2_unq_rows)}")
print(f"Column statistics: {compare.column_stats}")import os
from datetime import datetime
# Generate multiple reports with timestamps
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# Text report
text_file = f"comparison_report_{timestamp}.txt"
with open(text_file, 'w') as f:
f.write(compare.report())
# HTML report
html_file = f"comparison_report_{timestamp}.html"
compare.report(html_file=html_file)
# Summary report for dashboard
summary = {
'timestamp': timestamp,
'matches': compare.matches(),
'total_rows_df1': len(compare.df1),
'total_rows_df2': len(compare.df2),
'unique_rows_df1': len(compare.df1_unq_rows),
'unique_rows_df2': len(compare.df2_unq_rows),
'shared_columns': len(compare.intersect_columns()),
'unique_columns_df1': len(compare.df1_unq_columns()),
'unique_columns_df2': len(compare.df2_unq_columns())
}
import json
with open(f"comparison_summary_{timestamp}.json", 'w') as f:
json.dump(summary, f, indent=2)# Create custom template directory structure
# /custom_templates/
# └── custom_report.j2
custom_template_content = """
<!DOCTYPE html>
<html>
<head>
<title>{{ title | default('DataComPy Comparison Report') }}</title>
<style>
.summary { background-color: #f0f0f0; padding: 10px; }
.mismatch { background-color: #ffe6e6; }
.match { background-color: #e6ffe6; }
</style>
</head>
<body>
<h1>Comparison: {{ df1_name }} vs {{ df2_name }}</h1>
<div class="summary">
<h2>Summary</h2>
<p>{{ df1_name }}: {{ df1_shape[0] }} rows, {{ df1_shape[1] }} columns</p>
<p>{{ df2_name }}: {{ df2_shape[0] }} rows, {{ df2_shape[1] }} columns</p>
</div>
<!-- Custom sections here -->
</body>
</html>
"""
# Save custom template
os.makedirs('/custom_templates', exist_ok=True)
with open('/custom_templates/custom_report.j2', 'w') as f:
f.write(custom_template_content)
# Use custom template
custom_report = compare.report(
html_file='custom_comparison.html',
template_path='/custom_templates'
)from IPython.display import HTML, display
import datacompy
# Generate comparison
compare = datacompy.Compare(df1, df2, join_columns=['id'])
# Display HTML report inline in Jupyter
html_report = compare.report()
display(HTML(html_report))
# Or save and display file
compare.report(html_file='notebook_report.html')
display(HTML(filename='notebook_report.html'))The default report includes the following sections:
Each section can be customized through template modification or by using different template files for specific reporting needs.
Install with Tessl CLI
npx tessl i tessl/pypi-datacompy