Generate comprehensive profile reports for pandas DataFrame with automated exploratory data analysis
Direct integration with pandas DataFrames through monkey patching that adds profiling capability directly to pandas DataFrames. This integration allows pandas DataFrames to generate profiling reports directly through a .profile_report() method, providing seamless integration with pandas workflows.
The .profile_report() method is automatically added to all pandas DataFrame instances when ydata_profiling is imported. This method provides direct access to profiling functionality without needing to explicitly create a ProfileReport instance.
def profile_report(
self,
minimal: bool = False,
tsmode: bool = False,
sortby: Optional[str] = None,
sensitive: bool = False,
explorative: bool = False,
sample: Optional[dict] = None,
config_file: Optional[Union[Path, str]] = None,
lazy: bool = True,
typeset: Optional[VisionsTypeset] = None,
summarizer: Optional[BaseSummarizer] = None,
config: Optional[Settings] = None,
type_schema: Optional[dict] = None,
**kwargs
) -> ProfileReport:
"""
Generate a comprehensive profiling report for this DataFrame.
This method is automatically added to pandas DataFrame instances
when ydata_profiling is imported via monkey patching.
Parameters:
- minimal: use minimal computation mode for faster processing
- tsmode: enable time-series analysis for numerical variables
- sortby: column to sort by for time-series analysis
- sensitive: enable privacy mode hiding sensitive values
- explorative: enable additional exploratory features
- sample: sampling configuration dictionary
- config_file: path to YAML configuration file
- lazy: defer computation until needed
- typeset: custom type inference system
- summarizer: custom statistical summarizer
- config: Settings object for configuration
- type_schema: manual type specifications
- **kwargs: additional configuration parameters
Returns:
ProfileReport instance containing comprehensive analysis
"""Usage Example:
import pandas as pd
from ydata_profiling import ProfileReport
# Load data
df = pd.read_csv('data.csv')
# Generate report using the decorator method
report = df.profile_report(title="My Dataset Report")
# Export report
report.to_file("report.html")
# Generate with custom configuration
report = df.profile_report(
title="Detailed Analysis",
explorative=True,
minimal=False
)When ydata_profiling is imported, the profile_report() method is automatically added to all pandas DataFrame instances.
Usage Example:
import pandas as pd
# This will NOT work - profile_report method not available yet
# df = pd.read_csv('data.csv')
# report = df.profile_report() # AttributeError
# Import ydata_profiling to add the method
from ydata_profiling import ProfileReport
# Now the method is available on all DataFrames
df = pd.read_csv('data.csv')
report = df.profile_report() # Works!
# The method is available on any DataFrame
df2 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
report2 = df2.profile_report(title="Simple DataFrame")Seamless integration with common pandas data analysis workflows.
Data Cleaning Workflow:
import pandas as pd
from ydata_profiling import ProfileReport
# Load and explore data
df = pd.read_csv('messy_data.csv')
# Initial profiling
initial_report = df.profile_report(title="Initial Data Assessment")
initial_report.to_file("initial_analysis.html")
# Clean data based on profiling insights
df_cleaned = df.dropna(subset=['important_column'])
df_cleaned = df_cleaned[df_cleaned['age'] >= 0] # Remove negative ages
df_cleaned = df_cleaned.drop_duplicates()
# Profile cleaned data
cleaned_report = df_cleaned.profile_report(title="Cleaned Data")
cleaned_report.to_file("cleaned_analysis.html")
# Compare before and after
comparison = initial_report.compare(cleaned_report)
comparison.to_file("cleaning_impact.html")Exploratory Data Analysis Workflow:
import pandas as pd
from ydata_profiling import ProfileReport
# Load data
df = pd.read_csv('customer_data.csv')
# Quick exploration with minimal mode for large datasets
quick_profile = df.profile_report(
title="Quick Customer Data Overview",
minimal=True
)
# Detailed analysis after initial insights
detailed_profile = df.profile_report(
title="Comprehensive Customer Analysis",
explorative=True,
tsmode=True if 'timestamp' in df.columns else False,
sortby='timestamp' if 'timestamp' in df.columns else None
)
detailed_profile.to_file("customer_analysis.html")
# Access specific insights
duplicates = detailed_profile.get_duplicates()
print(f"Found {len(duplicates)} duplicate customers")The pandas integration supports method chaining for fluid data analysis workflows.
Usage Example:
import pandas as pd
from ydata_profiling import ProfileReport
# Method chaining with profiling
report = (pd.read_csv('data.csv')
.dropna()
.reset_index(drop=True)
.profile_report(title="Processed Data Analysis"))
# Chain with other pandas operations
processed_report = (df
.query('age >= 18')
.groupby('category')
.first()
.reset_index()
.profile_report(title="Adult Customers by Category"))
# Export results
report.to_file("processed_analysis.html")
processed_report.to_file("category_analysis.html")Enhanced integration with Jupyter notebooks through the pandas decorator.
Usage Example:
import pandas as pd
from ydata_profiling import ProfileReport
# Load data in notebook
df = pd.read_csv('analysis_data.csv')
# Generate and display report inline
report = df.profile_report(title="Notebook Analysis")
# Display directly in notebook cell
report.to_notebook_iframe()
# Or use widgets for interactive exploration
report.to_widgets()
# Quick minimal analysis for fast iteration
df.profile_report(minimal=True).to_notebook_iframe()Using pandas integration in data processing pipelines.
Usage Example:
import pandas as pd
from ydata_profiling import ProfileReport
def analyze_dataset(file_path: str, output_dir: str) -> dict:
"""
Analyze dataset and return summary metrics.
"""
# Load data
df = pd.read_csv(file_path)
# Generate profile
report = df.profile_report(
title=f"Analysis of {file_path}",
explorative=True
)
# Save report
report_path = f"{output_dir}/analysis.html"
report.to_file(report_path)
# Extract key metrics
description = report.get_description()
return {
'rows': description.table['n'],
'columns': description.table['p'],
'missing_cells': description.table['n_cells_missing'],
'duplicates': description.table['n_duplicates'],
'report_path': report_path
}
# Use in pipeline
metrics = analyze_dataset('input/data.csv', 'output/')
print(f"Dataset has {metrics['rows']} rows and {metrics['columns']} columns")Optimized usage patterns for large datasets using pandas integration.
Usage Example:
import pandas as pd
from ydata_profiling import ProfileReport
# For large datasets, use minimal mode initially
large_df = pd.read_csv('large_dataset.csv')
# Quick assessment with minimal resources
quick_report = large_df.profile_report(
minimal=True,
title="Large Dataset - Quick Assessment"
)
# Sample subset for detailed analysis if needed
sample_df = large_df.sample(n=10000, random_state=42)
detailed_report = sample_df.profile_report(
title="Detailed Analysis - Sample",
explorative=True
)
# Process in chunks for very large datasets
chunk_reports = []
for chunk in pd.read_csv('very_large_dataset.csv', chunksize=5000):
chunk_report = chunk.profile_report(minimal=True)
chunk_reports.append(chunk_report)
# Compare chunks to identify data consistency
if len(chunk_reports) >= 2:
chunk_comparison = chunk_reports[0].compare(chunk_reports[1])
chunk_comparison.to_file("chunk_consistency.html")Install with Tessl CLI
npx tessl i tessl/pypi-ydata-profiling