Generate comprehensive profile reports for pandas DataFrames with exploratory data analysis
npx @tessl/cli install tessl/pypi-pandas-profiling@3.6.0A Python library that provides comprehensive one-line Exploratory Data Analysis (EDA) for pandas DataFrames. It generates detailed profile reports including statistical summaries, data quality warnings, visualizations, and insights that go far beyond basic df.describe() functionality.
pip install pandas-profilingpip install pandas-profiling[notebook,unicode]from pandas_profiling import ProfileReportFor dataset comparison:
from pandas_profiling import compareTo enable pandas DataFrame.profile_report() method:
import pandas_profiling # Adds profile_report() method to DataFramesFor configuration:
from pandas_profiling.config import Settingsimport pandas as pd
from pandas_profiling import ProfileReport
# Load your data
df = pd.read_csv('your_data.csv')
# Generate profile report
profile = ProfileReport(df, title="Data Profile Report")
# View in Jupyter notebook
profile.to_widgets()
# Or export to HTML file
profile.to_file("profile_report.html")
# Or get as JSON
json_data = profile.to_json()pandas-profiling is built around a modular architecture:
from typing import Any, Dict, List, Optional, Union, Tuple
from pathlib import Path
import pandas as pd
from visions import VisionsTypeset
# Key classes from pandas_profiling
class Settings: ... # Configuration management class
class BaseSummarizer: ... # Summary generation interfaceThe core functionality for creating comprehensive data analysis reports from pandas DataFrames.
class ProfileReport:
def __init__(
self,
df: Optional[pd.DataFrame] = None,
minimal: bool = False,
explorative: bool = False,
sensitive: bool = False,
dark_mode: bool = False,
orange_mode: bool = False,
tsmode: bool = False,
sortby: Optional[str] = None,
sample: Optional[dict] = None,
config_file: Union[Path, str] = None,
lazy: bool = True,
typeset: Optional[VisionsTypeset] = None,
summarizer: Optional[BaseSummarizer] = None,
config: Optional[Settings] = None,
**kwargs
):
"""
Generate a ProfileReport based on a pandas DataFrame.
Parameters:
- df: pandas DataFrame to analyze
- minimal: use minimal computation mode for faster processing
- explorative: enable advanced analysis features
- sensitive: enable privacy-aware mode for sensitive data
- dark_mode: apply dark theme styling
- orange_mode: apply orange theme styling
- tsmode: enable time series analysis mode
- sortby: column name for time series sorting
- sample: optional sample data dict with name, caption, data
- config_file: path to YAML configuration file
- lazy: compute analysis when needed (default True)
- typeset: custom type inference system
- summarizer: custom summary generation system
- config: Settings object for configuration
- **kwargs: additional configuration options
"""Methods for outputting and displaying the generated profile report.
class ProfileReport:
def to_file(self, output_file: Union[str, Path], silent: bool = True) -> None:
"""
Export report to HTML or JSON file.
Parameters:
- output_file: path for output file (.html or .json extension)
- silent: suppress progress output
"""
def to_html(self) -> str:
"""
Get HTML representation of the report.
Returns:
str: Complete HTML report as string
"""
def to_json(self) -> str:
"""
Get JSON representation of the report.
Returns:
str: Complete report data as JSON string
"""
def to_widgets(self) -> Any:
"""
Display report as interactive Jupyter widgets.
Returns:
Widget object for Jupyter notebook display
"""
def to_notebook_iframe(self) -> None:
"""
Display report as embedded HTML iframe in Jupyter notebook.
"""Methods for accessing specific analysis results and data insights.
class ProfileReport:
def get_description(self) -> dict:
"""
Get the complete analysis description dictionary.
Returns:
dict: Complete analysis results and metadata
"""
def get_duplicates(self) -> Optional[pd.DataFrame]:
"""
Get DataFrame containing duplicate rows.
Returns:
DataFrame or None: Duplicate rows if any exist
"""
def get_sample(self) -> dict:
"""
Get sample data information.
Returns:
dict: Sample data with metadata
"""
def get_rejected_variables(self) -> set:
"""
Get set of variable names that were rejected from analysis.
Returns:
set: Variable names excluded from the report
"""Functionality for comparing multiple datasets and generating comparison reports.
def compare(
reports: List[ProfileReport],
config: Optional[Settings] = None,
compute: bool = False
) -> ProfileReport:
"""
Compare multiple ProfileReport objects.
Parameters:
- reports: list of ProfileReport objects to compare
- config: optional Settings object for the merged report
- compute: recompute profiles using config (recommended for different settings)
Returns:
ProfileReport: Comparison report highlighting differences and similarities
"""
class ProfileReport:
def compare(
self,
other: ProfileReport,
config: Optional[Settings] = None
) -> ProfileReport:
"""
Compare this report with another ProfileReport.
Parameters:
- other: ProfileReport object to compare against
- config: optional Settings object for the merged report
Returns:
ProfileReport: Comparison report
"""Comprehensive configuration system for customizing analysis and report generation.
class Settings:
def __init__(self):
"""
Create new Settings configuration object with default values.
"""
def update(self, updates: dict) -> Settings:
"""
Update configuration with new values.
Parameters:
- updates: dictionary of configuration updates
Returns:
Settings: New Settings object with updated values
"""
@classmethod
def from_file(cls, config_file: Union[Path, str]) -> Settings:
"""
Load configuration from YAML file.
Parameters:
- config_file: path to YAML configuration file
Returns:
Settings: Configuration loaded from file
"""
class Config:
@staticmethod
def get_arg_groups(key: str) -> dict:
"""
Get predefined configuration group.
Parameters:
- key: configuration group name ('sensitive', 'explorative', 'dark_mode', 'orange_mode')
Returns:
dict: Configuration dictionary for the specified group
"""
@staticmethod
def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]:
"""
Process configuration shortcuts and expand them.
Parameters:
- kwargs: configuration dictionary with potential shortcuts
- split: whether to split into shorthand and regular configs
Returns:
tuple: (shorthand_config, regular_config) dictionaries
"""Automatic extension of pandas DataFrame with profiling functionality.
# Automatically available after importing pandas_profiling
class DataFrame:
def profile_report(self, **kwargs) -> ProfileReport:
"""
Generate a ProfileReport for this DataFrame.
Parameters:
- **kwargs: arguments passed to ProfileReport constructor
Returns:
ProfileReport: Analysis report for this DataFrame
"""Methods for managing analysis computation caching.
class ProfileReport:
def invalidate_cache(self, subset: Optional[str] = None) -> None:
"""
Clear cached computations to force recomputation.
Parameters:
- subset: optional cache subset to clear (None clears all)
"""The Settings class provides extensive configuration through nested models:
from enum import Enum
class Theme(Enum):
"""Available visual themes for reports."""
flatly = "flatly"
united = "united"
# Additional theme values available
class ImageType(Enum):
"""Supported image output formats."""
png = "png"
svg = "svg"
class IframeAttribute(Enum):
"""HTML iframe attribute options."""
srcdoc = "srcdoc"
src = "src"import pandas as pd
from pandas_profiling import ProfileReport
# Load time series data
df = pd.read_csv('timeseries_data.csv')
df['date'] = pd.to_datetime(df['date'])
# Generate time series report
profile = ProfileReport(
df,
title="Time Series Analysis",
tsmode=True,
sortby='date'
)
profile.to_file("timeseries_report.html")from pandas_profiling import ProfileReport
# Generate privacy-aware report
profile = ProfileReport(
df,
title="Sensitive Data Report",
sensitive=True # Redacts potentially sensitive information
)
profile.to_widgets()from pandas_profiling import ProfileReport
from pandas_profiling.config import Settings
# Create custom configuration
config = Settings()
config = config.update({
'vars': {
'num': {'quantiles': [0.1, 0.5, 0.9]},
'cat': {'characters': True, 'words': True}
},
'correlations': {
'pearson': {'threshold': 0.8}
}
})
profile = ProfileReport(df, config=config)
profile.to_file("custom_report.html")from pandas_profiling import ProfileReport, compare
# Create reports for different datasets
report1 = ProfileReport(df_before, title="Before Processing")
report2 = ProfileReport(df_after, title="After Processing")
# Generate comparison report
comparison = compare([report1, report2])
comparison.to_file("comparison_report.html")# Generate report from CSV file
pandas_profiling --title "My Report" data.csv report.html
# Use custom configuration
pandas_profiling --config_file config.yaml data.csv report.html