Generate comprehensive profile reports for pandas DataFrame with automated exploratory data analysis
npx @tessl/cli install tessl/pypi-ydata-profiling@4.16.0A comprehensive Python library that provides one-line Exploratory Data Analysis (EDA) for pandas DataFrames. YData Profiling generates detailed profile reports with statistical analysis, data quality warnings, correlations, missing data patterns, and interactive visualizations - transforming raw data understanding from hours of manual exploration into automated, publication-ready reports.
pip install ydata-profilingpandas-profiling (deprecated)from ydata_profiling import ProfileReportCommon imports for advanced usage:
from ydata_profiling import ProfileReport, compare, __version__
from ydata_profiling.config import Settings, SparkSettingsimport pandas as pd
from ydata_profiling import ProfileReport
# Load your data
df = pd.read_csv('your_data.csv')
# Generate comprehensive report with one line
report = ProfileReport(df, title='Dataset Analysis Report')
# Export report
report.to_file('data_report.html')
# Display in Jupyter notebook
report.to_notebook_iframe()
# Get interactive widgets
report.to_widgets()YData Profiling uses a modular architecture for extensible data analysis:
This design enables automated EDA workflows, integration with data pipelines, and customization for domain-specific analysis requirements across data science and analytics teams.
Primary functionality for generating comprehensive data profile reports from DataFrames, including statistical analysis, data quality assessment, and automated report generation.
class ProfileReport:
def __init__(
self,
df: Optional[Union[pd.DataFrame, sDataFrame]] = None,
minimal: bool = False,
tsmode: bool = False,
sortby: Optional[str] = None,
sensitive: bool = False,
explorative: bool = False,
sample: Optional[dict] = None,
config_file: Optional[Union[Path, str]] = None,
lazy: bool = True,
typeset: Optional[VisionsTypeset] = None,
summarizer: Optional[BaseSummarizer] = None,
config: Optional[Settings] = None,
type_schema: Optional[dict] = None,
**kwargs
): ...
def to_file(self, output_file: Union[str, Path], silent: bool = True): ...
def to_html(self) -> str: ...
def to_json(self) -> str: ...
def to_notebook_iframe(self): ...
def to_widgets(self): ...Compare multiple data profiling reports to identify differences, changes over time, or variations between datasets.
def compare(
reports: Union[List[ProfileReport], List[BaseDescription]],
config: Optional[Settings] = None,
compute: bool = False
) -> ProfileReport: ...Comprehensive configuration system for customizing analysis depth, statistical computations, visualizations, and report output formats.
class Settings:
def __init__(self, **kwargs): ...
class SparkSettings:
def __init__(self, **kwargs): ...
class Config:
@staticmethod
def get_config() -> Settings: ...Detailed statistical analysis components including correlation analysis, missing data patterns, duplicate detection, and specialized analysis for different data types.
class BaseDescription: ...
class BaseSummarizer: ...
class ProfilingSummarizer: ...
def format_summary(description: BaseDescription) -> dict: ...Direct integration with pandas DataFrames through monkey patching that adds profiling capability directly to pandas DataFrames.
def profile_report(
self,
minimal: bool = False,
tsmode: bool = False,
sortby: Optional[str] = None,
sensitive: bool = False,
explorative: bool = False,
**kwargs
) -> ProfileReport: ...Save and load ProfileReport objects for reuse, storage, and sharing across sessions.
def dumps(self) -> bytes: ...
def loads(data: bytes) -> Union['ProfileReport', 'SerializeReport']: ...
def dump(self, output_file: Union[Path, str]) -> None: ...
def load(load_file: Union[Path, str]) -> Union['ProfileReport', 'SerializeReport']: ...Capabilities: Report serialization, persistent storage, cross-session report sharing, and efficient report caching for large datasets.
Generate data validation expectations directly from profiling results for ongoing data quality monitoring.
def to_expectation_suite(
self,
suite_name: Optional[str] = None,
data_context: Optional[Any] = None,
save_suite: bool = True,
run_validation: bool = True,
build_data_docs: bool = True,
handler: Optional[Handler] = None
) -> Any: ...Capabilities: Automated expectation generation, data validation pipeline integration, and continuous data quality monitoring.
Access package version and metadata for compatibility and debugging purposes.
__version__: str # Package version stringUsage: Version checking, compatibility validation, and debugging support.
Command-line interface for generating profiling reports directly from CSV files without writing Python code.
ydata_profiling [OPTIONS] INPUT_FILE OUTPUT_FILECapabilities: Direct CSV profiling, automated report generation, CI/CD pipeline integration, and shell script automation.
from typing import Optional, Union, List, Dict, Any
from pathlib import Path
import pandas as pd
# Core DataFrame types
try:
from pyspark.sql import DataFrame as sDataFrame
except ImportError:
from typing import TypeVar
sDataFrame = TypeVar("sDataFrame")
# Configuration types
class Settings:
dataset: DatasetConfig
variables: VariablesConfig
correlations: CorrelationsConfig
plot: PlotConfig
html: HtmlConfig
style: StyleConfig
class SparkSettings(Settings):
"""Specialized Settings for Spark DataFrames with performance optimizations"""
pass
# Analysis result types
class BaseDescription:
"""Complete dataset description with analysis results"""
pass
class BaseAnalysis:
"""Base analysis metadata"""
pass
# Summarizer types
class BaseSummarizer:
"""Base statistical summarizer interface"""
pass
class ProfilingSummarizer(BaseSummarizer):
"""Default profiling summarizer implementation"""
pass
# Alert system types
from enum import Enum
class AlertType(Enum):
"""Types of data quality alerts"""
pass
class Alert:
"""Individual data quality alert"""
pass