tessl/pypi-ydata-profiling

Generate comprehensive profile reports for pandas DataFrame with automated exploratory data analysis

Overview

Eval results

Files

Core Profiling

Name: tessl/pypi-ydata-profiling
Author: tessl

Primary functionality for generating comprehensive data profile reports from DataFrames, including statistical analysis, data quality assessment, and automated report generation with customizable analysis depth and output formats.

Capabilities

ProfileReport Class

Main class for creating comprehensive data profiling reports from pandas or Spark DataFrames with extensive customization options.

class ProfileReport:
    def __init__(
        self,
        df: Optional[Union[pd.DataFrame, sDataFrame]] = None,
        minimal: bool = False,
        tsmode: bool = False,
        sortby: Optional[str] = None,
        sensitive: bool = False,
        explorative: bool = False,
        sample: Optional[dict] = None,
        config_file: Optional[Union[Path, str]] = None,
        lazy: bool = True,
        typeset: Optional[VisionsTypeset] = None,
        summarizer: Optional[BaseSummarizer] = None,
        config: Optional[Settings] = None,
        type_schema: Optional[dict] = None,
        **kwargs
    ):
        """
        Generate a ProfileReport based on a pandas or spark.sql DataFrame.

        Parameters:
        - df: pandas or spark.sql DataFrame to analyze
        - minimal: use minimal computation mode for faster processing
        - tsmode: activate time-series analysis for numerical variables
        - sortby: column name to sort dataset by (for time-series mode)
        - sensitive: hide values for categorical/text variables for privacy
        - explorative: enable additional analysis features
        - sample: sampling configuration dictionary
        - config_file: path to YAML configuration file
        - lazy: defer computation until report generation
        - typeset: custom visions typeset for type inference
        - summarizer: custom statistical summarizer
        - config: Settings object for configuration
        - type_schema: manual type specification dictionary
        - **kwargs: additional configuration parameters
        """

Usage Example:

import pandas as pd
from ydata_profiling import ProfileReport

# Basic usage
df = pd.read_csv('data.csv')
report = ProfileReport(df, title="My Dataset Report")

# Minimal mode for large datasets
report = ProfileReport(df, minimal=True)

# Time-series analysis
report = ProfileReport(df, tsmode=True, sortby='timestamp')

# Custom configuration
report = ProfileReport(
    df,
    explorative=True,
    sensitive=False,
    title="Detailed Analysis",
    pool_size=4
)

Report Generation Methods

Methods for generating and exporting profiling reports in various formats.

def to_file(self, output_file: Union[str, Path], silent: bool = True) -> None:
    """
    Save the report to an HTML file.

    Parameters:
    - output_file: path where to save the report
    - silent: suppress progress information
    """

def to_html(self) -> str:
    """
    Generate HTML report content as string.

    Returns:
    Complete HTML report as string
    """

def to_json(self) -> str:
    """
    Generate JSON representation of the report.

    Returns:
    JSON string containing all analysis results
    """

def to_notebook_iframe(self) -> None:
    """
    Display the report in a Jupyter notebook iframe.
    """

def to_widgets(self) -> Any:
    """
    Generate interactive Jupyter widgets for the report.

    Returns:
    Widget object for interactive exploration
    """

Usage Example:

# Generate report
report = ProfileReport(df)

# Export to HTML file
report.to_file("my_report.html")

# Get HTML content as string
html_content = report.to_html()

# Get JSON representation
json_data = report.to_json()

# Display in Jupyter notebook
report.to_notebook_iframe()

# Create interactive widgets
widgets = report.to_widgets()

Data Access Methods

Methods for accessing underlying data and analysis results.

def get_description(self) -> BaseDescription:
    """
    Get the complete dataset description with all analysis results.

    Returns:
    BaseDescription object containing statistical summaries,
    correlations, missing data patterns, and data quality alerts
    """

def get_duplicates(self) -> Optional[pd.DataFrame]:
    """
    Get duplicate rows from the dataset.

    Returns:
    DataFrame containing all duplicate rows, or None if no duplicates
    """

def get_sample(self) -> dict:
    """
    Get data samples from the dataset.

    Returns:
    Dictionary containing head, tail, and random samples
    """

def get_rejected_variables(self) -> set:
    """
    Get variables that were rejected during analysis.

    Returns:
    Set of column names that were rejected
    """

Usage Example:

report = ProfileReport(df)

# Get complete analysis description
description = report.get_description()

# Access duplicate rows
duplicates = report.get_duplicates()
print(f"Found {len(duplicates)} duplicate rows")

# Get data samples
samples = report.get_sample()
print("Sample data:", samples['head'])

# Check rejected variables
rejected = report.get_rejected_variables()
if rejected:
    print(f"Rejected variables: {rejected}")

Report Management Methods

Methods for managing report state and comparisons.

def invalidate_cache(self, subset: Optional[str] = None) -> None:
    """
    Clear cached analysis results to force recomputation.

    Parameters:
    - subset: cache subset to invalidate ("rendering", "report", or None for all)
    """

def compare(self, other: 'ProfileReport', config: Optional[Settings] = None) -> 'ProfileReport':
    """
    Compare this report with another ProfileReport.

    Parameters:
    - other: another ProfileReport to compare against
    - config: configuration for comparison analysis

    Returns:
    New ProfileReport containing comparison results
    """

Usage Example:

# Create reports for two datasets
report1 = ProfileReport(df1, title="Dataset 1")
report2 = ProfileReport(df2, title="Dataset 2")

# Compare reports
comparison = report1.compare(report2)
comparison.to_file("comparison_report.html")

# Force recomputation
report1.invalidate_cache()
updated_html = report1.to_html()

Properties

Key properties for accessing report components and metadata.

@property
def typeset(self) -> VisionsTypeset:
    """Get the typeset used for data type inference."""

@property
def summarizer(self) -> BaseSummarizer:
    """Get the statistical summarizer used for analysis."""

@property
def description_set(self) -> BaseDescription:
    """Get the complete dataset description."""

@property
def df_hash(self) -> str:
    """Get hash of the source DataFrame."""

@property
def report(self) -> Root:
    """Get the report structure object."""

@property
def html(self) -> str:
    """Get HTML report content."""

@property
def json(self) -> str:
    """Get JSON report content."""

@property
def widgets(self) -> Any:
    """Get report widgets."""

Usage Example:

report = ProfileReport(df)

# Access report properties
print(f"Report title: {report.config.title}")
print(f"DataFrame hash: {report.df_hash}")

# Access analysis components
typeset = report.typeset
summarizer = report.summarizer
description = report.description_set

# Get report content
html_report = report.html
json_report = report.json

Serialization Methods

Methods for serializing and deserializing ProfileReport objects for storage and transmission.

def dumps(self) -> bytes:
    """
    Serialize ProfileReport to bytes.
    
    Returns:
    Serialized ProfileReport as bytes
    """

def loads(data: bytes) -> Union['ProfileReport', 'SerializeReport']:
    """
    Deserialize ProfileReport from bytes.
    
    Parameters:
    - data: serialized ProfileReport bytes
    
    Returns:
    Deserialized ProfileReport instance
    """

def dump(self, output_file: Union[Path, str]) -> None:
    """
    Save serialized ProfileReport to file.
    
    Parameters:
    - output_file: path where to save the serialized report
    """

def load(load_file: Union[Path, str]) -> Union['ProfileReport', 'SerializeReport']:
    """
    Load ProfileReport from serialized file.
    
    Parameters:
    - load_file: path to serialized report file
    
    Returns:
    Loaded ProfileReport instance
    """

Usage Example:

import pickle
from pathlib import Path

# Create and serialize report
report = ProfileReport(df, title="My Dataset")

# Serialize to bytes
serialized_bytes = report.dumps()

# Save to file
report.dump("my_report.pkl")

# Load from file
loaded_report = ProfileReport.load("my_report.pkl")

# Deserialize from bytes
restored_report = ProfileReport.loads(serialized_bytes)

# Use loaded report
restored_report.to_file("restored_report.html")

Great Expectations Integration

Integration with Great Expectations for automated data validation and expectation suite generation.

def to_expectation_suite(
    self,
    suite_name: Optional[str] = None,
    data_context: Optional[Any] = None,
    save_suite: bool = True,
    run_validation: bool = True,
    build_data_docs: bool = True,
    handler: Optional[Handler] = None
) -> Any:
    """
    Generate Great Expectations expectation suite from profiling results.
    
    Parameters:
    - suite_name: name for the expectation suite
    - data_context: Great Expectations data context
    - save_suite: whether to save the suite to the data context
    - run_validation: whether to run validation after creating suite
    - build_data_docs: whether to build data docs after suite creation
    - handler: custom handler for expectation generation
    
    Returns:
    Great Expectations expectation suite object
    """

Usage Example:

import great_expectations as ge
from ydata_profiling import ProfileReport

# Create ProfileReport
report = ProfileReport(df, title="Data Validation")

# Generate Great Expectations suite
suite = report.to_expectation_suite(
    suite_name="my_dataset_expectations",
    save_suite=True,
    run_validation=True
)

# The suite can now be used for ongoing data validation
print(f"Created expectation suite with {len(suite.expectations)} expectations")

Install with Tessl CLI