Generate comprehensive profile reports for pandas DataFrame with automated exploratory data analysis
Detailed statistical analysis components including correlation analysis, missing data patterns, duplicate detection, and specialized analysis for different data types. These components form the analytical engine behind YData Profiling's comprehensive data understanding capabilities.
Core data structure containing complete dataset analysis results and statistical summaries.
class BaseDescription:
"""
Complete dataset description containing all analysis results.
Contains statistical summaries, data quality metrics, correlations,
missing data patterns, duplicate analysis, and variable-specific insights.
"""
# Core properties
analysis: BaseAnalysis
table: dict
variables: dict
correlations: dict
missing: dict
alerts: List[Alert]
package: dict
def __init__(self, analysis: BaseAnalysis, table: dict, variables: dict, **kwargs):
"""Initialize BaseDescription with analysis results."""Usage Example:
from ydata_profiling import ProfileReport
report = ProfileReport(df)
description = report.get_description()
# Access analysis components
print(f"Dataset shape: {description.table['n']}, {description.table['p']}")
print(f"Missing cells: {description.table['n_cells_missing']}")
print(f"Duplicate rows: {description.table['n_duplicates']}")
# Access variable-specific analysis
for var_name, var_data in description.variables.items():
print(f"Variable {var_name}: {var_data['type']}")Statistical computation engines that perform the actual analysis of datasets.
class BaseSummarizer:
"""
Base interface for statistical summarizers.
Defines the contract for implementing custom analysis engines
for different data backends (pandas, Spark, etc.).
"""
def summarize(self, config: Settings, df: Union[pd.DataFrame, Any]) -> BaseDescription:
"""
Perform statistical analysis on the dataset.
Parameters:
- config: configuration settings for analysis
- df: dataset to analyze
Returns:
BaseDescription containing complete analysis results
"""
class ProfilingSummarizer(BaseSummarizer):
"""
Default profiling summarizer with comprehensive statistical analysis.
Implements univariate analysis, correlation analysis, missing data
patterns, duplicate detection, and data quality assessment.
"""
def __init__(self, typeset: Optional[VisionsTypeset] = None):
"""
Initialize ProfilingSummarizer.
Parameters:
- typeset: custom type system for variable classification
"""Usage Example:
from ydata_profiling.model.summarizer import ProfilingSummarizer
from ydata_profiling.config import Settings
from ydata_profiling.model.typeset import ProfilingTypeSet
# Create custom summarizer
typeset = ProfilingTypeSet()
summarizer = ProfilingSummarizer(typeset=typeset)
# Use with ProfileReport
config = Settings()
report = ProfileReport(df, summarizer=summarizer, config=config)
# Access summarizer results
description = report.get_description()Functions for formatting and processing analysis results.
def format_summary(description: BaseDescription) -> dict:
"""
Format analysis summary for display and export.
Parameters:
- description: BaseDescription containing analysis results
Returns:
Formatted dictionary with human-readable summaries
"""
def redact_summary(description_dict: dict, config: Settings) -> dict:
"""
Redact sensitive information from analysis summary.
Parameters:
- description_dict: dictionary containing analysis results
- config: configuration specifying redaction rules
Returns:
Dictionary with sensitive information redacted
"""Usage Example:
from ydata_profiling.model.summarizer import format_summary, redact_summary
report = ProfileReport(df)
description = report.get_description()
# Format summary for display
formatted = format_summary(description)
print(formatted['table'])
# Redact sensitive information
config = Settings()
config.variables.text.redact = True
redacted = redact_summary(description.__dict__, config)Data quality alert system for identifying potential issues and anomalies in datasets.
from enum import Enum
class AlertType(Enum):
"""
Types of data quality alerts that can be generated.
"""
CONSTANT = "CONSTANT"
ZEROS = "ZEROS"
HIGH_CORRELATION = "HIGH_CORRELATION"
HIGH_CARDINALITY = "HIGH_CARDINALITY"
IMBALANCE = "IMBALANCE"
MISSING = "MISSING"
INFINITE = "INFINITE"
SKEWED = "SKEWED"
UNIQUE = "UNIQUE"
UNIFORM = "UNIFORM"
DUPLICATES = "DUPLICATES"
class Alert:
"""
Individual data quality alert with details and recommendations.
"""
def __init__(
self,
alert_type: AlertType,
column_name: str,
description: str,
**kwargs
):
"""
Create a data quality alert.
Parameters:
- alert_type: type of alert from AlertType enum
- column_name: name of column triggering alert
- description: human-readable description of issue
- **kwargs: additional alert metadata
"""
alert_type: AlertType
column_name: str
description: str
values: dictUsage Example:
from ydata_profiling.model.alerts import Alert, AlertType
report = ProfileReport(df)
description = report.get_description()
# Access all alerts
alerts = description.alerts
print(f"Found {len(alerts)} data quality alerts")
# Filter alerts by type
missing_alerts = [a for a in alerts if a.alert_type == AlertType.MISSING]
correlation_alerts = [a for a in alerts if a.alert_type == AlertType.HIGH_CORRELATION]
# Examine specific alerts
for alert in alerts:
print(f"Alert: {alert.alert_type.value}")
print(f"Column: {alert.column_name}")
print(f"Description: {alert.description}")Comprehensive correlation analysis supporting multiple correlation methods and backends.
class CorrelationBackend:
"""Base class for correlation computation backends."""
def compute(self, df: pd.DataFrame, config: Settings) -> dict:
"""
Compute correlations for the dataset.
Parameters:
- df: dataset to analyze
- config: correlation configuration
Returns:
Dictionary containing correlation matrices and metadata
"""
class Correlation:
"""Base correlation analysis class."""
pass
class Auto(Correlation):
"""Automatic correlation method selection based on data types."""
pass
class Spearman(Correlation):
"""Spearman rank correlation analysis."""
pass
class Pearson(Correlation):
"""Pearson product-moment correlation analysis."""
pass
class Kendall(Correlation):
"""Kendall tau correlation analysis."""
pass
class Cramers(Correlation):
"""Cramer's V correlation for categorical variables."""
pass
class PhiK(Correlation):
"""PhiK correlation analysis for mixed data types."""
passUsage Example:
from ydata_profiling.model.correlations import Pearson, Spearman, PhiK
report = ProfileReport(df)
description = report.get_description()
# Access correlation results
correlations = description.correlations
# Check available correlation methods
for method, results in correlations.items():
if results is not None:
print(f"{method} correlation matrix shape: {results['matrix'].shape}")
# Access specific correlation matrix
if 'pearson' in correlations:
pearson_matrix = correlations['pearson']['matrix']
print("Pearson correlation matrix:")
print(pearson_matrix.head())Custom type system for intelligent data type inference and variable classification.
class ProfilingTypeSet:
"""
Custom visions typeset optimized for data profiling.
Extends base visions typeset with profiling-specific type
inference rules and variable classification logic.
"""
def __init__(self):
"""Initialize ProfilingTypeSet with profiling-specific types."""
def infer_type(self, series: pd.Series) -> str:
"""
Infer the profiling type of a pandas Series.
Parameters:
- series: pandas Series to analyze
Returns:
String representing the inferred profiling type
"""Usage Example:
from ydata_profiling.model.typeset import ProfilingTypeSet
import pandas as pd
# Create custom typeset
typeset = ProfilingTypeSet()
# Use with ProfileReport
report = ProfileReport(df, typeset=typeset)
# Access type inference results
description = report.get_description()
for var_name, var_info in description.variables.items():
print(f"{var_name}: {var_info['type']}")Data sampling functionality for handling large datasets and providing representative samples.
class Sample:
"""
Data sampling functionality for report generation.
Provides head, tail, and random sampling strategies
for including representative data in reports.
"""
def __init__(self, sample_config: dict):
"""
Initialize Sample with configuration.
Parameters:
- sample_config: dictionary containing sampling parameters
"""
def get_sample(self, df: pd.DataFrame) -> dict:
"""
Generate samples from the dataset.
Parameters:
- df: dataset to sample
Returns:
Dictionary containing different sample types
"""Usage Example:
# Configure sampling in ProfileReport
sample_config = {
"head": 10,
"tail": 10,
"random": 10
}
report = ProfileReport(df, sample=sample_config)
# Access samples
samples = report.get_sample()
print("Head sample:")
print(samples['head'])
print("\nRandom sample:")
print(samples['random'])Base analysis metadata containing dataset-level information and processing details.
class BaseAnalysis:
"""
Base analysis metadata containing dataset-level information.
Stores metadata about the analysis process, data source,
and processing configuration.
"""
def __init__(self, df: pd.DataFrame, sample: dict):
"""
Initialize BaseAnalysis with dataset metadata.
Parameters:
- df: source dataset
- sample: sampling configuration
"""
# Analysis metadata
title: str
date_start: datetime
date_end: datetime
duration: float
class TimeIndexAnalysis(BaseAnalysis):
"""
Time series analysis metadata for time-indexed datasets.
Extends BaseAnalysis with time series specific metadata
including temporal patterns and seasonality detection.
"""
def __init__(self, df: pd.DataFrame, sample: dict, time_index: str):
"""
Initialize TimeIndexAnalysis.
Parameters:
- df: time-indexed dataset
- sample: sampling configuration
- time_index: name of time index column
"""Usage Example:
report = ProfileReport(df, tsmode=True, sortby='timestamp')
description = report.get_description()
# Access analysis metadata
analysis = description.analysis
print(f"Analysis duration: {analysis.duration}s")
print(f"Analysis started: {analysis.date_start}")
# For time series analysis
if hasattr(analysis, 'time_index'):
print(f"Time index column: {analysis.time_index}")Install with Tessl CLI
npx tessl i tessl/pypi-ydata-profiling