Generate comprehensive profile reports for pandas DataFrame with automated exploratory data analysis
Comprehensive configuration system for customizing analysis depth, statistical computations, visualizations, and report output formats. The configuration system provides fine-grained control over every aspect of the profiling process.
Main configuration class providing comprehensive control over profiling behavior and report generation.
class Settings:
def __init__(self, **kwargs):
"""
Initialize Settings with configuration parameters.
Parameters:
- **kwargs: configuration parameters for various analysis components
"""
# Core configuration sections
dataset: DatasetConfig
variables: VariablesConfig
correlations: CorrelationsConfig
interactions: InteractionsConfig
plot: PlotConfig
html: HtmlConfig
style: StyleConfig
# Global settings
title: str = "Profiling Report"
pool_size: int = 0
progress_bar: bool = True
lazy: bool = TrueUsage Example:
from ydata_profiling import ProfileReport
from ydata_profiling.config import Settings
# Create custom configuration
config = Settings()
config.title = "Custom Dataset Analysis"
config.pool_size = 4
config.progress_bar = True
# Apply configuration to report
report = ProfileReport(df, config=config)
report.to_file("custom_report.html")Load configuration from files or preset configurations.
class Config:
@staticmethod
def get_config(config_file: Optional[Union[str, Path]] = None) -> Settings:
"""
Load configuration from file or return default configuration.
Parameters:
- config_file: path to YAML configuration file
Returns:
Settings object with loaded configuration
"""Usage Example:
from ydata_profiling.config import Config
from ydata_profiling import ProfileReport
# Load from configuration file
config = Config.get_config("my_config.yaml")
report = ProfileReport(df, config=config)
# Use preset configurations
minimal_report = ProfileReport(df, minimal=True)
explorative_report = ProfileReport(df, explorative=True)
sensitive_report = ProfileReport(df, sensitive=True)Configuration for dataset-level metadata and processing options.
class DatasetConfig:
"""Configuration for dataset-level settings."""
# Dataset metadata
description: str = ""
creator: str = ""
author: str = ""
copyright_holder: str = ""
copyright_year: str = ""
url: str = ""
# Processing options
sample: Optional[dict] = None
duplicates: Optional[dict] = NoneUsage Example:
config = Settings()
config.dataset.description = "Customer transaction data for Q4 2023"
config.dataset.creator = "Data Science Team"
config.dataset.author = "John Doe"
report = ProfileReport(df, config=config)Configuration for variable-specific analysis settings across different data types.
class VariablesConfig:
"""Configuration for variable-specific analysis."""
# Variable type configurations
descriptions: dict = {}
# Type-specific settings
num: NumVarsConfig
cat: CatVarsConfig
bool: BoolVarsConfig
text: TextVarsConfig
file: FileVarsConfig
path: PathVarsConfig
image: ImageVarsConfig
url: UrlVarsConfig
timeseries: TimeseriesVarsConfigclass NumVarsConfig:
"""Numeric variables configuration."""
low_categorical_threshold: int = 5
chi_squared_threshold: float = 0.999
skewness_threshold: int = 20
kurtosis_threshold: int = 20
class CatVarsConfig:
"""Categorical variables configuration."""
length: bool = True
characters: bool = True
words: bool = True
cardinality_threshold: int = 50
class TextVarsConfig:
"""Text variables configuration."""
length: bool = True
characters: bool = True
words: bool = True
redact: bool = FalseUsage Example:
config = Settings()
# Configure numeric variables
config.variables.num.low_categorical_threshold = 10
config.variables.num.skewness_threshold = 15
# Configure categorical variables
config.variables.cat.cardinality_threshold = 100
config.variables.cat.length = True
# Configure text variables
config.variables.text.redact = True # Hide sensitive text
report = ProfileReport(df, config=config)Configuration for correlation analysis and visualization.
class CorrelationsConfig:
"""Configuration for correlation analysis."""
pearson: CorrelationConfig
spearman: CorrelationConfig
kendall: CorrelationConfig
cramers: CorrelationConfig
phik: CorrelationConfig
auto: CorrelationConfig
class CorrelationConfig:
"""Individual correlation method configuration."""
calculate: bool = True
warn_high_cardinality: bool = True
threshold: float = 0.9Usage Example:
config = Settings()
# Enable/disable specific correlation methods
config.correlations.pearson.calculate = True
config.correlations.spearman.calculate = True
config.correlations.kendall.calculate = False
# Set correlation thresholds
config.correlations.pearson.threshold = 0.8
config.correlations.auto.warn_high_cardinality = True
report = ProfileReport(df, config=config)Configuration for visualizations and plotting options.
class PlotConfig:
"""Configuration for plot generation."""
# Plot settings
histogram: dict = {}
correlation: dict = {}
missing: dict = {}
# Image settings
dpi: int = 800
image_format: str = "svg"Usage Example:
config = Settings()
# Configure plot settings
config.plot.dpi = 300
config.plot.image_format = "png"
# Configure histogram settings
config.plot.histogram = {
"bins": 50,
"max_bins": 250
}
# Configure correlation plots
config.plot.correlation = {
"cmap": "RdYlBu_r",
"bad": "#000000"
}
report = ProfileReport(df, config=config)Configuration for HTML report generation and styling.
class HtmlConfig:
"""Configuration for HTML report generation."""
# Report structure
minify_html: bool = True
use_local_assets: bool = True
inline: bool = True
# Navigation and layout
navbar_show: bool = True
full_width: bool = False
# Content sections
style: dict = {}Usage Example:
config = Settings()
# Configure HTML output
config.html.minify_html = False # Keep HTML readable
config.html.full_width = True # Use full browser width
config.html.navbar_show = True # Show navigation bar
# Custom styling
config.html.style = {
"primary_color": "#337ab7",
"logo": "https://company.com/logo.png"
}
report = ProfileReport(df, config=config)Configuration for Spark DataFrame processing.
class SparkSettings:
def __init__(self, **kwargs):
"""
Initialize Spark-specific configuration.
Parameters:
- **kwargs: Spark configuration parameters
"""
# Spark-specific settings
executor_memory: str = "2g"
executor_cores: int = 2
max_result_size: str = "1g"Usage Example:
from ydata_profiling.config import SparkSettings
from ydata_profiling import ProfileReport
# Configure Spark settings
spark_config = SparkSettings()
spark_config.executor_memory = "4g"
spark_config.executor_cores = 4
# Use with Spark DataFrame
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Profiling").getOrCreate()
spark_df = spark.read.csv("large_dataset.csv", header=True, inferSchema=True)
report = ProfileReport(spark_df, config=spark_config)YAML configuration file format for persistent settings.
Example Configuration File (config.yaml):
title: "Production Data Report"
pool_size: 8
progress_bar: true
dataset:
description: "Customer transaction dataset"
creator: "Data Engineering Team"
variables:
num:
low_categorical_threshold: 10
skewness_threshold: 20
cat:
cardinality_threshold: 50
text:
redact: false
correlations:
pearson:
calculate: true
threshold: 0.9
spearman:
calculate: true
kendall:
calculate: false
plot:
dpi: 300
image_format: "png"
html:
minify_html: true
full_width: falseUsage with Configuration File:
from ydata_profiling import ProfileReport
# Load configuration from file
report = ProfileReport(df, config_file="config.yaml")
report.to_file("production_report.html")Specialized configuration class optimized for Spark DataFrames with performance-focused defaults.
class SparkSettings(Settings):
"""
Specialized Settings class for Spark DataFrames with optimized configurations.
Inherits from Settings but with performance-focused defaults that disable
computationally expensive operations for large-scale Spark datasets.
"""
# Performance optimizations
infer_dtypes: bool = False
correlations: Dict[str, bool] = {
"spearman": True,
"pearson": True,
"auto": False, # Disabled for performance
"phi_k": False,
"cramers": False,
"kendall": False
}
# Disabled heavy computations
interactions_continuous: bool = False
missing_diagrams: Dict[str, bool] = {
"bar": False,
"matrix": False,
"dendrogram": False,
"heatmap": False
}
# Reduced sampling
samples_tail: int = 0
samples_random: int = 0Usage Example:
from ydata_profiling import ProfileReport
from ydata_profiling.config import SparkSettings
from pyspark.sql import SparkSession
# Create Spark DataFrame
spark = SparkSession.builder.appName("Profiling").getOrCreate()
spark_df = spark.read.csv("large_dataset.csv", header=True, inferSchema=True)
# Use SparkSettings for optimal performance
config = SparkSettings()
config.title = "Large Dataset Analysis"
report = ProfileReport(spark_df, config=config)
report.to_file("spark_report.html")Advanced methods for managing and updating configuration settings.
def update(self, updates: dict) -> 'Settings':
"""
Merge updates with existing configuration.
Parameters:
- updates: dictionary with configuration updates
Returns:
Updated Settings instance
"""
@staticmethod
def from_file(config_file: Union[Path, str]) -> 'Settings':
"""
Create Settings from YAML configuration file.
Parameters:
- config_file: path to YAML configuration file
Returns:
Settings instance with loaded configuration
"""
@property
def primary_color(self) -> str:
"""
Get primary color for backward compatibility.
Returns:
Primary color from style configuration
"""Usage Example:
from ydata_profiling.config import Settings
from pathlib import Path
# Load from file
config = Settings.from_file("custom_config.yaml")
# Update specific settings
updates = {
"title": "Updated Report Title",
"plot": {
"dpi": 600,
"image_format": "png"
},
"vars": {
"cat": {
"redact": True
}
}
}
updated_config = config.update(updates)
# Use updated configuration
report = ProfileReport(df, config=updated_config)Built-in configuration presets for common use cases.
Built-in Presets:
# Minimal mode - fast profiling with reduced computation
ProfileReport(df, minimal=True)
# Explorative mode - comprehensive analysis with all features
ProfileReport(df, explorative=True)
# Sensitive mode - privacy-aware profiling
ProfileReport(df, sensitive=True)
# Time-series mode - specialized for time-series data
ProfileReport(df, tsmode=True, sortby='timestamp')Preset Details:
Install with Tessl CLI
npx tessl i tessl/pypi-ydata-profiling