tessl/pypi-ydata-profiling

Generate comprehensive profile reports for pandas DataFrame with automated exploratory data analysis

Overview

Eval results

Files

Configuration

Name: tessl/pypi-ydata-profiling
Author: tessl

Comprehensive configuration system for customizing analysis depth, statistical computations, visualizations, and report output formats. The configuration system provides fine-grained control over every aspect of the profiling process.

Capabilities

Settings Class

Main configuration class providing comprehensive control over profiling behavior and report generation.

class Settings:
    def __init__(self, **kwargs):
        """
        Initialize Settings with configuration parameters.

        Parameters:
        - **kwargs: configuration parameters for various analysis components
        """
    
    # Core configuration sections
    dataset: DatasetConfig
    variables: VariablesConfig  
    correlations: CorrelationsConfig
    interactions: InteractionsConfig
    plot: PlotConfig
    html: HtmlConfig
    style: StyleConfig
    
    # Global settings
    title: str = "Profiling Report"
    pool_size: int = 0
    progress_bar: bool = True
    lazy: bool = True

Usage Example:

from ydata_profiling import ProfileReport
from ydata_profiling.config import Settings

# Create custom configuration
config = Settings()
config.title = "Custom Dataset Analysis"
config.pool_size = 4
config.progress_bar = True

# Apply configuration to report
report = ProfileReport(df, config=config)
report.to_file("custom_report.html")

Configuration Loading

Load configuration from files or preset configurations.

class Config:
    @staticmethod
    def get_config(config_file: Optional[Union[str, Path]] = None) -> Settings:
        """
        Load configuration from file or return default configuration.

        Parameters:
        - config_file: path to YAML configuration file

        Returns:
        Settings object with loaded configuration
        """

Usage Example:

from ydata_profiling.config import Config
from ydata_profiling import ProfileReport

# Load from configuration file
config = Config.get_config("my_config.yaml")
report = ProfileReport(df, config=config)

# Use preset configurations
minimal_report = ProfileReport(df, minimal=True)
explorative_report = ProfileReport(df, explorative=True)
sensitive_report = ProfileReport(df, sensitive=True)

Dataset Configuration

Configuration for dataset-level metadata and processing options.

class DatasetConfig:
    """Configuration for dataset-level settings."""
    
    # Dataset metadata
    description: str = ""
    creator: str = ""
    author: str = ""
    copyright_holder: str = ""
    copyright_year: str = ""
    url: str = ""
    
    # Processing options
    sample: Optional[dict] = None
    duplicates: Optional[dict] = None

Usage Example:

config = Settings()
config.dataset.description = "Customer transaction data for Q4 2023"
config.dataset.creator = "Data Science Team"
config.dataset.author = "John Doe"

report = ProfileReport(df, config=config)

Variables Configuration

Configuration for variable-specific analysis settings across different data types.

class VariablesConfig:
    """Configuration for variable-specific analysis."""
    
    # Variable type configurations
    descriptions: dict = {}
    
    # Type-specific settings
    num: NumVarsConfig
    cat: CatVarsConfig
    bool: BoolVarsConfig
    text: TextVarsConfig
    file: FileVarsConfig
    path: PathVarsConfig
    image: ImageVarsConfig
    url: UrlVarsConfig
    timeseries: TimeseriesVarsConfig

class NumVarsConfig:
    """Numeric variables configuration."""
    
    low_categorical_threshold: int = 5
    chi_squared_threshold: float = 0.999
    skewness_threshold: int = 20
    kurtosis_threshold: int = 20
    
class CatVarsConfig:
    """Categorical variables configuration."""
    
    length: bool = True
    characters: bool = True
    words: bool = True
    cardinality_threshold: int = 50
    
class TextVarsConfig:
    """Text variables configuration."""
    
    length: bool = True
    characters: bool = True  
    words: bool = True
    redact: bool = False

Usage Example:

config = Settings()

# Configure numeric variables
config.variables.num.low_categorical_threshold = 10
config.variables.num.skewness_threshold = 15

# Configure categorical variables  
config.variables.cat.cardinality_threshold = 100
config.variables.cat.length = True

# Configure text variables
config.variables.text.redact = True  # Hide sensitive text

report = ProfileReport(df, config=config)

Correlation Configuration

Configuration for correlation analysis and visualization.

class CorrelationsConfig:
    """Configuration for correlation analysis."""
    
    pearson: CorrelationConfig
    spearman: CorrelationConfig
    kendall: CorrelationConfig
    cramers: CorrelationConfig
    phik: CorrelationConfig
    auto: CorrelationConfig

class CorrelationConfig:
    """Individual correlation method configuration."""
    
    calculate: bool = True
    warn_high_cardinality: bool = True
    threshold: float = 0.9

Usage Example:

config = Settings()

# Enable/disable specific correlation methods
config.correlations.pearson.calculate = True
config.correlations.spearman.calculate = True
config.correlations.kendall.calculate = False

# Set correlation thresholds
config.correlations.pearson.threshold = 0.8
config.correlations.auto.warn_high_cardinality = True

report = ProfileReport(df, config=config)

Plot Configuration

Configuration for visualizations and plotting options.

class PlotConfig:
    """Configuration for plot generation."""
    
    # Plot settings
    histogram: dict = {}
    correlation: dict = {}
    missing: dict = {}
    
    # Image settings  
    dpi: int = 800
    image_format: str = "svg"

Usage Example:

config = Settings()

# Configure plot settings
config.plot.dpi = 300
config.plot.image_format = "png"

# Configure histogram settings
config.plot.histogram = {
    "bins": 50,
    "max_bins": 250
}

# Configure correlation plots
config.plot.correlation = {
    "cmap": "RdYlBu_r",
    "bad": "#000000"
}

report = ProfileReport(df, config=config)

HTML Configuration

Configuration for HTML report generation and styling.

class HtmlConfig:
    """Configuration for HTML report generation."""
    
    # Report structure
    minify_html: bool = True
    use_local_assets: bool = True
    inline: bool = True
    
    # Navigation and layout
    navbar_show: bool = True
    full_width: bool = False
    
    # Content sections
    style: dict = {}

Usage Example:

config = Settings()

# Configure HTML output
config.html.minify_html = False  # Keep HTML readable
config.html.full_width = True    # Use full browser width
config.html.navbar_show = True   # Show navigation bar

# Custom styling
config.html.style = {
    "primary_color": "#337ab7",
    "logo": "https://company.com/logo.png"
}

report = ProfileReport(df, config=config)

Spark Configuration

Configuration for Spark DataFrame processing.

class SparkSettings:
    def __init__(self, **kwargs):
        """
        Initialize Spark-specific configuration.

        Parameters:
        - **kwargs: Spark configuration parameters
        """
    
    # Spark-specific settings
    executor_memory: str = "2g"
    executor_cores: int = 2
    max_result_size: str = "1g"

Usage Example:

from ydata_profiling.config import SparkSettings
from ydata_profiling import ProfileReport

# Configure Spark settings
spark_config = SparkSettings()
spark_config.executor_memory = "4g"
spark_config.executor_cores = 4

# Use with Spark DataFrame
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Profiling").getOrCreate()
spark_df = spark.read.csv("large_dataset.csv", header=True, inferSchema=True)

report = ProfileReport(spark_df, config=spark_config)

Configuration Files

YAML configuration file format for persistent settings.

Example Configuration File (config.yaml):

title: "Production Data Report"
pool_size: 8
progress_bar: true

dataset:
  description: "Customer transaction dataset"
  creator: "Data Engineering Team"

variables:
  num:
    low_categorical_threshold: 10
    skewness_threshold: 20
  cat:
    cardinality_threshold: 50
  text:
    redact: false

correlations:
  pearson:
    calculate: true
    threshold: 0.9
  spearman:
    calculate: true
  kendall:
    calculate: false

plot:
  dpi: 300
  image_format: "png"

html:
  minify_html: true
  full_width: false

Usage with Configuration File:

from ydata_profiling import ProfileReport

# Load configuration from file
report = ProfileReport(df, config_file="config.yaml")
report.to_file("production_report.html")

SparkSettings Class

Specialized configuration class optimized for Spark DataFrames with performance-focused defaults.

class SparkSettings(Settings):
    """
    Specialized Settings class for Spark DataFrames with optimized configurations.
    
    Inherits from Settings but with performance-focused defaults that disable
    computationally expensive operations for large-scale Spark datasets.
    """
    
    # Performance optimizations
    infer_dtypes: bool = False
    correlations: Dict[str, bool] = {
        "spearman": True,
        "pearson": True,
        "auto": False,  # Disabled for performance
        "phi_k": False,
        "cramers": False,
        "kendall": False
    }
    
    # Disabled heavy computations
    interactions_continuous: bool = False
    missing_diagrams: Dict[str, bool] = {
        "bar": False,
        "matrix": False, 
        "dendrogram": False,
        "heatmap": False
    }
    
    # Reduced sampling
    samples_tail: int = 0
    samples_random: int = 0

Usage Example:

from ydata_profiling import ProfileReport
from ydata_profiling.config import SparkSettings
from pyspark.sql import SparkSession

# Create Spark DataFrame
spark = SparkSession.builder.appName("Profiling").getOrCreate()
spark_df = spark.read.csv("large_dataset.csv", header=True, inferSchema=True)

# Use SparkSettings for optimal performance
config = SparkSettings()
config.title = "Large Dataset Analysis"

report = ProfileReport(spark_df, config=config)
report.to_file("spark_report.html")

Configuration Methods

Advanced methods for managing and updating configuration settings.

def update(self, updates: dict) -> 'Settings':
    """
    Merge updates with existing configuration.
    
    Parameters:
    - updates: dictionary with configuration updates
    
    Returns:
    Updated Settings instance
    """

@staticmethod
def from_file(config_file: Union[Path, str]) -> 'Settings':
    """
    Create Settings from YAML configuration file.
    
    Parameters:
    - config_file: path to YAML configuration file
    
    Returns:
    Settings instance with loaded configuration
    """

@property
def primary_color(self) -> str:
    """
    Get primary color for backward compatibility.
    
    Returns:
    Primary color from style configuration
    """

Usage Example:

from ydata_profiling.config import Settings
from pathlib import Path

# Load from file
config = Settings.from_file("custom_config.yaml")

# Update specific settings
updates = {
    "title": "Updated Report Title",
    "plot": {
        "dpi": 600,
        "image_format": "png"
    },
    "vars": {
        "cat": {
            "redact": True
        }
    }
}

updated_config = config.update(updates)

# Use updated configuration
report = ProfileReport(df, config=updated_config)

Preset Configurations

Built-in configuration presets for common use cases.

Built-in Presets:

# Minimal mode - fast profiling with reduced computation
ProfileReport(df, minimal=True)

# Explorative mode - comprehensive analysis with all features
ProfileReport(df, explorative=True)

# Sensitive mode - privacy-aware profiling
ProfileReport(df, sensitive=True)

# Time-series mode - specialized for time-series data
ProfileReport(df, tsmode=True, sortby='timestamp')

Preset Details:

Minimal: Disables correlations, missing diagrams, and type inference for speed
Explorative: Enables advanced text analysis, file analysis, and memory profiling
Sensitive: Redacts categorical/text values and disables sample display
Time-series: Enables autocorrelation analysis and time-based sorting

Install with Tessl CLI