A pandas-based library to visualize and compare datasets.
Primary functions for creating exploratory data analysis reports. These functions analyze pandas DataFrames and return DataframeReport objects containing comprehensive statistics, visualizations, and association matrices.
Analyzes a single DataFrame, generating comprehensive statistics, visualizations, and feature relationships. Optionally focuses analysis around a target feature to highlight correlations and associations.
def analyze(source: Union[pd.DataFrame, Tuple[pd.DataFrame, str]],
target_feat: str = None,
feat_cfg: FeatureConfig = None,
pairwise_analysis: str = 'auto') -> DataframeReport:
"""
Analyze a single DataFrame and generate a report.
Parameters:
- source: DataFrame to analyze, or tuple of [DataFrame, "Display Name"]
- target_feat: Name of target feature for focused analysis (boolean/numerical only)
- feat_cfg: FeatureConfig object for controlling feature processing
- pairwise_analysis: Controls correlation analysis ('auto', 'on', 'off')
Returns:
DataframeReport object containing analysis results
"""import sweetviz as sv
import pandas as pd
# Basic analysis
df = pd.read_csv('data.csv')
report = sv.analyze(df)
# Analysis with named dataset
report = sv.analyze([df, "My Dataset"])
# Target-focused analysis
report = sv.analyze(df, target_feat='outcome')
# With feature configuration
config = sv.FeatureConfig(skip=['id'], force_cat=['category'])
report = sv.analyze(df, target_feat='price', feat_cfg=config)
# Control pairwise analysis for large datasets
report = sv.analyze(df, pairwise_analysis='off') # Skip correlation matrixCompares two datasets side-by-side, highlighting differences in distributions, statistics, and feature relationships. Ideal for comparing training/test splits or different data versions.
def compare(source: Union[pd.DataFrame, Tuple[pd.DataFrame, str]],
compare: Union[pd.DataFrame, Tuple[pd.DataFrame, str]],
target_feat: str = None,
feat_cfg: FeatureConfig = None,
pairwise_analysis: str = 'auto') -> DataframeReport:
"""
Compare two DataFrames and generate a comparison report.
Parameters:
- source: Primary DataFrame or [DataFrame, "Display Name"]
- compare: Comparison DataFrame or [DataFrame, "Display Name"]
- target_feat: Name of target feature for focused analysis (boolean/numerical only)
- feat_cfg: FeatureConfig object for controlling feature processing
- pairwise_analysis: Controls correlation analysis ('auto', 'on', 'off')
Returns:
DataframeReport object containing comparison results
"""# Compare training and test sets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
report = sv.compare([train_df, "Training"], [test_df, "Test"])
# Compare with target analysis
report = sv.compare([train_df, "Training"], [test_df, "Test"], target_feat='label')
# Compare datasets with different names
old_data = pd.read_csv('old.csv')
new_data = pd.read_csv('new.csv')
report = sv.compare([old_data, "Previous Version"], [new_data, "Current Version"])Compares subsets within the same DataFrame based on a boolean condition. Useful for analyzing differences between groups (e.g., male vs female, treatment vs control).
def compare_intra(source_df: pd.DataFrame,
condition_series: pd.Series,
names: Tuple[str, str],
target_feat: str = None,
feat_cfg: FeatureConfig = None,
pairwise_analysis: str = 'auto') -> DataframeReport:
"""
Compare subsets within the same DataFrame based on a boolean condition.
Parameters:
- source_df: DataFrame to analyze
- condition_series: Boolean Series for splitting data (same length as source_df)
- names: Tuple of names for (True subset, False subset)
- target_feat: Name of target feature for focused analysis (boolean/numerical only)
- feat_cfg: FeatureConfig object for controlling feature processing
- pairwise_analysis: Controls correlation analysis ('auto', 'on', 'off')
Returns:
DataframeReport object containing intra-dataset comparison
Raises:
ValueError: If condition_series length doesn't match source_df or isn't boolean type
ValueError: If either subset is empty after splitting
"""# Compare male vs female
df = pd.read_csv('data.csv')
report = sv.compare_intra(df, df["gender"] == "male", ["Male", "Female"])
# Compare with target feature
report = sv.compare_intra(df,
df["age"] > 30,
["Over 30", "30 and Under"],
target_feat="income")
# Compare treatment groups
report = sv.compare_intra(df,
df["treatment"] == "A",
["Treatment A", "Treatment B"],
target_feat="outcome")
# Complex boolean conditions
high_income = (df["income"] > df["income"].median())
report = sv.compare_intra(df, high_income, ["High Income", "Low Income"])See Configuration for detailed FeatureConfig usage.
All analysis functions may raise:
Common errors and solutions:
# Handle missing target feature
try:
report = sv.analyze(df, target_feat='nonexistent')
except KeyError:
print("Target feature not found in DataFrame")
# Handle categorical target
try:
report = sv.analyze(df, target_feat='category')
except ValueError as e:
if "CATEGORICAL" in str(e):
# Force to numerical if appropriate
config = sv.FeatureConfig(force_num=['category'])
report = sv.analyze(df, target_feat='category', feat_cfg=config)Install with Tessl CLI
npx tessl i tessl/pypi-sweetviz