How to generate a metareport?#

Create a metareport comparing synthetic datasets with respect to a list of metrics. /! Only for the summary.#

Assume that the synthetic datasets to compare are already generated

Wisconsin Breast Cancer Dataset (WBCD)

[ ]:

# Standard library
import sys
import tempfile
from pathlib import Path

sys.path.append("..")

# 3rd party packages
import pandas as pd

# Local packages
from clover.metrics.metareport import Metareport

Load the real and synthetic Wisconsin Breast Cancer Datasets#

[2]:

df_real = {}
df_real["train"] = pd.read_csv("../data/WBCD_train.csv")
df_real["test"] = pd.read_csv("../data/WBCD_test.csv")
df_real["train"].shape

[2]:

(455, 10)

Choose the synthetic dataset#

[3]:

# generated by Synthpop here
df_synth = {
    "train": pd.read_csv(
        "../results/attack/data/1st_generation/2025-06-18_Synthpop_455samples.csv"
    ),
    "test": pd.read_csv(
        "../results/attack/data/1st_generation/2025-06-18_Synthpop_228samples.csv"
    ),
    "2nd_gen": pd.read_csv(
        "../results/attack/data/2nd_generation/2025-06-18_Synthpop_455samples.csv"
    ),
}

# random synthetic dataset to compare to the one generated by Synthpop
df_mock = {
    "train": df_synth["train"].apply(
        lambda x: x.sample(frac=1, replace=True).to_numpy()
    ),
    "test": df_synth["test"].apply(lambda x: x.sample(frac=1, replace=True).to_numpy()),
    "2nd_gen": df_synth["2nd_gen"].apply(
        lambda x: x.sample(frac=1, replace=True).to_numpy()
    ),
}

synth_datasets = {"synthpop": df_synth, "random": df_mock}

Configure the metadata dictionary#

The continuous and categorical variables need to be specified, as well as the variable to predict#

[4]:

metadata = {
    "continuous": [
        "Clump_Thickness",
        "Uniformity_of_Cell_Size",
        "Uniformity_of_Cell_Shape",
        "Marginal_Adhesion",
        "Single_Epithelial_Cell_Size",
        "Bland_Chromatin",
        "Normal_Nucleoli",
        "Mitoses",
        "Bare_Nuclei",
    ],
    "categorical": ["Class"],
    "variable_to_predict": "Class",
}

Generate the metareport#

[5]:

parameters = {  # see the notebooks utility_report and privacy_report for more details
    "cross_learning": False,
    "num_repeat": 1,
    "num_kfolds": 2,
    "num_optuna_trials": 15,
    "use_gpu": True,
    "sampling_frac": 0.5,
}

[6]:

metareport = Metareport(
    dataset_name="Wisconsin Breast Cancer Dataset",
    df_real=df_real,
    synthetic_datasets=synth_datasets,
    metadata=metadata,
    figsize=(8, 6),  # will be automatically adjusted for larger or longer figures
    random_state=42,  # for reproducibility purposes
    metareport_folderpath=None,  # a dictionary containing the path of each already computed report to load and compare
    metrics=None,  # list of the metrics to compute. Can be utility or privacy metrics. If not specified, all the metrics are computed.
    params=parameters,  # the dictionary containing the parameters for both utility and privacy reports
)

[7]:

metareport.compute()

/data8/install/anaconda3/envs/synthetic_data_p3.10/lib/python3.10/site-packages/xgboost/core.py:158: UserWarning: [21:31:25] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.

This warning will only be shown once.

  warnings.warn(smsg, UserWarning)

GAN-Leaks test set shape: (228, 10)
Monte Carlo Membership test set shape: (228, 10)
Detector test set shape: (228, 10)
LOGAN test set shape: (228, 10)
TableGan test set shape: (228, 10)
GAN-Leaks test set shape: (228, 10)

/data8/install/anaconda3/envs/synthetic_data_p3.10/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1531: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))

Monte Carlo Membership test set shape: (228, 10)
Detector test set shape: (228, 10)
LOGAN test set shape: (228, 10)
TableGan test set shape: (228, 10)

Get the summary report as a pandas dataframe#

[8]:

df_summary = metareport.summary()

[9]:

df_summary

[9]:

compared	random	synthpop
metric
cat_consis-within_ratio	1.000000	1.000000
cat_stats-frequency_coverage	0.975824	0.956044
cat_stats-support_coverage	1.000000	1.000000
classif-diff_real_synth	0.065878	0.004307
collision-avg_num_appearance_collision_real	6.428571	3.403509
collision-avg_num_appearance_collision_synth	1.285714	3.263158
collision-avg_num_appearance_realcontrol	1.349112	1.349112
collision-avg_num_appearance_realtrain	1.463023	1.463023
collision-avg_num_appearance_synth	1.006637	1.413043
collision-f1_score	0.000000	0.300792
collision-precision	0.000000	0.177019
collision-recall	0.000000	1.000000
collision-recovery_rate	0.000000	0.183280
cont_consis-within_ratio	1.000000	1.000000
cont_stats-iqr_l1_distance	0.092593	0.080247
cont_stats-iqr_l1_distance_train_test_ref	0.098765	0.098765
cont_stats-median_l1_distance	0.012346	0.012346
cont_stats-median_l1_distance_train_test_ref	0.000000	0.000000
dcr-dcr_5th_percent_synthreal_control	0.033333	0.000000
dcr-dcr_5th_percent_synthreal_train	0.033333	0.000000
dcr-dcr_5th_percent_train_test_ref	0.000000	0.000000
dcr-nndr_5th_percent_synthreal_control	0.723947	0.000000
dcr-nndr_5th_percent_synthreal_train	0.554167	0.000000
dcr-nndr_5th_percent_train_test_ref	0.000000	0.000000
dcr-ratio_match_synthreal_control	0.000000	0.333333
dcr-ratio_match_synthreal_train	0.000000	0.315789
dcr-ratio_match_train_test_ref	0.333333	0.333333
detector-precision	0.575000	0.525773
detector-precision_top1%	1.000000	0.500000
detector-precision_top50%	0.508772	0.517544
detector-tpr_at_0.001%_fpr	0.017544	0.008772
detector-tpr_at_0.1%_fpr	0.017544	0.008772
dist-prediction_auc_rescaled	0.950081	0.000000
dist-prediction_mse	0.588812	0.000078
dist-propensity_mse	0.607270	0.000158
feature_imp-diff_permutation_importance	0.026678	0.003163
fscore-diff_f_score	1.245316	0.359862
ganleaks-precision_top1%	0.500000	0.500000
ganleaks-precision_top50%	0.807018	0.894737
hell_cat_univ_dist-hellinger_distance	0.017793	0.032174
hell_cat_univ_dist-hellinger_distance_train_test_ref	0.001057	0.001057
hell_cont_univ_dist-hellinger_distance	0.053408	0.041094
hell_cont_univ_dist-hellinger_distance_train_test_ref	0.057075	0.057075
kl_div_cat_univ_dist-kl_divergence	0.001273	0.004175
kl_div_cat_univ_dist-kl_divergence_train_test_ref	0.000004	0.000004
kl_div_cont_univ_dist-kl_divergence	0.012086	0.007177
kl_div_cont_univ_dist-kl_divergence_train_test_ref	0.013798	0.013798
logan-precision	0.475309	0.527778
logan-precision_top1%	0.500000	1.000000
logan-precision_top50%	0.473684	0.543860
logan-tpr_at_0.001%_fpr	0.008772	0.070175
logan-tpr_at_0.1%_fpr	0.008772	0.070175
mcmebership-precision_top1%	0.500000	0.500000
mcmebership-precision_top50%	0.526316	0.526316
pcd-norm	5.161654	0.285290
tablegan-precision	0.585366	0.485981
tablegan-precision_top1%	1.000000	0.500000
tablegan-precision_top50%	0.561404	0.508772
tablegan-tpr_at_0.001%_fpr	0.035088	0.000000
tablegan-tpr_at_0.1%_fpr	0.035088	0.000000

Style the result#

The best value (minimum or maximal according to the submetric objective) is colored in green. The worst in yellow.

[10]:

s = df_summary.style.pipe(Metareport.make_pretty, metrics=list(df_summary.index))
s

[10]:

compared	random	synthpop
metric
cat_consis-within_ratio	1.00	1.00
cat_stats-frequency_coverage	0.98	0.96
cat_stats-support_coverage	1.00	1.00
classif-diff_real_synth	0.07	0.00
collision-avg_num_appearance_collision_real	6.43	3.40
collision-avg_num_appearance_collision_synth	1.29	3.26
collision-avg_num_appearance_realcontrol	1.35	1.35
collision-avg_num_appearance_realtrain	1.46	1.46
collision-avg_num_appearance_synth	1.01	1.41
collision-f1_score	0.00	0.30
collision-precision	0.00	0.18
collision-recall	0.00	1.00
collision-recovery_rate	0.00	0.18
cont_consis-within_ratio	1.00	1.00
cont_stats-iqr_l1_distance	0.09	0.08
cont_stats-iqr_l1_distance_train_test_ref	0.10	0.10
cont_stats-median_l1_distance	0.01	0.01
cont_stats-median_l1_distance_train_test_ref	0.00	0.00
dcr-dcr_5th_percent_synthreal_control	0.03	0.00
dcr-dcr_5th_percent_synthreal_train	0.03	0.00
dcr-dcr_5th_percent_train_test_ref	0.00	0.00
dcr-nndr_5th_percent_synthreal_control	0.72	0.00
dcr-nndr_5th_percent_synthreal_train	0.55	0.00
dcr-nndr_5th_percent_train_test_ref	0.00	0.00
dcr-ratio_match_synthreal_control	0.00	0.33
dcr-ratio_match_synthreal_train	0.00	0.32
dcr-ratio_match_train_test_ref	0.33	0.33
detector-precision	0.57	0.53
detector-precision_top1%	1.00	0.50
detector-precision_top50%	0.51	0.52
detector-tpr_at_0.001%_fpr	0.02	0.01
detector-tpr_at_0.1%_fpr	0.02	0.01
dist-prediction_auc_rescaled	0.95	0.00
dist-prediction_mse	0.59	0.00
dist-propensity_mse	0.61	0.00
feature_imp-diff_permutation_importance	0.03	0.00
fscore-diff_f_score	1.25	0.36
ganleaks-precision_top1%	0.50	0.50
ganleaks-precision_top50%	0.81	0.89
hell_cat_univ_dist-hellinger_distance	0.02	0.03
hell_cat_univ_dist-hellinger_distance_train_test_ref	0.00	0.00
hell_cont_univ_dist-hellinger_distance	0.05	0.04
hell_cont_univ_dist-hellinger_distance_train_test_ref	0.06	0.06
kl_div_cat_univ_dist-kl_divergence	0.00	0.00
kl_div_cat_univ_dist-kl_divergence_train_test_ref	0.00	0.00
kl_div_cont_univ_dist-kl_divergence	0.01	0.01
kl_div_cont_univ_dist-kl_divergence_train_test_ref	0.01	0.01
logan-precision	0.48	0.53
logan-precision_top1%	0.50	1.00
logan-precision_top50%	0.47	0.54
logan-tpr_at_0.001%_fpr	0.01	0.07
logan-tpr_at_0.1%_fpr	0.01	0.07
mcmebership-precision_top1%	0.50	0.50
mcmebership-precision_top50%	0.53	0.53
pcd-norm	5.16	0.29
tablegan-precision	0.59	0.49
tablegan-precision_top1%	1.00	0.50
tablegan-precision_top50%	0.56	0.51
tablegan-tpr_at_0.001%_fpr	0.04	0.00
tablegan-tpr_at_0.1%_fpr	0.04	0.00

Save the styled result as html#

[11]:

with tempfile.TemporaryDirectory() as temp_dir:
    with open(Path(temp_dir) / "df.html", "w") as f:
        print(s.to_html(), file=f)

Save and load the metareport#

[12]:

with tempfile.TemporaryDirectory() as temp_dir:
    metareport.save(savepath=temp_dir)  # save
    new_report = Metareport(
        metareport_folderpath={"synthpop": temp_dir, "random": temp_dir}
    )  # load

[ ]: