How to generate a metareport?#

Create a metareport comparing synthetic datasets with respect to a list of metrics. /! Only for the summary.#

Assume that the synthetic datasets to compare are already generated
Wisconsin Breast Cancer Dataset (WBCD)
[1]:
# Standard library
import sys
import tempfile
from pathlib import Path

sys.path.append("..")

# 3rd party packages
import pandas as pd

# Local packages
import config
import utils.draw
from metrics.metareport import Metareport

Load the real and synthetic Wisconsin Breast Cancer Datasets#

[2]:
df_real = {}
df_real["train"] = pd.read_csv("../data/WBCD_train.csv")
df_real["test"] = pd.read_csv("../data/WBCD_test.csv")
df_real["train"].shape
[2]:
(455, 10)

Choose the synthetic dataset#

[3]:
# generated by Synthpop here
df_synth = {
    "train": pd.read_csv("../results/data/2024-02-15_Synthpop_455samples.csv"),
    "test": pd.read_csv("../results/data/2024-02-15_Synthpop_228samples.csv"),
    "2nd_gen": pd.read_csv(
        "../results/data/2024-02-15_Synthpop_455samples_2nd_gen.csv"
    ),
}

# random synthetic dataset to compare to the one generated by Synthpop
df_mock = {
    "train": df_real["train"].apply(
        lambda x: x.sample(frac=1, replace=True).to_numpy()
    ),
    "test": df_real["test"].apply(lambda x: x.sample(frac=1, replace=True).to_numpy()),
    "2nd_gen": df_synth["train"].apply(
        lambda x: x.sample(frac=1, replace=True).to_numpy()
    ),
}

synth_datasets = {"synthpop": df_synth, "random": df_mock}

Configure the metadata dictionary#

The continuous and categorical variables need to be specified, as well as the variable to predict#

[4]:
metadata = {
    "continuous": [
        "Clump_Thickness",
        "Uniformity_of_Cell_Size",
        "Uniformity_of_Cell_Shape",
        "Marginal_Adhesion",
        "Single_Epithelial_Cell_Size",
        "Bland_Chromatin",
        "Normal_Nucleoli",
        "Mitoses",
        "Bare_Nuclei",
    ],
    "categorical": ["Class"],
    "variable_to_predict": "Class",
}

Generate the metareport#

[5]:
parameters = {  # see the notebooks utility_report and privacy_report for more details
    "cross_learning": False,
    "num_repeat": 1,
    "num_kfolds": 2,
    "num_optuna_trials": 15,
    "use_gpu": True,
    "sampling_frac": 0.5,
}
[6]:
metareport = Metareport(
    dataset_name="Wisconsin Breast Cancer Dataset",
    df_real=df_real,
    synthetic_datasets=synth_datasets,
    metadata=metadata,
    figsize=(8, 6),  # will be automatically adjusted for larger or longer figures
    random_state=42,  # for reproducibility purposes
    metareport_folderpath=None,  # a dictionary containing the path of each already computed report to load and compare
    metrics=None,  # list of the metrics to compute. Can be utility or privacy metrics. If not specified, all the metrics are computed.
    params=parameters,  # the dictionary containing the parameters for both utility and privacy reports
)
[7]:
metareport.compute()
TableGan test set shape: (228, 10)
LOGAN test set shape: (228, 10)
Detector test set shape: (228, 10)
TableGan test set shape: (228, 10)
LOGAN test set shape: (228, 10)
Detector test set shape: (228, 10)

Get the summary report as a pandas dataframe#

[8]:
df_summary = metareport.summary()
[9]:
df_summary
[9]:
compared random synthpop
metric
cat_consis-within_ratio 1.000000 1.000000
cat_stats-frequency_coverage 0.984615 0.975824
cat_stats-support_coverage 1.000000 1.000000
classif-diff_real_synth 0.339485 0.004054
collision-avg_num_appearance_collision_real 4.500000 3.250000
collision-avg_num_appearance_collision_synth 1.000000 3.566667
collision-avg_num_appearance_realcontrol 1.349112 1.349112
collision-avg_num_appearance_realtrain 1.463023 1.463023
collision-avg_num_appearance_synth 1.000000 1.552901
collision-f1_score 0.017429 0.339943
collision-precision 0.008791 0.204778
collision-recall 1.000000 1.000000
collision-recovery_rate 0.012862 0.192926
cont_consis-within_ratio 1.000000 1.000000
cont_stats-iqr_l1_distance 0.012346 0.049383
cont_stats-median_l1_distance 0.000000 0.000000
dcr-dcr_5th_percent_synthreal_control 0.022222 0.000000
dcr-dcr_5th_percent_synthreal_train 0.029444 0.000000
dcr-nndr_5th_percent_synthreal_control 0.602470 0.000000
dcr-nndr_5th_percent_synthreal_train 0.666667 0.000000
dcr-ratio_match_synthreal_control 0.017544 0.377193
dcr-ratio_match_synthreal_train 0.017544 0.359649
detector-precision 0.740741 0.514286
detector-precision_top1% 1.000000 1.000000
detector-precision_top50% 0.482456 0.500000
detector-tpr_at_0.001%_fpr 0.043860 0.078947
detector-tpr_at_0.1%_fpr 0.043860 0.078947
dist-prediction_auc_rescaled 0.950331 0.035838
dist-prediction_mse 0.596189 0.090924
dist-prediction_mse_real 0.613067 0.093513
dist-prediction_mse_synth 0.579310 0.088335
feature_imp-diff_permutation_importance 0.035203 0.005191
fscore-diff_f_score 1.249184 0.353215
ganleaks-precision_top1% 0.500000 0.000000
ganleaks-precision_top50% 0.833333 0.850877
hell_cat_univ_dist-hellinger_distance 0.011352 0.018080
hell_cont_univ_dist-hellinger_distance 0.041920 0.053864
kl_div_cat_univ_dist-kl_divergence 0.000517 0.001300
kl_div_cont_univ_dist-kl_divergence 0.007137 0.012048
logan-precision 0.533835 0.574074
logan-precision_top1% 1.000000 0.500000
logan-precision_top50% 0.543860 0.587719
logan-tpr_at_0.001%_fpr 0.035088 0.000000
logan-tpr_at_0.1%_fpr 0.035088 0.000000
mcmebership-precision_top1% 0.500000 0.500000
mcmebership-precision_top50% 0.526316 0.526316
pcd-norm 5.217832 0.222504
tablegan-precision 0.533333 0.509259
tablegan-precision_top1% 1.000000 0.000000
tablegan-precision_top50% 0.517544 0.500000
tablegan-tpr_at_0.001%_fpr 0.026316 0.000000
tablegan-tpr_at_0.1%_fpr 0.026316 0.000000

Style the result#

The best value (minimum or maximal according to the submetric objective) is colored in green. The worst in yellow.

[ ]:
s = df_summary.style.pipe(Metareport.make_pretty, metrics=list(df_summary.index))
s

Save the styled result as html#

[ ]:
with tempfile.TemporaryDirectory() as temp_dir:
    with open(Path(temp_dir) / "df.html", "w") as f:
        print(s.to_html(), file=f)

Save and load the metareport#

[ ]:
with tempfile.TemporaryDirectory() as temp_dir:
    metareport.save(savepath=temp_dir)  # save
    new_report = Metareport(
        metareport_folderpath={"synthpop": temp_dir, "random": temp_dir}
    )  # load