How to generate a combined utility/privacy report?#
Create a combined report of the metrics, whether they are utility or privacy metrics. /! Only for the summary.#
Assume that the synthetic data is already generated
Based on the Wisconsin Breast Cancer Dataset (WBCD)
[ ]:
# Standard library
import sys
import tempfile
sys.path.append("..")
# 3rd party packages
import pandas as pd
# Local packages
from clover.utils import draw
from clover.metrics.report import Report
Load the real and synthetic Wisconsin Breast Cancer Datasets#
[2]:
df_real = {}
df_real["train"] = pd.read_csv("../data/WBCD_train.csv")
df_real["test"] = pd.read_csv("../data/WBCD_test.csv")
df_real["train"].shape
[2]:
(455, 10)
Choose the synthetic dataset#
[4]:
df_synth = {}
df_synth["train"] = pd.read_csv(
"../results/attack/data/1st_generation/2025-06-18_Synthpop_455samples.csv"
)
df_synth["test"] = pd.read_csv(
"../results/attack/data/1st_generation/2025-06-18_Synthpop_228samples.csv"
)
df_synth["2nd_gen"] = pd.read_csv(
"../results/attack/data/2nd_generation/2025-06-18_Synthpop_455samples.csv"
)
df_synth["test"].shape
[4]:
(228, 10)
Configure the metadata dictionary#
The continuous and categorical variables need to be specified, as well as the variable to predict#
[5]:
metadata = {
"continuous": [
"Clump_Thickness",
"Uniformity_of_Cell_Size",
"Uniformity_of_Cell_Shape",
"Marginal_Adhesion",
"Single_Epithelial_Cell_Size",
"Bland_Chromatin",
"Normal_Nucleoli",
"Mitoses",
"Bare_Nuclei",
],
"categorical": ["Class"],
"variable_to_predict": "Class",
}
Generate the report#
[6]:
parameters = { # see the notebooks utility_report and privacy_report for more details
"cross_learning": False,
"num_repeat": 1,
"num_kfolds": 3,
"num_optuna_trials": 15,
"use_gpu": True,
"sampling_frac": 0.5,
}
[7]:
report = Report(
dataset_name="Wisconsin Breast Cancer Dataset",
df_real=df_real,
df_synthetic=df_synth,
metadata=metadata,
figsize=(8, 6), # will be automatically adjusted for larger or longer figures
random_state=42, # for reproducibility purposes
report_folderpath=None, # load computed utility and/or privacy reports if available
report_filename=None, # the name of the computed report (without extension nor utility/privacy) if available
metrics=None, # list of the metrics to compute. Can be utility or privacy metrics. If not specified, all the metrics are computed.
params=parameters, # the dictionary containing the parameters for both utility and privacy reports
)
[8]:
report.compute()
/data8/install/anaconda3/envs/synthetic_data_p3.10/lib/python3.10/site-packages/xgboost/core.py:158: UserWarning: [21:29:20] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.
This warning will only be shown once.
warnings.warn(smsg, UserWarning)
TableGan test set shape: (228, 10)
LOGAN test set shape: (228, 10)
Detector test set shape: (228, 10)
Monte Carlo Membership test set shape: (228, 10)
GAN-Leaks test set shape: (228, 10)
Get the summary report as a pandas dataframe#
[9]:
report.specification()
----- Wisconsin Breast Cancer Dataset -----
Contains:
- 455 instances in the train set,
- 228 instances in the test set,
- 10 variables, 9 continuous and 1 categorical.
[10]:
df_summary = report.summary()
[11]:
by = ["name", "objective", "min", "max"]
df_summary.groupby(by).apply(lambda x: x.drop(by, axis=1).reset_index(drop=True))
/tmp/ipykernel_637219/717339571.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
df_summary.groupby(by).apply(lambda x: x.drop(by, axis=1).reset_index(drop=True))
[11]:
| alias | submetric | value | |||||
|---|---|---|---|---|---|---|---|
| name | objective | min | max | ||||
| Categorical Consistency | max | 0 | 1.0 | 0 | cat_consis | within_ratio | 1.000000 |
| Categorical Statistics | max | 0 | 1.0 | 0 | cat_stats | support_coverage | 1.000000 |
| 1 | cat_stats | frequency_coverage | 0.956044 | ||||
| Classification | min | 0 | 1.0 | 0 | classif | diff_real_synth | 0.006672 |
| Collision | - | 0 | inf | 0 | collision | avg_num_appearance_realtrain | 1.463023 |
| 1 | collision | avg_num_appearance_realcontrol | 1.349112 | ||||
| 2 | collision | avg_num_appearance_synth | 1.413043 | ||||
| 3 | collision | avg_num_appearance_collision_real | 3.403509 | ||||
| 4 | collision | avg_num_appearance_collision_synth | 3.263158 | ||||
| min | 0 | 1.0 | 0 | collision | precision | 0.341772 | |
| 1 | collision | recall | 0.947368 | ||||
| 2 | collision | f1_score | 0.502326 | ||||
| 3 | collision | recovery_rate | 0.173633 | ||||
| Continuous Consistency | max | 0 | 1.0 | 0 | cont_consis | within_ratio | 1.000000 |
| Continuous Statistics | min | 0 | inf | 0 | cont_stats | median_l1_distance | 0.012346 |
| 1 | cont_stats | median_l1_distance_train_test_ref | 0.000000 | ||||
| 2 | cont_stats | iqr_l1_distance | 0.080247 | ||||
| 3 | cont_stats | iqr_l1_distance_train_test_ref | 0.098765 | ||||
| DCR | max | 0 | 1.0 | 0 | dcr | dcr_5th_percent_synthreal_train | 0.000000 |
| 1 | dcr | dcr_5th_percent_synthreal_control | 0.000000 | ||||
| 2 | dcr | dcr_5th_percent_train_test_ref | 0.000000 | ||||
| 3 | dcr | nndr_5th_percent_synthreal_train | 0.000000 | ||||
| 4 | dcr | nndr_5th_percent_synthreal_control | 0.000000 | ||||
| 5 | dcr | nndr_5th_percent_train_test_ref | 0.000000 | ||||
| min | 0 | 1.0 | 0 | dcr | ratio_match_synthreal_train | 0.315789 | |
| 1 | dcr | ratio_match_synthreal_control | 0.333333 | ||||
| 2 | dcr | ratio_match_train_test_ref | 0.333333 | ||||
| Detector | min | 0 | 1.0 | 0 | detector | precision_top1% | 1.000000 |
| 1 | detector | precision_top50% | 0.552632 | ||||
| 2 | detector | precision | 0.549180 | ||||
| 3 | detector | tpr_at_0.001%_fpr | 0.035088 | ||||
| 4 | detector | tpr_at_0.1%_fpr | 0.035088 | ||||
| Distinguishability | min | 0 | 1.0 | 0 | dist | propensity_mse | 0.000129 |
| 1 | dist | prediction_mse | 0.000075 | ||||
| 2 | dist | prediction_auc_rescaled | 0.086334 | ||||
| FScore | min | 0 | inf | 0 | fscore | diff_f_score | 0.359862 |
| Feature Importance | min | 0 | inf | 0 | feature_imp | diff_permutation_importance | 0.002509 |
| GAN-Leaks | min | 0 | 1.0 | 0 | ganleaks | precision_top1% | 0.500000 |
| 1 | ganleaks | precision_top50% | 0.894737 | ||||
| Hellinger Categorical Univariate Distance | min | 0 | 1.0 | 0 | hell_cat_univ_dist | hellinger_distance | 0.032174 |
| 1 | hell_cat_univ_dist | hellinger_distance_train_test_ref | 0.001057 | ||||
| Hellinger Continuous Univariate Distance | min | 0 | 1.0 | 0 | hell_cont_univ_dist | hellinger_distance | 0.041094 |
| 1 | hell_cont_univ_dist | hellinger_distance_train_test_ref | 0.057075 | ||||
| KL Divergence Categorical Univariate Distance | min | 0 | inf | 0 | kl_div_cat_univ_dist | kl_divergence | 0.004175 |
| 1 | kl_div_cat_univ_dist | kl_divergence_train_test_ref | 0.000004 | ||||
| KL Divergence Continuous Univariate Distance | min | 0 | inf | 0 | kl_div_cont_univ_dist | kl_divergence | 0.007177 |
| 1 | kl_div_cont_univ_dist | kl_divergence_train_test_ref | 0.013798 | ||||
| LOGAN | min | 0 | 1.0 | 0 | logan | precision_top1% | 0.000000 |
| 1 | logan | precision_top50% | 0.701754 | ||||
| 2 | logan | precision | 0.486772 | ||||
| 3 | logan | tpr_at_0.001%_fpr | 0.000000 | ||||
| 4 | logan | tpr_at_0.1%_fpr | 0.000000 | ||||
| Monte Carlo Membership | min | 0 | 1.0 | 0 | mcmebership | precision_top1% | 0.500000 |
| 1 | mcmebership | precision_top50% | 0.526316 | ||||
| Pairwise Correlation Difference | min | 0 | inf | 0 | pcd | norm | 0.285290 |
| TableGan | min | 0 | 1.0 | 0 | tablegan | precision_top1% | 0.500000 |
| 1 | tablegan | precision_top50% | 0.482456 | ||||
| 2 | tablegan | precision | 0.482759 | ||||
| 3 | tablegan | tpr_at_0.001%_fpr | 0.000000 | ||||
| 4 | tablegan | tpr_at_0.1%_fpr | 0.000000 |
Save and load the report#
[12]:
with tempfile.TemporaryDirectory() as temp_dir:
report.save(savepath=temp_dir, filename="report") # save
new_report = Report(report_folderpath=temp_dir, report_filename="report") # load
[ ]: