How to generate a combined utility/privacy report?#

Create a combined report of the metrics, whether they are utility or privacy metrics. /! Only for the summary.#

Assume that the synthetic data is already generated

Based on the Wisconsin Breast Cancer Dataset (WBCD)

[ ]:

# Standard library
import sys
import tempfile

sys.path.append("..")

# 3rd party packages
import pandas as pd

# Local packages
from clover.utils import draw
from clover.metrics.report import Report

Load the real and synthetic Wisconsin Breast Cancer Datasets#

[2]:

df_real = {}
df_real["train"] = pd.read_csv("../data/WBCD_train.csv")
df_real["test"] = pd.read_csv("../data/WBCD_test.csv")
df_real["train"].shape

[2]:

(455, 10)

Choose the synthetic dataset#

[4]:

df_synth = {}
df_synth["train"] = pd.read_csv(
    "../results/attack/data/1st_generation/2025-06-18_Synthpop_455samples.csv"
)
df_synth["test"] = pd.read_csv(
    "../results/attack/data/1st_generation/2025-06-18_Synthpop_228samples.csv"
)
df_synth["2nd_gen"] = pd.read_csv(
    "../results/attack/data/2nd_generation/2025-06-18_Synthpop_455samples.csv"
)
df_synth["test"].shape

[4]:

(228, 10)

Configure the metadata dictionary#

The continuous and categorical variables need to be specified, as well as the variable to predict#

[5]:

metadata = {
    "continuous": [
        "Clump_Thickness",
        "Uniformity_of_Cell_Size",
        "Uniformity_of_Cell_Shape",
        "Marginal_Adhesion",
        "Single_Epithelial_Cell_Size",
        "Bland_Chromatin",
        "Normal_Nucleoli",
        "Mitoses",
        "Bare_Nuclei",
    ],
    "categorical": ["Class"],
    "variable_to_predict": "Class",
}

Generate the report#

[6]:

parameters = {  # see the notebooks utility_report and privacy_report for more details
    "cross_learning": False,
    "num_repeat": 1,
    "num_kfolds": 3,
    "num_optuna_trials": 15,
    "use_gpu": True,
    "sampling_frac": 0.5,
}

[7]:

report = Report(
    dataset_name="Wisconsin Breast Cancer Dataset",
    df_real=df_real,
    df_synthetic=df_synth,
    metadata=metadata,
    figsize=(8, 6),  # will be automatically adjusted for larger or longer figures
    random_state=42,  # for reproducibility purposes
    report_folderpath=None,  # load computed utility and/or privacy reports if available
    report_filename=None,  # the name of the computed report (without extension nor utility/privacy) if available
    metrics=None,  # list of the metrics to compute. Can be utility or privacy metrics. If not specified, all the metrics are computed.
    params=parameters,  # the dictionary containing the parameters for both utility and privacy reports
)

[8]:

report.compute()

/data8/install/anaconda3/envs/synthetic_data_p3.10/lib/python3.10/site-packages/xgboost/core.py:158: UserWarning: [21:29:20] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.

This warning will only be shown once.

  warnings.warn(smsg, UserWarning)

TableGan test set shape: (228, 10)
LOGAN test set shape: (228, 10)
Detector test set shape: (228, 10)
Monte Carlo Membership test set shape: (228, 10)
GAN-Leaks test set shape: (228, 10)

Get the summary report as a pandas dataframe#

[9]:

report.specification()

----- Wisconsin Breast Cancer Dataset -----
Contains:
    - 455 instances in the train set,
    - 228 instances in the test set,
    - 10 variables, 9 continuous and 1 categorical.

[10]:

df_summary = report.summary()

[11]:

by = ["name", "objective", "min", "max"]
df_summary.groupby(by).apply(lambda x: x.drop(by, axis=1).reset_index(drop=True))

/tmp/ipykernel_637219/717339571.py:2: DeprecationWarning: DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.
  df_summary.groupby(by).apply(lambda x: x.drop(by, axis=1).reset_index(drop=True))

[11]:

					alias	submetric	value
name	objective	min	max
Categorical Consistency	max	0	1.0	0	cat_consis	within_ratio	1.000000
Categorical Statistics	max	0	1.0	0	cat_stats	support_coverage	1.000000
Categorical Statistics	max	0	1.0	1	cat_stats	frequency_coverage	0.956044
Classification	min	0	1.0	0	classif	diff_real_synth	0.006672
Collision	-	0	inf	0	collision	avg_num_appearance_realtrain	1.463023
				1	collision	avg_num_appearance_realcontrol	1.349112
				2	collision	avg_num_appearance_synth	1.413043
				3	collision	avg_num_appearance_collision_real	3.403509
				4	collision	avg_num_appearance_collision_synth	3.263158
	min	0	1.0	0	collision	precision	0.341772
				1	collision	recall	0.947368
				2	collision	f1_score	0.502326
				3	collision	recovery_rate	0.173633
Continuous Consistency	max	0	1.0	0	cont_consis	within_ratio	1.000000
Continuous Statistics	min	0	inf	0	cont_stats	median_l1_distance	0.012346
				1	cont_stats	median_l1_distance_train_test_ref	0.000000
				2	cont_stats	iqr_l1_distance	0.080247
				3	cont_stats	iqr_l1_distance_train_test_ref	0.098765
DCR	max	0	1.0	0	dcr	dcr_5th_percent_synthreal_train	0.000000
				1	dcr	dcr_5th_percent_synthreal_control	0.000000
				2	dcr	dcr_5th_percent_train_test_ref	0.000000
				3	dcr	nndr_5th_percent_synthreal_train	0.000000
				4	dcr	nndr_5th_percent_synthreal_control	0.000000
				5	dcr	nndr_5th_percent_train_test_ref	0.000000
	min	0	1.0	0	dcr	ratio_match_synthreal_train	0.315789
				1	dcr	ratio_match_synthreal_control	0.333333
				2	dcr	ratio_match_train_test_ref	0.333333
Detector	min	0	1.0	0	detector	precision_top1%	1.000000
				1	detector	precision_top50%	0.552632
				2	detector	precision	0.549180
				3	detector	tpr_at_0.001%_fpr	0.035088
				4	detector	tpr_at_0.1%_fpr	0.035088
Distinguishability	min	0	1.0	0	dist	propensity_mse	0.000129
				1	dist	prediction_mse	0.000075
				2	dist	prediction_auc_rescaled	0.086334
FScore	min	0	inf	0	fscore	diff_f_score	0.359862
Feature Importance	min	0	inf	0	feature_imp	diff_permutation_importance	0.002509
GAN-Leaks	min	0	1.0	0	ganleaks	precision_top1%	0.500000
GAN-Leaks	min	0	1.0	1	ganleaks	precision_top50%	0.894737
Hellinger Categorical Univariate Distance	min	0	1.0	0	hell_cat_univ_dist	hellinger_distance	0.032174
Hellinger Categorical Univariate Distance	min	0	1.0	1	hell_cat_univ_dist	hellinger_distance_train_test_ref	0.001057
Hellinger Continuous Univariate Distance	min	0	1.0	0	hell_cont_univ_dist	hellinger_distance	0.041094
Hellinger Continuous Univariate Distance	min	0	1.0	1	hell_cont_univ_dist	hellinger_distance_train_test_ref	0.057075
KL Divergence Categorical Univariate Distance	min	0	inf	0	kl_div_cat_univ_dist	kl_divergence	0.004175
KL Divergence Categorical Univariate Distance	min	0	inf	1	kl_div_cat_univ_dist	kl_divergence_train_test_ref	0.000004
KL Divergence Continuous Univariate Distance	min	0	inf	0	kl_div_cont_univ_dist	kl_divergence	0.007177
KL Divergence Continuous Univariate Distance	min	0	inf	1	kl_div_cont_univ_dist	kl_divergence_train_test_ref	0.013798
LOGAN	min	0	1.0	0	logan	precision_top1%	0.000000
				1	logan	precision_top50%	0.701754
				2	logan	precision	0.486772
				3	logan	tpr_at_0.001%_fpr	0.000000
				4	logan	tpr_at_0.1%_fpr	0.000000
Monte Carlo Membership	min	0	1.0	0	mcmebership	precision_top1%	0.500000
Monte Carlo Membership	min	0	1.0	1	mcmebership	precision_top50%	0.526316
Pairwise Correlation Difference	min	0	inf	0	pcd	norm	0.285290
TableGan	min	0	1.0	0	tablegan	precision_top1%	0.500000
				1	tablegan	precision_top50%	0.482456
				2	tablegan	precision	0.482759
				3	tablegan	tpr_at_0.001%_fpr	0.000000
				4	tablegan	tpr_at_0.1%_fpr	0.000000

Save and load the report#

[12]:

with tempfile.TemporaryDirectory() as temp_dir:
    report.save(savepath=temp_dir, filename="report")  # save
    new_report = Report(report_folderpath=temp_dir, report_filename="report")  # load

[ ]: