How to generate a membership inference attacks report?#

Check that the synthetic data protect the privacy of the real data#

Based on the Wisconsin Breast Cancer Dataset (WBCD)

[1]:

# Standard library
import sys

sys.path.append("..")
import tempfile
from pathlib import Path
import time

# 3rd party packages
import pandas as pd
from sklearn.model_selection import train_test_split

# Local packages
from clover.generators import (
    Generator,
    DataSynthesizerGenerator,
    SynthpopGenerator,
    SmoteGenerator,
    TVAEGenerator,
    CTGANGenerator,
    FinDiffGenerator,
    MSTGenerator,
    CTABGANGenerator,
)
from clover.metrics.privacy.report import PrivacyReport
from clover.utils.standard import create_directory

Generate synthetic data#

Load the Wisconsin Breast Cancer Dataset#

[2]:

df_real = pd.read_csv("../data/breast_cancer_wisconsin.csv").drop(
    "Sample_code_number", axis=1
)
df_real.shape

[2]:

(683, 10)

[3]:

df_real.head()

[3]:

	Clump_Thickness	Uniformity_of_Cell_Size	Uniformity_of_Cell_Shape	Marginal_Adhesion	Single_Epithelial_Cell_Size	Bare_Nuclei	Bland_Chromatin	Normal_Nucleoli	Mitoses	Class
0	5	1	1	1	2	1	3	1	1	2
1	5	4	4	5	7	10	3	2	1	2
2	3	1	1	1	2	2	3	1	1	2
3	6	8	8	1	3	4	3	7	1	2
4	4	1	1	3	2	1	3	1	1	2

Split the real data into train and test sets#

[4]:

df_real_train, df_real_test = train_test_split(
    df_real, test_size=1 / 3, random_state=42, stratify=df_real["Class"]
)

[5]:

df_real_train.to_csv("../data/WBCD_train.csv", index=False)
df_real_test.to_csv("../data/WBCD_test.csv", index=False)

Create the metadata dictionary#

The continuous and categorical variables need to be specified, as well as the variable to predict for the future learning task (used by SMOTE)

[6]:

metadata = {
    "continuous": [
        "Clump_Thickness",
        "Uniformity_of_Cell_Size",
        "Uniformity_of_Cell_Shape",
        "Marginal_Adhesion",
        "Single_Epithelial_Cell_Size",
        "Bland_Chromatin",
        "Normal_Nucleoli",
        "Mitoses",
        "Bare_Nuclei",
    ],
    "categorical": ["Class"],
    "variable_to_predict": "Class",
}

Choose a generator#

We choose synthpop as generator with default setting

[7]:

gen = SynthpopGenerator(
    df=df_real_train,
    metadata=metadata,
    random_state=42,  # for reproducibility, can be set to None
    generator_filepath=None,  # to load an existing generator,
    variables_order=None,  # use the dataframe columns order by default
    min_samples_leaf=5,  # default = 5
    max_depth=None,
)

Fit the generator to the real train data#

[8]:

create_directory(
    path="../results/attack/generator/1st_generation"
)  # create path if doesn't exist

gen.preprocess()
gen.fit(save_path="../results/attack/generator/1st_generation")  # the path should exist

Generate synthetic data#

[9]:

create_directory(
    path="../results/attack/data/1st_generation"
)  # create path if doesn't exist

df_synth_train = gen.sample(
    save_path="../results/attack/data/1st_generation",  # the path should exist
    num_samples=len(df_real_train),
)

[10]:

df_synth_test = gen.sample(
    save_path="../results/attack/data/1st_generation",  # the path should exist
    num_samples=len(df_real_test),
)

Generate 2nd generation synthetic data#

We choose the same generator and setting to generate the 2nd generation synthetic data

[11]:

gen_2nd = SynthpopGenerator(
    df=df_synth_train,
    metadata=metadata,
    random_state=42,  # for reproducibility, can be set to None
    generator_filepath=None,  # to load an existing generator,
    variables_order=None,  # use the dataframe columns order by default
    min_samples_leaf=5,  # default = 5
    max_depth=None,
)

[12]:

create_directory(
    path="../results/attack/generator/2nd_generation"
)  # create path if doesn't exist

gen_2nd.preprocess()
gen_2nd.fit(
    save_path="../results/attack/generator/2nd_generation"
)  # the path should exist

[13]:

create_directory(
    path="../results/attack/data/2nd_generation"
)  # create path if doesn't exist

df_synth_2nd = gen_2nd.sample(
    save_path="../results/attack/data/2nd_generation",  # the path should exist
    num_samples=len(df_synth_train),
)

Generate privacy report#

Gethor real and synthetic data into dictionaries#

[14]:

df_real_dict = {}
df_real_dict["train"] = df_real_train
df_real_dict["test"] = df_real_test

# 1st and 2nd generation synthetic data
df_synth_dict = {}
df_synth_dict["train"] = df_synth_train
df_synth_dict["test"] = df_synth_test
df_synth_dict["2nd_gen"] = df_synth_2nd

Generate the privacy report#

[15]:

report = PrivacyReport(
    dataset_name="Wisconsin Breast Cancer Dataset",
    df_real=df_real_dict,
    df_synthetic=df_synth_dict,
    metadata=metadata,
    figsize=(8, 6),  # will be automatically adjusted for larger or longer figures
    random_state=42,  # for reproducibility purposes
    report_filepath=None,  # load a computed report if available
    metrics=None,  # list of the metrics to compute. If not specified, all the metrics are computed.
    sampling_frac=0.5,  # the fraction of data to sample from real and synthetic datasets for better computing performance
    num_repeat=10,  # for the metrics relying on predictions
    num_kfolds=3,  # the number of folds to tune the hyperparameters for the metrics relying on predictors
    num_optuna_trials=15,  # the number of trials of the optimization process for tuning hyperparameters for the metrics relying on predictors
    use_gpu=True,  # run the learning tasks on the GPU
)

[16]:

start = time.time()

report.compute()

end = time.time() - start

print(f"Time taken: {end:.2f} seconds.")

GAN-Leaks test set shape: (228, 10)
Monte Carlo Membership test set shape: (228, 10)
LOGAN test set shape: (228, 10)

/data8/install/anaconda3/envs/synthetic_data_p3.10/lib/python3.10/site-packages/xgboost/core.py:158: UserWarning: [20:53:41] WARNING: /workspace/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.
Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.

This warning will only be shown once.

  warnings.warn(smsg, UserWarning)

TableGan test set shape: (228, 10)
Detector test set shape: (228, 10)
Time taken: 310.38 seconds.

Get the summary report as a pandas dataframe#

[17]:

report.specification()

----- Wisconsin Breast Cancer Dataset -----
Contains:
    - 455 instances in the train set,
    - 228 instances in the test set,
    - 10 variables, 9 continuous and 1 categorical.

[18]:

df_summary = report.summary()

[31]:

by = ["name", "objective", "min", "max"]
df_summary.groupby(by, group_keys=True).apply(
    lambda x: x.reset_index(drop=True), include_groups=False
)

[31]:

					alias	submetric	value
name	objective	min	max
Collision	-	0	inf	0	collision	avg_num_appearance_realtrain	1.463023
				1	collision	avg_num_appearance_realcontrol	1.349112
				2	collision	avg_num_appearance_synth	1.413043
				3	collision	avg_num_appearance_collision_real	3.403509
				4	collision	avg_num_appearance_collision_synth	3.263158
	min	0	1.0	0	collision	precision	0.404410
				1	collision	recall	0.924561
				2	collision	f1_score	0.559463
				3	collision	recovery_rate	0.169453
DCR	max	0	1.0	0	dcr	dcr_5th_percent_synthreal_train	0.000000
				1	dcr	dcr_5th_percent_synthreal_control	0.000000
				2	dcr	dcr_5th_percent_train_test_ref	0.000000
				3	dcr	nndr_5th_percent_synthreal_train	0.000000
				4	dcr	nndr_5th_percent_synthreal_control	0.000000
				5	dcr	nndr_5th_percent_train_test_ref	0.000000
	min	0	1.0	0	dcr	ratio_match_synthreal_train	0.315789
				1	dcr	ratio_match_synthreal_control	0.333333
				2	dcr	ratio_match_train_test_ref	0.333333
Detector	min	0	1.0	0	detector	precision_top1%	0.800000
				1	detector	precision_top50%	0.509649
				2	detector	precision	0.507511
				3	detector	tpr_at_0.001%_fpr	0.014035
				4	detector	tpr_at_0.1%_fpr	0.014035
GAN-Leaks	min	0	1.0	0	ganleaks	precision_top1%	0.500000
GAN-Leaks	min	0	1.0	1	ganleaks	precision_top50%	0.894737
LOGAN	min	0	1.0	0	logan	precision_top1%	0.950000
				1	logan	precision_top50%	0.529825
				2	logan	precision	0.532749
				3	logan	tpr_at_0.001%_fpr	0.023684
				4	logan	tpr_at_0.1%_fpr	0.023684
Monte Carlo Membership	min	0	1.0	0	mcmebership	precision_top1%	0.500000
Monte Carlo Membership	min	0	1.0	1	mcmebership	precision_top50%	0.526316
TableGan	min	0	1.0	0	tablegan	precision_top1%	1.000000
				1	tablegan	precision_top50%	0.489474
				2	tablegan	precision	0.493370
				3	tablegan	tpr_at_0.001%_fpr	0.034211
				4	tablegan	tpr_at_0.1%_fpr	0.034211

Display the detailed report#

[20]:

report.detailed(show=True, save_folder=None, figure_format="png")

../_images/tutorial_privacy_report_35_0.png

../_images/tutorial_privacy_report_35_1.png

../_images/tutorial_privacy_report_35_2.png

../_images/tutorial_privacy_report_35_3.png

../_images/tutorial_privacy_report_35_4.png

../_images/tutorial_privacy_report_35_5.png

../_images/tutorial_privacy_report_35_6.png

../_images/tutorial_privacy_report_35_7.png

../_images/tutorial_privacy_report_35_8.png

../_images/tutorial_privacy_report_35_9.png

../_images/tutorial_privacy_report_35_10.png

../_images/tutorial_privacy_report_35_11.png

Save and load the report#

[21]:

with tempfile.TemporaryDirectory() as temp_dir:
    report.save(savepath=temp_dir, filename="privacy_report")  # save
    new_report = PrivacyReport(
        report_filepath=Path(temp_dir) / "privacy_report.pkl"
    )  # load

[ ]: