Synthetic Data Generation#

Generate synthetic data with the generators listed below Based on the Wisconsin Breast Cancer Dataset (WBCD)

[20]:

# Standard library
import sys
from pathlib import Path

sys.path.append("..")

# 3rd party packages
import matplotlib.pyplot as plt
import pandas as pd

# Local packages
from clover.generators import (
    Generator,
    DataSynthesizerGenerator,
    SynthpopGenerator,
    SmoteGenerator,
    TVAEGenerator,
    CTGANGenerator,
    FinDiffGenerator,
    MSTGenerator,
    CTABGANGenerator,
)
import clover.utils.draw as draw
from clover.utils.standard import create_directory

Load the real WBCD training dataset#

[21]:

PROJECT_PATH = Path(".")
OUTPUT_PATH = PROJECT_PATH / "output"

[22]:

df_real_train = pd.read_csv("data/breast_cancer_wisconsin_train.csv")
df_real_test = pd.read_csv("data/breast_cancer_wisconsin_test.csv")
df_real_train.shape, df_real_test.shape

[22]:

((359, 10), (90, 10))

[23]:

df_real_train.head()

[23]:

	Clump_Thickness	Uniformity_of_Cell_Size	Uniformity_of_Cell_Shape	Marginal_Adhesion	Single_Epithelial_Cell_Size	Bare_Nuclei	Bland_Chromatin	Normal_Nucleoli	Mitoses	Class
0	10	10	10	3	10	10	9	10	1	1
1	5	1	2	1	2	1	1	1	1	0
2	6	10	2	8	10	2	7	8	10	1
3	4	8	6	4	3	4	10	6	1	1
4	7	4	7	4	3	7	7	6	1	1

Create the metadata dictionary#

The continuous and categorical variables need to be specified, as well as the variable to predict for the future learning task (used by SMOTE)#

[24]:

metadata = {
    "continuous": [
        "Clump_Thickness",
        "Uniformity_of_Cell_Size",
        "Uniformity_of_Cell_Shape",
        "Marginal_Adhesion",
        "Single_Epithelial_Cell_Size",
        "Bland_Chromatin",
        "Normal_Nucleoli",
        "Mitoses",
        "Bare_Nuclei",
    ],
    "categorical": ["Class"],
    "variable_to_predict": "Class",
}

Choose the generator#

[25]:

generator = "synthpop"  # to choose among this list: ["synthpop", "smote", "datasynthesizer", "mst", "ctgan", "tvae", "ctabgan", "findiff"]
dp = False

Synthpop#

[26]:

if generator == "synthpop":
    if not dp:
        gen = SynthpopGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator,
            variables_order=None,  # use the dataframe columns order by default
        )
    else:
        gen = SynthpopGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator,
            variables_order=None,  # use the dataframe columns order by default
            epsilon=10,
            max_depth=5,
        )

SMOTE#

[27]:

if generator == "smote":
    if not dp:
        gen = SmoteGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator,
            k_neighbors=None,  # cannot be found by searching the best hyperparameters yet, set to 5 by default
        )
    else:
        gen = SmoteGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator,
            k_neighbors=5,  # cannot be found by searching the best hyperparameters yet, set to 5 by default
            epsilon=10,
        )

Datasynthesizer#

[28]:

if generator == "datasynthesizer":
    if not dp:
        gen = DataSynthesizerGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator
            candidate_keys=None,  # the identifiers
            epsilon=0,  # for the differential privacy
            degree=2,  # the maximal number of parents for the bayesian network
        )
    else:
        gen = DataSynthesizerGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator
            candidate_keys=None,  # the identifiers
            epsilon=10,  # for the differential privacy
            degree=2,  # the maximal number of parents for the bayesian network
        )

MST#

[29]:

# MST includes DP by design, hence non-dp mode is not available
if generator == "mst":
    gen = MSTGenerator(
        df=df_real_train,
        metadata=metadata,
        random_state=66,  # for reproducibility, can be set to None
        generator_filepath=None,  # to load an existing generator
        epsilon=10,  # the privacy budget of the differential privacy
        delta=1e-9,  # the failure probability of the differential privacy
    )

CTGAN#

[30]:

if generator == "ctgan":
    if not dp:
        gen = CTGANGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator
            discriminator_steps=4,  # the number of discriminator updates to do for each generator update
            epochs=300,  # the number of training epochs
            batch_size=100,  # the batch size for training
        )
    else:
        gen = CTGANGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator
            discriminator_steps=4,  # the number of discriminator updates to do for each generator update
            epochs=300,  # the number of training epochs
            batch_size=100,  # the batch size for training
            epsilon=10,
            delta=1e-5,
        )

TVAE#

[31]:

if generator == "tvae":
    if not dp:
        gen = TVAEGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator
            epochs=300,  # the number of training epochs
            batch_size=100,  # the batch size for training
            compress_dims=(249, 249),  # the size of the hidden layers in the encoder
            decompress_dims=(249, 249),  # the size of the hidden layers in the decoder
        )
    else:
        gen = TVAEGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator
            epochs=300,  # the number of training epochs
            batch_size=100,  # the batch size for training
            compress_dims=(249, 249),  # the size of the hidden layers in the encoder
            decompress_dims=(249, 249),  # the size of the hidden layers in the decoder
            epsilon=10,
            delta=1e-5,
        )

CTAB-GAN+#

[32]:

if generator == "ctabgan":
    if not dp:
        gen = CTABGANGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator
            mixed_columns=None,  # dictionary of "mixed" column names with corresponding categorical modes
            log_columns=None,  # list of skewed exponential numerical columns
            integer_columns=metadata[
                "continuous"
            ],  # list of numeric columns without floating numbers
            class_dim=(
                256,
                256,
                256,
                256,
            ),  # size of each desired linear layer for the auxiliary classifier
            random_dim=100,  # dimension of the noise vector fed to the generator
            num_channels=64,  # number of channels in the convolutional layers of both the generator and the discriminator
            l2scale=1e-5,  # rate of weight decay used in the optimizer of the generator, discriminator and auxiliary classifier
            batch_size=150,  # batch size for training
            epochs=500,  # number of training epochs
        )
    else:
        gen = CTABGANGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator
            mixed_columns=None,  # dictionary of "mixed" column names with corresponding categorical modes
            log_columns=None,  # list of skewed exponential numerical columns
            integer_columns=metadata[
                "continuous"
            ],  # list of numeric columns without floating numbers
            class_dim=(
                32,
                32,
                32,
                32,
            ),  # size of each desired linear layer for the auxiliary classifier
            random_dim=10,  # dimension of the noise vector fed to the generator
            num_channels=8,  # number of channels in the convolutional layers of both the generator and the discriminator
            l2scale=1e-5,  # rate of weight decay used in the optimizer of the generator, discriminator and auxiliary classifier
            batch_size=150,  # batch size for training
            epochs=10,  # number of training epochs
            epsilon=10,
            delta=1e-5,
        )

FinDiff#

[33]:

if generator == "findiff":
    if not dp:
        gen = FinDiffGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator
            learning_rate=1e-4,  # the learning rate for training
            batch_size=512,  # the batch size for training and sampling
            diffusion_steps=500,  # the diffusion timesteps for the forward diffusion process
            epochs=500,  # the training iterations
            mpl_layers=[1024, 1024, 1024, 1024],  # the width of the MLP layers
            activation="lrelu",  # the activation fuction
            dim_t=64,  # dimensionality of the intermediate layer for connecting the embeddings
            cat_emb_dim=2,  # dimension of categorical embeddings
            diff_beta_start_end=[1e-4, 0.02],  # diffusion start and end betas
            scheduler="linear",  # diffusion scheduler
        )
    else:
        gen = FinDiffGenerator(
            df=df_real_train,
            metadata=metadata,
            random_state=66,  # for reproducibility, can be set to None
            generator_filepath=None,  # to load an existing generator
            learning_rate=1e-4,  # the learning rate for training
            batch_size=512,  # the batch size for training and sampling
            diffusion_steps=500,  # the diffusion timesteps for the forward diffusion process
            epochs=500,  # the training iterations
            mpl_layers=[1024, 1024, 1024, 1024],  # the width of the MLP layers
            activation="lrelu",  # the activation fuction
            dim_t=64,  # dimensionality of the intermediate layer for connecting the embeddings
            cat_emb_dim=2,  # dimension of categorical embeddings
            diff_beta_start_end=[1e-4, 0.02],  # diffusion start and end betas
            scheduler="linear",  # diffusion scheduler
            epsilon=1,  # value for DP parameter (delta is at default value 1e-5)
        )

Fit the generator to the real data#

[34]:

create_directory(path="../results/generators")  # create path if doesn't exist

gen.preprocess()
gen.fit(save_path="../results/generators")  # the path should exist

Display the fitted generator#

[35]:

gen.display()

Constructed sequential trees:
   Clump_Thickness has parents []
   Uniformity_of_Cell_Size has parents ['Clump_Thickness']
   Uniformity_of_Cell_Shape has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size']
   Marginal_Adhesion has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape']
   Single_Epithelial_Cell_Size has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion']
   Bare_Nuclei has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size']
   Bland_Chromatin has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei']
   Normal_Nucleoli has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin']
   Mitoses has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli']
   Class has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses']

Generate the synthetic data#

[36]:

create_directory(path="../results/data")  # create path if doesn't exist

df_synth_train = gen.sample(
    save_path="../results/data",  # the path should exist
    num_samples=len(
        df_real_train
    ),  # can be different from the real data, but for computing the utility metrics should be the same
)

df_synth_test = gen.sample(
    save_path="../results/data",  # the path should exist
    num_samples=len(
        df_real_test
    ),  # can be different from the real data, but for computing the utility metrics should be the same
)

[37]:

df_synth_train.head()

[37]:

	Clump_Thickness	Uniformity_of_Cell_Size	Uniformity_of_Cell_Shape	Marginal_Adhesion	Single_Epithelial_Cell_Size	Bare_Nuclei	Bland_Chromatin	Normal_Nucleoli	Mitoses	Class
0	1	1	1	1	2	1	2	2	1	0
1	10	4	3	6	6	10	7	6	2	1
2	9	10	10	8	5	10	4	7	1	1
3	10	8	8	7	10	10	4	8	10	1
4	1	4	2	10	3	7	5	6	2	0

[38]:

fig, axes = plt.subplots(  # manually set number of cols/rows
    nrows=4, ncols=3, squeeze=0, figsize=(18, 16), layout="constrained"
)
axes = axes.reshape(-1)
draw.kde_plot_hue_plot_per_col(
    df=df_real_train,
    df_nested=df_synth_train,
    original_name="Real",
    nested_name="Synthetic",
    hue_name="Data",
    title="Kernel density estimate",
    axes=axes,
)

../_images/tutorial_synthetic_data_generation_35_0.png

	Clump_Thickness	Uniformity_of_Cell_Size	Uniformity_of_Cell_Shape	Marginal_Adhesion	Single_Epithelial_Cell_Size	Bare_Nuclei	Bland_Chromatin	Normal_Nucleoli	Mitoses	Class
0	10	10	10	3	10	10	9	10	1	1
1	5	1	2	1	2	1	1	1	1	0
2	6	10	2	8	10	2	7	8	10	1
3	4	8	6	4	3	4	10	6	1	1
4	7	4	7	4	3	7	7	6	1	1

	Clump_Thickness	Uniformity_of_Cell_Size	Uniformity_of_Cell_Shape	Marginal_Adhesion	Single_Epithelial_Cell_Size	Bare_Nuclei	Bland_Chromatin	Normal_Nucleoli	Mitoses	Class
0	1	1	1	1	2	1	2	2	1	0
1	10	4	3	6	6	10	7	6	2	1
2	9	10	10	8	5	10	4	7	1	1
3	10	8	8	7	10	10	4	8	10	1
4	1	4	2	10	3	7	5	6	2	0

	Clump_Thickness	Uniformity_of_Cell_Size	Uniformity_of_Cell_Shape	Marginal_Adhesion	Single_Epithelial_Cell_Size	Bare_Nuclei	Bland_Chromatin	Normal_Nucleoli	Mitoses	Class
0	10	10	10	3	10	10	9	10	1	1
1	5	1	2	1	2	1	1	1	1	0
2	6	10	2	8	10	2	7	8	10	1
3	4	8	6	4	3	4	10	6	1	1
4	7	4	7	4	3	7	7	6	1	1

	Clump_Thickness	Uniformity_of_Cell_Size	Uniformity_of_Cell_Shape	Marginal_Adhesion	Single_Epithelial_Cell_Size	Bare_Nuclei	Bland_Chromatin	Normal_Nucleoli	Mitoses	Class
0	1	1	1	1	2	1	2	2	1	0
1	10	4	3	6	6	10	7	6	2	1
2	9	10	10	8	5	10	4	7	1	1
3	10	8	8	7	10	10	4	8	10	1
4	1	4	2	10	3	7	5	6	2	0

	Clump_Thickness	Uniformity_of_Cell_Size	Uniformity_of_Cell_Shape	Marginal_Adhesion	Single_Epithelial_Cell_Size	Bare_Nuclei	Bland_Chromatin	Normal_Nucleoli	Mitoses	Class
0	10	10	10	3	10	10	9	10	1	1
1	5	1	2	1	2	1	1	1	1	0
2	6	10	2	8	10	2	7	8	10	1
3	4	8	6	4	3	4	10	6	1	1
4	7	4	7	4	3	7	7	6	1	1

	Clump_Thickness	Uniformity_of_Cell_Size	Uniformity_of_Cell_Shape	Marginal_Adhesion	Single_Epithelial_Cell_Size	Bare_Nuclei	Bland_Chromatin	Normal_Nucleoli	Mitoses	Class
0	1	1	1	1	2	1	2	2	1	0
1	10	4	3	6	6	10	7	6	2	1
2	9	10	10	8	5	10	4	7	1	1
3	10	8	8	7	10	10	4	8	10	1
4	1	4	2	10	3	7	5	6	2	0