Synthetic Data Generation#

Generate synthetic data with the generators listed below Based on the Wisconsin Breast Cancer Dataset (WBCD)

[1]:
# Standard library
import sys

sys.path.append("..")

# 3rd party packages
import pandas as pd

# Local packages
import config
from generators.ctabgan_generator import CTABGANGenerator
from generators.ctgan_generator import CTGANGenerator
from generators.dataSynthesizer import DataSynthesizerGenerator
from generators.mst_generator import MSTGenerator
from generators.smote import SmoteGenerator
from generators.synthpop_generator import SynthpopGenerator
from generators.findiff_generator import FindiffGenerator
from generators.tvae_generator import TVAEGenerator

Load the real WBCD training dataset#

[2]:
df_real = pd.read_csv("../data/" + config.WBCD_DATASET_TRAIN_FILEPATH.stem + ".csv")
df_real.shape
[2]:
(359, 10)
[3]:
df_real.head()
[3]:
Clump_Thickness Uniformity_of_Cell_Size Uniformity_of_Cell_Shape Marginal_Adhesion Single_Epithelial_Cell_Size Bare_Nuclei Bland_Chromatin Normal_Nucleoli Mitoses Class
0 10 10 10 3 10 10 9 10 1 1
1 5 1 2 1 2 1 1 1 1 0
2 6 10 2 8 10 2 7 8 10 1
3 4 8 6 4 3 4 10 6 1 1
4 7 4 7 4 3 7 7 6 1 1

Create the metadata dictionary#

The continuous and categorical variables need to be specified, as well as the variable to predict for the future learning task (used by SMOTE)#

[4]:
metadata = {
    "continuous": [
        "Clump_Thickness",
        "Uniformity_of_Cell_Size",
        "Uniformity_of_Cell_Shape",
        "Marginal_Adhesion",
        "Single_Epithelial_Cell_Size",
        "Bland_Chromatin",
        "Normal_Nucleoli",
        "Mitoses",
        "Bare_Nuclei",
    ],
    "categorical": ["Class"],
    "variable_to_predict": "Class",
}

Choose the generator#

[5]:
generator = "synthpop"  # to choose among this list: ["synthpop", "smote", "datasynthesizer", "mst", "ctgan", "tvae", "ctabgan", "findiff"]

Synthpop#

[6]:
if generator == "synthpop":
    gen = SynthpopGenerator(
        df=df_real,
        metadata=metadata,
        random_state=66,  # for reproducibility, can be set to None
        generator_filepath=None,  # to load an existing generator,
        variables_order=None,  # use the dataframe columns order by default
    )

SMOTE#

[7]:
if generator == "smote":
    gen = SmoteGenerator(
        df=df_real,
        metadata=metadata,
        random_state=66,  # for reproducibility, can be set to None
        generator_filepath=None,  # to load an existing generator,
        k_neighbors=None,  # cannot be found by searching the best hyperparameters yet, set to 5 by default
    )

Datasynthesizer#

[8]:
if generator == "datasynthesizer":
    gen = DataSynthesizerGenerator(
        df=df_real,
        metadata=metadata,
        random_state=66,  # for reproducibility, can be set to None
        generator_filepath=None,  # to load an existing generator
        candidate_keys=None,  # the identifiers
        epsilon=0,  # for the differential privacy
        degree=2,  # the maximal number of parents for the bayesian network
    )

MST#

[9]:
if generator == "mst":
    gen = MSTGenerator(
        df=df_real,
        metadata=metadata,
        random_state=66,  # for reproducibility, can be set to None
        generator_filepath=None,  # to load an existing generator
        epsilon=1.0,  # the privacy budget of the differential privacy
        delta=1e-9,  # the failure probability of the differential privacy
    )

CTGAN#

[10]:
if generator == "ctgan":
    gen = CTGANGenerator(
        df=df_real,
        metadata=metadata,
        random_state=66,  # for reproducibility, can be set to None
        generator_filepath=None,  # to load an existing generator
        discriminator_steps=4,  # the number of discriminator updates to do for each generator update
        epochs=300,  # the number of training epochs
        batch_size=100,  # the batch size for training
    )

TVAE#

[11]:
if generator == "tvae":
    gen = TVAEGenerator(
        df=df_real,
        metadata=metadata,
        random_state=66,  # for reproducibility, can be set to None
        generator_filepath=None,  # to load an existing generator
        epochs=300,  # the number of training epochs
        batch_size=100,  # the batch size for training
        compress_dims=(249, 249),  # the size of the hidden layers in the encoder
        decompress_dims=(249, 249),  # the size of the hidden layers in the decoder
    )

CTAB-GAN+#

[12]:
if generator == "ctabgan":
    gen = CTABGANGenerator(
        df=df_real,
        metadata=metadata,
        random_state=66,  # for reproducibility, can be set to None
        generator_filepath=None,  # to load an existing generator
        mixed_columns=None,  # dictionary of "mixed" column names with corresponding categorical modes
        log_columns=None,  # list of skewed exponential numerical columns
        integer_columns=metadata[
            "continuous"
        ],  # list of numeric columns without floating numbers
        class_dim=(
            256,
            256,
            256,
            256,
        ),  # size of each desired linear layer for the auxiliary classifier
        random_dim=100,  # dimension of the noise vector fed to the generator
        num_channels=64,  # number of channels in the convolutional layers of both the generator and the discriminator
        l2scale=1e-5,  # rate of weight decay used in the optimizer of the generator, discriminator and auxiliary classifier
        batch_size=150,  # batch size for training
        epochs=500,  # number of training epochs
    )

FinDiff#

[6]:
if generator == "findiff":
    gen = FinDiffGenerator(
        df=df_real,
        metadata=metadata,
        random_state=66,  # for reproducibility, can be set to None
        generator_filepath=None,  # to load an existing generator
        learning_rate=1e-4,  # the learning rate for training
        batch_size=512,  # the batch size for training and sampling
        diffusion_steps=500,  # the diffusion timesteps for the forward diffusion process
        epochs=500,  # the training iterations
        mpl_layers=[1024, 1024, 1024, 1024],  # the width of the MLP layers
        activation= "lrelu",  # the activation fuction
        dim_t = 64,  # dimensionality of the intermediate layer for connecting the embeddings
        cat_emb_dim = 2, # dimension of categorical embeddings
        diff_beta_start_end = [1e-4, 0.02],  # diffusion start and end betas
        scheduler = "linear",  # diffusion scheduler
    )

Fit the generator to the real data#

[14]:
gen.preprocess()
gen.fit(save_path="../results/generators")  # the path should exist

Display the fitted generator#

[15]:
gen.display()
Constructed sequential trees:
   Clump_Thickness has parents []
   Uniformity_of_Cell_Size has parents ['Clump_Thickness']
   Uniformity_of_Cell_Shape has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size']
   Marginal_Adhesion has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape']
   Single_Epithelial_Cell_Size has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion']
   Bare_Nuclei has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size']
   Bland_Chromatin has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei']
   Normal_Nucleoli has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin']
   Mitoses has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli']
   Class has parents ['Clump_Thickness', 'Uniformity_of_Cell_Size', 'Uniformity_of_Cell_Shape', 'Marginal_Adhesion', 'Single_Epithelial_Cell_Size', 'Bare_Nuclei', 'Bland_Chromatin', 'Normal_Nucleoli', 'Mitoses']

Generate the synthetic data#

[16]:
df_synth = gen.sample(
    save_path="../results/data",  # the path should exist
    num_samples=len(
        df_real
    ),  # can be different from the real data, but for computing the utility metrics should be the same
)
[17]:
df_synth.head()
[17]:
Clump_Thickness Uniformity_of_Cell_Size Uniformity_of_Cell_Shape Marginal_Adhesion Single_Epithelial_Cell_Size Bare_Nuclei Bland_Chromatin Normal_Nucleoli Mitoses Class
0 8 3 4 3 6 10 3 1 1 1
1 3 1 1 3 2 1 1 1 1 0
2 5 1 1 1 2 1 3 1 1 0
3 10 7 5 6 3 5 3 3 1 1
4 2 7 10 10 7 9 7 10 3 1
[18]:
import matplotlib.pyplot as plt

import utils.draw

fig, axes = plt.subplots(  # manually set number of cols/rows
    nrows=4, ncols=3, squeeze=0, figsize=(18, 16), layout="constrained"
)
axes = axes.reshape(-1)
utils.draw.kde_plot_hue_plot_per_col(
    df=df_real,
    df_nested=df_synth,
    original_name="Real",
    nested_name="Synthetic",
    hue_name="Data",
    title="Kernel density estimate",
    axes=axes,
)
../_images/tutorial_synthetic_data_generation_34_0.png
[ ]: