Synthetic Data Generation – Find the best hyperparameters#
Tune hyperparameters of synthetic data generation algorithms, based on the Wisconsin Breast Cancer Dataset (WBCD)
# Standard library
import sys
import time
import warnings
warnings.simplefilter(action="ignore", category=FutureWarning)
# 3rd party packages
import pandas as pd
from optuna import samplers
from ray import tune
# Local packages
import config
from generators.ctabgan_generator import CTABGANGenerator
from generators.ctgan_generator import CTGANGenerator
from generators.dataSynthesizer import DataSynthesizerGenerator
from generators.mst_generator import MSTGenerator
from generators.smote import SmoteGenerator
from generators.synthpop_generator import SynthpopGenerator
from generators.findiff_generator import FindiffGenerator
from generators.tvae_generator import TVAEGenerator
from optimization.discrete_pso_search import DiscreteParticleSwarmOptimizationSearch
from optimization.objective_function import (
from optimization.optuna_search import OptunaSearch
from optimization.raytune_search import RayTuneSearch
Load the real WBCD training dataset#
df_real = pd.read_csv("../data/" + config.WBCD_DATASET_TRAIN_FILEPATH.stem + ".csv")
(359, 10)
Create the metadata dictionary#
The continuous and categorical variables need to be specified, as well as the variable to predict for the future learning task (used by SMOTE)#
metadata = {
"continuous": [
"categorical": ["Class"],
"variable_to_predict": "Class",
Choose the objective function#
Two options: - Minimize the distinguishability metric (ability of a classifier to distinguish the real set from the synthetic one). - Minimize the absolute difference between prediction scores on the real validation set for a predictor trained on real train set versus synthetic train set.
objective_function = (
"distinguishability" # can be "distinguishability" or "difference_prediction_score"
if objective_function == "distinguishability":
objective = distinguishability_hinge_loss
if objective_function == "difference_prediction_score":
objective = absolute_difference_hinge_loss
Choose the optimizer#
optimizer = "Optuna" # "PSO", "Optuna" or "Ray Tune"
Discrete Particle Swarm Optimization (PSO) for variable order#
if optimizer == "PSO":
optim = DiscreteParticleSwarmOptimizationSearch(
"variables_order": list(df_real.columns)
}, # the variable to optimize with the default sequence
generator=SynthpopGenerator, # the generator
cv_num_folds=0, # the number of folds for cross-validation (0 or 1 to deactivate)
use_gpu=True, # flag to use the gpu if there are available
direction="min", # the direction of optimization ("min" or "max")
num_iter=2, # the number of iterations to repeat the search
population_size=2, # the size of the swarm
Configure the hyperparameters search space#
See Optuna for more details:
def params_to_explore_optuna(trial):
params = {
"batch_size": trial.suggest_categorical("batch_size", [50, 100]),
"epochs": trial.suggest_categorical("epochs", [10, 20]),
return params
Choose the sampler algorithm#
Can be random search, grid search, Bayesian search (TPESampler), etc. See for more samplers.[10]:
sampler = samplers.RandomSampler() # random search
Configure the search#
if optimizer == "Optuna":
optim = OptunaSearch(
generator=TVAEGenerator, # the generator
cv_num_folds=1, # the number of folds for cross-validation (0 or 1 to deactivate)
use_gpu=True, # flag to use the gpu if there are available
# pruner # the algorithm early stopping the unsuccessful trials. No pruning by default.
direction="minimize", # the direction of optimization ("minimize" or "maximize")
num_iter=2, # the number of iterations to repeat the search
verbose=1, # whether to print the INFO logs (1) or not (0)
[I 2023-08-10 18:30:32,753] A new study created in memory with name: no-name-cab0c499-09ce-4a09-a6a9-50d8eb8d2d3e
Ray Tune with Optuna for parallelization#
For a single run, slower than standalone Optuna.
Configure the hyperparameters search space#
See Ray Tune for more details:
params_to_explore_raytune = {
"batch_size": tune.choice([50, 100]),
"epochs": tune.choice([10, 20]),
Configure the search#
if optimizer == "Ray Tune":
optim = RayTuneSearch(
cv_num_folds=1, # the number of folds for cross-validation (0 or 1 to deactivate)
use_gpu=True, # flag to use the gpu if there are available
"gpu": 1
}, # a dictionary used to request GPU and CPU resources for each trial
sampler=sampler, # same as Optuna
direction="min", # the direction of optimization ("min" or "max")
num_iter=3, # the number of iterations to repeat the search
verbose=1, # verbose: 0 (silent), 1 (default), 2 (verbose)
Fit the optimizer#
start = time.time()
end = time.time() - start
print(f"Time taken: {end:.2f} seconds.")
[I 2023-08-10 18:30:42,101] Trial 0 finished with value: 0.6039116649574554 and parameters: {'batch_size': 100, 'epochs': 20}. Best is trial 0 with value: 0.6039116649574554.
[I 2023-08-10 18:30:48,913] Trial 1 finished with value: 0.5484475393481238 and parameters: {'batch_size': 50, 'epochs': 20}. Best is trial 1 with value: 0.5484475393481238.
Time taken: 16.13 seconds.
Get the results#
print("Best parameters:")
print("\nAssociated loss:")
Best parameters:
{'batch_size': 50, 'epochs': 20}
Associated loss:
[ ]: