Synthetic Data Generation – Find the best hyperparameters#
Tune hyperparameters of synthetic data generation algorithms, based on the Wisconsin Breast Cancer Dataset (WBCD)
[1]:
# Standard library
import sys
import time
import warnings
sys.path.append("..")
warnings.simplefilter(action="ignore", category=FutureWarning)
# 3rd party packages
import pandas as pd
from optuna import samplers
from ray import tune
# Local packages
import config
from generators.ctabgan_generator import CTABGANGenerator
from generators.ctgan_generator import CTGANGenerator
from generators.dataSynthesizer import DataSynthesizerGenerator
from generators.mst_generator import MSTGenerator
from generators.smote import SmoteGenerator
from generators.synthpop_generator import SynthpopGenerator
from generators.findiff_generator import FindiffGenerator
from generators.tvae_generator import TVAEGenerator
from optimization.discrete_pso_search import DiscreteParticleSwarmOptimizationSearch
from optimization.objective_function import (
absolute_difference_hinge_loss,
distinguishability_hinge_loss,
)
from optimization.optuna_search import OptunaSearch
from optimization.raytune_search import RayTuneSearch
Load the real WBCD training dataset#
[2]:
df_real = pd.read_csv("../data/" + config.WBCD_DATASET_TRAIN_FILEPATH.stem + ".csv")
df_real.shape
[2]:
(359, 10)
Create the metadata dictionary#
The continuous and categorical variables need to be specified, as well as the variable to predict for the future learning task (used by SMOTE)#
[3]:
metadata = {
"continuous": [
"Clump_Thickness",
"Uniformity_of_Cell_Size",
"Uniformity_of_Cell_Shape",
"Marginal_Adhesion",
"Single_Epithelial_Cell_Size",
"Bland_Chromatin",
"Normal_Nucleoli",
"Mitoses",
"Bare_Nuclei",
],
"categorical": ["Class"],
"variable_to_predict": "Class",
}
Choose the objective function#
Two options: - Minimize the distinguishability metric (ability of a classifier to distinguish the real set from the synthetic one). - Minimize the absolute difference between prediction scores on the real validation set for a predictor trained on real train set versus synthetic train set.
[4]:
objective_function = (
"distinguishability" # can be "distinguishability" or "difference_prediction_score"
)
[5]:
if objective_function == "distinguishability":
objective = distinguishability_hinge_loss
[6]:
if objective_function == "difference_prediction_score":
objective = absolute_difference_hinge_loss
Choose the optimizer#
[7]:
optimizer = "Optuna" # "PSO", "Optuna" or "Ray Tune"
Discrete Particle Swarm Optimization (PSO) for variable order#
[8]:
if optimizer == "PSO":
optim = DiscreteParticleSwarmOptimizationSearch(
df=df_real,
metadata=metadata,
hyperparams={
"variables_order": list(df_real.columns)
}, # the variable to optimize with the default sequence
generator=SynthpopGenerator, # the generator
objective_function=objective,
cv_num_folds=0, # the number of folds for cross-validation (0 or 1 to deactivate)
use_gpu=True, # flag to use the gpu if there are available
direction="min", # the direction of optimization ("min" or "max")
num_iter=2, # the number of iterations to repeat the search
population_size=2, # the size of the swarm
)
Optuna#
Configure the hyperparameters search space#
See Optuna for more details: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html
[9]:
def params_to_explore_optuna(trial):
params = {
"batch_size": trial.suggest_categorical("batch_size", [50, 100]),
"epochs": trial.suggest_categorical("epochs", [10, 20]),
}
return params
Choose the sampler algorithm#
Can be random search, grid search, Bayesian search (TPESampler), etc. See https://optuna.readthedocs.io/en/stable/reference/samplers/index.html for more samplers.[10]:
sampler = samplers.RandomSampler() # random search
Configure the search#
[11]:
if optimizer == "Optuna":
optim = OptunaSearch(
df=df_real,
metadata=metadata,
hyperparams=params_to_explore_optuna,
generator=TVAEGenerator, # the generator
objective_function=objective,
cv_num_folds=1, # the number of folds for cross-validation (0 or 1 to deactivate)
use_gpu=True, # flag to use the gpu if there are available
sampler=sampler,
# pruner # the algorithm early stopping the unsuccessful trials. No pruning by default.
direction="minimize", # the direction of optimization ("minimize" or "maximize")
num_iter=2, # the number of iterations to repeat the search
verbose=1, # whether to print the INFO logs (1) or not (0)
)
[I 2023-08-10 18:30:32,753] A new study created in memory with name: no-name-cab0c499-09ce-4a09-a6a9-50d8eb8d2d3e
Ray Tune with Optuna for parallelization#
For a single run, slower than standalone Optuna.
Configure the hyperparameters search space#
See Ray Tune for more details: https://docs.ray.io/en/latest/tune/api/search_space.html
[12]:
params_to_explore_raytune = {
"batch_size": tune.choice([50, 100]),
"epochs": tune.choice([10, 20]),
}
Configure the search#
[13]:
if optimizer == "Ray Tune":
optim = RayTuneSearch(
df=df_real,
metadata=metadata,
hyperparams=params_to_explore_raytune,
generator=TVAEGenerator,
objective_function=objective,
cv_num_folds=1, # the number of folds for cross-validation (0 or 1 to deactivate)
use_gpu=True, # flag to use the gpu if there are available
resources={
"gpu": 1
}, # a dictionary used to request GPU and CPU resources for each trial
sampler=sampler, # same as Optuna
direction="min", # the direction of optimization ("min" or "max")
num_iter=3, # the number of iterations to repeat the search
verbose=1, # verbose: 0 (silent), 1 (default), 2 (verbose)
)
Fit the optimizer#
[14]:
start = time.time()
optim.fit()
end = time.time() - start
print(f"Time taken: {end:.2f} seconds.")
[I 2023-08-10 18:30:42,101] Trial 0 finished with value: 0.6039116649574554 and parameters: {'batch_size': 100, 'epochs': 20}. Best is trial 0 with value: 0.6039116649574554.
[I 2023-08-10 18:30:48,913] Trial 1 finished with value: 0.5484475393481238 and parameters: {'batch_size': 50, 'epochs': 20}. Best is trial 1 with value: 0.5484475393481238.
Time taken: 16.13 seconds.
Get the results#
[15]:
print("Best parameters:")
print(optim.best_params)
print("\nAssociated loss:")
print(optim.best_cost)
Best parameters:
{'batch_size': 50, 'epochs': 20}
Associated loss:
0.5484475393481238
[ ]: