Synthetic Data Generation – Find the best hyperparameters#
Tune hyperparameters of synthetic data generation algorithms, based on the Wisconsin Breast Cancer Dataset (WBCD)
[25]:
# Standard library
import sys
import time
import warnings
from pathlib import Path
sys.path.append("..")
warnings.simplefilter(action="ignore", category=FutureWarning)
# 3rd party packages
import pandas as pd
from optuna import samplers
from ray import tune
# Local packages
from clover.generators import (
Generator,
DataSynthesizerGenerator,
SynthpopGenerator,
SmoteGenerator,
TVAEGenerator,
CTGANGenerator,
FinDiffGenerator,
MSTGenerator,
CTABGANGenerator,
)
from clover.optimization import (
DiscreteParticleSwarmOptimizationSearch,
absolute_difference_hinge_loss,
distinguishability_hinge_loss,
OptunaSearch,
RayTuneSearch,
)
[27]:
PROJECT_PATH = Path(".")
OUTPUT_PATH = PROJECT_PATH / "output"
Load the real WBCD training dataset#
[ ]:
df_real = pd.read_csv("data/breast_cancer_wisconsin_train.csv")
df_real.shape
Create the metadata dictionary#
The continuous and categorical variables need to be specified, as well as the variable to predict for the future learning task (used by SMOTE)#
[ ]:
metadata = {
"continuous": [
"Clump_Thickness",
"Uniformity_of_Cell_Size",
"Uniformity_of_Cell_Shape",
"Marginal_Adhesion",
"Single_Epithelial_Cell_Size",
"Bland_Chromatin",
"Normal_Nucleoli",
"Mitoses",
"Bare_Nuclei",
],
"categorical": ["Class"],
"variable_to_predict": "Class",
}
Choose the objective function#
Two options: - Minimize the distinguishability metric (ability of a classifier to distinguish the real set from the synthetic one). - Minimize the absolute difference between prediction scores on the real validation set for a predictor trained on real train set versus synthetic train set.
[ ]:
objective_function = (
"distinguishability" # can be "distinguishability" or "difference_prediction_score"
)
[ ]:
if objective_function == "distinguishability":
objective = distinguishability_hinge_loss
[ ]:
if objective_function == "difference_prediction_score":
objective = absolute_difference_hinge_loss
Choose the optimizer#
[ ]:
optimizer = "Optuna" # "PSO", "Optuna" or "Ray Tune"
Discrete Particle Swarm Optimization (PSO) for variable order#
[ ]:
if optimizer == "PSO":
optim = DiscreteParticleSwarmOptimizationSearch(
df=df_real,
metadata=metadata,
hyperparams={
"variables_order": list(df_real.columns)
}, # the variable to optimize with the default sequence
generator=SynthpopGenerator, # the generator
objective_function=objective,
cv_num_folds=0, # the number of folds for cross-validation (0 or 1 to deactivate)
use_gpu=True, # flag to use the gpu if there are available
direction="min", # the direction of optimization ("min" or "max")
num_iter=2, # the number of iterations to repeat the search
population_size=2, # the size of the swarm
)
Optuna#
Configure the hyperparameters search space#
See Optuna for more details: https://optuna.readthedocs.io/en/stable/reference/generated/optuna.trial.Trial.html
[ ]:
def params_to_explore_optuna(trial):
params = {
"batch_size": trial.suggest_categorical("batch_size", [50, 100]),
"epochs": trial.suggest_categorical("epochs", [10, 20]),
}
return params
Choose the sampler algorithm#
Can be random search, grid search, Bayesian search (TPESampler), etc. See https://optuna.readthedocs.io/en/stable/reference/samplers/index.html for more samplers.[ ]:
sampler = samplers.RandomSampler() # random search
Configure the search#
[ ]:
if optimizer == "Optuna":
optim = OptunaSearch(
df=df_real,
metadata=metadata,
hyperparams=params_to_explore_optuna,
generator=TVAEGenerator, # the generator
objective_function=objective,
cv_num_folds=1, # the number of folds for cross-validation (0 or 1 to deactivate)
use_gpu=True, # flag to use the gpu if there are available
sampler=sampler,
# pruner # the algorithm early stopping the unsuccessful trials. No pruning by default.
direction="minimize", # the direction of optimization ("minimize" or "maximize")
num_iter=2, # the number of iterations to repeat the search
verbose=1, # whether to print the INFO logs (1) or not (0)
)
Ray Tune with Optuna for parallelization#
For a single run, slower than standalone Optuna.
Configure the hyperparameters search space#
See Ray Tune for more details: https://docs.ray.io/en/latest/tune/api/search_space.html
[ ]:
params_to_explore_raytune = {
"batch_size": tune.choice([50, 100]),
"epochs": tune.choice([10, 20]),
}
Configure the search#
[ ]:
if optimizer == "Ray Tune":
optim = RayTuneSearch(
df=df_real,
metadata=metadata,
hyperparams=params_to_explore_raytune,
generator=TVAEGenerator,
objective_function=objective,
cv_num_folds=1, # the number of folds for cross-validation (0 or 1 to deactivate)
use_gpu=True, # flag to use the gpu if there are available
resources={
"gpu": 1
}, # a dictionary used to request GPU and CPU resources for each trial
sampler=sampler, # same as Optuna
direction="min", # the direction of optimization ("min" or "max")
num_iter=1, # the number of iterations to repeat the search
verbose=1, # verbose: 0 (silent), 1 (default), 2 (verbose)
output_path=OUTPUT_PATH,
)
Fit the optimizer#
[ ]:
start = time.time()
optim.fit()
end = time.time() - start
print(f"Time taken: {end:.2f} seconds.")
Get the results#
[ ]:
print("Best parameters:")
print(optim.best_params)
print("\nAssociated loss:")
print(optim.best_cost)
[ ]: