Source code for tests.generators.test_generators

import pytest  # standard library
from typing import Type
import tempfile
from pathlib import Path
from inspect import getfullargspec

# 3rd party packages
import pandas as pd

# Local packages
from clover.generators import (
    Generator,
    DataSynthesizerGenerator,
    SynthpopGenerator,
    SmoteGenerator,
    TVAEGenerator,
    CTGANGenerator,
    FinDiffGenerator,
    MSTGenerator,
    CTABGANGenerator,
)


[docs]@pytest.mark.parametrize( "generator_non_dp", [ DataSynthesizerGenerator, SynthpopGenerator, SmoteGenerator, CTGANGenerator, TVAEGenerator, CTABGANGenerator, FinDiffGenerator, ], ) def test_generation_non_dp( generator_non_dp: Type[Generator], df_wbcd: dict[str, pd.DataFrame], metadata_wbcd: dict, ) -> None: """ Check the generation process for non differentially private generators. :param generator_non_dp: the class of the generator to test :param df_wbcd: the real Wisconsin Breast Cancer Dataset fixture, split into **train** and **test** sets :param metadata_wbcd: the wbcd metadata fixture :return: *None* """ # Instance parameters with tempfile.TemporaryDirectory() as temp_dir: # no need to keep the generated files temp_dir = Path(temp_dir) datapath = temp_dir / "real_data.csv" df_wbcd["train"].to_csv(datapath, index=False) d = { "df": df_wbcd["train"], "metadata": metadata_wbcd, "random_state": 0, "generator_filepath": None, "epsilon": None, "candidate_keys": None, # datasynthesizer "degree": 2, # datasynthesizer "variables_order": None, # synthpop "min_samples_leaf": 5, # synthpop "max_depth": None, # synthpop "k_neighbors": 5, # smote "discriminator_steps": 2, # ctgan "epochs": 2, # ctgan / tvae / ctabganplus / findiff "batch_size": 100, # ctgan / tvae / ctabganplus / findiff "verbose": 0, # ctgan "max_physical_batch_size": 126, # tvae "compress_dims": (249, 249), # tvae "decompress_dims": (249, 249), # tvae "mixed_columns": None, # ctabganplus "log_columns": None, # ctabganplus "integer_columns": None, # ctabganplus "class_dim": (256, 256, 256, 256), # ctabganplus "random_dim": 100, # ctabganplus "num_channels": 64, # ctabganplus "l2scale": 1e-5, # ctabganplus "learning_rate": 1e-4, # findiff "diffusion_steps": 50, # findiff "mpl_layers": [1024, 1024, 1024, 1024], # findiff "activation": "lrelu", # findiff "dim_t": 64, # findiff "cat_emb_dim": 2, # findiff "diff_beta_start_end": [1e-4, 0.02], # findiff "scheduler": "linear", # findiff # Parameters not applicable to non-dp generators "preprocess_metadata": None, "n_bins": None, # synthpop "methods": None, # synthpop "prediction_matrix": None, # synthpop "n_parents": None, # synthpop "nu": None, # smote "r": None, # smote "delta": None, # ctgan / tvae / ctabganplus / findiff "max_grad_norm": None, # ctgan / tvae / ctabganplus / findiff "nbins": 10, # mst } # Select only the expected instance parameters args = getfullargspec(generator_non_dp).args[1:] # remove self gen = generator_non_dp(*[d[arg] for arg in args]) # Preprocess and fit the generator gen.preprocess() gen.fit(save_path=temp_dir) # Check that the generator is saved num_files = len(list(temp_dir.glob("*"))) assert ( num_files >= 2 ), "The generator should have been saved" # with the datafile # Generate the samples df_synth = gen.sample(save_path=temp_dir, num_samples=len(df_wbcd["train"])) # Check that the generated samples are consistent num_files_plusone = len(list(Path(temp_dir).glob("*"))) assert num_files_plusone > num_files, "The samples should have been saved" assert ( df_wbcd["train"].shape == df_synth.shape ), "Datasets must have the same shape" assert set(df_wbcd["train"].columns) == set( df_synth.columns ), "Datasets must have the same columns"
[docs]@pytest.mark.parametrize( "generator_dp", [ DataSynthesizerGenerator, SynthpopGenerator, SmoteGenerator, MSTGenerator, CTGANGenerator, TVAEGenerator, CTABGANGenerator, FinDiffGenerator, ], ) def test_generation_dp( generator_dp: Type[Generator], df_wbcd: dict[str, pd.DataFrame], metadata_wbcd: dict, preprocess_metadata_wbcd: dict, ) -> None: """ Check the generation process for differentially private generators. :param generator_dp: the class of the generator to test :param df_wbcd: the real Wisconsin Breast Cancer Dataset fixture, split into **train** and **test** sets :param metadata_wbcd: the wbcd metadata fixture :param preprocess_metadata_wbcd: the wbcd preprocessing metadata fixture :return: *None* """ # Instance parameters with tempfile.TemporaryDirectory() as temp_dir: # no need to keep the generated files temp_dir = Path(temp_dir) datapath = temp_dir / "real_data.csv" df_wbcd["train"].to_csv(datapath, index=False) d = { "df": df_wbcd["train"], "metadata": metadata_wbcd, "random_state": 0, "generator_filepath": None, "epsilon": 1, "preprocess_metadata": preprocess_metadata_wbcd, "candidate_keys": None, # datasynthesizer "degree": 2, # datasynthesizer "variables_order": None, # synthpop "max_depth": 3, # synthpop "n_bins": 10, # synthpop "methods": None, # synthpop "prediction_matrix": None, # synthpop "n_parents": 2, # synthpop "k_neighbors": 5, # smote "nu": 0.5, # smote "cat_emb_dim": 2, # smote / findiff "r": 1, # smote "delta": 1e-9, # MST / ctgan / tvae / ctabganplus / findiff "discriminator_steps": 2, # ctgan "epochs": 2, # ctgan / tvae / ctabganplus / findiff "batch_size": 100, # ctgan / tvae / ctabganplus / findiff "max_grad_norm": 1, # ctgan / tvae / ctabganplus / findiff "verbose": 0, # ctgan "max_physical_batch_size": 126, # tvae "compress_dims": (249, 249), # tvae "decompress_dims": (249, 249), # tvae "mixed_columns": None, # ctabganplus "log_columns": None, # ctabganplus "integer_columns": None, # ctabganplus "class_dim": (256, 256, 256, 256), # ctabganplus "random_dim": 100, # ctabganplus "num_channels": 64, # ctabganplus "l2scale": 1e-5, # ctabganplus "learning_rate": 1e-4, # findiff "diffusion_steps": 50, # findiff "mpl_layers": [1024, 1024, 1024, 1024], # findiff "activation": "lrelu", # findiff "dim_t": 64, # findiff "diff_beta_start_end": [1e-4, 0.02], # findiff "scheduler": "linear", # findiff # Parameters not applicable to dp generators "min_samples_leaf": None, # synthpop "nbins": 10, # mst } # Select only the expected instance parameters args = getfullargspec(generator_dp).args[1:] # remove self gen = generator_dp(*[d[arg] for arg in args]) # Preprocess and fit the generator gen.preprocess() gen.fit(save_path=temp_dir) # Check that the generator is saved num_files = len(list(temp_dir.glob("*"))) assert ( num_files >= 2 ), "The generator should have been saved" # with the datafile # Generate the samples df_synth = gen.sample(save_path=temp_dir, num_samples=len(df_wbcd["train"])) # Check that the generated samples are consistent num_files_plusone = len(list(Path(temp_dir).glob("*"))) assert num_files_plusone > num_files, "The samples should have been saved" assert ( df_wbcd["train"].shape == df_synth.shape ), "Datasets must have the same shape" assert set(df_wbcd["train"].columns) == set( df_synth.columns ), "Datasets must have the same columns"