Source code for tests.metrics.conftest

import pytest  # 3rd party packages
import pandas as pd
import numpy as np


[docs]@pytest.fixture(scope="package")
def df_mock_wbcd(df_wbcd: dict[str, pd.DataFrame]) -> dict[str, pd.DataFrame]:
    """
    Generate the continuous mock Wisconsin Breast Cancer Dataset wbcd without ids.

    :param df_wbcd: the wbcd dataset fixture, split into **train** and **test** sets
    :return: the dataframe containing the mock wbcd dataset, split into **train**, **test** and **2nd_gen** sets
    """
    # Shuffle each column with replacement

    df = {}

    for set in ["train", "test"]:
        df[set] = df_wbcd[set].apply(
            lambda x: np.random.choice(x.unique(), size=len(x), replace=True)
        )

    df["2nd_gen"] = df["train"].apply(
        lambda x: np.random.choice(x.unique(), size=len(x), replace=True)
    )

    for set in ["train", "test", "2nd_gen"]:
        # Ensure the support coverage is different
        df[set] = df[set].replace(
            {
                "Clump_Thickness": 3,
                "Uniformity_of_Cell_Shape": 1,
            },
            8,
        )
        # Ensure the consistency is different
        df[set] = df[set].replace({"Bland_Chromatin": 2}, 11)
        df[set] = df[set].replace({"Normal_Nucleoli": "2"}, "11")
        df[set] = df[set].replace({"Uniformity_of_Cell_Shape": 2}, 11)

    return df