From dda5dc85ce924c91ca16e8306b89ebadb91de1c7 Mon Sep 17 00:00:00 2001 From: bzamanlooy Date: Tue, 26 May 2026 18:06:28 -0400 Subject: [PATCH 1/9] diabetes adapeted TF attack --- .../attacks/tartan_federer/classification.py | 2 +- .../attacks/tartan_federer/data_utils.py | 2 +- .../tartan_federer/tartan_federer_attack.py | 49 ++++++++++++------- 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/src/midst_toolkit/attacks/tartan_federer/classification.py b/src/midst_toolkit/attacks/tartan_federer/classification.py index 53f7a948..d47dc5c0 100644 --- a/src/midst_toolkit/attacks/tartan_federer/classification.py +++ b/src/midst_toolkit/attacks/tartan_federer/classification.py @@ -217,4 +217,4 @@ def fit_model( else: log(INFO, "Training complete (no validation set provided).") - return regression_model + return regression_model \ No newline at end of file diff --git a/src/midst_toolkit/attacks/tartan_federer/data_utils.py b/src/midst_toolkit/attacks/tartan_federer/data_utils.py index 6029ba3d..27a51220 100644 --- a/src/midst_toolkit/attacks/tartan_federer/data_utils.py +++ b/src/midst_toolkit/attacks/tartan_federer/data_utils.py @@ -206,4 +206,4 @@ def evaluate_attack_performance( roc_auc = roc_auc_score(solutions_arr, predictions_arr) predictions_arr = np.concatenate(predictions) - return {"max_tpr": tpr_at_fpr, "roc_auc": roc_auc} + return {"max_tpr": tpr_at_fpr, "roc_auc": roc_auc} \ No newline at end of file diff --git a/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py b/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py index 7e2def75..31057ec8 100644 --- a/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py +++ b/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py @@ -107,28 +107,13 @@ def make_dataset_from_df_with_loaded( numerical_transform: StandardScaler | None = None, noise_scale: float = 0, ) -> Dataset: - """ - Create a dataset using artifacts. - - Args: - data: Raw data to be used for creating the dataset. - transformation: Transformations that one might apply to the dataset, including NaN policies etc. - is_target_conditioned: Enum indicating how, if at all, the model uses a target for generation conditioning. - table_metadata: Meta data about the table or tables. - label_encoders: Encoders that were used to encode the categorical data. - numerical_transform: Transformations that should be applied to the numerical data. Defaults to None. - noise_scale: he scale of the noise to add to the categorical features. Noise is drawn from a normal - distribution with standard deviation of ``noise_scale``. Defaults to 0. - Returns: - A full dataset constructed of the various pieces. - """ categorical_column_names, numerical_column_names = get_categorical_and_numerical_column_names( table_metadata, is_target_conditioned, ) numerical_features = {DataSplit.TRAIN.value: data[numerical_column_names].values.astype(np.float32)} - categorical_features = {DataSplit.TRAIN.value: data[categorical_column_names].to_numpy(dtype=np.str_)} + categorical_features = {DataSplit.TRAIN.value: data[categorical_column_names].to_numpy()} targets = {DataSplit.TRAIN.value: data[[table_metadata.target_column_name]].values.astype(np.float32)} if len(categorical_column_names) > 0: @@ -153,6 +138,14 @@ def make_dataset_from_df_with_loaded( numerical_features = categorical_features target_info = TargetInfo(policy=None, mean=None, std=None) + + + # Apply the model's pre-fitted numerical transform directly instead of re-fitting a new one. + # Calling transform_dataset() would fit a brand new QuantileTransformer on the MIA data, + # which produces a different normalization than the model saw during training, destroying signal. + if numerical_transform is not None: + numerical_features = {k: numerical_transform.transform(v) for k, v in numerical_features.items()} + dataset = Dataset( numerical_features=numerical_features, categorical_features=None, @@ -163,7 +156,10 @@ def make_dataset_from_df_with_loaded( categorical_transform=None, numerical_transform=numerical_transform, ) - return transform_dataset(dataset, transformation, None) + # Use a no-normalization transformation since we've already applied the model's scaler above. + from dataclasses import replace as dc_replace + transformation_no_norm = dc_replace(transformation, normalization=None) + return transform_dataset(dataset, transformation_no_norm, None) def get_dataset( @@ -448,7 +444,22 @@ def train_tartan_federer_attack_classifier( population_df_for_validation = pd.read_csv(population_data_dir / "population_dataset_for_validating_attack.csv") log(INFO, "Population datasets for validating loaded.") - noise_dimension = len([col for col in population_df_for_training.columns if "_id" not in col]) + # Fix 1: derive noise dimension from the actual diffusion model's num_numerical_features rather + # than from the population dataframe column count. The mixed_loss function slices + # x[:, :diffusion.num_numerical_features], so the noise vectors must have exactly that length. + # We load the first available model to read this value, then discard it. + first_model_number = (train_indices + (val_indices or []))[0] + first_model_dir = model_data_dir / f"{model_type}_{first_model_number}" + first_model_path = first_model_dir / target_model_subdir + _relation_order = [("None", "trans")] if model_type == "tabddpm" else [] + for _parent, _child in _relation_order: + _ckpt_path = first_model_path / f"{_parent}_{_child}_ckpt.pkl" + with open(_ckpt_path, "rb") as _f: + _probe_model = CustomUnpickler(_f).load() + noise_dimension = _probe_model.diffusion.num_numerical_features + log(INFO, f"Noise dimension read from diffusion model: {noise_dimension}") + break + input_noise = [np.random.normal(size=noise_dimension).tolist() for _ in range(num_noise_per_time_step)] input_dimension = len(input_noise) * len(timesteps) * len(additional_timesteps) @@ -718,4 +729,4 @@ def tartan_federer_attack( f.write(str(mia_performance_test) + "\n") log(INFO, f"MIA performance results saved to {results_path / 'mia_performance.txt'}") - return mia_performance_train, mia_performance_val, mia_performance_test + return mia_performance_train, mia_performance_val, mia_performance_test \ No newline at end of file From a5b00ed965a335768e8d036ecd56c31eac09b00b Mon Sep 17 00:00:00 2001 From: bzamanlooy Date: Tue, 26 May 2026 18:11:47 -0400 Subject: [PATCH 2/9] Adapt Tartan Federer attack for diabetes --- .../attacks/tartan_federer/classification.py | 2 +- .../attacks/tartan_federer/data_utils.py | 2 +- .../tartan_federer/tartan_federer_attack.py | 22 +++++++++++++---- src/midst_toolkit/models/clavaddpm/dataset.py | 13 ++++++++++ .../models/clavaddpm/dataset_utils.py | 24 +++++++++++++++++-- 5 files changed, 55 insertions(+), 8 deletions(-) diff --git a/src/midst_toolkit/attacks/tartan_federer/classification.py b/src/midst_toolkit/attacks/tartan_federer/classification.py index d47dc5c0..53f7a948 100644 --- a/src/midst_toolkit/attacks/tartan_federer/classification.py +++ b/src/midst_toolkit/attacks/tartan_federer/classification.py @@ -217,4 +217,4 @@ def fit_model( else: log(INFO, "Training complete (no validation set provided).") - return regression_model \ No newline at end of file + return regression_model diff --git a/src/midst_toolkit/attacks/tartan_federer/data_utils.py b/src/midst_toolkit/attacks/tartan_federer/data_utils.py index 27a51220..6029ba3d 100644 --- a/src/midst_toolkit/attacks/tartan_federer/data_utils.py +++ b/src/midst_toolkit/attacks/tartan_federer/data_utils.py @@ -206,4 +206,4 @@ def evaluate_attack_performance( roc_auc = roc_auc_score(solutions_arr, predictions_arr) predictions_arr = np.concatenate(predictions) - return {"max_tpr": tpr_at_fpr, "roc_auc": roc_auc} \ No newline at end of file + return {"max_tpr": tpr_at_fpr, "roc_auc": roc_auc} diff --git a/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py b/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py index 31057ec8..20d83f33 100644 --- a/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py +++ b/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py @@ -3,6 +3,7 @@ import csv import os from collections.abc import Generator +from dataclasses import replace as dc_replace from logging import INFO from pathlib import Path from typing import Any @@ -98,6 +99,7 @@ def mixed_loss( # TODO: Unify this with the Dataset.from_df function. +# TODO: Noise scale is always called with a value of 0 for the attack. def make_dataset_from_df_with_loaded( data: pd.DataFrame, transformation: Transformations, @@ -107,7 +109,21 @@ def make_dataset_from_df_with_loaded( numerical_transform: StandardScaler | None = None, noise_scale: float = 0, ) -> Dataset: + """ + Makes a dataset from a dataframe with loaded transformations. + + Args: + data: The dataframe to make the dataset from. + transformation: The transformations to apply to the data. + is_target_conditioned: Whether the target is conditioned on the data. + table_metadata: The metadata for the table. + label_encoders: The label encoders for the categorical columns. + numerical_transform: The numerical transform to apply to the data. + noise_scale: The scale of the noise to add to the data. + Returns: + A dataset object. + """ categorical_column_names, numerical_column_names = get_categorical_and_numerical_column_names( table_metadata, is_target_conditioned, @@ -139,7 +155,6 @@ def make_dataset_from_df_with_loaded( target_info = TargetInfo(policy=None, mean=None, std=None) - # Apply the model's pre-fitted numerical transform directly instead of re-fitting a new one. # Calling transform_dataset() would fit a brand new QuantileTransformer on the MIA data, # which produces a different normalization than the model saw during training, destroying signal. @@ -157,7 +172,6 @@ def make_dataset_from_df_with_loaded( numerical_transform=numerical_transform, ) # Use a no-normalization transformation since we've already applied the model's scaler above. - from dataclasses import replace as dc_replace transformation_no_norm = dc_replace(transformation, normalization=None) return transform_dataset(dataset, transformation_no_norm, None) @@ -390,7 +404,7 @@ def prepare_dataframe( return filter_dataframe(merged_data, df_data, columns_for_deduplication) -def train_tartan_federer_attack_classifier( +def train_tartan_federer_attack_classifier( # noqa: PLR0915 train_indices: list[int], val_indices: list[int] | None, timesteps: list[int], @@ -729,4 +743,4 @@ def tartan_federer_attack( f.write(str(mia_performance_test) + "\n") log(INFO, f"MIA performance results saved to {results_path / 'mia_performance.txt'}") - return mia_performance_train, mia_performance_val, mia_performance_test \ No newline at end of file + return mia_performance_train, mia_performance_val, mia_performance_test diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index e3fee9e0..ee8a7d0a 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -377,10 +377,23 @@ def from_df( column_orders = numerical_column_names + categorical_column_names # Encode the categorical features and merge them with the numerical features + # Look for pre-fitted label encoders in the parent directories of the data + import os as _os + + _le_path = None + for _parent in [ + _os.path.join("whitebox_single_table_DI", "label_encoders.pkl"), + _os.path.join("whitebox_single_table_70", "label_encoders.pkl"), + ]: + if _os.path.exists(_parent): + _le_path = _parent + break features, label_encoders = encode_and_merge_features( categorical_features, numerical_features, noise_scale, + categorical_column_names=categorical_column_names if len(categorical_column_names) > 0 else None, + label_encoders_path=_le_path, ) assert isinstance(table_metadata.n_classes, int) diff --git a/src/midst_toolkit/models/clavaddpm/dataset_utils.py b/src/midst_toolkit/models/clavaddpm/dataset_utils.py index 8837af80..28b6d640 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset_utils.py +++ b/src/midst_toolkit/models/clavaddpm/dataset_utils.py @@ -57,6 +57,8 @@ def encode_and_merge_features( categorical_features: ArrayDict | None, numerical_features: ArrayDict | None, noise_scale: float, + categorical_column_names: list[str] | None = None, + label_encoders_path: str | None = None, ) -> tuple[ArrayDict, dict[int, LabelEncoder]]: """ Merge the categorical with the numerical features for train, validation, and test datasets. Numerical features @@ -75,6 +77,9 @@ def encode_and_merge_features( keys are "train", "val", "test" from the DataSplit enumeration noise_scale: The scale of the noise to add to the categorical features. Noise is drawn from a normal distribution with standard deviation of ``noise_scale``. + categorical_column_names: The names of the categorical columns. + label_encoders_path: The path to the label encoders pkl file. If provided, already fitted label encoder + will be loaded from the pkl file, otherwise they will be fitted on the current data. Returns: The merged features for train, validation, and test datasets and the label encoders used to do so. The label @@ -95,11 +100,26 @@ def encode_and_merge_features( ) ) + # Load pre-fitted label encoders from pkl if provided, otherwise fit on current data + preloaded_encoders: dict[str, LabelEncoder] | None = None + if label_encoders_path is not None: + _pkl_path = Path(label_encoders_path) + if _pkl_path.exists(): + with open(_pkl_path, "rb") as _f: + preloaded_encoders = pickle.load(_f) + categorical_data_encoded = [] label_encoders = {} for column in range(all_categorical_data.shape[1]): - label_encoder = LabelEncoder() - encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float) + col_name = categorical_column_names[column] if categorical_column_names is not None else None + if preloaded_encoders is not None and col_name is not None and col_name in preloaded_encoders: + # Use pre-fitted encoder from full dataset (e.g. 101K rows) + label_encoder = preloaded_encoders[col_name] + encoded_labels = label_encoder.transform(all_categorical_data[:, column]).astype(float) + else: + # Fallback: fit on current data + label_encoder = LabelEncoder() + encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float) if noise_scale > 0: # add noise encoded_labels += np.random.normal(0, noise_scale, encoded_labels.shape) From 8b9415a619e8921f964f65f44524ee7b8472496e Mon Sep 17 00:00:00 2001 From: bzamanlooy Date: Wed, 27 May 2026 12:19:29 -0400 Subject: [PATCH 3/9] Updated test --- .../test_tartan_federer_attack.py | 39 ++++++++++--------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py b/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py index c94f4549..5fea0f57 100644 --- a/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py +++ b/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py @@ -25,12 +25,12 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit(): "model_data_dir": base_path, "target_model_subdir": Path("."), "model_type": "tabddpm", - "classifier_hidden_dim": 20, - "classifier_num_epochs": 200, + "classifier_hidden_dim": 100, + "classifier_num_epochs": 20, "samples_per_train_model": 3000, "samples_per_val_model": 10, "num_noise_per_time_step": 30, - "timesteps": [5, 10, 15], + "timesteps": [5, 7, 9], "additional_timesteps": [0], "predictions_file_name": "challenge_label_predictions", # TODO: Make results path a temp directory @@ -52,14 +52,14 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit(): roc_auc_test = mia_performance_test["roc_auc"] tpr_at_fpr_test = mia_performance_test["max_tpr"] - assert roc_auc_train == pytest.approx(0.4469875, abs=1e-8) - assert tpr_at_fpr_train == pytest.approx(0.08, abs=1e-8) + assert roc_auc_train == pytest.approx(0.63159999999999999, abs=1e-8) + assert tpr_at_fpr_train == pytest.approx(0.165, abs=1e-8) - assert roc_auc_val == pytest.approx(0.5054624999999999, abs=1e-8) - assert tpr_at_fpr_val == pytest.approx(0.125, abs=1e-8) + assert roc_auc_val == pytest.approx(0.6732, abs=1e-8) + assert tpr_at_fpr_val == pytest.approx(0.28, abs=1e-8) - assert roc_auc_test == pytest.approx(0.4937875, abs=1e-8) - assert tpr_at_fpr_test == pytest.approx(0.115, abs=1e-8) + assert roc_auc_test == pytest.approx(0.6607, abs=1e-8) + assert tpr_at_fpr_test == pytest.approx(0.19, abs=1e-8) unset_all_random_seeds() os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None) @@ -107,14 +107,15 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit_single_model(): roc_auc_test = mia_performance_test["roc_auc"] tpr_at_fpr_test = mia_performance_test["max_tpr"] - assert roc_auc_train == pytest.approx(0.5046999999999999, abs=1e-8) - assert tpr_at_fpr_train == pytest.approx(0.09, abs=1e-8) - assert roc_auc_val == pytest.approx(0.47159999999999996, abs=1e-8) - assert tpr_at_fpr_val == pytest.approx(0.12, abs=1e-8) + assert roc_auc_train == pytest.approx(0.6985000000000001, abs=1e-8) + assert tpr_at_fpr_train == pytest.approx(0.33, abs=1e-8) + + assert roc_auc_val == pytest.approx(0.7075, abs=1e-8) + assert tpr_at_fpr_val == pytest.approx(0.32, abs=1e-8) - assert roc_auc_test == pytest.approx(0.46390000000000003, abs=1e-8) - assert tpr_at_fpr_test == pytest.approx(0.16, abs=1e-8) + assert roc_auc_test == pytest.approx(0.8042, abs=1e-8) + assert tpr_at_fpr_test == pytest.approx(0.56, abs=1e-8) unset_all_random_seeds() os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None) @@ -162,11 +163,11 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit_no_validation(): assert mia_performance_val is None - assert roc_auc_train == pytest.approx(0.4996999999999999, abs=1e-8) - assert tpr_at_fpr_train == pytest.approx(0.07, abs=1e-8) + assert roc_auc_train == pytest.approx(0.6980999999999999, abs=1e-8) + assert tpr_at_fpr_train == pytest.approx(0.33, abs=1e-8) - assert roc_auc_test == pytest.approx(0.5174, abs=1e-8) - assert tpr_at_fpr_test == pytest.approx(0.13, abs=1e-8) + assert roc_auc_test == pytest.approx(0.7075000000000001, abs=1e-8) + assert tpr_at_fpr_test == pytest.approx(0.32, abs=1e-8) unset_all_random_seeds() os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None) From 616c96dd0417be26c21838e6797502d2a1e6ea1f Mon Sep 17 00:00:00 2001 From: bzamanlooy Date: Wed, 27 May 2026 12:30:37 -0400 Subject: [PATCH 4/9] cleaning up and ruff check comment --- src/midst_toolkit/models/clavaddpm/dataset.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/dataset.py b/src/midst_toolkit/models/clavaddpm/dataset.py index ee8a7d0a..dd6739c1 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset.py +++ b/src/midst_toolkit/models/clavaddpm/dataset.py @@ -280,6 +280,7 @@ def from_df( table_metadata: TableMetadata, data_split_percentages: list[float] | None = None, noise_scale: float = 0, + label_encoders_path: str | None = None, # TODO: Find places in code that have this kind of hardcoded random default and remove (with TESTING) data_split_random_state: int = 42, ) -> tuple[Dataset, dict[int, LabelEncoder], list[str]]: @@ -314,6 +315,8 @@ def from_df( data_split_percentages: The percentages of the dataset to go into train, val, and test splits. The sum of the percentages must amount to 1 (within a tolerance of 0.01). Optional, default is [0.7, 0.2, 0.1]. noise_scale: The scale of the noise to add to the categorical features. Optional, default is 0. + label_encoders_path: The path to the label encoders pkl file. If provided, already fitted label encoder + will be loaded from the pkl file, otherwise they will be fitted on the current data. data_split_random_state: The random state to use for the data split. Will be passed down to the ``train_test_split`` function from sklearn. Optional, default is 42. @@ -378,22 +381,13 @@ def from_df( # Encode the categorical features and merge them with the numerical features # Look for pre-fitted label encoders in the parent directories of the data - import os as _os - - _le_path = None - for _parent in [ - _os.path.join("whitebox_single_table_DI", "label_encoders.pkl"), - _os.path.join("whitebox_single_table_70", "label_encoders.pkl"), - ]: - if _os.path.exists(_parent): - _le_path = _parent - break + features, label_encoders = encode_and_merge_features( categorical_features, numerical_features, noise_scale, categorical_column_names=categorical_column_names if len(categorical_column_names) > 0 else None, - label_encoders_path=_le_path, + label_encoders_path=label_encoders_path, ) assert isinstance(table_metadata.n_classes, int) From 0b698b16079364082e25e8563803693c8eefd02b Mon Sep 17 00:00:00 2001 From: bzamanlooy Date: Wed, 27 May 2026 12:46:39 -0400 Subject: [PATCH 5/9] ruff --- .../attacks/tartan_federer/test_tartan_federer_attack.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py b/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py index 5fea0f57..f44f3877 100644 --- a/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py +++ b/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py @@ -107,7 +107,6 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit_single_model(): roc_auc_test = mia_performance_test["roc_auc"] tpr_at_fpr_test = mia_performance_test["max_tpr"] - assert roc_auc_train == pytest.approx(0.6985000000000001, abs=1e-8) assert tpr_at_fpr_train == pytest.approx(0.33, abs=1e-8) From 200eb888aac4c34b2183eeeed2efc4075f59afe1 Mon Sep 17 00:00:00 2001 From: bzamanlooy Date: Wed, 27 May 2026 13:14:35 -0400 Subject: [PATCH 6/9] changed atack numbers with a cpu run to make more stable --- .../attacks/tartan_federer/test_tartan_federer_attack.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py b/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py index f44f3877..84d313d6 100644 --- a/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py +++ b/tests/integration/attacks/tartan_federer/test_tartan_federer_attack.py @@ -52,7 +52,7 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit(): roc_auc_test = mia_performance_test["roc_auc"] tpr_at_fpr_test = mia_performance_test["max_tpr"] - assert roc_auc_train == pytest.approx(0.63159999999999999, abs=1e-8) + assert roc_auc_train == pytest.approx(0.6315875, abs=1e-8) assert tpr_at_fpr_train == pytest.approx(0.165, abs=1e-8) assert roc_auc_val == pytest.approx(0.6732, abs=1e-8) From c01f3a956ad3b80bcdf0f150898dc29ca2901485 Mon Sep 17 00:00:00 2001 From: bzamanlooy Date: Wed, 27 May 2026 14:30:44 -0400 Subject: [PATCH 7/9] addressed coderabbit comments --- .../tartan_federer/tartan_federer_attack.py | 10 ++++-- .../models/clavaddpm/dataset_utils.py | 32 +++++++++++++++---- 2 files changed, 33 insertions(+), 9 deletions(-) diff --git a/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py b/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py index 20d83f33..2218fdbe 100644 --- a/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py +++ b/src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py @@ -404,7 +404,7 @@ def prepare_dataframe( return filter_dataframe(merged_data, df_data, columns_for_deduplication) -def train_tartan_federer_attack_classifier( # noqa: PLR0915 +def train_tartan_federer_attack_classifier( # noqa: PLR0915, PLR0912 train_indices: list[int], val_indices: list[int] | None, timesteps: list[int], @@ -465,7 +465,13 @@ def train_tartan_federer_attack_classifier( # noqa: PLR0915 first_model_number = (train_indices + (val_indices or []))[0] first_model_dir = model_data_dir / f"{model_type}_{first_model_number}" first_model_path = first_model_dir / target_model_subdir - _relation_order = [("None", "trans")] if model_type == "tabddpm" else [] + + if model_type != "tabddpm": + raise ValueError( + f"Unsupported model_type {model_type}. Tartan Federer Attack is only supported for ClavaDDPM-single-table models." + ) + # TODO: We should read this from the metadata instead. + _relation_order = [("None", "trans")] for _parent, _child in _relation_order: _ckpt_path = first_model_path / f"{_parent}_{_child}_ckpt.pkl" with open(_ckpt_path, "rb") as _f: diff --git a/src/midst_toolkit/models/clavaddpm/dataset_utils.py b/src/midst_toolkit/models/clavaddpm/dataset_utils.py index 28b6d640..e1d591fa 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset_utils.py +++ b/src/midst_toolkit/models/clavaddpm/dataset_utils.py @@ -104,25 +104,43 @@ def encode_and_merge_features( preloaded_encoders: dict[str, LabelEncoder] | None = None if label_encoders_path is not None: _pkl_path = Path(label_encoders_path) - if _pkl_path.exists(): - with open(_pkl_path, "rb") as _f: - preloaded_encoders = pickle.load(_f) + + if not _pkl_path.exists(): + raise FileNotFoundError(f"label_encoders_path does not exist: {_pkl_path}") + with open(_pkl_path, "rb") as _f: + preloaded_encoders = pickle.load(_f) + + if preloaded_encoders is not None: + if categorical_column_names is None: + raise ValueError("categorical_column_names must be provided when using label_encoders_path.") + + expected_cols = set(categorical_column_names) + available_cols = set(preloaded_encoders.keys()) + + missing_cols = expected_cols - available_cols + + if missing_cols: + raise ValueError( + "label_encoders_path is missing encoders for categorical columns: " + f"{sorted(missing_cols)}. " + "Refusing to mix preloaded encoders with freshly fit encoders." + ) categorical_data_encoded = [] label_encoders = {} for column in range(all_categorical_data.shape[1]): col_name = categorical_column_names[column] if categorical_column_names is not None else None - if preloaded_encoders is not None and col_name is not None and col_name in preloaded_encoders: - # Use pre-fitted encoder from full dataset (e.g. 101K rows) + + if preloaded_encoders is not None: label_encoder = preloaded_encoders[col_name] encoded_labels = label_encoder.transform(all_categorical_data[:, column]).astype(float) else: - # Fallback: fit on current data label_encoder = LabelEncoder() encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float) + if noise_scale > 0: - # add noise encoded_labels += np.random.normal(0, noise_scale, encoded_labels.shape) + categorical_data_encoded.append(encoded_labels) label_encoders[column] = label_encoder From a66d5e951a5247fe9e26858e2ef4784726548379 Mon Sep 17 00:00:00 2001 From: bzamanlooy Date: Wed, 27 May 2026 15:39:15 -0400 Subject: [PATCH 8/9] fix mypy issues --- src/midst_toolkit/models/clavaddpm/dataset_utils.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/dataset_utils.py b/src/midst_toolkit/models/clavaddpm/dataset_utils.py index e1d591fa..a5f455de 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset_utils.py +++ b/src/midst_toolkit/models/clavaddpm/dataset_utils.py @@ -111,8 +111,7 @@ def encode_and_merge_features( preloaded_encoders = pickle.load(_f) if preloaded_encoders is not None: - if categorical_column_names is None: - raise ValueError("categorical_column_names must be provided when using label_encoders_path.") + assert categorical_column_names is not None, "categorical_column_names must be provided when using label_encoders_path." expected_cols = set(categorical_column_names) available_cols = set(preloaded_encoders.keys()) @@ -126,13 +125,14 @@ def encode_and_merge_features( "Refusing to mix preloaded encoders with freshly fit encoders." ) + + categorical_data_encoded = [] label_encoders = {} for column in range(all_categorical_data.shape[1]): - col_name = categorical_column_names[column] if categorical_column_names is not None else None - if preloaded_encoders is not None: - label_encoder = preloaded_encoders[col_name] + assert categorical_column_names is not None + label_encoder = preloaded_encoders[categorical_column_names[column]] encoded_labels = label_encoder.transform(all_categorical_data[:, column]).astype(float) else: label_encoder = LabelEncoder() From fee5134548cbff3dd62f4bef0a1dbfdf1c4902a4 Mon Sep 17 00:00:00 2001 From: bzamanlooy Date: Thu, 28 May 2026 11:45:22 -0400 Subject: [PATCH 9/9] fix mypy error --- src/midst_toolkit/models/clavaddpm/dataset_utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/midst_toolkit/models/clavaddpm/dataset_utils.py b/src/midst_toolkit/models/clavaddpm/dataset_utils.py index a5f455de..31c2af88 100644 --- a/src/midst_toolkit/models/clavaddpm/dataset_utils.py +++ b/src/midst_toolkit/models/clavaddpm/dataset_utils.py @@ -111,7 +111,9 @@ def encode_and_merge_features( preloaded_encoders = pickle.load(_f) if preloaded_encoders is not None: - assert categorical_column_names is not None, "categorical_column_names must be provided when using label_encoders_path." + assert categorical_column_names is not None, ( + "categorical_column_names must be provided when using label_encoders_path." + ) expected_cols = set(categorical_column_names) available_cols = set(preloaded_encoders.keys()) @@ -125,14 +127,12 @@ def encode_and_merge_features( "Refusing to mix preloaded encoders with freshly fit encoders." ) - - categorical_data_encoded = [] label_encoders = {} for column in range(all_categorical_data.shape[1]): if preloaded_encoders is not None: assert categorical_column_names is not None - label_encoder = preloaded_encoders[categorical_column_names[column]] + label_encoder = preloaded_encoders[categorical_column_names[column]] encoded_labels = label_encoder.transform(all_categorical_data[:, column]).astype(float) else: label_encoder = LabelEncoder()