Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 45 additions & 14 deletions src/midst_toolkit/attacks/tartan_federer/tartan_federer_attack.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import csv
import os
from collections.abc import Generator
from dataclasses import replace as dc_replace
from logging import INFO
from pathlib import Path
from typing import Any
Expand Down Expand Up @@ -98,6 +99,7 @@ def mixed_loss(


# TODO: Unify this with the Dataset.from_df function.
# TODO: Noise scale is always called with a value of 0 for the attack.
def make_dataset_from_df_with_loaded(
data: pd.DataFrame,
transformation: Transformations,
Expand All @@ -108,27 +110,26 @@ def make_dataset_from_df_with_loaded(
noise_scale: float = 0,
) -> Dataset:
"""
Create a dataset using artifacts.
Makes a dataset from a dataframe with loaded transformations.

Args:
data: Raw data to be used for creating the dataset.
transformation: Transformations that one might apply to the dataset, including NaN policies etc.
is_target_conditioned: Enum indicating how, if at all, the model uses a target for generation conditioning.
table_metadata: Meta data about the table or tables.
label_encoders: Encoders that were used to encode the categorical data.
numerical_transform: Transformations that should be applied to the numerical data. Defaults to None.
noise_scale: he scale of the noise to add to the categorical features. Noise is drawn from a normal
distribution with standard deviation of ``noise_scale``. Defaults to 0.
data: The dataframe to make the dataset from.
transformation: The transformations to apply to the data.
is_target_conditioned: Whether the target is conditioned on the data.
table_metadata: The metadata for the table.
label_encoders: The label encoders for the categorical columns.
numerical_transform: The numerical transform to apply to the data.
noise_scale: The scale of the noise to add to the data.

Returns:
A full dataset constructed of the various pieces.
A dataset object.
"""
categorical_column_names, numerical_column_names = get_categorical_and_numerical_column_names(
table_metadata,
is_target_conditioned,
)
numerical_features = {DataSplit.TRAIN.value: data[numerical_column_names].values.astype(np.float32)}
categorical_features = {DataSplit.TRAIN.value: data[categorical_column_names].to_numpy(dtype=np.str_)}
categorical_features = {DataSplit.TRAIN.value: data[categorical_column_names].to_numpy()}
targets = {DataSplit.TRAIN.value: data[[table_metadata.target_column_name]].values.astype(np.float32)}

if len(categorical_column_names) > 0:
Expand All @@ -153,6 +154,13 @@ def make_dataset_from_df_with_loaded(
numerical_features = categorical_features

target_info = TargetInfo(policy=None, mean=None, std=None)

# Apply the model's pre-fitted numerical transform directly instead of re-fitting a new one.
# Calling transform_dataset() would fit a brand new QuantileTransformer on the MIA data,
# which produces a different normalization than the model saw during training, destroying signal.
if numerical_transform is not None:
numerical_features = {k: numerical_transform.transform(v) for k, v in numerical_features.items()}

dataset = Dataset(
numerical_features=numerical_features,
categorical_features=None,
Expand All @@ -163,7 +171,9 @@ def make_dataset_from_df_with_loaded(
categorical_transform=None,
numerical_transform=numerical_transform,
)
return transform_dataset(dataset, transformation, None)
# Use a no-normalization transformation since we've already applied the model's scaler above.
transformation_no_norm = dc_replace(transformation, normalization=None)
return transform_dataset(dataset, transformation_no_norm, None)


def get_dataset(
Expand Down Expand Up @@ -394,7 +404,7 @@ def prepare_dataframe(
return filter_dataframe(merged_data, df_data, columns_for_deduplication)


def train_tartan_federer_attack_classifier(
def train_tartan_federer_attack_classifier( # noqa: PLR0915, PLR0912
train_indices: list[int],
val_indices: list[int] | None,
timesteps: list[int],
Expand Down Expand Up @@ -448,7 +458,28 @@ def train_tartan_federer_attack_classifier(
population_df_for_validation = pd.read_csv(population_data_dir / "population_dataset_for_validating_attack.csv")
log(INFO, "Population datasets for validating loaded.")

noise_dimension = len([col for col in population_df_for_training.columns if "_id" not in col])
# Fix 1: derive noise dimension from the actual diffusion model's num_numerical_features rather
# than from the population dataframe column count. The mixed_loss function slices
# x[:, :diffusion.num_numerical_features], so the noise vectors must have exactly that length.
# We load the first available model to read this value, then discard it.
first_model_number = (train_indices + (val_indices or []))[0]
first_model_dir = model_data_dir / f"{model_type}_{first_model_number}"
first_model_path = first_model_dir / target_model_subdir

if model_type != "tabddpm":
raise ValueError(
f"Unsupported model_type {model_type}. Tartan Federer Attack is only supported for ClavaDDPM-single-table models."
)
# TODO: We should read this from the metadata instead.
_relation_order = [("None", "trans")]
for _parent, _child in _relation_order:
_ckpt_path = first_model_path / f"{_parent}_{_child}_ckpt.pkl"
with open(_ckpt_path, "rb") as _f:
_probe_model = CustomUnpickler(_f).load()
noise_dimension = _probe_model.diffusion.num_numerical_features
log(INFO, f"Noise dimension read from diffusion model: {noise_dimension}")
break

input_noise = [np.random.normal(size=noise_dimension).tolist() for _ in range(num_noise_per_time_step)]
input_dimension = len(input_noise) * len(timesteps) * len(additional_timesteps)

Expand Down
7 changes: 7 additions & 0 deletions src/midst_toolkit/models/clavaddpm/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,6 +280,7 @@ def from_df(
table_metadata: TableMetadata,
data_split_percentages: list[float] | None = None,
noise_scale: float = 0,
label_encoders_path: str | None = None,
# TODO: Find places in code that have this kind of hardcoded random default and remove (with TESTING)
data_split_random_state: int = 42,
) -> tuple[Dataset, dict[int, LabelEncoder], list[str]]:
Expand Down Expand Up @@ -314,6 +315,8 @@ def from_df(
data_split_percentages: The percentages of the dataset to go into train, val, and test splits. The sum of
the percentages must amount to 1 (within a tolerance of 0.01). Optional, default is [0.7, 0.2, 0.1].
noise_scale: The scale of the noise to add to the categorical features. Optional, default is 0.
label_encoders_path: The path to the label encoders pkl file. If provided, already fitted label encoder
will be loaded from the pkl file, otherwise they will be fitted on the current data.
data_split_random_state: The random state to use for the data split. Will be passed down to the
``train_test_split`` function from sklearn. Optional, default is 42.

Expand Down Expand Up @@ -377,10 +380,14 @@ def from_df(
column_orders = numerical_column_names + categorical_column_names

# Encode the categorical features and merge them with the numerical features
# Look for pre-fitted label encoders in the parent directories of the data

features, label_encoders = encode_and_merge_features(
categorical_features,
numerical_features,
noise_scale,
categorical_column_names=categorical_column_names if len(categorical_column_names) > 0 else None,
label_encoders_path=label_encoders_path,
)

assert isinstance(table_metadata.n_classes, int)
Expand Down
44 changes: 41 additions & 3 deletions src/midst_toolkit/models/clavaddpm/dataset_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ def encode_and_merge_features(
categorical_features: ArrayDict | None,
numerical_features: ArrayDict | None,
noise_scale: float,
categorical_column_names: list[str] | None = None,
label_encoders_path: str | None = None,
) -> tuple[ArrayDict, dict[int, LabelEncoder]]:
"""
Merge the categorical with the numerical features for train, validation, and test datasets. Numerical features
Expand All @@ -75,6 +77,9 @@ def encode_and_merge_features(
keys are "train", "val", "test" from the DataSplit enumeration
noise_scale: The scale of the noise to add to the categorical features. Noise is drawn from a normal
distribution with standard deviation of ``noise_scale``.
categorical_column_names: The names of the categorical columns.
label_encoders_path: The path to the label encoders pkl file. If provided, already fitted label encoder
will be loaded from the pkl file, otherwise they will be fitted on the current data.

Returns:
The merged features for train, validation, and test datasets and the label encoders used to do so. The label
Expand All @@ -95,14 +100,47 @@ def encode_and_merge_features(
)
)

# Load pre-fitted label encoders from pkl if provided, otherwise fit on current data
preloaded_encoders: dict[str, LabelEncoder] | None = None
if label_encoders_path is not None:
_pkl_path = Path(label_encoders_path)

if not _pkl_path.exists():
raise FileNotFoundError(f"label_encoders_path does not exist: {_pkl_path}")
with open(_pkl_path, "rb") as _f:
preloaded_encoders = pickle.load(_f)

if preloaded_encoders is not None:
if categorical_column_names is None:
raise ValueError("categorical_column_names must be provided when using label_encoders_path.")

expected_cols = set(categorical_column_names)
available_cols = set(preloaded_encoders.keys())

missing_cols = expected_cols - available_cols

if missing_cols:
raise ValueError(
"label_encoders_path is missing encoders for categorical columns: "
f"{sorted(missing_cols)}. "
"Refusing to mix preloaded encoders with freshly fit encoders."
)

categorical_data_encoded = []
label_encoders = {}
for column in range(all_categorical_data.shape[1]):
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
col_name = categorical_column_names[column] if categorical_column_names is not None else None

if preloaded_encoders is not None:
label_encoder = preloaded_encoders[col_name]
encoded_labels = label_encoder.transform(all_categorical_data[:, column]).astype(float)
else:
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
Comment on lines +103 to +139
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Don't silently mix cached and freshly fit encoders.

If label_encoders_path is stale or points at the wrong file, missing columns fall through to fit_transform() and you end up with a mixed encoder set that no longer matches the checkpoint you meant to reuse. Validate the full categorical_column_names set up front and fail fast instead of partially re-fitting.

🔧 Suggested guard
     if label_encoders_path is not None:
         _pkl_path = Path(label_encoders_path)
         if _pkl_path.exists():
             with open(_pkl_path, "rb") as _f:
                 preloaded_encoders = pickle.load(_f)
+            if categorical_column_names is not None:
+                missing = set(categorical_column_names) - set(preloaded_encoders)
+                if missing:
+                    raise ValueError(
+                        f"Missing label encoders for categorical columns: {sorted(missing)}"
+                    )
@@
-        else:
+        elif preloaded_encoders is None:
             # Fallback: fit on current data
             label_encoder = LabelEncoder()
             encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
+        else:
+            raise KeyError(f"No cached encoder found for categorical column: {col_name}")
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Load pre-fitted label encoders from pkl if provided, otherwise fit on current data
preloaded_encoders: dict[str, LabelEncoder] | None = None
if label_encoders_path is not None:
_pkl_path = Path(label_encoders_path)
if _pkl_path.exists():
with open(_pkl_path, "rb") as _f:
preloaded_encoders = pickle.load(_f)
categorical_data_encoded = []
label_encoders = {}
for column in range(all_categorical_data.shape[1]):
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
col_name = categorical_column_names[column] if categorical_column_names is not None else None
if preloaded_encoders is not None and col_name is not None and col_name in preloaded_encoders:
# Use pre-fitted encoder from full dataset (e.g. 101K rows)
label_encoder = preloaded_encoders[col_name]
encoded_labels = label_encoder.transform(all_categorical_data[:, column]).astype(float)
else:
# Fallback: fit on current data
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
# Load pre-fitted label encoders from pkl if provided, otherwise fit on current data
preloaded_encoders: dict[str, LabelEncoder] | None = None
if label_encoders_path is not None:
_pkl_path = Path(label_encoders_path)
if _pkl_path.exists():
with open(_pkl_path, "rb") as _f:
preloaded_encoders = pickle.load(_f)
if categorical_column_names is not None:
missing = set(categorical_column_names) - set(preloaded_encoders)
if missing:
raise ValueError(
f"Missing label encoders for categorical columns: {sorted(missing)}"
)
categorical_data_encoded = []
label_encoders = {}
for column in range(all_categorical_data.shape[1]):
col_name = categorical_column_names[column] if categorical_column_names is not None else None
if preloaded_encoders is not None and col_name is not None and col_name in preloaded_encoders:
# Use pre-fitted encoder from full dataset (e.g. 101K rows)
label_encoder = preloaded_encoders[col_name]
encoded_labels = label_encoder.transform(all_categorical_data[:, column]).astype(float)
elif preloaded_encoders is None:
# Fallback: fit on current data
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(all_categorical_data[:, column]).astype(float)
else:
raise KeyError(f"No cached encoder found for categorical column: {col_name}")
🧰 Tools
🪛 OpenGrep (1.22.0)

[ERROR] 109-109: pickle.load/loads deserializes arbitrary Python objects and can execute arbitrary code. Use a safe format like JSON instead.

(coderabbit.deserialization.python-pickle)

🪛 Ruff (0.15.14)

[error] 109-109: pickle and modules that wrap it can be unsafe when used to deserialize untrusted data, possible security issue

(S301)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@src/midst_toolkit/models/clavaddpm/dataset_utils.py` around lines 103 - 122,
When a label_encoders_path is supplied you must fail fast instead of mixing
preloaded and newly-fitted encoders: after loading preloaded_encoders (from
label_encoders_path) validate that categorical_column_names is not None and that
every name in categorical_column_names exists as a key in preloaded_encoders; if
any are missing, raise a clear error (or return/raise ValueError) rather than
falling back to fitting per-column. Update the loop that currently checks
preloaded_encoders and conditionally fits (the block using preloaded_encoders,
label_encoder, encoded_labels and the fallback LabelEncoder()) to assume
encoders are present when label_encoders_path was provided and only fit new
encoders when no path was provided; include the check up front so you never mix
cached and freshly-fit encoders.


if noise_scale > 0:
# add noise
encoded_labels += np.random.normal(0, noise_scale, encoded_labels.shape)

categorical_data_encoded.append(encoded_labels)
label_encoders[column] = label_encoder

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,12 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit():
"model_data_dir": base_path,
"target_model_subdir": Path("."),
"model_type": "tabddpm",
"classifier_hidden_dim": 20,
"classifier_num_epochs": 200,
"classifier_hidden_dim": 100,
"classifier_num_epochs": 20,
"samples_per_train_model": 3000,
"samples_per_val_model": 10,
"num_noise_per_time_step": 30,
"timesteps": [5, 10, 15],
"timesteps": [5, 7, 9],
"additional_timesteps": [0],
"predictions_file_name": "challenge_label_predictions",
# TODO: Make results path a temp directory
Expand All @@ -52,14 +52,14 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit():
roc_auc_test = mia_performance_test["roc_auc"]
tpr_at_fpr_test = mia_performance_test["max_tpr"]

assert roc_auc_train == pytest.approx(0.4469875, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.08, abs=1e-8)
assert roc_auc_train == pytest.approx(0.6315875, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.165, abs=1e-8)

assert roc_auc_val == pytest.approx(0.5054624999999999, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.125, abs=1e-8)
assert roc_auc_val == pytest.approx(0.6732, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.28, abs=1e-8)

assert roc_auc_test == pytest.approx(0.4937875, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.115, abs=1e-8)
assert roc_auc_test == pytest.approx(0.6607, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.19, abs=1e-8)

unset_all_random_seeds()
os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
Expand Down Expand Up @@ -107,14 +107,14 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit_single_model():
roc_auc_test = mia_performance_test["roc_auc"]
tpr_at_fpr_test = mia_performance_test["max_tpr"]

assert roc_auc_train == pytest.approx(0.5046999999999999, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.09, abs=1e-8)
assert roc_auc_train == pytest.approx(0.6985000000000001, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.33, abs=1e-8)

assert roc_auc_val == pytest.approx(0.47159999999999996, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.12, abs=1e-8)
assert roc_auc_val == pytest.approx(0.7075, abs=1e-8)
assert tpr_at_fpr_val == pytest.approx(0.32, abs=1e-8)

assert roc_auc_test == pytest.approx(0.46390000000000003, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.16, abs=1e-8)
assert roc_auc_test == pytest.approx(0.8042, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.56, abs=1e-8)

unset_all_random_seeds()
os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
Expand Down Expand Up @@ -162,11 +162,11 @@ def test_tf_attack_whitebox_tiny_config_midst_toolkit_no_validation():

assert mia_performance_val is None

assert roc_auc_train == pytest.approx(0.4996999999999999, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.07, abs=1e-8)
assert roc_auc_train == pytest.approx(0.6980999999999999, abs=1e-8)
assert tpr_at_fpr_train == pytest.approx(0.33, abs=1e-8)

assert roc_auc_test == pytest.approx(0.5174, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.13, abs=1e-8)
assert roc_auc_test == pytest.approx(0.7075000000000001, abs=1e-8)
assert tpr_at_fpr_test == pytest.approx(0.32, abs=1e-8)

unset_all_random_seeds()
os.environ.pop("CUBLAS_WORKSPACE_CONFIG", None)
Loading