diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py
index 1212d8e1ec..a502e17d0c 100644
--- a/pufferlib/sweep.py
+++ b/pufferlib/sweep.py
@@ -6,7 +6,6 @@
 from contextlib import contextmanager
 
 import numpy as np
-import pufferlib
 
 import torch
 import gpytorch
@@ -15,11 +14,12 @@
 from gpytorch.kernels import MaternKernel, PolynomialKernel, ScaleKernel, AdditiveKernel
 from gpytorch.means import ConstantMean
 from gpytorch.mlls import ExactMarginalLogLikelihood
-from gpytorch.priors import LogNormalPrior
+from gpytorch.priors import LogNormalPrior, GammaPrior
 from scipy.optimize import minimize
 from scipy.stats.qmc import Sobol
 from scipy.spatial import KDTree
 from sklearn.linear_model import LogisticRegression
+from sklearn.preprocessing import PowerTransformer
 
 EPSILON = 1e-6
 
@@ -50,7 +50,6 @@ def __init__(self, min, max, scale, is_integer=False):
         self.scale = scale
         self.norm_min = self.normalize(min)
         self.norm_max = self.normalize(max)
-        # Since min/max are normalized from -1 to 1, just use 0 as a mean
         self.norm_mean = 0
         self.is_integer = is_integer
 
@@ -62,7 +61,6 @@ def __init__(self, min, max, scale, is_integer=False):
         super().__init__(min, max, scale, is_integer)
 
     def normalize(self, value):
-        #assert isinstance(value, (int, float))
         zero_one = (value - self.min)/(self.max - self.min)
         return 2*zero_one - 1
 
@@ -77,13 +75,10 @@ class Pow2(Space):
     def __init__(self, min, max, scale, is_integer=False):
         if scale == 'auto':
             scale = 0.5
-            #scale = 2 / (np.log2(max) - np.log2(min))
 
         super().__init__(min, max, scale, is_integer)
 
     def normalize(self, value):
-        #assert isinstance(value, (int, float))
-        #assert value != 0.0
         zero_one = (math.log(value, 2) - math.log(self.min, 2))/(math.log(self.max, 2) - math.log(self.min, 2))
         return 2*zero_one - 1
 
@@ -98,7 +93,6 @@ class Log(Space):
 
     def __init__(self, min, max, scale, is_integer=False):
         if scale == 'time':
-            # TODO: Set scaling param intuitively based on number of jumps from min to max
             scale = 1 / (np.log2(max) - np.log2(min))
         elif scale == 'auto':
             scale = 0.5
@@ -106,8 +100,6 @@ def __init__(self, min, max, scale, is_integer=False):
         super().__init__(min, max, scale, is_integer)
 
     def normalize(self, value):
-        #assert isinstance(value, (int, float))
-        #assert value != 0.0
         zero_one = (math.log(value, self.base) - math.log(self.min, self.base))/(math.log(self.max, self.base) - math.log(self.min, self.base))
         return 2*zero_one - 1
 
@@ -153,7 +145,7 @@ def _params_from_puffer_sweep(sweep_config, only_include=None):
         if any(isinstance(param[k], dict) for k in param):
             param_spaces[name] = _params_from_puffer_sweep(param, only_include)
             continue
- 
+
         if only_include and not any(k in name for k in only_include):
             continue
 
@@ -260,9 +252,8 @@ def pareto_points(observations):
     scores = np.array([e['output'] for e in observations])
     costs = np.array([e['cost'] for e in observations])
 
-    # Sort by cost to find non-dominated points efficiently
     sorted_indices = np.argsort(costs)
-    
+
     pareto = []
     pareto_idxs = []
     max_score_so_far = -np.inf
@@ -276,8 +267,6 @@ def pareto_points(observations):
     return pareto, pareto_idxs
 
 def prune_pareto_front(pareto, efficiency_threshold=0.5, pruning_stop_score_fraction=0.98):
-    # Prune the high-cost long tail of a pareto front
-    # like (score 0.99, cost 100), (score 0.991, cost 200)
     if not pareto or len(pareto) < 2:
         return pareto
 
@@ -300,7 +289,6 @@ def prune_pareto_front(pareto, efficiency_threshold=0.5, pruning_stop_score_frac
         if efficiency < efficiency_threshold:
             sorted_pareto.pop(i)
         else:
-            # Stop pruning if we find an efficient point
             break
 
     return sorted_pareto
@@ -369,7 +357,7 @@ def suggest(self, fill=None, fixed_total_timesteps=None):
             else:
                 cost_dists = np.abs(pareto_costs[:, None] - pareto_costs[None, :])
 
-            cost_dists += (np.max(pareto_costs) + 1)*np.eye(len(pareto_costs)) # mask self-distance
+            cost_dists += (np.max(pareto_costs) + 1)*np.eye(len(pareto_costs))
             idx = np.argmax(np.min(cost_dists, axis=1))
             search_centers = candidates[idx]['input']
         else:
@@ -397,19 +385,30 @@ def early_stop(self, logs, target_key):
 
 
 class ExactGPModel(ExactGP):
-    def __init__(self, train_x, train_y, likelihood, x_dim):
-        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
+    def __init__(self, train_x, train_y, likelihood, x_dim, use_polynomial=True):
+        super().__init__(train_x, train_y, likelihood)
         self.mean_module = ConstantMean()
-        # Matern 3/2 kernel (equivalent to Pyro's Matern32)
-        matern_kernel = MaternKernel(nu=1.5, ard_num_dims=x_dim)
-
-        # NOTE: setting this constraint changes GP behavior, including the lengthscale
-        # even though the lengthscale is well within the range ... Commenting out for now.
-        # lengthscale_constraint = gpytorch.constraints.Interval(0.01, 10.0)
-        # matern_kernel = MaternKernel(nu=1.5, ard_num_dims=x_dim, lengthscale_constraint=lengthscale_constraint)
-
-        linear_kernel = PolynomialKernel(power=1)
-        self.covar_module = ScaleKernel(AdditiveKernel(linear_kernel, matern_kernel))
+        # Matern 5/2 (HEBO/BoTorch default) plus a GammaPrior on the
+        # lengthscale to keep ARD from collapsing at low data:dim ratios.
+        matern_kernel = MaternKernel(
+            nu=2.5, ard_num_dims=x_dim,
+            lengthscale_prior=GammaPrior(3.0, 6.0),
+        )
+        # Direct ref so lengthscale_range works regardless of additive ordering.
+        self._matern_kernel = matern_kernel
+        outputscale_prior = GammaPrior(2.0, 0.15)
+        if use_polynomial:
+            # Cost GP only: log_cost ≈ linear in log(lr)/log(steps), so the
+            # additive linear term is appropriate. Score GP drops it — the
+            # unbounded polynomial extrapolation pins lr to the search-space
+            # bounds whenever failures cluster at one end of a dim.
+            linear_kernel = PolynomialKernel(power=1)
+            self.covar_module = ScaleKernel(
+                AdditiveKernel(linear_kernel, matern_kernel),
+                outputscale_prior=outputscale_prior,
+            )
+        else:
+            self.covar_module = ScaleKernel(matern_kernel, outputscale_prior=outputscale_prior)
 
     def forward(self, x):
         mean_x = self.mean_module(x)
@@ -418,8 +417,7 @@ def forward(self, x):
 
     @property
     def lengthscale_range(self):
-        # Get lengthscale from MaternKernel
-        lengthscale = self.covar_module.base_kernel.kernels[1].lengthscale.tolist()[0]
+        lengthscale = self._matern_kernel.lengthscale.tolist()[0]
         return min(lengthscale), max(lengthscale)
 
 def train_gp_model(model, likelihood, mll, optimizer, train_x, train_y, training_iter=50):
@@ -438,7 +436,6 @@ def train_gp_model(model, likelihood, mll, optimizer, train_x, train_y, training
             loss = loss.detach()
 
         except gpytorch.utils.errors.NotPSDError:
-            # It's rare but it does happen. Hope it's a transient issue.
             break
 
     model.eval()
@@ -447,22 +444,19 @@ def train_gp_model(model, likelihood, mll, optimizer, train_x, train_y, training
 
 
 class RobustLogCostModel:
-    """
-    Fits Score ~ A + B * log(Cost) using Quantile Regression (Median)
-    and provides a cost-only threshold for early stopping.
-    """
-    def __init__(self, quantile=0.3, min_num_samples=30):
-        self.quantile = quantile  # 0.5 = Median regression
+    """Fits Score ~ A + B * log(Cost) via Quantile Regression (Median)."""
+    def __init__(self, quantile=0.3, min_num_samples=3):
+        self.quantile = quantile
         self.min_num_samples = min_num_samples
         self.is_fitted = False
         self.A = None
         self.B = None
         self.max_score = None
+        self.min_score = None
         self.max_cost = None
         self.upper_cost_threshold = None
 
     def _quantile_loss(self, params, x, y, q):
-        # Pinball loss function for quantile regression
         a, b = params
         y_pred = a + b * x
         residuals = y - y_pred
@@ -473,6 +467,7 @@ def fit(self, observations, upper_cost_threshold=None):
         scores = np.array([e['output'] for e in observations])
         costs = np.array([e['cost'] for e in observations])
         self.max_score = scores.max()
+        self.min_score = scores.min()
         self.upper_cost_threshold = upper_cost_threshold or costs.max()
 
         valid_indices = (costs > EPSILON) & np.isfinite(scores)
@@ -483,62 +478,55 @@ def fit(self, observations, upper_cost_threshold=None):
         c = costs[valid_indices]
         x_log_c = np.log(c)
 
-        # Initial guess using standard Polyfit (OLS) just to get in the ballpark
         try:
             b_init, a_init = np.polyfit(x_log_c, y, 1)
         except np.linalg.LinAlgError:
-            # Fallback guess
             b_init, a_init = 0.0, np.mean(y)
 
-        # Minimize the Quantile Loss (L1 for median)
         res = minimize(
-            self._quantile_loss, 
-            x0=[a_init, b_init], 
+            self._quantile_loss,
+            x0=[a_init, b_init],
             args=(x_log_c, y, self.quantile),
-            method='Nelder-Mead', # Robust solver for non-differentiable functions
-            bounds=[(None, None), (0, None)] # B should be positive
+            method='Nelder-Mead',
+            bounds=[(None, None), (0, None)]
         )
-        
+
         self.A, self.B = res.x
         self.is_fitted = True
 
-    def get_threshold(self, cost, min_cost_fraction=0.3, abs_min_cost=10):
+    def get_threshold(self, cost, min_cost_fraction=0.03, abs_min_cost=10):
         if not self.is_fitted or self.upper_cost_threshold is None:
             return -np.inf
 
-        # NOTE: min_allowed_cost seems vary a lot from env to env, so dynamically set here
         min_allowed_cost = self.upper_cost_threshold * min_cost_fraction + abs_min_cost
         if cost < min_allowed_cost:
             return -np.inf
 
-        # Stop long long train runs that don't do very well enough
-        if cost > 1.2 * self.upper_cost_threshold:
-            return 0.9 * self.max_score
-
+        # The original "stop trials past 1.2x budget" branch returned
+        # 0.9 * max_score, which inverts for negated losses (max_score < 0
+        # ⇒ threshold > max_score ⇒ every trial gets killed). With fixed
+        # train_steps it's also unreachable in normal operation. Dropped.
         return self.A + self.B * np.log(cost)
 
 
-# TODO: Eval defaults
 class Protein:
     def __init__(self,
             sweep_config,
             max_suggestion_cost = 3600,
             resample_frequency = 0,
             num_random_samples = 10,
-            num_keep_top_obs = 5,
             global_search_scale = 1,
             suggestions_per_pareto = 256,
             expansion_rate = 0.1,
-            gp_training_iter = 50,
-            gp_learning_rate = 0.001,
-            gp_max_obs = 750,  # gp train time jumps after 800
-            infer_batch_size = 4096,            
+            gp_training_iter = 100,
+            gp_learning_rate = 0.01,
+            gp_max_obs = 750,
+            infer_batch_size = 4096,
             optimizer_reset_frequency = 50,
             use_gpu = True,
             cost_param = "train/total_timesteps",
             prune_pareto = True,
         ):
-        # Process sweep config. NOTE: sweep_config takes precedence. It's not good.
         _use_gpu = sweep_config['use_gpu'] if 'use_gpu' in sweep_config else use_gpu
         _prune_pareto = sweep_config['prune_pareto'] if 'prune_pareto' in sweep_config else prune_pareto
         _max_suggestion_cost = sweep_config['max_suggestion_cost'] if 'max_suggestion_cost' in sweep_config else max_suggestion_cost
@@ -558,61 +546,76 @@ def __init__(self,
 
         self.success_observations = []
         self.failure_observations = []
-        self.num_keep_top_obs = num_keep_top_obs
-        self.top_observations = []
 
         self.suggestion_idx = 0
         self.min_score, self.max_score = math.inf, -math.inf
         self.log_c_min, self.log_c_max = math.inf, -math.inf
 
-        # Use Sobel seq for structured random exploration
+        # Failure sentinel: frozen once on first merge into the score GP.
+        # Previously `e['output'] = self.min_score` ratcheted upward as new
+        # successes lowered min_score, retilting the kernel toward unfailed
+        # corners forever.
+        self._failure_sentinel = None
+        # HEBO recipe: Yeo-Johnson output warping + per-fit input rescaling.
+        # Warping linearizes the response surface; per-fit rescaling restores
+        # σ-driven exploration on dims where observations have clustered.
+        self._y_warper_yj = PowerTransformer(method='yeo-johnson', standardize=True)
+        self._y_best_warped = None
+        self._x_min_obs = None
+        self._x_max_obs = None
+
         self.sobol = Sobol(d=self.hyperparameters.num, scramble=True)
         self.num_random_samples = num_random_samples
-        # NOTE: test if sobol sampling really helps
-        # points_per_run = sweep_config['downsample']
-        # self.num_random_samples = 3 * points_per_run * self.hyperparameters.num
 
         self.cost_param_idx = self.hyperparameters.get_flat_idx(cost_param)
         self.cost_space = None
         self.cost_random_suggestion = None
         if self.cost_param_idx is not None:
             self.cost_space = list(self.hyperparameters.flat_spaces.values())[self.cost_param_idx]
-            self.cost_random_suggestion = -0.8  # In norm cost space. Make arg if necessary
+            self.cost_random_suggestion = -0.8
         self.target_cost_ratio = []
         self._running_target_buffer = deque(maxlen=30)
 
-        self.gp_max_obs = gp_max_obs  # train time bumps after 800?
+        self.gp_max_obs = gp_max_obs
         self.infer_batch_size = infer_batch_size
 
-        # Probably useful only when downsample=1 and each run is expensive.
         self.use_success_prob = sweep_config['downsample'] == 1
         self.success_classifier = LogisticRegression(class_weight='balanced')
 
-        # This model is conservative. Aggressive early stopping interferes with and hampers GP model learning.
         self.stop_threshold_model = RobustLogCostModel(quantile=sweep_config['early_stop_quantile'])
         self.upper_cost_threshold = -np.inf
 
-        # Use 64 bit for GP regression
         with default_tensor_dtype(torch.float64):
-            # Params taken from HEBO: https://arxiv.org/abs/2012.03826
             noise_prior = LogNormalPrior(math.log(1e-2), 0.5)
+            noise_constraint = gpytorch.constraints.GreaterThan(8e-4)
 
-            # Create dummy data for model initialization on the selected device
             dummy_x = torch.ones((1, self.hyperparameters.num), device=self.device)
             dummy_y = torch.zeros(1, device=self.device)
-            # Score GP
-            self.likelihood_score = GaussianLikelihood(noise_prior=deepcopy(noise_prior)).to(self.device)
-            self.gp_score = ExactGPModel(dummy_x, dummy_y, self.likelihood_score, self.hyperparameters.num).to(self.device)
+            # Score GP: drops the polynomial term.
+            self.likelihood_score = GaussianLikelihood(
+                noise_prior=deepcopy(noise_prior),
+                noise_constraint=deepcopy(noise_constraint),
+            ).to(self.device)
+            self.gp_score = ExactGPModel(
+                dummy_x, dummy_y, self.likelihood_score,
+                self.hyperparameters.num, use_polynomial=False,
+            ).to(self.device)
             self.mll_score = ExactMarginalLogLikelihood(self.likelihood_score, self.gp_score).to(self.device)
             self.score_opt = torch.optim.Adam(self.gp_score.parameters(), lr=self.gp_learning_rate, amsgrad=True)
 
-            # Cost GP
-            self.likelihood_cost = GaussianLikelihood(noise_prior=deepcopy(noise_prior)).to(self.device)
-            self.gp_cost = ExactGPModel(dummy_x, dummy_y, self.likelihood_cost, self.hyperparameters.num).to(self.device)
+            # Cost GP: keeps the polynomial (log_cost approximately linear in
+            # log(lr)/log(steps), so the linear extrapolation is appropriate).
+            self.likelihood_cost = GaussianLikelihood(
+                noise_prior=deepcopy(noise_prior),
+                noise_constraint=deepcopy(noise_constraint),
+            ).to(self.device)
+            self.gp_cost = ExactGPModel(
+                dummy_x, dummy_y, self.likelihood_cost,
+                self.hyperparameters.num, use_polynomial=True,
+            ).to(self.device)
             self.mll_cost = ExactMarginalLogLikelihood(self.likelihood_cost, self.gp_cost).to(self.device)
             self.cost_opt = torch.optim.Adam(self.gp_cost.parameters(), lr=self.gp_learning_rate, amsgrad=True)
 
-            # Buffers for GP training and inference
             self.gp_params_buffer = torch.empty(self.gp_max_obs, self.hyperparameters.num, device=self.device)
             self.gp_score_buffer = torch.empty(self.gp_max_obs, device=self.device)
             self.gp_cost_buffer = torch.empty(self.gp_max_obs, device=self.device)
@@ -624,11 +627,9 @@ def _filter_near_duplicates(self, inputs, duplicate_threshold=EPSILON):
 
         tree = KDTree(inputs)
         to_keep = np.ones(len(inputs), dtype=bool)
-        # Iterate from most recent to oldest
         for i in range(len(inputs) - 1, -1, -1):
             if to_keep[i]:
                 nearby_indices = tree.query_ball_point(inputs[i], r=duplicate_threshold)
-                # Exclude the point itself from being marked for removal
                 nearby_indices.remove(i)
                 if nearby_indices:
                     to_keep[nearby_indices] = False
@@ -641,23 +642,24 @@ def _sample_observations(self, max_size=None, recent_ratio=0.5):
 
         observations = self.success_observations.copy()
 
-        # Update the stats using the full data
         y = np.array([e['output'] for e in observations])
         self.min_score, self.max_score = y.min(), y.max()
 
         c = np.array([e['cost'] for e in observations])
         log_c = np.log(np.maximum(c, EPSILON))
         self.log_c_min = log_c.min()
-        self.log_c_max = np.quantile(log_c, 0.97)  # Make it less sensitive to outlier points
+        self.log_c_max = np.quantile(log_c, 0.97)
 
-        # When the data is scare, also use failed observations
+        # Failures pinned to a FROZEN sentinel below the worst observed
+        # success. Without this freeze the failure floor ratchets with
+        # min_score as new successes arrive, retilting the kernel slope.
         if len(observations) < 100 and self.failure_observations:
-            # Give the min score for the failed obs, so this value will keep changing.
+            if self._failure_sentinel is None:
+                spread = max(abs(self.max_score - self.min_score), 1.0)
+                self._failure_sentinel = float(self.min_score) - 0.5 * spread
             for e in self.failure_observations:
-                e['output'] = self.min_score
-            
-            # NOTE: the order of obs matters since recent obs are always fed into gp training
-            # So, putting the failure obs first.
+                e['output'] = self._failure_sentinel
+
             observations = self.failure_observations + observations
 
         params = np.array([np.append(e['input'], [e['output'], e['cost']]) for e in observations])
@@ -669,7 +671,7 @@ def _sample_observations(self, max_size=None, recent_ratio=0.5):
 
         if len(observations) <= max_size:
             return observations
-        
+
         recent_size = int(recent_ratio*max_size)
         recent_obs = observations[-recent_size:]
         older_obs = observations[:-recent_size]
@@ -681,54 +683,53 @@ def _sample_observations(self, max_size=None, recent_ratio=0.5):
     def _train_gp_models(self):
         if not self.success_observations:
             return 0, 0
-        
+
         sampled_observations = self._sample_observations(max_size=self.gp_max_obs)
         num_sampled = len(sampled_observations)
 
-        # Prepare tensors using pre-allocated buffers
-        params = np.array([e['input'] for e in sampled_observations])
+        # Per-fit input rescaling to the observed box. On dims where obs
+        # cluster, x_span shrinks → σ saturates at unobserved values → UCB
+        # exploration dominates on that dim.
+        params = np.array([e['input'] for e in sampled_observations], dtype=np.float64)
+        self._x_min_obs = params.min(axis=0)
+        self._x_max_obs = params.max(axis=0)
+        x_span = np.maximum(self._x_max_obs - self._x_min_obs, 1e-8)
+        params_scaled = 2.0 * (params - self._x_min_obs) / x_span - 1.0
         params_tensor = self.gp_params_buffer[:num_sampled]
-        params_tensor.copy_(torch.from_numpy(params))
-
-        # Normalized scores and costs
-        y = np.array([e['output'] for e in sampled_observations])
-        y_norm = (y - self.min_score) / (np.abs(self.max_score - self.min_score) + EPSILON)
-        y_norm_tensor = self.gp_score_buffer[:num_sampled]
-        y_norm_tensor.copy_(torch.from_numpy(y_norm))
+        params_tensor.copy_(torch.from_numpy(params_scaled))
+
+        # Output warping: signed (so GP always sees "higher = better"), then
+        # Yeo-Johnson + standardize. Fall back to plain z-score on degeneracy.
+        y = np.array([e['output'] for e in sampled_observations], dtype=np.float64)
+        signed_y = self.hyperparameters.optimize_direction * y
+        y_warped = None
+        if num_sampled >= 4 and float(np.std(signed_y)) > 1e-9:
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore")
+                try:
+                    y_warped = self._y_warper_yj.fit_transform(signed_y.reshape(-1, 1)).ravel().astype(np.float64)
+                except (ValueError, RuntimeError):
+                    y_warped = None
+        if y_warped is None:
+            mu, sd = signed_y.mean(), max(float(np.std(signed_y)), EPSILON)
+            y_warped = (signed_y - mu) / sd
+        self._y_best_warped = float(y_warped.max())
+        y_warped_tensor = self.gp_score_buffer[:num_sampled]
+        y_warped_tensor.copy_(torch.from_numpy(y_warped))
 
         c = np.array([e['cost'] for e in sampled_observations])
-        log_c = np.log(np.maximum(c, EPSILON)) # Ensure log is not taken on zero
+        log_c = np.log(np.maximum(c, EPSILON))
         log_c_norm = (log_c - self.log_c_min) / (self.log_c_max - self.log_c_min + EPSILON)
         log_c_norm_tensor = self.gp_cost_buffer[:num_sampled]
         log_c_norm_tensor.copy_(torch.from_numpy(log_c_norm))
 
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", gpytorch.utils.warnings.NumericalWarning)
-            score_loss = train_gp_model(self.gp_score, self.likelihood_score, self.mll_score, self.score_opt, params_tensor, y_norm_tensor, training_iter=self.gp_training_iter)
+            score_loss = train_gp_model(self.gp_score, self.likelihood_score, self.mll_score, self.score_opt, params_tensor, y_warped_tensor, training_iter=self.gp_training_iter)
             cost_loss = train_gp_model(self.gp_cost, self.likelihood_cost, self.mll_cost, self.cost_opt, params_tensor, log_c_norm_tensor, training_iter=self.gp_training_iter)
 
         return score_loss, cost_loss
 
-    def _get_top_obs_params(self):
-        if not self.top_observations:
-            return np.array([])
-        
-        params = np.array([e['input'] for e in self.top_observations])
-        if self.cost_param_idx is None:
-            return params
-
-        # Add the same params with less cost to the search center, and not the original
-        original_costs_norm = params[:, self.cost_param_idx]
-
-        params_1 = np.copy(params)
-        cost_norm_1 = original_costs_norm - (original_costs_norm - (-1)) / 2
-        params_1[:, self.cost_param_idx] = cost_norm_1
-        params_2 = np.copy(params)
-        cost_norm_2 = original_costs_norm - (original_costs_norm - (-1)) / 3
-        params_2[:, self.cost_param_idx] = cost_norm_2
-
-        return np.vstack([params_1, params_2])
-
     def _sample_target_cost_ratio(self, expansion_rate, target_ratios=(0.16, 0.32, 0.48, 0.64, 0.8, 1.0)):
         if not self.target_cost_ratio:
             self.target_cost_ratio = list(target_ratios)
@@ -736,6 +737,15 @@ def _sample_target_cost_ratio(self, expansion_rate, target_ratios=(0.16, 0.32, 0
         target_ratio = np.clip(self.target_cost_ratio.pop() + 0.1 * np.random.randn(), 0, 1)
         return (1 + expansion_rate) * target_ratio
 
+    def _sobol_suggestion(self, fill, fixed_cost_norm):
+        suggestion = 2 * self.sobol.random(1)[0] - 1
+        if fixed_cost_norm is not None:
+            suggestion[self.cost_param_idx] = fixed_cost_norm
+        elif self.cost_param_idx is not None:
+            cost_suggestion = self.cost_random_suggestion + 0.1 * np.random.randn()
+            suggestion[self.cost_param_idx] = np.clip(cost_suggestion, -1, 1)
+        return self.hyperparameters.to_dict(suggestion, fill), {}
+
     def suggest(self, fill, fixed_total_timesteps=None):
         info = {}
         self.suggestion_idx += 1
@@ -743,28 +753,16 @@ def suggest(self, fill, fixed_total_timesteps=None):
         if fixed_total_timesteps is not None and self.cost_space is not None:
             fixed_cost_norm = self.cost_space.normalize(fixed_total_timesteps)
 
-        # NOTE: Changed pufferl to use the train args, NOT the sweep hyperparam search center
-        # if len(self.success_observations) == 0 and self.seed_with_search_center:
-        #     suggestion = self.hyperparameters.search_centers
-        #     return self.hyperparameters.to_dict(suggestion, fill), info
-
         if self.suggestion_idx <= self.num_random_samples:
-            # Suggest the next point in the Sobol sequence
-            zero_one = self.sobol.random(1)[0]
-            suggestion = 2*zero_one - 1  # Scale from [0, 1) to [-1, 1)
-            if fixed_cost_norm is not None:
-                suggestion[self.cost_param_idx] = fixed_cost_norm
-            elif self.cost_param_idx is not None:
-                cost_suggestion = self.cost_random_suggestion + 0.1 * np.random.randn()
-                suggestion[self.cost_param_idx] = np.clip(cost_suggestion, -1, 1)  # limit the cost
-            return self.hyperparameters.to_dict(suggestion, fill), info
-
-        elif self.resample_frequency and self.suggestion_idx % self.resample_frequency == 0:
+            return self._sobol_suggestion(fill, fixed_cost_norm)
+
+        if self.resample_frequency and self.suggestion_idx % self.resample_frequency == 0:
             candidates, _ = pareto_points(self.success_observations)
-            suggestions = np.stack([e['input'] for e in candidates])
-            best_idx = np.random.randint(0, len(candidates))
-            best = suggestions[best_idx]
-            return self.hyperparameters.to_dict(best, fill), info
+            if candidates:
+                suggestions = np.stack([e['input'] for e in candidates])
+                best_idx = np.random.randint(0, len(candidates))
+                best = suggestions[best_idx]
+                return self.hyperparameters.to_dict(best, fill), info
 
         score_loss, cost_loss = self._train_gp_models()
 
@@ -772,27 +770,39 @@ def suggest(self, fill, fixed_total_timesteps=None):
             print(f'Resetting GP optimizers at suggestion {self.suggestion_idx}')
             self.score_opt = torch.optim.Adam(self.gp_score.parameters(), lr=self.gp_learning_rate, amsgrad=True)
             self.cost_opt = torch.optim.Adam(self.gp_cost.parameters(), lr=self.gp_learning_rate, amsgrad=True)
-       
+
         pareto_front, pareto_idxs = pareto_points(self.success_observations)
         pruned_front = prune_pareto_front(pareto_front)
+        # An all-failure Sobol phase leaves success_observations empty here.
+        if not pruned_front:
+            return self._sobol_suggestion(fill, fixed_cost_norm)
         pareto_observations = pruned_front if self.prune_pareto else pareto_front
 
-        # Use the max cost from the pruned pareto to avoid inefficiently long runs
         if self.upper_cost_threshold < 0:
             self.upper_cost_threshold = pruned_front[-1]['cost']
-        # Try to change the threshold slowly
         elif self.upper_cost_threshold < pruned_front[-1]['cost']:
             self.upper_cost_threshold *= 1.01
         self.stop_threshold_model.fit(self.success_observations, self.upper_cost_threshold)
 
-        ### Sample suggestions
-        search_centers = np.stack([e['input'] for e in pareto_observations])
-        if self.top_observations:
-            # Add top observations by score to search centers for diversity
-            search_centers = np.vstack([search_centers, self._get_top_obs_params()])
-
-        suggestions = self.hyperparameters.sample(
-            len(search_centers)*self.suggestions_per_pareto, mu=search_centers)
+        # Candidates: global Sobol (UCB σ-exploration) + tight local
+        # perturbations around top-K observations (µ-injection refinement).
+        # Pure Sobol breaks down at our dim: P(any sample within ~ℓ of an
+        # obs) ≈ (ℓ/2)^d ≈ 10⁻¹² at d=16, so argmax-µ over Sobol never
+        # lands near best-obs.
+        d = self.hyperparameters.num
+        n_cand = 1 << max(11, (max(2000, 200 * d) - 1).bit_length())
+        cand_sobol = Sobol(d=d, scramble=True, seed=self.suggestion_idx)
+        suggestions_global = 2.0 * cand_sobol.random(n_cand) - 1.0
+
+        k_top = min(5, len(self.success_observations))
+        top_centers = np.stack([e['input'] for e in
+                                sorted(self.success_observations,
+                                       key=lambda e: e['output'],
+                                       reverse=True)[:k_top]])
+        n_local = max(256 * k_top, 512)
+        suggestions_local = self.hyperparameters.sample(n_local, mu=top_centers, scale=0.4)
+
+        suggestions = np.vstack([suggestions_global, suggestions_local])
 
         if fixed_cost_norm is not None:
             suggestions[:, self.cost_param_idx] = fixed_cost_norm
@@ -801,59 +811,64 @@ def suggest(self, fill, fixed_total_timesteps=None):
         suggestions = suggestions[dedup_indices]
 
         if len(suggestions) == 0:
-            return self.suggest(fill) # Fallback to random if all suggestions are filtered
+            return self.suggest(fill)
 
-        ### Predict scores and costs
-        # Batch predictions to avoid GPU OOM for large number of suggestions
-        gp_y_norm_list, gp_log_c_norm_list = [], []
+        # Apply the same per-fit rescaling to candidates before prediction.
+        x_span = np.maximum(self._x_max_obs - self._x_min_obs, 1e-8)
+        suggestions_scaled = 2.0 * (suggestions - self._x_min_obs) / x_span - 1.0
+
+        gp_y_mean_list, gp_y_var_list, gp_log_c_norm_list = [], [], []
 
         with torch.no_grad(), gpytorch.settings.fast_pred_var(), warnings.catch_warnings():
             warnings.simplefilter("ignore", gpytorch.utils.warnings.NumericalWarning)
 
-            # Create a reusable buffer on the device to avoid allocating a huge tensor
-            for i in range(0, len(suggestions), self.infer_batch_size):
-                batch_numpy = suggestions[i:i+self.infer_batch_size]
+            for i in range(0, len(suggestions_scaled), self.infer_batch_size):
+                batch_numpy = suggestions_scaled[i:i+self.infer_batch_size]
                 current_batch_size = len(batch_numpy)
-
-                # Use a slice of the buffer if the current batch is smaller
                 batch_tensor = self.infer_batch_buffer[:current_batch_size]
                 batch_tensor.copy_(torch.from_numpy(batch_numpy))
 
                 try:
-                    # Score and cost prediction
-                    pred_y_mean = self.likelihood_score(self.gp_score(batch_tensor)).mean.cpu()
+                    posterior = self.likelihood_score(self.gp_score(batch_tensor))
+                    pred_y_mean = posterior.mean.cpu()
+                    pred_y_var = posterior.variance.cpu()
                     pred_c_mean = self.likelihood_cost(self.gp_cost(batch_tensor)).mean.cpu()
-
                 except RuntimeError:
-                    # Handle numerical errors during GP prediction
-                    pred_y_mean, pred_c_mean = torch.zeros(current_batch_size)
-
-                gp_y_norm_list.append(pred_y_mean.cpu())
-                gp_log_c_norm_list.append(pred_c_mean.cpu())
+                    pred_y_mean = torch.zeros(current_batch_size)
+                    pred_y_var = torch.ones(current_batch_size)
+                    pred_c_mean = torch.zeros(current_batch_size)
 
-                del pred_y_mean, pred_c_mean
+                gp_y_mean_list.append(pred_y_mean)
+                gp_y_var_list.append(pred_y_var)
+                gp_log_c_norm_list.append(pred_c_mean)
 
-        # Concatenate results from all batches
-        gp_y_norm = torch.cat(gp_y_norm_list).numpy()
+        gp_y_mean = torch.cat(gp_y_mean_list).numpy()
+        gp_y_var = torch.cat(gp_y_var_list).numpy()
+        gp_y_std = np.sqrt(np.maximum(gp_y_var, EPSILON * EPSILON))
         gp_log_c_norm = torch.cat(gp_log_c_norm_list).numpy()
-
-        # Unlinearize
-        gp_y = gp_y_norm*(self.max_score - self.min_score) + self.min_score
         gp_log_c = gp_log_c_norm*(self.log_c_max - self.log_c_min) + self.log_c_min
         gp_c = np.exp(gp_log_c)
 
-        # Maximize score. (Tried upper confidence bounds, but it did more harm because gp was noisy)
-        suggestion_scores = self.hyperparameters.optimize_direction * gp_y_norm
-
-        # When cost is fixed (pareto mode), skip cost-based weighting
-        if fixed_cost_norm is None:
-            max_c_mask = gp_c < self.max_suggestion_cost
+        # HEBO stochastic perturbation: noise ~ N(0, 2·σ²_obs) added to µ
+        # robustifies acquisition against GP miscalibration.
+        noise_std = float(self.gp_score.likelihood.noise.detach().sqrt().cpu().item())
+        gp_y_mu_perturbed = gp_y_mean + math.sqrt(2.0) * noise_std * np.random.randn(len(gp_y_mean))
+
+        # UCB; κ = sqrt(2·log(n·d)) is HEBO's heuristic, grows slowly with n.
+        n_obs = max(1, len(self.success_observations))
+        kappa = math.sqrt(2.0 * math.log(max(n_obs * d, 2)))
+        ucb_scores = gp_y_mu_perturbed + kappa * gp_y_std
+
+        # Cost and success-prob factors kept in log-space so they compose
+        # additively with the acquisition scores.
+        cost_logweight = np.zeros_like(ucb_scores)
+        cost_mask = np.ones_like(ucb_scores, dtype=bool)
+        if fixed_cost_norm is None and self.cost_param_idx is not None:
+            cost_mask = gp_c < self.max_suggestion_cost
             target_cost = self._sample_target_cost_ratio(self.expansion_rate)
-            weight = 1 - abs(target_cost - gp_log_c_norm)
-            suggestion_scores *= max_c_mask * weight
+            cost_logweight = np.log(np.maximum(1 - np.abs(target_cost - gp_log_c_norm), 1e-9))
 
-        # Then, consider the prob of training success, only when downsample = 1
-        # NOTE: Useful only in limited scenarios, where each data point is expensive. So turn it off by default.
+        success_logweight = np.zeros_like(ucb_scores)
         if self.use_success_prob and len(self.success_observations) > 9 and len(self.failure_observations) > 9:
             success_params = np.array([e['input'] for e in self.success_observations])
             failure_params = np.array([e['input'] for e in self.failure_observations])
@@ -867,22 +882,46 @@ def suggest(self, fill, fixed_total_timesteps=None):
                 with warnings.catch_warnings():
                     warnings.simplefilter("ignore", UserWarning)
                     p_success = self.success_classifier.predict_proba(suggestions)[:, 1]
-                suggestion_scores *= p_success
+                success_logweight = np.log(np.maximum(p_success, 1e-9))
+
+        # MACE mean injection: every 3rd GP-phase trial commits to argmax(µ)
+        # to refine the currently-best mode. At κ≈3.4 pure UCB won't shrink
+        # σ fast enough on a single observation. Drop the target-cost soft
+        # weight here (refinement shouldn't be cost-biased), keep the hard
+        # over-budget mask and the success-prob weight.
+        if self.suggestion_idx % 3 == 0:
+            mu_scores = gp_y_mu_perturbed + success_logweight
+            mu_scores[~cost_mask] = -np.inf
+            best_idx = int(np.argmax(mu_scores))
+            acq_mode = 'mu_inject'
+            picked_score = float(mu_scores[best_idx])
+        else:
+            ucb_scores = ucb_scores + cost_logweight + success_logweight
+            ucb_scores[~cost_mask] = -np.inf
+            best_idx = int(np.argmax(ucb_scores))
+            acq_mode = 'ucb'
+            picked_score = float(ucb_scores[best_idx])
 
-        best_idx = np.argmax(suggestion_scores)
         info = dict(
+            acq_mode = acq_mode,
             cost = gp_c[best_idx].item(),
-            score = gp_y[best_idx].item(),
-            rating = suggestion_scores[best_idx].item(),
+            score_mean_warped = float(gp_y_mean[best_idx]),
+            score_std_warped = float(gp_y_std[best_idx]),
+            picked_score = picked_score,
+            kappa = kappa,
+            f_best_warped = self._y_best_warped,
             score_loss = score_loss,
             cost_loss = cost_loss,
             score_lengthscale = self.gp_score.lengthscale_range,
             cost_lengthscale = self.gp_cost.lengthscale_range,
         )
         print('Predicted -- ',
-            f'Score: {info["score"]:.3f}',
-            f'Cost: {info["cost"]:.3f}',
-            f'Rating: {info["rating"]:.3f}',
+            f'{acq_mode}: {picked_score:+.3f}',
+            f'µ_w: {info["score_mean_warped"]:+.3f}',
+            f'σ_w: {info["score_std_warped"]:.3f}',
+            f'κ: {info["kappa"]:.2f}',
+            f'f*_w: {info["f_best_warped"]:+.3f}',
+            f'Cost: {info["cost"]:.0f}',
         )
 
         best = suggestions[best_idx]
@@ -919,21 +958,11 @@ def observe(self, hypers, score, cost, is_failure=False):
                 self.success_observations[same[0]] = new_observation
                 return
 
-        # Ignore obs that are below the minimum cost
         if self.cost_param_idx is not None and params[self.cost_param_idx] <= -1:
             return
 
         self.success_observations.append(new_observation)
 
-        # Update top_observations without sorting the full list every time
-        if len(self.top_observations) < self.num_keep_top_obs:
-            self.top_observations.append(new_observation)
-            self.top_observations.sort(key=lambda x: x['output'], reverse=True)
-        elif score > self.top_observations[-1]['output']:
-            self.top_observations.pop()
-            self.top_observations.append(new_observation)
-            self.top_observations.sort(key=lambda x: x['output'], reverse=True)
-
     def get_early_stop_threshold(self, cost):
         return self.stop_threshold_model.get_threshold(cost)