diff --git a/pufferlib/sweep.py b/pufferlib/sweep.py index 1212d8e1ec..a502e17d0c 100644 --- a/pufferlib/sweep.py +++ b/pufferlib/sweep.py @@ -6,7 +6,6 @@ from contextlib import contextmanager import numpy as np -import pufferlib import torch import gpytorch @@ -15,11 +14,12 @@ from gpytorch.kernels import MaternKernel, PolynomialKernel, ScaleKernel, AdditiveKernel from gpytorch.means import ConstantMean from gpytorch.mlls import ExactMarginalLogLikelihood -from gpytorch.priors import LogNormalPrior +from gpytorch.priors import LogNormalPrior, GammaPrior from scipy.optimize import minimize from scipy.stats.qmc import Sobol from scipy.spatial import KDTree from sklearn.linear_model import LogisticRegression +from sklearn.preprocessing import PowerTransformer EPSILON = 1e-6 @@ -50,7 +50,6 @@ def __init__(self, min, max, scale, is_integer=False): self.scale = scale self.norm_min = self.normalize(min) self.norm_max = self.normalize(max) - # Since min/max are normalized from -1 to 1, just use 0 as a mean self.norm_mean = 0 self.is_integer = is_integer @@ -62,7 +61,6 @@ def __init__(self, min, max, scale, is_integer=False): super().__init__(min, max, scale, is_integer) def normalize(self, value): - #assert isinstance(value, (int, float)) zero_one = (value - self.min)/(self.max - self.min) return 2*zero_one - 1 @@ -77,13 +75,10 @@ class Pow2(Space): def __init__(self, min, max, scale, is_integer=False): if scale == 'auto': scale = 0.5 - #scale = 2 / (np.log2(max) - np.log2(min)) super().__init__(min, max, scale, is_integer) def normalize(self, value): - #assert isinstance(value, (int, float)) - #assert value != 0.0 zero_one = (math.log(value, 2) - math.log(self.min, 2))/(math.log(self.max, 2) - math.log(self.min, 2)) return 2*zero_one - 1 @@ -98,7 +93,6 @@ class Log(Space): def __init__(self, min, max, scale, is_integer=False): if scale == 'time': - # TODO: Set scaling param intuitively based on number of jumps from min to max scale = 1 / (np.log2(max) - np.log2(min)) elif scale == 'auto': scale = 0.5 @@ -106,8 +100,6 @@ def __init__(self, min, max, scale, is_integer=False): super().__init__(min, max, scale, is_integer) def normalize(self, value): - #assert isinstance(value, (int, float)) - #assert value != 0.0 zero_one = (math.log(value, self.base) - math.log(self.min, self.base))/(math.log(self.max, self.base) - math.log(self.min, self.base)) return 2*zero_one - 1 @@ -153,7 +145,7 @@ def _params_from_puffer_sweep(sweep_config, only_include=None): if any(isinstance(param[k], dict) for k in param): param_spaces[name] = _params_from_puffer_sweep(param, only_include) continue - + if only_include and not any(k in name for k in only_include): continue @@ -260,9 +252,8 @@ def pareto_points(observations): scores = np.array([e['output'] for e in observations]) costs = np.array([e['cost'] for e in observations]) - # Sort by cost to find non-dominated points efficiently sorted_indices = np.argsort(costs) - + pareto = [] pareto_idxs = [] max_score_so_far = -np.inf @@ -276,8 +267,6 @@ def pareto_points(observations): return pareto, pareto_idxs def prune_pareto_front(pareto, efficiency_threshold=0.5, pruning_stop_score_fraction=0.98): - # Prune the high-cost long tail of a pareto front - # like (score 0.99, cost 100), (score 0.991, cost 200) if not pareto or len(pareto) < 2: return pareto @@ -300,7 +289,6 @@ def prune_pareto_front(pareto, efficiency_threshold=0.5, pruning_stop_score_frac if efficiency < efficiency_threshold: sorted_pareto.pop(i) else: - # Stop pruning if we find an efficient point break return sorted_pareto @@ -369,7 +357,7 @@ def suggest(self, fill=None, fixed_total_timesteps=None): else: cost_dists = np.abs(pareto_costs[:, None] - pareto_costs[None, :]) - cost_dists += (np.max(pareto_costs) + 1)*np.eye(len(pareto_costs)) # mask self-distance + cost_dists += (np.max(pareto_costs) + 1)*np.eye(len(pareto_costs)) idx = np.argmax(np.min(cost_dists, axis=1)) search_centers = candidates[idx]['input'] else: @@ -397,19 +385,30 @@ def early_stop(self, logs, target_key): class ExactGPModel(ExactGP): - def __init__(self, train_x, train_y, likelihood, x_dim): - super(ExactGPModel, self).__init__(train_x, train_y, likelihood) + def __init__(self, train_x, train_y, likelihood, x_dim, use_polynomial=True): + super().__init__(train_x, train_y, likelihood) self.mean_module = ConstantMean() - # Matern 3/2 kernel (equivalent to Pyro's Matern32) - matern_kernel = MaternKernel(nu=1.5, ard_num_dims=x_dim) - - # NOTE: setting this constraint changes GP behavior, including the lengthscale - # even though the lengthscale is well within the range ... Commenting out for now. - # lengthscale_constraint = gpytorch.constraints.Interval(0.01, 10.0) - # matern_kernel = MaternKernel(nu=1.5, ard_num_dims=x_dim, lengthscale_constraint=lengthscale_constraint) - - linear_kernel = PolynomialKernel(power=1) - self.covar_module = ScaleKernel(AdditiveKernel(linear_kernel, matern_kernel)) + # Matern 5/2 (HEBO/BoTorch default) plus a GammaPrior on the + # lengthscale to keep ARD from collapsing at low data:dim ratios. + matern_kernel = MaternKernel( + nu=2.5, ard_num_dims=x_dim, + lengthscale_prior=GammaPrior(3.0, 6.0), + ) + # Direct ref so lengthscale_range works regardless of additive ordering. + self._matern_kernel = matern_kernel + outputscale_prior = GammaPrior(2.0, 0.15) + if use_polynomial: + # Cost GP only: log_cost ≈ linear in log(lr)/log(steps), so the + # additive linear term is appropriate. Score GP drops it — the + # unbounded polynomial extrapolation pins lr to the search-space + # bounds whenever failures cluster at one end of a dim. + linear_kernel = PolynomialKernel(power=1) + self.covar_module = ScaleKernel( + AdditiveKernel(linear_kernel, matern_kernel), + outputscale_prior=outputscale_prior, + ) + else: + self.covar_module = ScaleKernel(matern_kernel, outputscale_prior=outputscale_prior) def forward(self, x): mean_x = self.mean_module(x) @@ -418,8 +417,7 @@ def forward(self, x): @property def lengthscale_range(self): - # Get lengthscale from MaternKernel - lengthscale = self.covar_module.base_kernel.kernels[1].lengthscale.tolist()[0] + lengthscale = self._matern_kernel.lengthscale.tolist()[0] return min(lengthscale), max(lengthscale) def train_gp_model(model, likelihood, mll, optimizer, train_x, train_y, training_iter=50): @@ -438,7 +436,6 @@ def train_gp_model(model, likelihood, mll, optimizer, train_x, train_y, training loss = loss.detach() except gpytorch.utils.errors.NotPSDError: - # It's rare but it does happen. Hope it's a transient issue. break model.eval() @@ -447,22 +444,19 @@ def train_gp_model(model, likelihood, mll, optimizer, train_x, train_y, training class RobustLogCostModel: - """ - Fits Score ~ A + B * log(Cost) using Quantile Regression (Median) - and provides a cost-only threshold for early stopping. - """ - def __init__(self, quantile=0.3, min_num_samples=30): - self.quantile = quantile # 0.5 = Median regression + """Fits Score ~ A + B * log(Cost) via Quantile Regression (Median).""" + def __init__(self, quantile=0.3, min_num_samples=3): + self.quantile = quantile self.min_num_samples = min_num_samples self.is_fitted = False self.A = None self.B = None self.max_score = None + self.min_score = None self.max_cost = None self.upper_cost_threshold = None def _quantile_loss(self, params, x, y, q): - # Pinball loss function for quantile regression a, b = params y_pred = a + b * x residuals = y - y_pred @@ -473,6 +467,7 @@ def fit(self, observations, upper_cost_threshold=None): scores = np.array([e['output'] for e in observations]) costs = np.array([e['cost'] for e in observations]) self.max_score = scores.max() + self.min_score = scores.min() self.upper_cost_threshold = upper_cost_threshold or costs.max() valid_indices = (costs > EPSILON) & np.isfinite(scores) @@ -483,62 +478,55 @@ def fit(self, observations, upper_cost_threshold=None): c = costs[valid_indices] x_log_c = np.log(c) - # Initial guess using standard Polyfit (OLS) just to get in the ballpark try: b_init, a_init = np.polyfit(x_log_c, y, 1) except np.linalg.LinAlgError: - # Fallback guess b_init, a_init = 0.0, np.mean(y) - # Minimize the Quantile Loss (L1 for median) res = minimize( - self._quantile_loss, - x0=[a_init, b_init], + self._quantile_loss, + x0=[a_init, b_init], args=(x_log_c, y, self.quantile), - method='Nelder-Mead', # Robust solver for non-differentiable functions - bounds=[(None, None), (0, None)] # B should be positive + method='Nelder-Mead', + bounds=[(None, None), (0, None)] ) - + self.A, self.B = res.x self.is_fitted = True - def get_threshold(self, cost, min_cost_fraction=0.3, abs_min_cost=10): + def get_threshold(self, cost, min_cost_fraction=0.03, abs_min_cost=10): if not self.is_fitted or self.upper_cost_threshold is None: return -np.inf - # NOTE: min_allowed_cost seems vary a lot from env to env, so dynamically set here min_allowed_cost = self.upper_cost_threshold * min_cost_fraction + abs_min_cost if cost < min_allowed_cost: return -np.inf - # Stop long long train runs that don't do very well enough - if cost > 1.2 * self.upper_cost_threshold: - return 0.9 * self.max_score - + # The original "stop trials past 1.2x budget" branch returned + # 0.9 * max_score, which inverts for negated losses (max_score < 0 + # ⇒ threshold > max_score ⇒ every trial gets killed). With fixed + # train_steps it's also unreachable in normal operation. Dropped. return self.A + self.B * np.log(cost) -# TODO: Eval defaults class Protein: def __init__(self, sweep_config, max_suggestion_cost = 3600, resample_frequency = 0, num_random_samples = 10, - num_keep_top_obs = 5, global_search_scale = 1, suggestions_per_pareto = 256, expansion_rate = 0.1, - gp_training_iter = 50, - gp_learning_rate = 0.001, - gp_max_obs = 750, # gp train time jumps after 800 - infer_batch_size = 4096, + gp_training_iter = 100, + gp_learning_rate = 0.01, + gp_max_obs = 750, + infer_batch_size = 4096, optimizer_reset_frequency = 50, use_gpu = True, cost_param = "train/total_timesteps", prune_pareto = True, ): - # Process sweep config. NOTE: sweep_config takes precedence. It's not good. _use_gpu = sweep_config['use_gpu'] if 'use_gpu' in sweep_config else use_gpu _prune_pareto = sweep_config['prune_pareto'] if 'prune_pareto' in sweep_config else prune_pareto _max_suggestion_cost = sweep_config['max_suggestion_cost'] if 'max_suggestion_cost' in sweep_config else max_suggestion_cost @@ -558,61 +546,76 @@ def __init__(self, self.success_observations = [] self.failure_observations = [] - self.num_keep_top_obs = num_keep_top_obs - self.top_observations = [] self.suggestion_idx = 0 self.min_score, self.max_score = math.inf, -math.inf self.log_c_min, self.log_c_max = math.inf, -math.inf - # Use Sobel seq for structured random exploration + # Failure sentinel: frozen once on first merge into the score GP. + # Previously `e['output'] = self.min_score` ratcheted upward as new + # successes lowered min_score, retilting the kernel toward unfailed + # corners forever. + self._failure_sentinel = None + # HEBO recipe: Yeo-Johnson output warping + per-fit input rescaling. + # Warping linearizes the response surface; per-fit rescaling restores + # σ-driven exploration on dims where observations have clustered. + self._y_warper_yj = PowerTransformer(method='yeo-johnson', standardize=True) + self._y_best_warped = None + self._x_min_obs = None + self._x_max_obs = None + self.sobol = Sobol(d=self.hyperparameters.num, scramble=True) self.num_random_samples = num_random_samples - # NOTE: test if sobol sampling really helps - # points_per_run = sweep_config['downsample'] - # self.num_random_samples = 3 * points_per_run * self.hyperparameters.num self.cost_param_idx = self.hyperparameters.get_flat_idx(cost_param) self.cost_space = None self.cost_random_suggestion = None if self.cost_param_idx is not None: self.cost_space = list(self.hyperparameters.flat_spaces.values())[self.cost_param_idx] - self.cost_random_suggestion = -0.8 # In norm cost space. Make arg if necessary + self.cost_random_suggestion = -0.8 self.target_cost_ratio = [] self._running_target_buffer = deque(maxlen=30) - self.gp_max_obs = gp_max_obs # train time bumps after 800? + self.gp_max_obs = gp_max_obs self.infer_batch_size = infer_batch_size - # Probably useful only when downsample=1 and each run is expensive. self.use_success_prob = sweep_config['downsample'] == 1 self.success_classifier = LogisticRegression(class_weight='balanced') - # This model is conservative. Aggressive early stopping interferes with and hampers GP model learning. self.stop_threshold_model = RobustLogCostModel(quantile=sweep_config['early_stop_quantile']) self.upper_cost_threshold = -np.inf - # Use 64 bit for GP regression with default_tensor_dtype(torch.float64): - # Params taken from HEBO: https://arxiv.org/abs/2012.03826 noise_prior = LogNormalPrior(math.log(1e-2), 0.5) + noise_constraint = gpytorch.constraints.GreaterThan(8e-4) - # Create dummy data for model initialization on the selected device dummy_x = torch.ones((1, self.hyperparameters.num), device=self.device) dummy_y = torch.zeros(1, device=self.device) - # Score GP - self.likelihood_score = GaussianLikelihood(noise_prior=deepcopy(noise_prior)).to(self.device) - self.gp_score = ExactGPModel(dummy_x, dummy_y, self.likelihood_score, self.hyperparameters.num).to(self.device) + # Score GP: drops the polynomial term. + self.likelihood_score = GaussianLikelihood( + noise_prior=deepcopy(noise_prior), + noise_constraint=deepcopy(noise_constraint), + ).to(self.device) + self.gp_score = ExactGPModel( + dummy_x, dummy_y, self.likelihood_score, + self.hyperparameters.num, use_polynomial=False, + ).to(self.device) self.mll_score = ExactMarginalLogLikelihood(self.likelihood_score, self.gp_score).to(self.device) self.score_opt = torch.optim.Adam(self.gp_score.parameters(), lr=self.gp_learning_rate, amsgrad=True) - # Cost GP - self.likelihood_cost = GaussianLikelihood(noise_prior=deepcopy(noise_prior)).to(self.device) - self.gp_cost = ExactGPModel(dummy_x, dummy_y, self.likelihood_cost, self.hyperparameters.num).to(self.device) + # Cost GP: keeps the polynomial (log_cost approximately linear in + # log(lr)/log(steps), so the linear extrapolation is appropriate). + self.likelihood_cost = GaussianLikelihood( + noise_prior=deepcopy(noise_prior), + noise_constraint=deepcopy(noise_constraint), + ).to(self.device) + self.gp_cost = ExactGPModel( + dummy_x, dummy_y, self.likelihood_cost, + self.hyperparameters.num, use_polynomial=True, + ).to(self.device) self.mll_cost = ExactMarginalLogLikelihood(self.likelihood_cost, self.gp_cost).to(self.device) self.cost_opt = torch.optim.Adam(self.gp_cost.parameters(), lr=self.gp_learning_rate, amsgrad=True) - # Buffers for GP training and inference self.gp_params_buffer = torch.empty(self.gp_max_obs, self.hyperparameters.num, device=self.device) self.gp_score_buffer = torch.empty(self.gp_max_obs, device=self.device) self.gp_cost_buffer = torch.empty(self.gp_max_obs, device=self.device) @@ -624,11 +627,9 @@ def _filter_near_duplicates(self, inputs, duplicate_threshold=EPSILON): tree = KDTree(inputs) to_keep = np.ones(len(inputs), dtype=bool) - # Iterate from most recent to oldest for i in range(len(inputs) - 1, -1, -1): if to_keep[i]: nearby_indices = tree.query_ball_point(inputs[i], r=duplicate_threshold) - # Exclude the point itself from being marked for removal nearby_indices.remove(i) if nearby_indices: to_keep[nearby_indices] = False @@ -641,23 +642,24 @@ def _sample_observations(self, max_size=None, recent_ratio=0.5): observations = self.success_observations.copy() - # Update the stats using the full data y = np.array([e['output'] for e in observations]) self.min_score, self.max_score = y.min(), y.max() c = np.array([e['cost'] for e in observations]) log_c = np.log(np.maximum(c, EPSILON)) self.log_c_min = log_c.min() - self.log_c_max = np.quantile(log_c, 0.97) # Make it less sensitive to outlier points + self.log_c_max = np.quantile(log_c, 0.97) - # When the data is scare, also use failed observations + # Failures pinned to a FROZEN sentinel below the worst observed + # success. Without this freeze the failure floor ratchets with + # min_score as new successes arrive, retilting the kernel slope. if len(observations) < 100 and self.failure_observations: - # Give the min score for the failed obs, so this value will keep changing. + if self._failure_sentinel is None: + spread = max(abs(self.max_score - self.min_score), 1.0) + self._failure_sentinel = float(self.min_score) - 0.5 * spread for e in self.failure_observations: - e['output'] = self.min_score - - # NOTE: the order of obs matters since recent obs are always fed into gp training - # So, putting the failure obs first. + e['output'] = self._failure_sentinel + observations = self.failure_observations + observations params = np.array([np.append(e['input'], [e['output'], e['cost']]) for e in observations]) @@ -669,7 +671,7 @@ def _sample_observations(self, max_size=None, recent_ratio=0.5): if len(observations) <= max_size: return observations - + recent_size = int(recent_ratio*max_size) recent_obs = observations[-recent_size:] older_obs = observations[:-recent_size] @@ -681,54 +683,53 @@ def _sample_observations(self, max_size=None, recent_ratio=0.5): def _train_gp_models(self): if not self.success_observations: return 0, 0 - + sampled_observations = self._sample_observations(max_size=self.gp_max_obs) num_sampled = len(sampled_observations) - # Prepare tensors using pre-allocated buffers - params = np.array([e['input'] for e in sampled_observations]) + # Per-fit input rescaling to the observed box. On dims where obs + # cluster, x_span shrinks → σ saturates at unobserved values → UCB + # exploration dominates on that dim. + params = np.array([e['input'] for e in sampled_observations], dtype=np.float64) + self._x_min_obs = params.min(axis=0) + self._x_max_obs = params.max(axis=0) + x_span = np.maximum(self._x_max_obs - self._x_min_obs, 1e-8) + params_scaled = 2.0 * (params - self._x_min_obs) / x_span - 1.0 params_tensor = self.gp_params_buffer[:num_sampled] - params_tensor.copy_(torch.from_numpy(params)) - - # Normalized scores and costs - y = np.array([e['output'] for e in sampled_observations]) - y_norm = (y - self.min_score) / (np.abs(self.max_score - self.min_score) + EPSILON) - y_norm_tensor = self.gp_score_buffer[:num_sampled] - y_norm_tensor.copy_(torch.from_numpy(y_norm)) + params_tensor.copy_(torch.from_numpy(params_scaled)) + + # Output warping: signed (so GP always sees "higher = better"), then + # Yeo-Johnson + standardize. Fall back to plain z-score on degeneracy. + y = np.array([e['output'] for e in sampled_observations], dtype=np.float64) + signed_y = self.hyperparameters.optimize_direction * y + y_warped = None + if num_sampled >= 4 and float(np.std(signed_y)) > 1e-9: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + try: + y_warped = self._y_warper_yj.fit_transform(signed_y.reshape(-1, 1)).ravel().astype(np.float64) + except (ValueError, RuntimeError): + y_warped = None + if y_warped is None: + mu, sd = signed_y.mean(), max(float(np.std(signed_y)), EPSILON) + y_warped = (signed_y - mu) / sd + self._y_best_warped = float(y_warped.max()) + y_warped_tensor = self.gp_score_buffer[:num_sampled] + y_warped_tensor.copy_(torch.from_numpy(y_warped)) c = np.array([e['cost'] for e in sampled_observations]) - log_c = np.log(np.maximum(c, EPSILON)) # Ensure log is not taken on zero + log_c = np.log(np.maximum(c, EPSILON)) log_c_norm = (log_c - self.log_c_min) / (self.log_c_max - self.log_c_min + EPSILON) log_c_norm_tensor = self.gp_cost_buffer[:num_sampled] log_c_norm_tensor.copy_(torch.from_numpy(log_c_norm)) with warnings.catch_warnings(): warnings.simplefilter("ignore", gpytorch.utils.warnings.NumericalWarning) - score_loss = train_gp_model(self.gp_score, self.likelihood_score, self.mll_score, self.score_opt, params_tensor, y_norm_tensor, training_iter=self.gp_training_iter) + score_loss = train_gp_model(self.gp_score, self.likelihood_score, self.mll_score, self.score_opt, params_tensor, y_warped_tensor, training_iter=self.gp_training_iter) cost_loss = train_gp_model(self.gp_cost, self.likelihood_cost, self.mll_cost, self.cost_opt, params_tensor, log_c_norm_tensor, training_iter=self.gp_training_iter) return score_loss, cost_loss - def _get_top_obs_params(self): - if not self.top_observations: - return np.array([]) - - params = np.array([e['input'] for e in self.top_observations]) - if self.cost_param_idx is None: - return params - - # Add the same params with less cost to the search center, and not the original - original_costs_norm = params[:, self.cost_param_idx] - - params_1 = np.copy(params) - cost_norm_1 = original_costs_norm - (original_costs_norm - (-1)) / 2 - params_1[:, self.cost_param_idx] = cost_norm_1 - params_2 = np.copy(params) - cost_norm_2 = original_costs_norm - (original_costs_norm - (-1)) / 3 - params_2[:, self.cost_param_idx] = cost_norm_2 - - return np.vstack([params_1, params_2]) - def _sample_target_cost_ratio(self, expansion_rate, target_ratios=(0.16, 0.32, 0.48, 0.64, 0.8, 1.0)): if not self.target_cost_ratio: self.target_cost_ratio = list(target_ratios) @@ -736,6 +737,15 @@ def _sample_target_cost_ratio(self, expansion_rate, target_ratios=(0.16, 0.32, 0 target_ratio = np.clip(self.target_cost_ratio.pop() + 0.1 * np.random.randn(), 0, 1) return (1 + expansion_rate) * target_ratio + def _sobol_suggestion(self, fill, fixed_cost_norm): + suggestion = 2 * self.sobol.random(1)[0] - 1 + if fixed_cost_norm is not None: + suggestion[self.cost_param_idx] = fixed_cost_norm + elif self.cost_param_idx is not None: + cost_suggestion = self.cost_random_suggestion + 0.1 * np.random.randn() + suggestion[self.cost_param_idx] = np.clip(cost_suggestion, -1, 1) + return self.hyperparameters.to_dict(suggestion, fill), {} + def suggest(self, fill, fixed_total_timesteps=None): info = {} self.suggestion_idx += 1 @@ -743,28 +753,16 @@ def suggest(self, fill, fixed_total_timesteps=None): if fixed_total_timesteps is not None and self.cost_space is not None: fixed_cost_norm = self.cost_space.normalize(fixed_total_timesteps) - # NOTE: Changed pufferl to use the train args, NOT the sweep hyperparam search center - # if len(self.success_observations) == 0 and self.seed_with_search_center: - # suggestion = self.hyperparameters.search_centers - # return self.hyperparameters.to_dict(suggestion, fill), info - if self.suggestion_idx <= self.num_random_samples: - # Suggest the next point in the Sobol sequence - zero_one = self.sobol.random(1)[0] - suggestion = 2*zero_one - 1 # Scale from [0, 1) to [-1, 1) - if fixed_cost_norm is not None: - suggestion[self.cost_param_idx] = fixed_cost_norm - elif self.cost_param_idx is not None: - cost_suggestion = self.cost_random_suggestion + 0.1 * np.random.randn() - suggestion[self.cost_param_idx] = np.clip(cost_suggestion, -1, 1) # limit the cost - return self.hyperparameters.to_dict(suggestion, fill), info - - elif self.resample_frequency and self.suggestion_idx % self.resample_frequency == 0: + return self._sobol_suggestion(fill, fixed_cost_norm) + + if self.resample_frequency and self.suggestion_idx % self.resample_frequency == 0: candidates, _ = pareto_points(self.success_observations) - suggestions = np.stack([e['input'] for e in candidates]) - best_idx = np.random.randint(0, len(candidates)) - best = suggestions[best_idx] - return self.hyperparameters.to_dict(best, fill), info + if candidates: + suggestions = np.stack([e['input'] for e in candidates]) + best_idx = np.random.randint(0, len(candidates)) + best = suggestions[best_idx] + return self.hyperparameters.to_dict(best, fill), info score_loss, cost_loss = self._train_gp_models() @@ -772,27 +770,39 @@ def suggest(self, fill, fixed_total_timesteps=None): print(f'Resetting GP optimizers at suggestion {self.suggestion_idx}') self.score_opt = torch.optim.Adam(self.gp_score.parameters(), lr=self.gp_learning_rate, amsgrad=True) self.cost_opt = torch.optim.Adam(self.gp_cost.parameters(), lr=self.gp_learning_rate, amsgrad=True) - + pareto_front, pareto_idxs = pareto_points(self.success_observations) pruned_front = prune_pareto_front(pareto_front) + # An all-failure Sobol phase leaves success_observations empty here. + if not pruned_front: + return self._sobol_suggestion(fill, fixed_cost_norm) pareto_observations = pruned_front if self.prune_pareto else pareto_front - # Use the max cost from the pruned pareto to avoid inefficiently long runs if self.upper_cost_threshold < 0: self.upper_cost_threshold = pruned_front[-1]['cost'] - # Try to change the threshold slowly elif self.upper_cost_threshold < pruned_front[-1]['cost']: self.upper_cost_threshold *= 1.01 self.stop_threshold_model.fit(self.success_observations, self.upper_cost_threshold) - ### Sample suggestions - search_centers = np.stack([e['input'] for e in pareto_observations]) - if self.top_observations: - # Add top observations by score to search centers for diversity - search_centers = np.vstack([search_centers, self._get_top_obs_params()]) - - suggestions = self.hyperparameters.sample( - len(search_centers)*self.suggestions_per_pareto, mu=search_centers) + # Candidates: global Sobol (UCB σ-exploration) + tight local + # perturbations around top-K observations (µ-injection refinement). + # Pure Sobol breaks down at our dim: P(any sample within ~ℓ of an + # obs) ≈ (ℓ/2)^d ≈ 10⁻¹² at d=16, so argmax-µ over Sobol never + # lands near best-obs. + d = self.hyperparameters.num + n_cand = 1 << max(11, (max(2000, 200 * d) - 1).bit_length()) + cand_sobol = Sobol(d=d, scramble=True, seed=self.suggestion_idx) + suggestions_global = 2.0 * cand_sobol.random(n_cand) - 1.0 + + k_top = min(5, len(self.success_observations)) + top_centers = np.stack([e['input'] for e in + sorted(self.success_observations, + key=lambda e: e['output'], + reverse=True)[:k_top]]) + n_local = max(256 * k_top, 512) + suggestions_local = self.hyperparameters.sample(n_local, mu=top_centers, scale=0.4) + + suggestions = np.vstack([suggestions_global, suggestions_local]) if fixed_cost_norm is not None: suggestions[:, self.cost_param_idx] = fixed_cost_norm @@ -801,59 +811,64 @@ def suggest(self, fill, fixed_total_timesteps=None): suggestions = suggestions[dedup_indices] if len(suggestions) == 0: - return self.suggest(fill) # Fallback to random if all suggestions are filtered + return self.suggest(fill) - ### Predict scores and costs - # Batch predictions to avoid GPU OOM for large number of suggestions - gp_y_norm_list, gp_log_c_norm_list = [], [] + # Apply the same per-fit rescaling to candidates before prediction. + x_span = np.maximum(self._x_max_obs - self._x_min_obs, 1e-8) + suggestions_scaled = 2.0 * (suggestions - self._x_min_obs) / x_span - 1.0 + + gp_y_mean_list, gp_y_var_list, gp_log_c_norm_list = [], [], [] with torch.no_grad(), gpytorch.settings.fast_pred_var(), warnings.catch_warnings(): warnings.simplefilter("ignore", gpytorch.utils.warnings.NumericalWarning) - # Create a reusable buffer on the device to avoid allocating a huge tensor - for i in range(0, len(suggestions), self.infer_batch_size): - batch_numpy = suggestions[i:i+self.infer_batch_size] + for i in range(0, len(suggestions_scaled), self.infer_batch_size): + batch_numpy = suggestions_scaled[i:i+self.infer_batch_size] current_batch_size = len(batch_numpy) - - # Use a slice of the buffer if the current batch is smaller batch_tensor = self.infer_batch_buffer[:current_batch_size] batch_tensor.copy_(torch.from_numpy(batch_numpy)) try: - # Score and cost prediction - pred_y_mean = self.likelihood_score(self.gp_score(batch_tensor)).mean.cpu() + posterior = self.likelihood_score(self.gp_score(batch_tensor)) + pred_y_mean = posterior.mean.cpu() + pred_y_var = posterior.variance.cpu() pred_c_mean = self.likelihood_cost(self.gp_cost(batch_tensor)).mean.cpu() - except RuntimeError: - # Handle numerical errors during GP prediction - pred_y_mean, pred_c_mean = torch.zeros(current_batch_size) - - gp_y_norm_list.append(pred_y_mean.cpu()) - gp_log_c_norm_list.append(pred_c_mean.cpu()) + pred_y_mean = torch.zeros(current_batch_size) + pred_y_var = torch.ones(current_batch_size) + pred_c_mean = torch.zeros(current_batch_size) - del pred_y_mean, pred_c_mean + gp_y_mean_list.append(pred_y_mean) + gp_y_var_list.append(pred_y_var) + gp_log_c_norm_list.append(pred_c_mean) - # Concatenate results from all batches - gp_y_norm = torch.cat(gp_y_norm_list).numpy() + gp_y_mean = torch.cat(gp_y_mean_list).numpy() + gp_y_var = torch.cat(gp_y_var_list).numpy() + gp_y_std = np.sqrt(np.maximum(gp_y_var, EPSILON * EPSILON)) gp_log_c_norm = torch.cat(gp_log_c_norm_list).numpy() - - # Unlinearize - gp_y = gp_y_norm*(self.max_score - self.min_score) + self.min_score gp_log_c = gp_log_c_norm*(self.log_c_max - self.log_c_min) + self.log_c_min gp_c = np.exp(gp_log_c) - # Maximize score. (Tried upper confidence bounds, but it did more harm because gp was noisy) - suggestion_scores = self.hyperparameters.optimize_direction * gp_y_norm - - # When cost is fixed (pareto mode), skip cost-based weighting - if fixed_cost_norm is None: - max_c_mask = gp_c < self.max_suggestion_cost + # HEBO stochastic perturbation: noise ~ N(0, 2·σ²_obs) added to µ + # robustifies acquisition against GP miscalibration. + noise_std = float(self.gp_score.likelihood.noise.detach().sqrt().cpu().item()) + gp_y_mu_perturbed = gp_y_mean + math.sqrt(2.0) * noise_std * np.random.randn(len(gp_y_mean)) + + # UCB; κ = sqrt(2·log(n·d)) is HEBO's heuristic, grows slowly with n. + n_obs = max(1, len(self.success_observations)) + kappa = math.sqrt(2.0 * math.log(max(n_obs * d, 2))) + ucb_scores = gp_y_mu_perturbed + kappa * gp_y_std + + # Cost and success-prob factors kept in log-space so they compose + # additively with the acquisition scores. + cost_logweight = np.zeros_like(ucb_scores) + cost_mask = np.ones_like(ucb_scores, dtype=bool) + if fixed_cost_norm is None and self.cost_param_idx is not None: + cost_mask = gp_c < self.max_suggestion_cost target_cost = self._sample_target_cost_ratio(self.expansion_rate) - weight = 1 - abs(target_cost - gp_log_c_norm) - suggestion_scores *= max_c_mask * weight + cost_logweight = np.log(np.maximum(1 - np.abs(target_cost - gp_log_c_norm), 1e-9)) - # Then, consider the prob of training success, only when downsample = 1 - # NOTE: Useful only in limited scenarios, where each data point is expensive. So turn it off by default. + success_logweight = np.zeros_like(ucb_scores) if self.use_success_prob and len(self.success_observations) > 9 and len(self.failure_observations) > 9: success_params = np.array([e['input'] for e in self.success_observations]) failure_params = np.array([e['input'] for e in self.failure_observations]) @@ -867,22 +882,46 @@ def suggest(self, fill, fixed_total_timesteps=None): with warnings.catch_warnings(): warnings.simplefilter("ignore", UserWarning) p_success = self.success_classifier.predict_proba(suggestions)[:, 1] - suggestion_scores *= p_success + success_logweight = np.log(np.maximum(p_success, 1e-9)) + + # MACE mean injection: every 3rd GP-phase trial commits to argmax(µ) + # to refine the currently-best mode. At κ≈3.4 pure UCB won't shrink + # σ fast enough on a single observation. Drop the target-cost soft + # weight here (refinement shouldn't be cost-biased), keep the hard + # over-budget mask and the success-prob weight. + if self.suggestion_idx % 3 == 0: + mu_scores = gp_y_mu_perturbed + success_logweight + mu_scores[~cost_mask] = -np.inf + best_idx = int(np.argmax(mu_scores)) + acq_mode = 'mu_inject' + picked_score = float(mu_scores[best_idx]) + else: + ucb_scores = ucb_scores + cost_logweight + success_logweight + ucb_scores[~cost_mask] = -np.inf + best_idx = int(np.argmax(ucb_scores)) + acq_mode = 'ucb' + picked_score = float(ucb_scores[best_idx]) - best_idx = np.argmax(suggestion_scores) info = dict( + acq_mode = acq_mode, cost = gp_c[best_idx].item(), - score = gp_y[best_idx].item(), - rating = suggestion_scores[best_idx].item(), + score_mean_warped = float(gp_y_mean[best_idx]), + score_std_warped = float(gp_y_std[best_idx]), + picked_score = picked_score, + kappa = kappa, + f_best_warped = self._y_best_warped, score_loss = score_loss, cost_loss = cost_loss, score_lengthscale = self.gp_score.lengthscale_range, cost_lengthscale = self.gp_cost.lengthscale_range, ) print('Predicted -- ', - f'Score: {info["score"]:.3f}', - f'Cost: {info["cost"]:.3f}', - f'Rating: {info["rating"]:.3f}', + f'{acq_mode}: {picked_score:+.3f}', + f'µ_w: {info["score_mean_warped"]:+.3f}', + f'σ_w: {info["score_std_warped"]:.3f}', + f'κ: {info["kappa"]:.2f}', + f'f*_w: {info["f_best_warped"]:+.3f}', + f'Cost: {info["cost"]:.0f}', ) best = suggestions[best_idx] @@ -919,21 +958,11 @@ def observe(self, hypers, score, cost, is_failure=False): self.success_observations[same[0]] = new_observation return - # Ignore obs that are below the minimum cost if self.cost_param_idx is not None and params[self.cost_param_idx] <= -1: return self.success_observations.append(new_observation) - # Update top_observations without sorting the full list every time - if len(self.top_observations) < self.num_keep_top_obs: - self.top_observations.append(new_observation) - self.top_observations.sort(key=lambda x: x['output'], reverse=True) - elif score > self.top_observations[-1]['output']: - self.top_observations.pop() - self.top_observations.append(new_observation) - self.top_observations.sort(key=lambda x: x['output'], reverse=True) - def get_early_stop_threshold(self, cost): return self.stop_threshold_model.get_threshold(cost)