From 6cea1bfe63be551837760984a4cf8d00f83d64fe Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Sat, 6 Jun 2026 09:58:30 +0100 Subject: [PATCH] Require holdout and MSRE eCPS wins --- .../pipelines/mp300k_artifact_gates.py | 61 +++++++++++++ tests/pipelines/test_mp300k_artifact_gates.py | 86 ++++++++++++++++++- tests/pipelines/test_mp300k_gate_inputs.py | 14 ++- 3 files changed, 153 insertions(+), 8 deletions(-) diff --git a/src/microplex_us/pipelines/mp300k_artifact_gates.py b/src/microplex_us/pipelines/mp300k_artifact_gates.py index 625c47d..38e4e33 100644 --- a/src/microplex_us/pipelines/mp300k_artifact_gates.py +++ b/src/microplex_us/pipelines/mp300k_artifact_gates.py @@ -1088,6 +1088,10 @@ def _ecps_comparison_gate( "n_targets_kept": summary.get("n_targets_kept"), "matched_household_count": contract["matched_household_count"], "holdout_target_fraction": contract["holdout_target_fraction"], + "candidate_holdout_loss": contract["candidate_holdout_loss"], + "baseline_holdout_loss": contract["baseline_holdout_loss"], + "candidate_unweighted_msre": contract["candidate_unweighted_msre"], + "baseline_unweighted_msre": contract["baseline_unweighted_msre"], }, details=details, ) @@ -1244,6 +1248,38 @@ def _ecps_comparison_contract_summary( elif holdout_targets is not None: has_holdout_targets = int(holdout_targets) > 0 + candidate_holdout_loss = _first_nested_present( + payload, + summary, + "candidate_holdout_loss", + ) + baseline_holdout_loss = _first_nested_present( + payload, + summary, + "baseline_holdout_loss", + ) + holdout_loss_beats_baseline = _loss_strictly_beats( + candidate_holdout_loss, baseline_holdout_loss + ) + + candidate_unweighted_msre = _first_nested_present( + payload, + summary, + "candidate_unweighted_msre", + "candidate_msre", + "candidate_mean_unweighted_msre", + ) + baseline_unweighted_msre = _first_nested_present( + payload, + summary, + "baseline_unweighted_msre", + "baseline_msre", + "baseline_mean_unweighted_msre", + ) + unweighted_msre_beats_baseline = _loss_strictly_beats( + candidate_unweighted_msre, baseline_unweighted_msre + ) + protected_summary = _protected_family_floor_summary(payload, summary) core_benchmark_summary = _core_benchmark_family_floor_summary(payload, summary) frozen_baseline_summary = _frozen_baseline_certificate_summary( @@ -1260,12 +1296,18 @@ def _ecps_comparison_contract_summary( "ecps_refit_effective": ecps_refit_effective is True, "frozen_ecps_baseline_certificate": frozen_baseline_summary["passed"] is True, "holdout_target_split": has_holdout_targets, + "holdout_loss_beats_baseline": holdout_loss_beats_baseline is True, + "unweighted_msre_beats_baseline": (unweighted_msre_beats_baseline is True), "protected_family_floors": protected_summary["passed"] is True, "core_benchmark_family_floors": core_benchmark_summary["passed"] is True, } return { "matched_household_count": matched_household_count, "holdout_target_fraction": holdout_target_fraction, + "candidate_holdout_loss": candidate_holdout_loss, + "baseline_holdout_loss": baseline_holdout_loss, + "candidate_unweighted_msre": candidate_unweighted_msre, + "baseline_unweighted_msre": baseline_unweighted_msre, "missing_requirements": [ key for key, passed in requirements.items() if not passed ], @@ -1279,6 +1321,12 @@ def _ecps_comparison_contract_summary( "ecps_refit_effective_passed": ecps_refit_effective, "frozen_ecps_baseline_certificate": frozen_baseline_summary, "holdout_targets": holdout_targets, + "candidate_holdout_loss": candidate_holdout_loss, + "baseline_holdout_loss": baseline_holdout_loss, + "holdout_loss_beats_baseline": holdout_loss_beats_baseline, + "candidate_unweighted_msre": candidate_unweighted_msre, + "baseline_unweighted_msre": baseline_unweighted_msre, + "unweighted_msre_beats_baseline": unweighted_msre_beats_baseline, "protected_family_floor": protected_summary, "core_benchmark_family_floor": core_benchmark_summary, }, @@ -1551,6 +1599,19 @@ def _float_equal(left: Any, right: Any, *, tolerance: float = 1e-12) -> bool: return abs(float(left) - float(right)) <= tolerance +def _loss_strictly_beats(candidate: Any, baseline: Any) -> bool | None: + if candidate is None or baseline is None: + return None + try: + candidate_value = float(candidate) + baseline_value = float(baseline) + except (TypeError, ValueError): + return None + if not np.isfinite(candidate_value) or not np.isfinite(baseline_value): + return None + return candidate_value < baseline_value + + def _first_nested_present( payload: Any, summary: dict[str, Any], diff --git a/tests/pipelines/test_mp300k_artifact_gates.py b/tests/pipelines/test_mp300k_artifact_gates.py index a06a956..f2b6035 100644 --- a/tests/pipelines/test_mp300k_artifact_gates.py +++ b/tests/pipelines/test_mp300k_artifact_gates.py @@ -195,6 +195,10 @@ def _sound_ecps_comparison_payload( *, candidate_loss: float = 0.12, baseline_loss: float = 0.20, + candidate_holdout_loss: float = 0.03, + baseline_holdout_loss: float = 0.04, + candidate_unweighted_msre: float = 0.10, + baseline_unweighted_msre: float = 0.17, ) -> dict[str, object]: fit_config = { "lambda_l0": 0.0, @@ -258,8 +262,8 @@ def _sound_ecps_comparison_payload( "scoring_config": {"sha256": "e" * 64}, "baseline_metrics": { "baseline_enhanced_cps_native_loss": baseline_loss, - "baseline_holdout_loss": 0.04, - "baseline_unweighted_msre": 0.17, + "baseline_holdout_loss": baseline_holdout_loss, + "baseline_unweighted_msre": baseline_unweighted_msre, }, }, "summary": { @@ -274,8 +278,10 @@ def _sound_ecps_comparison_payload( "baseline_refit_config": fit_config, "refit_objective_matches_scoring": True, "ecps_refit_effective_passed": True, - "baseline_holdout_loss": 0.04, - "baseline_unweighted_msre": 0.17, + "candidate_holdout_loss": candidate_holdout_loss, + "baseline_holdout_loss": baseline_holdout_loss, + "candidate_unweighted_msre": candidate_unweighted_msre, + "baseline_unweighted_msre": baseline_unweighted_msre, "holdout_target_fraction": 0.2, "protected_family_losses": protected_family_losses, }, @@ -1528,6 +1534,78 @@ def test_ecps_comparison_requires_measured_refit_objective_identity(tmp_path): assert ecps_gate["details"]["refit_objective_matches_scoring"] is None +def test_ecps_comparison_rejects_adverse_holdout_loss(tmp_path): + artifact_dir = tmp_path / "artifact" + artifact_dir.mkdir() + _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") + baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") + benchmark_manifest = tmp_path / "benchmark_manifest.json" + _write_benchmark_manifest(benchmark_manifest) + _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) + payload = _sound_ecps_comparison_payload( + candidate_loss=0.10, + baseline_loss=0.20, + candidate_holdout_loss=0.050, + baseline_holdout_loss=0.040, + ) + + report_path = write_mp300k_artifact_gate_report( + artifact_dir, + ecps_comparison_payload=payload, + arch_coverage_payload=_arch_coverage_payload(), + runtime_smoke_payload={"runtime_ratio": 1.0}, + benchmark_manifest_path=benchmark_manifest, + compute_native_scores=False, + update_manifest=False, + ) + + record = json.loads(report_path.read_text()) + ecps_gate = record["gates"]["ecps_comparison"] + + assert record["summary"]["status"] == "failed" + assert ecps_gate["status"] == "fail" + assert "holdout_loss_beats_baseline" in ecps_gate["summary"] + assert ecps_gate["metrics"]["candidate_holdout_loss"] == pytest.approx(0.050) + assert ecps_gate["metrics"]["baseline_holdout_loss"] == pytest.approx(0.040) + assert ecps_gate["details"]["holdout_loss_beats_baseline"] is False + + +def test_ecps_comparison_rejects_adverse_unweighted_msre(tmp_path): + artifact_dir = tmp_path / "artifact" + artifact_dir.mkdir() + _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") + baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") + benchmark_manifest = tmp_path / "benchmark_manifest.json" + _write_benchmark_manifest(benchmark_manifest) + _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) + payload = _sound_ecps_comparison_payload( + candidate_loss=0.10, + baseline_loss=0.20, + candidate_unweighted_msre=0.30, + baseline_unweighted_msre=0.17, + ) + + report_path = write_mp300k_artifact_gate_report( + artifact_dir, + ecps_comparison_payload=payload, + arch_coverage_payload=_arch_coverage_payload(), + runtime_smoke_payload={"runtime_ratio": 1.0}, + benchmark_manifest_path=benchmark_manifest, + compute_native_scores=False, + update_manifest=False, + ) + + record = json.loads(report_path.read_text()) + ecps_gate = record["gates"]["ecps_comparison"] + + assert record["summary"]["status"] == "failed" + assert ecps_gate["status"] == "fail" + assert "unweighted_msre_beats_baseline" in ecps_gate["summary"] + assert ecps_gate["metrics"]["candidate_unweighted_msre"] == pytest.approx(0.30) + assert ecps_gate["metrics"]["baseline_unweighted_msre"] == pytest.approx(0.17) + assert ecps_gate["details"]["unweighted_msre_beats_baseline"] is False + + def test_runtime_gate_ignores_contradictory_producer_verdict(tmp_path): artifact_dir = tmp_path / "artifact" artifact_dir.mkdir() diff --git a/tests/pipelines/test_mp300k_gate_inputs.py b/tests/pipelines/test_mp300k_gate_inputs.py index 8ae3329..c4e072f 100644 --- a/tests/pipelines/test_mp300k_gate_inputs.py +++ b/tests/pipelines/test_mp300k_gate_inputs.py @@ -191,6 +191,10 @@ def _sound_ecps_comparison_payload() -> dict[str, object]: ] candidate_loss = 0.1 baseline_loss = 0.2 + candidate_holdout_loss = 0.03 + baseline_holdout_loss = 0.04 + candidate_unweighted_msre = 0.10 + baseline_unweighted_msre = 0.17 return { "frozen_ecps_baseline_certificate": { "schema_version": 1, @@ -218,8 +222,8 @@ def _sound_ecps_comparison_payload() -> dict[str, object]: "scoring_config": {"sha256": "e" * 64}, "baseline_metrics": { "baseline_enhanced_cps_native_loss": baseline_loss, - "baseline_holdout_loss": 0.04, - "baseline_unweighted_msre": 0.17, + "baseline_holdout_loss": baseline_holdout_loss, + "baseline_unweighted_msre": baseline_unweighted_msre, }, }, "summary": { @@ -234,8 +238,10 @@ def _sound_ecps_comparison_payload() -> dict[str, object]: "baseline_refit_config": fit_config, "refit_objective_matches_scoring": True, "ecps_refit_effective_passed": True, - "baseline_holdout_loss": 0.04, - "baseline_unweighted_msre": 0.17, + "candidate_holdout_loss": candidate_holdout_loss, + "baseline_holdout_loss": baseline_holdout_loss, + "candidate_unweighted_msre": candidate_unweighted_msre, + "baseline_unweighted_msre": baseline_unweighted_msre, "holdout_target_fraction": 0.2, "protected_family_losses": protected_family_losses, },