Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 61 additions & 0 deletions src/microplex_us/pipelines/mp300k_artifact_gates.py
Original file line number Diff line number Diff line change
Expand Up @@ -1088,6 +1088,10 @@ def _ecps_comparison_gate(
"n_targets_kept": summary.get("n_targets_kept"),
"matched_household_count": contract["matched_household_count"],
"holdout_target_fraction": contract["holdout_target_fraction"],
"candidate_holdout_loss": contract["candidate_holdout_loss"],
"baseline_holdout_loss": contract["baseline_holdout_loss"],
"candidate_unweighted_msre": contract["candidate_unweighted_msre"],
"baseline_unweighted_msre": contract["baseline_unweighted_msre"],
},
details=details,
)
Expand Down Expand Up @@ -1244,6 +1248,38 @@ def _ecps_comparison_contract_summary(
elif holdout_targets is not None:
has_holdout_targets = int(holdout_targets) > 0

candidate_holdout_loss = _first_nested_present(
payload,
summary,
"candidate_holdout_loss",
)
baseline_holdout_loss = _first_nested_present(
payload,
summary,
"baseline_holdout_loss",
)
holdout_loss_beats_baseline = _loss_strictly_beats(
candidate_holdout_loss, baseline_holdout_loss
)

candidate_unweighted_msre = _first_nested_present(
payload,
summary,
"candidate_unweighted_msre",
"candidate_msre",
"candidate_mean_unweighted_msre",
)
baseline_unweighted_msre = _first_nested_present(
payload,
summary,
"baseline_unweighted_msre",
"baseline_msre",
"baseline_mean_unweighted_msre",
)
unweighted_msre_beats_baseline = _loss_strictly_beats(
candidate_unweighted_msre, baseline_unweighted_msre
)

protected_summary = _protected_family_floor_summary(payload, summary)
core_benchmark_summary = _core_benchmark_family_floor_summary(payload, summary)
frozen_baseline_summary = _frozen_baseline_certificate_summary(
Expand All @@ -1260,12 +1296,18 @@ def _ecps_comparison_contract_summary(
"ecps_refit_effective": ecps_refit_effective is True,
"frozen_ecps_baseline_certificate": frozen_baseline_summary["passed"] is True,
"holdout_target_split": has_holdout_targets,
"holdout_loss_beats_baseline": holdout_loss_beats_baseline is True,
"unweighted_msre_beats_baseline": (unweighted_msre_beats_baseline is True),
"protected_family_floors": protected_summary["passed"] is True,
"core_benchmark_family_floors": core_benchmark_summary["passed"] is True,
}
return {
"matched_household_count": matched_household_count,
"holdout_target_fraction": holdout_target_fraction,
"candidate_holdout_loss": candidate_holdout_loss,
"baseline_holdout_loss": baseline_holdout_loss,
"candidate_unweighted_msre": candidate_unweighted_msre,
"baseline_unweighted_msre": baseline_unweighted_msre,
"missing_requirements": [
key for key, passed in requirements.items() if not passed
],
Expand All @@ -1279,6 +1321,12 @@ def _ecps_comparison_contract_summary(
"ecps_refit_effective_passed": ecps_refit_effective,
"frozen_ecps_baseline_certificate": frozen_baseline_summary,
"holdout_targets": holdout_targets,
"candidate_holdout_loss": candidate_holdout_loss,
"baseline_holdout_loss": baseline_holdout_loss,
"holdout_loss_beats_baseline": holdout_loss_beats_baseline,
"candidate_unweighted_msre": candidate_unweighted_msre,
"baseline_unweighted_msre": baseline_unweighted_msre,
"unweighted_msre_beats_baseline": unweighted_msre_beats_baseline,
"protected_family_floor": protected_summary,
"core_benchmark_family_floor": core_benchmark_summary,
},
Expand Down Expand Up @@ -1551,6 +1599,19 @@ def _float_equal(left: Any, right: Any, *, tolerance: float = 1e-12) -> bool:
return abs(float(left) - float(right)) <= tolerance


def _loss_strictly_beats(candidate: Any, baseline: Any) -> bool | None:
if candidate is None or baseline is None:
return None
try:
candidate_value = float(candidate)
baseline_value = float(baseline)
except (TypeError, ValueError):
return None
if not np.isfinite(candidate_value) or not np.isfinite(baseline_value):
return None
return candidate_value < baseline_value


def _first_nested_present(
payload: Any,
summary: dict[str, Any],
Expand Down
86 changes: 82 additions & 4 deletions tests/pipelines/test_mp300k_artifact_gates.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,10 @@ def _sound_ecps_comparison_payload(
*,
candidate_loss: float = 0.12,
baseline_loss: float = 0.20,
candidate_holdout_loss: float = 0.03,
baseline_holdout_loss: float = 0.04,
candidate_unweighted_msre: float = 0.10,
baseline_unweighted_msre: float = 0.17,
) -> dict[str, object]:
fit_config = {
"lambda_l0": 0.0,
Expand Down Expand Up @@ -258,8 +262,8 @@ def _sound_ecps_comparison_payload(
"scoring_config": {"sha256": "e" * 64},
"baseline_metrics": {
"baseline_enhanced_cps_native_loss": baseline_loss,
"baseline_holdout_loss": 0.04,
"baseline_unweighted_msre": 0.17,
"baseline_holdout_loss": baseline_holdout_loss,
"baseline_unweighted_msre": baseline_unweighted_msre,
},
},
"summary": {
Expand All @@ -274,8 +278,10 @@ def _sound_ecps_comparison_payload(
"baseline_refit_config": fit_config,
"refit_objective_matches_scoring": True,
"ecps_refit_effective_passed": True,
"baseline_holdout_loss": 0.04,
"baseline_unweighted_msre": 0.17,
"candidate_holdout_loss": candidate_holdout_loss,
"baseline_holdout_loss": baseline_holdout_loss,
"candidate_unweighted_msre": candidate_unweighted_msre,
"baseline_unweighted_msre": baseline_unweighted_msre,
"holdout_target_fraction": 0.2,
"protected_family_losses": protected_family_losses,
},
Expand Down Expand Up @@ -1528,6 +1534,78 @@ def test_ecps_comparison_requires_measured_refit_objective_identity(tmp_path):
assert ecps_gate["details"]["refit_objective_matches_scoring"] is None


def test_ecps_comparison_rejects_adverse_holdout_loss(tmp_path):
artifact_dir = tmp_path / "artifact"
artifact_dir.mkdir()
_write_contract_policyengine_dataset(artifact_dir / "candidate.h5")
baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5")
benchmark_manifest = tmp_path / "benchmark_manifest.json"
_write_benchmark_manifest(benchmark_manifest)
_write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset)
payload = _sound_ecps_comparison_payload(
candidate_loss=0.10,
baseline_loss=0.20,
candidate_holdout_loss=0.050,
baseline_holdout_loss=0.040,
)

report_path = write_mp300k_artifact_gate_report(
artifact_dir,
ecps_comparison_payload=payload,
arch_coverage_payload=_arch_coverage_payload(),
runtime_smoke_payload={"runtime_ratio": 1.0},
benchmark_manifest_path=benchmark_manifest,
compute_native_scores=False,
update_manifest=False,
)

record = json.loads(report_path.read_text())
ecps_gate = record["gates"]["ecps_comparison"]

assert record["summary"]["status"] == "failed"
assert ecps_gate["status"] == "fail"
assert "holdout_loss_beats_baseline" in ecps_gate["summary"]
assert ecps_gate["metrics"]["candidate_holdout_loss"] == pytest.approx(0.050)
assert ecps_gate["metrics"]["baseline_holdout_loss"] == pytest.approx(0.040)
assert ecps_gate["details"]["holdout_loss_beats_baseline"] is False


def test_ecps_comparison_rejects_adverse_unweighted_msre(tmp_path):
artifact_dir = tmp_path / "artifact"
artifact_dir.mkdir()
_write_contract_policyengine_dataset(artifact_dir / "candidate.h5")
baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5")
benchmark_manifest = tmp_path / "benchmark_manifest.json"
_write_benchmark_manifest(benchmark_manifest)
_write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset)
payload = _sound_ecps_comparison_payload(
candidate_loss=0.10,
baseline_loss=0.20,
candidate_unweighted_msre=0.30,
baseline_unweighted_msre=0.17,
)

report_path = write_mp300k_artifact_gate_report(
artifact_dir,
ecps_comparison_payload=payload,
arch_coverage_payload=_arch_coverage_payload(),
runtime_smoke_payload={"runtime_ratio": 1.0},
benchmark_manifest_path=benchmark_manifest,
compute_native_scores=False,
update_manifest=False,
)

record = json.loads(report_path.read_text())
ecps_gate = record["gates"]["ecps_comparison"]

assert record["summary"]["status"] == "failed"
assert ecps_gate["status"] == "fail"
assert "unweighted_msre_beats_baseline" in ecps_gate["summary"]
assert ecps_gate["metrics"]["candidate_unweighted_msre"] == pytest.approx(0.30)
assert ecps_gate["metrics"]["baseline_unweighted_msre"] == pytest.approx(0.17)
assert ecps_gate["details"]["unweighted_msre_beats_baseline"] is False


def test_runtime_gate_ignores_contradictory_producer_verdict(tmp_path):
artifact_dir = tmp_path / "artifact"
artifact_dir.mkdir()
Expand Down
14 changes: 10 additions & 4 deletions tests/pipelines/test_mp300k_gate_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,10 @@ def _sound_ecps_comparison_payload() -> dict[str, object]:
]
candidate_loss = 0.1
baseline_loss = 0.2
candidate_holdout_loss = 0.03
baseline_holdout_loss = 0.04
candidate_unweighted_msre = 0.10
baseline_unweighted_msre = 0.17
return {
"frozen_ecps_baseline_certificate": {
"schema_version": 1,
Expand Down Expand Up @@ -218,8 +222,8 @@ def _sound_ecps_comparison_payload() -> dict[str, object]:
"scoring_config": {"sha256": "e" * 64},
"baseline_metrics": {
"baseline_enhanced_cps_native_loss": baseline_loss,
"baseline_holdout_loss": 0.04,
"baseline_unweighted_msre": 0.17,
"baseline_holdout_loss": baseline_holdout_loss,
"baseline_unweighted_msre": baseline_unweighted_msre,
},
},
"summary": {
Expand All @@ -234,8 +238,10 @@ def _sound_ecps_comparison_payload() -> dict[str, object]:
"baseline_refit_config": fit_config,
"refit_objective_matches_scoring": True,
"ecps_refit_effective_passed": True,
"baseline_holdout_loss": 0.04,
"baseline_unweighted_msre": 0.17,
"candidate_holdout_loss": candidate_holdout_loss,
"baseline_holdout_loss": baseline_holdout_loss,
"candidate_unweighted_msre": candidate_unweighted_msre,
"baseline_unweighted_msre": baseline_unweighted_msre,
"holdout_target_fraction": 0.2,
"protected_family_losses": protected_family_losses,
},
Expand Down
Loading