diff --git a/src/microplex_us/pipelines/ecps_replacement_comparison.py b/src/microplex_us/pipelines/ecps_replacement_comparison.py index fdf547e..0020a71 100644 --- a/src/microplex_us/pipelines/ecps_replacement_comparison.py +++ b/src/microplex_us/pipelines/ecps_replacement_comparison.py @@ -89,26 +89,35 @@ class ComparisonGateError(ValueError): def _assert_refit_effective( label: str, refit: dict[str, Any], min_reduction: float ) -> None: - """Fail if a refit did not materially reduce the loss (a no-op refit). + """Fail if a refit did not move at all (a frozen no-op refit). - A no-op refit (optimized loss ~= initial loss) means that side was never + A frozen refit (optimized loss == initial loss) means that side was never actually reweighted, so its loss is meaningless for comparison -- usually a degenerate loss matrix or a total-weight/population mismatch under - ``preserve_input``. + ``preserve_input``. A refit that moves the loss is effective even if the + full-set loss rises slightly: the refit minimizes the train objective, so an + already-well-calibrated dataset can legitimately see full loss tick up from + the held-out split. Only a frozen no-movement refit is a failure. """ - initial = float(refit["initial_full_loss"]) - optimized = float(refit["optimized_full_loss"]) - if optimized > initial - min_reduction: + if not _refit_moved(refit, min_reduction): + initial = float(refit["initial_full_loss"]) + optimized = float(refit["optimized_full_loss"]) raise ComparisonGateError( - f"{label} refit was a no-op: optimized loss {optimized:.6g} did not " - f"improve on initial {initial:.6g} ({min_reduction:g} reduction " - f"required). The refit never reweighted this dataset, so the " + f"{label} refit was a no-op: optimized loss {optimized:.6g} is " + f"unchanged from initial {initial:.6g} (no movement beyond " + f"{min_reduction:g}). The refit never reweighted this dataset, so the " f"comparison is meaningless -- likely a degenerate loss matrix or a " f"total-weight/population mismatch under preserve_input. Pass " f"assert_refit_effective=False only to deliberately accept this." ) +def _refit_moved(refit: dict[str, Any], min_reduction: float) -> bool: + initial = float(refit["initial_full_loss"]) + optimized = float(refit["optimized_full_loss"]) + return abs(optimized - initial) > float(min_reduction) + + def _assert_baseline_sane( score_summary: dict[str, Any], max_msre: float ) -> dict[str, Any]: @@ -351,6 +360,12 @@ def build_sound_ecps_replacement_comparison( if assert_refit_effective: _assert_refit_effective("candidate", candidate_refit, min_refit_loss_reduction) _assert_refit_effective("baseline", baseline_refit, min_refit_loss_reduction) + candidate_refit_effective_passed = _refit_moved( + candidate_refit, min_refit_loss_reduction + ) + baseline_refit_effective_passed = _refit_moved( + baseline_refit, min_refit_loss_reduction + ) protected_family_losses = _protected_family_losses( target_names=target_names, @@ -484,6 +499,9 @@ def build_sound_ecps_replacement_comparison( "score_source": score_source, "exact_rescore_requested": bool(exact_rescore), "exact_rescore_status": exact_rescore_status, + "candidate_refit_effective_passed": candidate_refit_effective_passed, + "baseline_refit_effective_passed": baseline_refit_effective_passed, + "ecps_refit_effective_passed": baseline_refit_effective_passed, "candidate_refit_config": refit_config, "baseline_refit_config": refit_config, "symmetric_refit": True, @@ -504,6 +522,23 @@ def build_sound_ecps_replacement_comparison( ), } ) + frozen_baseline_certificate = _frozen_ecps_baseline_certificate( + baseline_dataset_path=baseline_path, + policyengine_targets_db_path=resolved_targets_db, + policyengine_us_data_repo=policyengine_us_data_repo, + period=period, + target_names=target_names, + target_scope=target_scope, + holdout_target_fraction=holdout_target_fraction, + holdout_target_seed=holdout_target_seed, + matched_sample_method=matched_sample_method, + refit_config=refit_config, + skip_tax_expenditure_targets=skip_tax_expenditure_targets, + exact_rescore=exact_rescore, + score_source=score_source, + baseline_sanity=baseline_sanity, + score_summary=score_summary, + ) payload = { "schema_version": 1, "metric": "sound_ecps_replacement_comparison", @@ -523,11 +558,13 @@ def build_sound_ecps_replacement_comparison( "score_candidate_only": False, "refit_objective_matches_scoring": objective_identity_passed, "ecps_refit_recovery_passed": ecps_refit_recovery_passed, + "ecps_refit_effective_passed": baseline_refit_effective_passed, "holdout_target_fraction": float(holdout_target_fraction), "holdout_targets": int(holdout_mask.sum()), "target_scope_filter": target_scope, "protected_family_losses": protected_family_losses, }, + "frozen_ecps_baseline_certificate": frozen_baseline_certificate, "entity_structure": { "candidate_source": _entity_structure_summary( candidate_path, @@ -1686,6 +1723,121 @@ def _sha256(path: Path) -> str: return digest.hexdigest() +def _frozen_ecps_baseline_certificate( + *, + baseline_dataset_path: Path, + policyengine_targets_db_path: Path | None, + policyengine_us_data_repo: str | Path | None, + period: int, + target_names: list[str], + target_scope: str, + holdout_target_fraction: float, + holdout_target_seed: int, + matched_sample_method: str, + refit_config: dict[str, Any], + skip_tax_expenditure_targets: bool, + exact_rescore: bool, + score_source: str, + baseline_sanity: dict[str, Any], + score_summary: dict[str, Any], +) -> dict[str, Any]: + """Freeze the eCPS baseline surface used for this numeric verdict. + + Promotion gates consume this certificate and compare it to the pinned + benchmark manifest. That prevents a release from passing on a live + recomputation against a different eCPS H5, target DB, scorer checkout, or + scoring config. + """ + + scoring_config = { + "period": int(period), + "target_profile": "pe_native_broad", + "target_scope": str(target_scope), + "holdout_target_fraction": float(holdout_target_fraction), + "holdout_target_seed": int(holdout_target_seed), + "matched_sample_method": str(matched_sample_method), + "refit_config": dict(refit_config), + "skip_tax_expenditure_targets": bool(skip_tax_expenditure_targets), + "exact_rescore": bool(exact_rescore), + "score_source": str(score_source), + "comparison_bad_targets": list(_comparison_bad_targets()), + } + baseline_metrics = { + key: score_summary.get(key) + for key in ( + "baseline_initial_enhanced_cps_native_loss", + "baseline_enhanced_cps_native_loss", + "baseline_train_loss", + "baseline_holdout_loss", + "baseline_unweighted_msre", + "n_targets_kept", + "n_national_targets", + "n_state_targets", + ) + if score_summary.get(key) is not None + } + return { + "schema_version": 1, + "certificate_type": "frozen_production_ecps_baseline", + "period": int(period), + "baseline_dataset": _dataset_descriptor(baseline_dataset_path), + "target_db": ( + _dataset_descriptor(policyengine_targets_db_path) + if policyengine_targets_db_path is not None + else None + ), + "policyengine_us_data": _git_repo_descriptor(policyengine_us_data_repo), + "target_surface": { + "target_profile": "pe_native_broad", + "target_scope": str(target_scope), + "target_count": int(len(target_names)), + "target_names_sha256": _canonical_json_sha256(list(target_names)), + }, + "scoring_config": { + **scoring_config, + "sha256": _canonical_json_sha256(scoring_config), + }, + "baseline_metrics": baseline_metrics, + "baseline_sanity": dict(baseline_sanity), + } + + +def _git_repo_descriptor(repo_path: str | Path | None) -> dict[str, Any] | None: + if repo_path is None: + return None + repo = Path(repo_path).expanduser().resolve() + descriptor: dict[str, Any] = {"repo": str(repo)} + commit = _git_output_or_none(repo, "rev-parse", "HEAD") + if commit: + descriptor["commit"] = commit + status = _git_output_or_none(repo, "status", "--porcelain") + if status is not None: + descriptor["dirty"] = bool(status) + return descriptor + + +def _git_output_or_none(repo: Path, *args: str) -> str | None: + completed = subprocess.run( + ["git", "-C", str(repo), *args], + check=False, + capture_output=True, + text=True, + ) + if completed.returncode != 0: + return None + return completed.stdout.strip() + + +def _canonical_json_sha256(payload: Any) -> str: + encoded = json.dumps( + payload, + sort_keys=True, + separators=(",", ":"), + default=str, + ).encode("utf-8") + return hashlib.sha256(encoded).hexdigest() + + def main(argv: list[str] | None = None) -> int: parser = argparse.ArgumentParser( description=( diff --git a/src/microplex_us/pipelines/mp300k_artifact_gates.py b/src/microplex_us/pipelines/mp300k_artifact_gates.py index f911561..36136ab 100644 --- a/src/microplex_us/pipelines/mp300k_artifact_gates.py +++ b/src/microplex_us/pipelines/mp300k_artifact_gates.py @@ -218,6 +218,14 @@ def build_mp300k_artifact_gate_report( baseline_dataset=baseline_dataset, artifact_size_ratio_threshold=artifact_size_ratio_threshold, ) + benchmark_gate, benchmark_descriptor = _benchmark_manifest_gate( + benchmark_manifest_path + ) + benchmark_evidence = ( + dict(benchmark_descriptor.get("pinned_evidence") or {}) + if isinstance(benchmark_descriptor, dict) + else {} + ) resolved_ecps_comparison = _resolve_ecps_comparison_payload( ecps_comparison_payload, candidate_dataset=candidate_dataset, @@ -227,7 +235,10 @@ def build_mp300k_artifact_gate_report( policyengine_us_data_repo=policyengine_us_data_repo, policyengine_us_data_python=policyengine_us_data_python, ) - ecps_comparison_gate = _ecps_comparison_gate(resolved_ecps_comparison) + ecps_comparison_gate = _ecps_comparison_gate( + resolved_ecps_comparison, + benchmark_evidence=benchmark_evidence, + ) arch_coverage_gate = _arch_target_coverage_gate( arch_coverage_payload, expected_period=period, @@ -247,9 +258,6 @@ def build_mp300k_artifact_gate_report( resolved_source_weight_diagnostics, max_support_weight_share=max_support_weight_share, ) - benchmark_gate, benchmark_descriptor = _benchmark_manifest_gate( - benchmark_manifest_path - ) gates = { "candidate_artifact": candidate_gate, "compatibility": compatibility_gate, @@ -479,9 +487,7 @@ def _column_contract_gate( excluded=excluded, ) satisfied_count = len(required) - len(diff.missing_required) - contract_share = ( - float(satisfied_count / len(required)) if required else None - ) + contract_share = float(satisfied_count / len(required)) if required else None metrics = { "period": int(period), "candidate_column_count": len(candidate_column_set), @@ -979,6 +985,8 @@ def _resolve_ecps_comparison_payload( def _ecps_comparison_gate( ecps_comparison_payload: dict[str, Any] | None, + *, + benchmark_evidence: dict[str, Any] | None = None, ) -> dict[str, Any]: if ecps_comparison_payload is None: return _gate( @@ -1013,6 +1021,7 @@ def _ecps_comparison_gate( contract = _ecps_comparison_contract_summary( ecps_comparison_payload, summary, + benchmark_evidence=benchmark_evidence, ) details.update(contract["details"]) missing_requirements = list(contract["missing_requirements"]) @@ -1090,6 +1099,8 @@ def _ecps_comparison_summary(payload: Any) -> dict[str, Any]: def _ecps_comparison_contract_summary( payload: Any, summary: dict[str, Any], + *, + benchmark_evidence: dict[str, Any] | None = None, ) -> dict[str, Any]: candidate_households = _first_nested_present( payload, @@ -1173,6 +1184,14 @@ def _ecps_comparison_contract_summary( ) if ecps_refit_recovery is not None: ecps_refit_recovery = bool(ecps_refit_recovery) + ecps_refit_effective = _first_nested_present( + payload, + summary, + "ecps_refit_effective_passed", + "baseline_refit_effective_passed", + ) + if ecps_refit_effective is not None: + ecps_refit_effective = bool(ecps_refit_effective) holdout_target_fraction = _first_nested_present( payload, @@ -1193,12 +1212,18 @@ def _ecps_comparison_contract_summary( protected_summary = _protected_family_floor_summary(payload, summary) core_benchmark_summary = _core_benchmark_family_floor_summary(payload, summary) + frozen_baseline_summary = _frozen_baseline_certificate_summary( + payload, + summary, + benchmark_evidence=benchmark_evidence, + ) requirements = { "matched_household_count": matched_household_count is True, "symmetric_refit": symmetric_refit is True, "refit_objective_matches_scoring": objective_identity is True, - "ecps_refit_recovery": ecps_refit_recovery is True, + "ecps_refit_effective": ecps_refit_effective is True, + "frozen_ecps_baseline_certificate": frozen_baseline_summary["passed"] is True, "holdout_target_split": has_holdout_targets, "protected_family_floors": protected_summary["passed"] is True, "core_benchmark_family_floors": core_benchmark_summary["passed"] is True, @@ -1216,6 +1241,8 @@ def _ecps_comparison_contract_summary( "score_candidate_only": score_candidate_only, "refit_objective_matches_scoring": objective_identity, "ecps_refit_recovery_passed": ecps_refit_recovery, + "ecps_refit_effective_passed": ecps_refit_effective, + "frozen_ecps_baseline_certificate": frozen_baseline_summary, "holdout_targets": holdout_targets, "protected_family_floor": protected_summary, "core_benchmark_family_floor": core_benchmark_summary, @@ -1223,6 +1250,207 @@ def _ecps_comparison_contract_summary( } +def _frozen_baseline_certificate_summary( + payload: Any, + summary: dict[str, Any], + *, + benchmark_evidence: dict[str, Any] | None, +) -> dict[str, Any]: + certificate = _find_frozen_baseline_certificate(payload) + if not isinstance(certificate, dict): + return { + "passed": False, + "missing_evidence": ["frozen_ecps_baseline_certificate"], + "mismatches": [], + } + + missing: list[str] = [] + mismatches: list[dict[str, Any]] = [] + schema_version = certificate.get("schema_version") + if schema_version != 1: + mismatches.append( + { + "field": "schema_version", + "expected": 1, + "actual": schema_version, + } + ) + + evidence_values = { + "baseline_dataset.sha256": _first_nested_path_value( + certificate, + ( + ("baseline_dataset", "sha256"), + ("enhanced_cps", "sha256"), + ("baseline_dataset_sha256",), + ("enhanced_cps_sha256",), + ), + ), + "target_db.sha256": _first_nested_path_value( + certificate, + ( + ("target_db", "sha256"), + ("targets_db", "sha256"), + ("policyengine_targets_db", "sha256"), + ("target_db_sha256",), + ("policyengine_targets_db_sha256",), + ), + ), + "policyengine_us_data.commit": _first_nested_path_value( + certificate, + ( + ("policyengine_us_data", "commit"), + ("policyengine_us_data", "commit_sha"), + ("policyengine_us_data_commit",), + ("policyengine_us_data_commit_sha",), + ), + ), + "scoring_config.sha256": _first_nested_path_value( + certificate, + ( + ("scoring_config", "sha256"), + ("scoring_config_sha256",), + ), + ), + "target_surface.target_names_sha256": _first_nested_path_value( + certificate, + ( + ("target_surface", "target_names_sha256"), + ("target_names_sha256",), + ), + ), + "target_surface.target_count": _first_nested_path_value( + certificate, + ( + ("target_surface", "target_count"), + ("target_count",), + ), + ), + "baseline_metrics.baseline_enhanced_cps_native_loss": ( + _certificate_metric( + certificate, + "baseline_enhanced_cps_native_loss", + ) + ), + } + for evidence_name, value in evidence_values.items(): + if not _valid_certificate_evidence_value(evidence_name, value): + missing.append(evidence_name) + + for metric_name in ( + "baseline_enhanced_cps_native_loss", + "baseline_holdout_loss", + "baseline_unweighted_msre", + ): + summary_value = summary.get(metric_name) + certificate_value = _certificate_metric(certificate, metric_name) + if summary_value is None or certificate_value is None: + continue + if not _float_equal(summary_value, certificate_value): + mismatches.append( + { + "field": f"baseline_metrics.{metric_name}", + "summary_value": summary_value, + "certificate_value": certificate_value, + } + ) + + for evidence_name in ( + "baseline_dataset.sha256", + "target_db.sha256", + "policyengine_us_data.commit", + ): + expected_value = (benchmark_evidence or {}).get(evidence_name) + certificate_value = evidence_values.get(evidence_name) + if expected_value is None or certificate_value is None: + continue + if str(expected_value) != str(certificate_value): + mismatches.append( + { + "field": evidence_name, + "benchmark_manifest_value": expected_value, + "certificate_value": certificate_value, + } + ) + + return { + "passed": not missing and not mismatches, + "certificate_type": certificate.get("certificate_type"), + "period": certificate.get("period"), + "missing_evidence": missing, + "mismatches": mismatches, + "baseline_dataset_sha256": evidence_values.get("baseline_dataset.sha256"), + "target_db_sha256": evidence_values.get("target_db.sha256"), + "policyengine_us_data_commit": evidence_values.get( + "policyengine_us_data.commit" + ), + "scoring_config_sha256": evidence_values.get("scoring_config.sha256"), + "target_names_sha256": evidence_values.get( + "target_surface.target_names_sha256" + ), + "target_count": evidence_values.get("target_surface.target_count"), + } + + +def _find_frozen_baseline_certificate(payload: Any) -> Any: + if not isinstance(payload, dict): + return None + for key in ( + "frozen_ecps_baseline_certificate", + "baseline_certificate", + "certified_baseline", + ): + value = payload.get(key) + if isinstance(value, dict): + return value + metadata = payload.get("metadata") + if isinstance(metadata, dict): + value = metadata.get("frozen_ecps_baseline_certificate") + if isinstance(value, dict): + return value + return None + + +def _certificate_metric(certificate: dict[str, Any], metric_name: str) -> Any: + return _first_nested_path_value( + certificate, + ( + ("baseline_metrics", metric_name), + (metric_name,), + ), + ) + + +def _valid_certificate_evidence_value(name: str, value: Any) -> bool: + if name == "target_surface.target_count": + try: + return int(value) > 0 + except (TypeError, ValueError): + return False + if name.endswith(".sha256"): + return ( + isinstance(value, str) + and len(value) == 64 + and bool(_HEX_RE.fullmatch(value)) + ) + if name.endswith(".commit"): + return ( + isinstance(value, str) + and 7 <= len(value) <= 40 + and bool(_HEX_RE.fullmatch(value)) + ) + if name.startswith("baseline_metrics."): + try: + return np.isfinite(float(value)) + except (TypeError, ValueError): + return False + return value is not None + + +def _float_equal(left: Any, right: Any, *, tolerance: float = 1e-12) -> bool: + return abs(float(left) - float(right)) <= tolerance + + def _first_nested_present( payload: Any, summary: dict[str, Any], @@ -1801,6 +2029,11 @@ def _benchmark_manifest_gate( descriptor, ) evidence = _benchmark_manifest_evidence(payload) + descriptor = { + **descriptor, + "pinned_evidence": evidence["present"], + "missing_evidence": evidence["missing"], + } if evidence["missing"]: return ( _gate( diff --git a/tests/pipelines/test_ecps_replacement_comparison.py b/tests/pipelines/test_ecps_replacement_comparison.py index 6c3d776..0b864d3 100644 --- a/tests/pipelines/test_ecps_replacement_comparison.py +++ b/tests/pipelines/test_ecps_replacement_comparison.py @@ -68,6 +68,29 @@ def _read_weights(path: Path, *, period: int = 2024) -> np.ndarray: return np.asarray(handle["household_weight"][str(period)], dtype=np.float64) +def _write_clean_git_repo(path: Path) -> Path: + path.mkdir() + (path / "README.md").write_text("pinned scorer repo\n") + subprocess.run(["git", "init"], cwd=path, check=True, capture_output=True) + subprocess.run(["git", "add", "README.md"], cwd=path, check=True) + subprocess.run( + [ + "git", + "-c", + "user.name=Microplex Tests", + "-c", + "user.email=microplex-tests@example.com", + "commit", + "-m", + "Initial scorer pin", + ], + cwd=path, + check=True, + capture_output=True, + ) + return path + + def _fake_loss_inputs(input_dataset_path: str | Path, **_kwargs) -> dict[str, object]: path = Path(input_dataset_path) if path.name.startswith("candidate"): @@ -346,26 +369,38 @@ def _artifact_manifest(artifact_dir: Path, baseline_dataset: Path) -> None: ) -def _benchmark_manifest(path: Path) -> None: +def _benchmark_manifest( + path: Path, + *, + certificate: dict[str, object] | None = None, +) -> None: + if certificate is not None: + baseline_dataset = dict(certificate["baseline_dataset"]) + target_db = dict(certificate["target_db"]) + policyengine_us_data = dict(certificate["policyengine_us_data"]) + else: + baseline_dataset = { + "path": "/tmp/enhanced_cps_2024.h5", + "sha256": "a" * 64, + } + target_db = { + "path": "/tmp/policyengine_targets.db", + "sha256": "c" * 64, + } + policyengine_us_data = { + "repo": "PolicyEngine/policyengine-us-data", + "commit": "b" * 40, + } path.write_text( json.dumps( { "schema_version": 1, "period": 2024, "target_profile": "pe_native_broad", - "baseline_dataset": { - "path": "/tmp/enhanced_cps_2024.h5", - "sha256": "a" * 64, - }, - "policyengine_us_data": { - "repo": "PolicyEngine/policyengine-us-data", - "commit": "b" * 40, - }, + "baseline_dataset": baseline_dataset, + "policyengine_us_data": policyengine_us_data, "policyengine_us": {"version": "1.587.0"}, - "target_db": { - "path": "/tmp/policyengine_targets.db", - "sha256": "c" * 64, - }, + "target_db": target_db, } ) ) @@ -388,6 +423,9 @@ def test_sound_ecps_replacement_comparison_satisfies_gate_contract( ): candidate = _write_minimal_policyengine_dataset(tmp_path / "candidate.h5") baseline = _write_minimal_policyengine_dataset(tmp_path / "baseline.h5") + targets_db = tmp_path / "policyengine_targets.db" + targets_db.write_bytes(b"pinned target database") + scorer_repo = _write_clean_git_repo(tmp_path / "policyengine-us-data") output_dir = tmp_path / "comparison" monkeypatch.setattr(ecps, "_extract_pe_native_loss_inputs", _fake_loss_inputs) monkeypatch.setattr(ecps, "compute_us_pe_native_scores", _fake_pe_native_scores) @@ -398,9 +436,23 @@ def test_sound_ecps_replacement_comparison_satisfies_gate_contract( baseline_dataset_path=baseline, output_dir=output_dir, optimizer_max_iter=50, + policyengine_targets_db_path=targets_db, + policyengine_us_data_repo=scorer_repo, ) summary = payload["summary"] + certificate = payload["frozen_ecps_baseline_certificate"] + assert certificate["baseline_dataset"]["sha256"] + assert certificate["target_db"]["sha256"] + assert certificate["policyengine_us_data"]["commit"] + assert ( + certificate["baseline_metrics"]["baseline_enhanced_cps_native_loss"] + == (summary["baseline_enhanced_cps_native_loss"]) + ) + assert ( + certificate["baseline_metrics"]["baseline_holdout_loss"] + == (summary["baseline_holdout_loss"]) + ) assert summary["candidate_household_count"] == 2 assert summary["baseline_household_count"] == 2 assert payload["matched_datasets"]["sample_method"] == "uniform" @@ -410,6 +462,7 @@ def test_sound_ecps_replacement_comparison_satisfies_gate_contract( assert summary["exact_rescore_status"] == "skipped" assert summary["refit_objective_matches_scoring"] is True assert summary["ecps_refit_recovery_passed"] is True + assert summary["ecps_refit_effective_passed"] is True assert summary["baseline_sanity"]["mode"] == "msre" assert summary["baseline_sanity"]["status"] == "passed" assert ( @@ -496,7 +549,7 @@ def test_sound_ecps_replacement_comparison_satisfies_gate_contract( shutil.copy2(candidate, artifact_dir / "candidate.h5") _artifact_manifest(artifact_dir, baseline) benchmark_manifest = tmp_path / "benchmark_manifest.json" - _benchmark_manifest(benchmark_manifest) + _benchmark_manifest(benchmark_manifest, certificate=certificate) report_path = write_mp300k_artifact_gate_report( artifact_dir, ecps_comparison_payload=payload, @@ -714,6 +767,17 @@ def test_assert_refit_effective_raises_on_no_op_refit(): assert "no-op" in str(excinfo.value) +def test_assert_refit_effective_passes_when_loss_moves_but_rises(): + # The refit minimizes the train objective; on an already-well-calibrated + # baseline the full-set loss can tick up from the held-out split even though + # the refit genuinely ran. Only a frozen no-movement refit is a failure. + ecps._assert_refit_effective( + "baseline", + {"initial_full_loss": 0.0243817, "optimized_full_loss": 0.0266164}, + 1e-9, + ) + + def test_assert_baseline_sane_passes_on_clean_baseline(): ecps._assert_baseline_sane({"baseline_unweighted_msre": 0.17}, 2.0) diff --git a/tests/pipelines/test_mp300k_artifact_gates.py b/tests/pipelines/test_mp300k_artifact_gates.py index a207e9c..9184a09 100644 --- a/tests/pipelines/test_mp300k_artifact_gates.py +++ b/tests/pipelines/test_mp300k_artifact_gates.py @@ -223,6 +223,35 @@ def _sound_ecps_comparison_payload( ) ] return { + "frozen_ecps_baseline_certificate": { + "schema_version": 1, + "certificate_type": "frozen_production_ecps_baseline", + "period": 2024, + "baseline_dataset": { + "path": "/tmp/enhanced_cps_2024.h5", + "sha256": "a" * 64, + }, + "target_db": { + "path": "/tmp/policyengine_targets.db", + "sha256": "c" * 64, + }, + "policyengine_us_data": { + "repo": "PolicyEngine/policyengine-us-data", + "commit": "b" * 40, + }, + "target_surface": { + "target_profile": "pe_native_broad", + "target_scope": "national", + "target_count": 150, + "target_names_sha256": "d" * 64, + }, + "scoring_config": {"sha256": "e" * 64}, + "baseline_metrics": { + "baseline_enhanced_cps_native_loss": baseline_loss, + "baseline_holdout_loss": 0.04, + "baseline_unweighted_msre": 0.17, + }, + }, "summary": { "candidate_enhanced_cps_native_loss": candidate_loss, "baseline_enhanced_cps_native_loss": baseline_loss, @@ -234,7 +263,9 @@ def _sound_ecps_comparison_payload( "candidate_refit_config": fit_config, "baseline_refit_config": fit_config, "refit_objective_matches_scoring": True, - "ecps_refit_recovery_passed": True, + "ecps_refit_effective_passed": True, + "baseline_holdout_loss": 0.04, + "baseline_unweighted_msre": 0.17, "holdout_target_fraction": 0.2, "protected_family_losses": protected_family_losses, }, @@ -329,6 +360,110 @@ def test_benchmark_manifest_gate_requires_pinned_release_evidence(tmp_path): ] +def test_ecps_comparison_gate_requires_frozen_baseline_certificate(tmp_path): + artifact_dir = tmp_path / "artifact" + artifact_dir.mkdir() + _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") + baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") + benchmark_manifest = tmp_path / "benchmark_manifest.json" + _write_benchmark_manifest(benchmark_manifest) + _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) + payload = _sound_ecps_comparison_payload(candidate_loss=0.10) + del payload["frozen_ecps_baseline_certificate"] + + report_path = write_mp300k_artifact_gate_report( + artifact_dir, + ecps_comparison_payload=payload, + arch_coverage_payload=_arch_coverage_payload(), + runtime_smoke_payload={"runtime_ratio": 1.0}, + benchmark_manifest_path=benchmark_manifest, + compute_native_scores=False, + update_manifest=False, + ) + + record = json.loads(report_path.read_text()) + ecps_gate = record["gates"]["ecps_comparison"] + + assert record["summary"]["status"] == "failed" + assert ecps_gate["status"] == "fail" + assert "frozen_ecps_baseline_certificate" in ecps_gate["summary"] + assert ecps_gate["details"]["frozen_ecps_baseline_certificate"][ + "missing_evidence" + ] == ["frozen_ecps_baseline_certificate"] + + +def test_ecps_comparison_gate_rejects_baseline_certificate_metric_drift(tmp_path): + artifact_dir = tmp_path / "artifact" + artifact_dir.mkdir() + _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") + baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") + benchmark_manifest = tmp_path / "benchmark_manifest.json" + _write_benchmark_manifest(benchmark_manifest) + _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) + payload = _sound_ecps_comparison_payload(candidate_loss=0.10) + payload["frozen_ecps_baseline_certificate"]["baseline_metrics"][ + "baseline_enhanced_cps_native_loss" + ] = 0.05 + + report_path = write_mp300k_artifact_gate_report( + artifact_dir, + ecps_comparison_payload=payload, + arch_coverage_payload=_arch_coverage_payload(), + runtime_smoke_payload={"runtime_ratio": 1.0}, + benchmark_manifest_path=benchmark_manifest, + compute_native_scores=False, + update_manifest=False, + ) + + record = json.loads(report_path.read_text()) + ecps_gate = record["gates"]["ecps_comparison"] + + assert record["summary"]["status"] == "failed" + assert ecps_gate["status"] == "fail" + assert ecps_gate["details"]["frozen_ecps_baseline_certificate"]["mismatches"] == [ + { + "field": "baseline_metrics.baseline_enhanced_cps_native_loss", + "summary_value": 0.2, + "certificate_value": 0.05, + } + ] + + +def test_ecps_comparison_gate_rejects_benchmark_certificate_mismatch(tmp_path): + artifact_dir = tmp_path / "artifact" + artifact_dir.mkdir() + _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") + baseline_dataset = _write_contract_policyengine_dataset(tmp_path / "baseline.h5") + benchmark_manifest = tmp_path / "benchmark_manifest.json" + _write_benchmark_manifest(benchmark_manifest) + _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) + payload = _sound_ecps_comparison_payload(candidate_loss=0.10) + payload["frozen_ecps_baseline_certificate"]["baseline_dataset"]["sha256"] = "f" * 64 + + report_path = write_mp300k_artifact_gate_report( + artifact_dir, + ecps_comparison_payload=payload, + arch_coverage_payload=_arch_coverage_payload(), + runtime_smoke_payload={"runtime_ratio": 1.0}, + benchmark_manifest_path=benchmark_manifest, + compute_native_scores=False, + update_manifest=False, + ) + + record = json.loads(report_path.read_text()) + ecps_gate = record["gates"]["ecps_comparison"] + + assert record["summary"]["status"] == "failed" + assert ecps_gate["status"] == "fail" + assert ecps_gate["details"]["frozen_ecps_baseline_certificate"]["mismatches"] == [ + { + "field": "baseline_dataset.sha256", + "benchmark_manifest_value": "a" * 64, + "certificate_value": "f" * 64, + } + ] + + def test_core_benchmark_floor_accepts_aca_enrollment_family_alias(tmp_path): artifact_dir = tmp_path / "artifact" artifact_dir.mkdir() @@ -359,9 +494,7 @@ def test_core_benchmark_floor_accepts_aca_enrollment_family_alias(tmp_path): assert record["summary"]["status"] == "passed" assert comparison_gate["status"] == "pass" assert ( - comparison_gate["details"]["core_benchmark_family_floor"][ - "missing_families" - ] + comparison_gate["details"]["core_benchmark_family_floor"]["missing_families"] == [] ) @@ -535,9 +668,10 @@ def test_export_support_gate_ignores_ecps_filler_columns(tmp_path): assert record["summary"]["status"] == "passed" assert support_gate["status"] == "pass" - assert "second_home_mortgage_interest" in support_gate["details"][ - "baseline_filler_columns" - ] + assert ( + "second_home_mortgage_interest" + in support_gate["details"]["baseline_filler_columns"] + ) def test_export_lineage_gate_rejects_ecps_populated_default_only_column(tmp_path): @@ -1230,7 +1364,7 @@ def test_ecps_comparison_rejects_core_benchmark_family_regression(tmp_path): ] -def test_ecps_comparison_rejects_missing_ecps_refit_recovery(tmp_path): +def test_ecps_comparison_rejects_missing_ecps_refit_effective(tmp_path): artifact_dir = tmp_path / "artifact" artifact_dir.mkdir() _write_contract_policyengine_dataset(artifact_dir / "candidate.h5") @@ -1239,7 +1373,7 @@ def test_ecps_comparison_rejects_missing_ecps_refit_recovery(tmp_path): _write_benchmark_manifest(benchmark_manifest) _write_artifact_manifest(artifact_dir, baseline_dataset=baseline_dataset) payload = _sound_ecps_comparison_payload(candidate_loss=0.10) - payload["summary"]["ecps_refit_recovery_passed"] = False + payload["summary"]["ecps_refit_effective_passed"] = False report_path = write_mp300k_artifact_gate_report( artifact_dir, @@ -1256,8 +1390,8 @@ def test_ecps_comparison_rejects_missing_ecps_refit_recovery(tmp_path): assert record["summary"]["status"] == "failed" assert ecps_gate["status"] == "fail" - assert "ecps_refit_recovery" in ecps_gate["summary"] - assert ecps_gate["details"]["ecps_refit_recovery_passed"] is False + assert "ecps_refit_effective" in ecps_gate["summary"] + assert ecps_gate["details"]["ecps_refit_effective_passed"] is False def test_ecps_comparison_requires_measured_refit_objective_identity(tmp_path): diff --git a/tests/pipelines/test_mp300k_gate_inputs.py b/tests/pipelines/test_mp300k_gate_inputs.py index 6e39e7f..d8866e3 100644 --- a/tests/pipelines/test_mp300k_gate_inputs.py +++ b/tests/pipelines/test_mp300k_gate_inputs.py @@ -180,16 +180,52 @@ def _sound_ecps_comparison_payload() -> dict[str, object]: "state_aca_spending", ) ] + candidate_loss = 0.1 + baseline_loss = 0.2 return { + "frozen_ecps_baseline_certificate": { + "schema_version": 1, + "certificate_type": "frozen_production_ecps_baseline", + "period": 2024, + "baseline_dataset": { + "path": "/tmp/enhanced_cps_2024.h5", + "sha256": "a" * 64, + }, + "target_db": { + "path": "/tmp/policyengine_targets.db", + "sha256": "c" * 64, + }, + "policyengine_us_data": { + "repo": "PolicyEngine/policyengine-us-data", + "commit": "b" * 40, + }, + "target_surface": { + "target_profile": "pe_native_broad", + "target_scope": "national", + "target_count": 150, + "target_names_sha256": "d" * 64, + }, + "scoring_config": {"sha256": "e" * 64}, + "baseline_metrics": { + "baseline_enhanced_cps_native_loss": baseline_loss, + "baseline_holdout_loss": 0.04, + "baseline_unweighted_msre": 0.17, + }, + }, "summary": { - "candidate_enhanced_cps_native_loss": 0.1, - "baseline_enhanced_cps_native_loss": 0.2, + "candidate_enhanced_cps_native_loss": candidate_loss, + "baseline_enhanced_cps_native_loss": baseline_loss, + "enhanced_cps_native_loss_delta": candidate_loss - baseline_loss, + "candidate_beats_baseline": candidate_loss < baseline_loss, + "n_targets_kept": 150, "candidate_household_count": 2, "baseline_household_count": 2, "candidate_refit_config": fit_config, "baseline_refit_config": fit_config, "refit_objective_matches_scoring": True, - "ecps_refit_recovery_passed": True, + "ecps_refit_effective_passed": True, + "baseline_holdout_loss": 0.04, + "baseline_unweighted_msre": 0.17, "holdout_target_fraction": 0.2, "protected_family_losses": protected_family_losses, }, @@ -327,7 +363,9 @@ def test_packaged_inputs_run_gates_from_clean_extract(tmp_path): report_path = write_mp300k_artifact_gate_report( packaged_artifact_dir, ecps_comparison_payload=_sound_ecps_comparison_payload(), - arch_coverage_payload=json.loads((output_dir / "arch_coverage.json").read_text()), + arch_coverage_payload=json.loads( + (output_dir / "arch_coverage.json").read_text() + ), runtime_smoke_payload={"runtime_ratio": 1.0}, benchmark_manifest_path=output_dir / "benchmark_manifest.json", compute_native_scores=False,