|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Heuristic criterion-level scoring for example quality hill-climbing. |
| 3 | +
|
| 4 | +This is not the editorial source of truth. It is a search aid: it breaks the |
| 5 | +rubric into observable criteria so the next rewrite can target the weakest |
| 6 | +axis instead of arguing about one opaque number. |
| 7 | +""" |
| 8 | +from __future__ import annotations |
| 9 | + |
| 10 | +import argparse |
| 11 | +import json |
| 12 | +import re |
| 13 | +import sys |
| 14 | +import tomllib |
| 15 | +from pathlib import Path |
| 16 | +from statistics import mean |
| 17 | + |
| 18 | +ROOT = Path(__file__).resolve().parents[1] |
| 19 | +sys.path.insert(0, str(ROOT)) |
| 20 | + |
| 21 | +from src.example_loader import load_examples # noqa: E402 |
| 22 | +from src.marginalia import EXAMPLE_QUALITY_SCORES # noqa: E402 |
| 23 | + |
| 24 | +REGISTRY = ROOT / "docs" / "quality-registries.toml" |
| 25 | +GENERIC_PHRASES = [ |
| 26 | + "it exists to make a common boundary explicit", |
| 27 | + "the example is small, deterministic", |
| 28 | + "prefer simpler neighboring tools", |
| 29 | +] |
| 30 | +BOUNDARY_WORDS = re.compile(r"\b(prefer|instead|boundary|when|unless|except|error|raises?|static|runtime|unsupported|footgun|warning)\b", re.I) |
| 31 | +RATIONALE_WORDS = re.compile(r"\b(use|prefer|reach for|when|because|useful|right tool|fit|shape)\b", re.I) |
| 32 | +TOY_WORDS = re.compile(r"\b(foo|bar|baz|spam|eggs)\b", re.I) |
| 33 | + |
| 34 | + |
| 35 | +def clamp(value: float) -> float: |
| 36 | + return max(0.0, min(1.0, value)) |
| 37 | + |
| 38 | + |
| 39 | +def tokenise(text: str) -> set[str]: |
| 40 | + return {token.lower() for token in re.findall(r"[a-zA-Z_]{3,}", text)} |
| 41 | + |
| 42 | + |
| 43 | +def criterion_scores(example: dict) -> dict[str, float]: |
| 44 | + prose = "\n".join(example.get("explanation", [])) |
| 45 | + notes = "\n".join(example.get("notes", [])) |
| 46 | + code = example.get("code", "") |
| 47 | + cells = example.get("cells", []) |
| 48 | + normal_cells = [cell for cell in cells if cell.get("kind") == "cell"] |
| 49 | + unsupported_cells = [cell for cell in cells if cell.get("kind") == "unsupported"] |
| 50 | + outputs = [cell.get("output", "") for cell in normal_cells] |
| 51 | + all_text = "\n".join([example.get("summary", ""), prose, notes]) |
| 52 | + cell_prose = [" ".join(cell.get("prose", [])) for cell in normal_cells] |
| 53 | + distinct_cell_starts = len({text[:60] for text in cell_prose}) |
| 54 | + output_lines = sum(len(output.splitlines()) for output in outputs if output) |
| 55 | + code_tokens = tokenise(code) |
| 56 | + prose_tokens = tokenise(all_text) |
| 57 | + overlap = len(code_tokens & prose_tokens) |
| 58 | + generic_penalty = 0.2 * sum(phrase in all_text.lower() for phrase in GENERIC_PHRASES) |
| 59 | + |
| 60 | + return { |
| 61 | + "conceptual_payoff": clamp(0.45 + min(len(prose) / 900, 0.35) + min(overlap / 30, 0.2) - generic_penalty), |
| 62 | + "rationale": clamp(0.35 + 0.35 * bool(RATIONALE_WORDS.search(all_text)) + min(len(example.get("notes", [])) / 6, 0.3) - generic_penalty), |
| 63 | + "alternatives_and_boundaries": clamp(0.25 + 0.25 * bool(BOUNDARY_WORDS.search(all_text)) + 0.2 * bool(unsupported_cells) + min(notes.lower().count("prefer") / 2, 0.2)), |
| 64 | + "executable_determinism": clamp(0.75 + 0.25 * bool(example.get("expected_output")) - 0.25 * bool(example.get("version_sensitive"))), |
| 65 | + "python_idiom_and_accuracy": clamp(0.75 + 0.15 * bool(example.get("doc_url")) + 0.1 * ("print(" in code) - 0.25 * bool(TOY_WORDS.search(code))), |
| 66 | + "literate_fit": clamp(0.35 + min(len(normal_cells) / 4, 0.35) + 0.3 * all(cell.get("prose") for cell in normal_cells)), |
| 67 | + "source_result_pairing": clamp(0.35 + 0.45 * all(outputs) + min(output_lines / 8, 0.2)), |
| 68 | + "concept_decomposition": clamp(0.25 + min(len(normal_cells) / 3, 0.55) + 0.2 * (len(normal_cells) >= 3)), |
| 69 | + "progressive_walkthrough": clamp(0.35 + min(distinct_cell_starts / max(len(normal_cells), 1), 0.45) + 0.2 * (len(normal_cells) >= 2)), |
| 70 | + "representative_coverage": clamp(0.3 + min(output_lines / 10, 0.25) + min(len(example.get("see_also", [])) / 4, 0.25) + 0.2 * (len(normal_cells) >= 3)), |
| 71 | + "practical_usefulness": clamp(0.55 + 0.25 * (not bool(TOY_WORDS.search(code))) + 0.2 * bool(re.search(r"Ada|Grace|project|config|score|price|request|path|file|service|team", code))), |
| 72 | + "editorial_progression": clamp(0.35 + min(len(example.get("explanation", [])) / 3, 0.25) + min(len(example.get("notes", [])) / 4, 0.25) + 0.15 * bool(example.get("see_also"))), |
| 73 | + } |
| 74 | + |
| 75 | + |
| 76 | +def weighted_score(scores: dict[str, float], weights: dict[str, float]) -> float: |
| 77 | + return round(sum(scores[name] * float(weight) for name, weight in weights.items()), 1) |
| 78 | + |
| 79 | + |
| 80 | +def main() -> int: |
| 81 | + parser = argparse.ArgumentParser() |
| 82 | + parser.add_argument("--json", action="store_true") |
| 83 | + parser.add_argument("--below", type=float, default=9.0) |
| 84 | + parser.add_argument("--limit", type=int, default=20) |
| 85 | + args = parser.parse_args() |
| 86 | + |
| 87 | + weights = tomllib.loads(REGISTRY.read_text())["score_model"] |
| 88 | + _, examples = load_examples() |
| 89 | + rows = [] |
| 90 | + for example in examples: |
| 91 | + criteria = criterion_scores(example) |
| 92 | + heuristic = weighted_score(criteria, weights) |
| 93 | + curated, comment = EXAMPLE_QUALITY_SCORES[example["slug"]] |
| 94 | + weakest = sorted(criteria.items(), key=lambda item: item[1])[:3] |
| 95 | + rows.append({ |
| 96 | + "slug": example["slug"], |
| 97 | + "curated": curated, |
| 98 | + "heuristic": heuristic, |
| 99 | + "delta": round(curated - heuristic, 1), |
| 100 | + "comment": comment, |
| 101 | + "weakest": weakest, |
| 102 | + "criteria": criteria, |
| 103 | + }) |
| 104 | + |
| 105 | + if args.json: |
| 106 | + print(json.dumps(rows, indent=2, sort_keys=True)) |
| 107 | + return 0 |
| 108 | + |
| 109 | + selected = [row for row in rows if row["curated"] < args.below] |
| 110 | + selected.sort(key=lambda row: (row["curated"], row["heuristic"])) |
| 111 | + for row in selected[: args.limit]: |
| 112 | + weak = ", ".join(f"{name}={score:.2f}" for name, score in row["weakest"]) |
| 113 | + print(f"{row['curated']:>3.1f} h={row['heuristic']:>3.1f} {row['slug']:<30} {weak}") |
| 114 | + print( |
| 115 | + f"criterion heuristic: examples={len(rows)} " |
| 116 | + f"curated_avg={mean(row['curated'] for row in rows):.2f} " |
| 117 | + f"heuristic_avg={mean(row['heuristic'] for row in rows):.2f}" |
| 118 | + ) |
| 119 | + return 0 |
| 120 | + |
| 121 | + |
| 122 | +if __name__ == "__main__": |
| 123 | + raise SystemExit(main()) |
0 commit comments