adewale
diff --git a/‎Makefile‎
Lines changed: 5 additions & 2 deletions b/‎Makefile‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎docs/lessons-learned.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/lessons-learned.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/quality-registries.toml‎
Lines changed: 0 additions & 44 deletions b/‎docs/quality-registries.toml‎
Lines changed: 0 additions & 44 deletions
diff --git a/‎docs/quality-search.md‎
Lines changed: 52 additions & 0 deletions b/‎docs/quality-search.md‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎scripts/score_example_criteria.py‎
Lines changed: 123 additions & 0 deletions b/‎scripts/score_example_criteria.py‎
Lines changed: 123 additions & 0 deletions
diff --git a/‎src/asset_manifest.py‎
Lines changed: 1 addition & 1 deletion b/‎src/asset_manifest.py‎
Lines changed: 1 addition & 1 deletion
@@ -1,4 +1,4 @@
-.PHONY: test embed-examples build check-generated fingerprint browser-layout-test seo-cache-lint verify-examples check-registry-integrity check-confusable-pairs check-broad-surface-tours check-footgun-coverage check-notes-supported check-quality-scores check-no-figure-rationales check-journey-outcomes quality-checks format-examples verify-python-version verify smoke-deployment dev deploy lint
+.PHONY: test embed-examples build check-generated fingerprint browser-layout-test seo-cache-lint verify-examples check-registry-integrity check-confusable-pairs check-broad-surface-tours check-footgun-coverage check-notes-supported score-example-criteria check-quality-scores check-no-figure-rationales check-journey-outcomes quality-checks format-examples verify-python-version verify smoke-deployment dev deploy lint
 
 test:
 	python3 -m unittest discover -s tests -v
@@ -38,6 +38,9 @@ check-footgun-coverage:
 check-notes-supported:
 	scripts/check_notes_supported.py
 
+score-example-criteria:
+	scripts/score_example_criteria.py --limit 12
+
 check-quality-scores:
 	scripts/check_quality_scores.py
 
@@ -47,7 +50,7 @@ check-no-figure-rationales:
 check-journey-outcomes:
 	scripts/check_journey_outcomes.py
 
-quality-checks: check-registry-integrity check-confusable-pairs check-broad-surface-tours check-footgun-coverage check-notes-supported check-quality-scores check-no-figure-rationales check-journey-outcomes
+quality-checks: check-registry-integrity check-confusable-pairs check-broad-surface-tours check-footgun-coverage check-notes-supported score-example-criteria check-quality-scores check-no-figure-rationales check-journey-outcomes
 
 format-examples:
 	scripts/format_examples.py
 
@@ -115,4 +115,5 @@ git diff --check
 - **Quality debt must be tracked, not normalized away.** `docs/example-quality-rubric.md` sets a 9.0 target and `scripts/check_quality_scores.py` enforces the score registry: pages below the hard minimum need a concrete improvement backlog entry, stale backlog entries fail once a page clears the gate, and Hello World is the only standing waiver because first examples are traditionally tiny. A score below target is allowed only when the remaining work is named.
 - **No-figure decisions need a registry.** Some examples should not have figures, but that cannot be an invisible omission. `scripts/check_no_figure_rationales.py` validates `no_figure_rationales` so future constraint-shaped pages can opt out explicitly instead of shipping weak diagrams.
 - **Journey sections need outcome contracts.** `scripts/check_journey_outcomes.py` ties each journey section to learner outcomes and support examples so journey pages stay mental maps rather than catalog slices.
+- **Opaque scores hide the next move.** `scripts/score_example_criteria.py` breaks each page into rubric criteria so quality work can target decomposition, boundaries, source/result pairing, graph support, or practical payoff directly. `docs/quality-search.md` records the hill-climbing and simulated-annealing loop for escaping locally tidy but globally weak page shapes.
 - **Deployment smoke belongs beside CI.** `scripts/smoke_deployment.py` checks rendered Worker pages, runtime-boundary pages, journey pages, prototype review pages, and representative Dynamic Worker POST runs for HTTP failures, exception markers, and stale edited-code output. Build success is not enough; the deployed Worker must render and execute edited examples.
@@ -196,30 +196,6 @@ expires = "never"
 # figure would distort the lesson. Current production attaches figures
 # to every example, so this registry is intentionally empty.
 
-[quality_improvement_backlog.constants]
-cause = "convention page needs stronger boundary against Final and ordinary variables"
-next_action = "add a cell contrasting naming convention with typing.Final and runtime rebinding"
-
-[quality_improvement_backlog.truthiness]
-cause = "truth-value protocol is under-linked to booleans and special methods"
-next_action = "add a boundary cell that predicts bool() for empty containers, None, and custom __bool__"
-
-[quality_improvement_backlog.virtual-environments]
-cause = "runtime-boundary page is constrained by Dynamic Workers and needs stronger standard-Python path"
-next_action = "teach venv creation/activation as unsupported Standard Python, then show local dependency evidence"
-
-[quality_improvement_backlog.literal-and-final]
-cause = "advanced type page compresses Literal and Final into one cell"
-next_action = "split value restriction from rebinding restriction and show runtime annotations boundary"
-
-[quality_improvement_backlog.paramspec]
-cause = "single-cell advanced typing page hides why ordinary Callable loses parameter shape"
-next_action = "add a decorator typed with Callable[..., T] before the ParamSpec-preserving version"
-
-[quality_improvement_backlog.number-parsing]
-cause = "parsing page lacks enough failure/recovery shape"
-next_action = "add cells for int base handling, ValueError recovery, and validation boundary"
-
 [quality_improvement_backlog.values]
 cause = "foundational page is graph-linked now but still needs a sharper object/type mental model"
 next_action = "add or revise a cell that connects value, type, and operation with a nearby See also path"
@@ -260,10 +236,6 @@ next_action = "add a cell that chooses a branch from a non-bool value and points
 cause = "match syntax is shown but shape-dispatch vs if/elif boundary could be clearer"
 next_action = "add a comparable if/elif or data-shape cell that makes match's payoff visible"
 
-[quality_improvement_backlog.while-loops]
-cause = "loop shape is shown but for-vs-while decision boundary is thin"
-next_action = "add a state-changing while cell beside an iterable for-loop alternative"
-
 [quality_improvement_backlog.lists]
 cause = "list operations are shown but sequence vs set/dict and mutation boundaries need sharpening"
 next_action = "add a cell contrasting append/index order with set membership or tuple immutability"
@@ -280,10 +252,6 @@ next_action = "add or sharpen cells for get/default, key membership, and safe de
 cause = "set uniqueness is shown but list-vs-set tradeoff and ordering boundary need emphasis"
 next_action = "add a cell comparing membership/duplicates with a list"
 
-[quality_improvement_backlog.slices]
-cause = "slice syntax is shown but off-by-one and copy-vs-view boundaries need stronger evidence"
-next_action = "add a cell showing adjacent slices meeting at the same boundary index"
-
 [quality_improvement_backlog.comprehensions]
 cause = "map/filter shape is shown but eager vs lazy and loop equivalence need stronger progression"
 next_action = "add a generator-expression contrast or explicit loop-equivalence cell"
@@ -304,18 +272,10 @@ next_action = "add a call-site contrast where unnamed booleans are ambiguous"
 cause = "collection of extra arguments is shown but forwarding boundary is underdeveloped"
 next_action = "add a wrapper cell that forwards *args and **kwargs to another callable"
 
-[quality_improvement_backlog.multiple-return-values]
-cause = "tuple return is shown but tuple/unpacking relationship needs a stronger explicit link"
-next_action = "add a cell showing the returned value is a tuple before unpacking"
-
 [quality_improvement_backlog.closures]
 cause = "closure memory is shown but late-binding footgun deserves more adjacent evidence"
 next_action = "add or sharpen loop-closure broken/fixed cells"
 
-[quality_improvement_backlog.recursion]
-cause = "recursive shape is shown but base-case and failure boundaries need more evidence"
-next_action = "add a base-case-first cell and a note on RecursionError/iteration alternative"
-
 [quality_improvement_backlog.lambdas]
 cause = "lambda syntax is shown but def-vs-lambda boundary is too light"
 next_action = "add a cell where lambda is useful as an argument and def is clearer for reuse"
@@ -336,10 +296,6 @@ next_action = "add a before/after cell changing a public attribute into a proper
 cause = "try/except structure is shown but bare-except and cleanup boundaries need sharper evidence"
 next_action = "add a cell contrasting specific exception handling with overbroad catching"
 
-[quality_improvement_backlog.enums]
-cause = "enum values are shown but raw constants/string alternatives need stronger contrast"
-next_action = "add a cell comparing Enum identity/name/value with plain strings"
-
 [quality_improvement_backlog.custom-exceptions]
 cause = "custom exception class is shown but when not to create one is underdeveloped"
 next_action = "add a boundary cell contrasting domain error with built-in ValueError"
 
@@ -0,0 +1,52 @@
+# Rubric-driven quality search
+
+Python By Example now has two complementary scoring loops:
+
+1. `scripts/check_quality_scores.py` is the editorial gate. It enforces the curated score registry, hard-minimum waivers, stale backlog cleanup, weak journey-section tracking, and the 10-point rubric weight model.
+2. `scripts/score_example_criteria.py` is the search aid. It breaks each page into rubric criteria so rewrite work can target the weakest axis instead of treating the score as one opaque number.
+
+The criterion report is deliberately heuristic. It should suggest candidates, not replace editorial review.
+
+## Hill-climbing move types
+
+Use these moves when a page is already close and the weakest criterion is clear:
+
+- **Decompose one compressed cell** into setup, boundary, and payoff cells.
+- **Add a before/after contrast** when the feature exists to remove boilerplate or clarify a shape.
+- **Add a runtime/static boundary cell** for typing pages where runtime behavior differs from type-checker behavior.
+- **Add a failure/recovery cell** for parsing, exceptions, warnings, and validation examples.
+- **Add a standard-Python/Worker-boundary unsupported cell** for runtime features constrained by Dynamic Workers.
+- **Strengthen graph edges** with prerequisite, neighboring, and next-depth `see_also` links.
+- **Replace generic prose** with a concrete domain pressure: user input, package setup, protocol bytes, record shape, service logging, or state transition.
+
+## Escaping local maxima with simulated annealing
+
+Greedy hill-climbing tends to overfit the current page shape: it adds one more note or one more small cell even when the page needs a different structure. For pages stuck around 8.2-8.8, use a simulated-annealing review loop:
+
+1. **State**: the page markdown plus metadata, figure rationale, and graph edges.
+2. **Energy**: `10 - curated_score`, with penalties for weak criterion scores, unsupported runtime ambiguity, graph isolation, empty output evidence, and overlong code runs.
+3. **Neighbor moves**:
+   - split a cell;
+   - merge two repetitive cells;
+   - swap the first example domain;
+   - introduce a contrasting failure case;
+   - move from toy data to realistic data;
+   - convert a figure requirement into a no-figure rationale when the page is constraint-shaped;
+   - add/remove a `see_also` edge;
+   - rewrite the intro around “when to use this”.
+4. **Temperature**: start high enough to accept occasional worse rewrites, especially when they introduce a new structure. Cool after tests, verification, and rubric review pass.
+5. **Acceptance rule**: accept improvements always; accept a worse intermediate with probability based on score loss and temperature only if executable correctness and docs links remain valid.
+6. **Refinement**: after cooling, run `make verify`, the criterion report, and a manual rubric pass before updating the curated score.
+
+This gives the project permission to try non-local changes — different domains, different cell order, or a no-figure rationale — without normalizing failed experiments into production.
+
+## Wider-system unlocks
+
+Future improvements that create new quality headroom:
+
+- Store criterion-level editorial subscores in TOML once the heuristic report stabilizes.
+- Add an authoring command that proposes the top three rewrite moves for a slug from the criterion deficits.
+- Add browser snapshots for representative low-score shapes, not only layout smoke.
+- Track page archetypes (`foundational`, `protocol-boundary`, `static-typing`, `aggregator`, `runtime-constrained`) so rubrics can apply the right expectations.
+- Add a no-figure review path to avoid weak diagrams for constraint-shaped pages.
+- Let CI post a quality delta comment for PRs: scores changed, graph edges changed, weak criteria changed.
@@ -0,0 +1,123 @@
+#!/usr/bin/env python3
+"""Heuristic criterion-level scoring for example quality hill-climbing.
+
+This is not the editorial source of truth. It is a search aid: it breaks the
+rubric into observable criteria so the next rewrite can target the weakest
+axis instead of arguing about one opaque number.
+"""
+from __future__ import annotations
+
+import argparse
+import json
+import re
+import sys
+import tomllib
+from pathlib import Path
+from statistics import mean
+
+ROOT = Path(__file__).resolve().parents[1]
+sys.path.insert(0, str(ROOT))
+
+from src.example_loader import load_examples  # noqa: E402
+from src.marginalia import EXAMPLE_QUALITY_SCORES  # noqa: E402
+
+REGISTRY = ROOT / "docs" / "quality-registries.toml"
+GENERIC_PHRASES = [
+    "it exists to make a common boundary explicit",
+    "the example is small, deterministic",
+    "prefer simpler neighboring tools",
+]
+BOUNDARY_WORDS = re.compile(r"\b(prefer|instead|boundary|when|unless|except|error|raises?|static|runtime|unsupported|footgun|warning)\b", re.I)
+RATIONALE_WORDS = re.compile(r"\b(use|prefer|reach for|when|because|useful|right tool|fit|shape)\b", re.I)
+TOY_WORDS = re.compile(r"\b(foo|bar|baz|spam|eggs)\b", re.I)
+
+
+def clamp(value: float) -> float:
+    return max(0.0, min(1.0, value))
+
+
+def tokenise(text: str) -> set[str]:
+    return {token.lower() for token in re.findall(r"[a-zA-Z_]{3,}", text)}
+
+
+def criterion_scores(example: dict) -> dict[str, float]:
+    prose = "\n".join(example.get("explanation", []))
+    notes = "\n".join(example.get("notes", []))
+    code = example.get("code", "")
+    cells = example.get("cells", [])
+    normal_cells = [cell for cell in cells if cell.get("kind") == "cell"]
+    unsupported_cells = [cell for cell in cells if cell.get("kind") == "unsupported"]
+    outputs = [cell.get("output", "") for cell in normal_cells]
+    all_text = "\n".join([example.get("summary", ""), prose, notes])
+    cell_prose = [" ".join(cell.get("prose", [])) for cell in normal_cells]
+    distinct_cell_starts = len({text[:60] for text in cell_prose})
+    output_lines = sum(len(output.splitlines()) for output in outputs if output)
+    code_tokens = tokenise(code)
+    prose_tokens = tokenise(all_text)
+    overlap = len(code_tokens & prose_tokens)
+    generic_penalty = 0.2 * sum(phrase in all_text.lower() for phrase in GENERIC_PHRASES)
+
+    return {
+        "conceptual_payoff": clamp(0.45 + min(len(prose) / 900, 0.35) + min(overlap / 30, 0.2) - generic_penalty),
+        "rationale": clamp(0.35 + 0.35 * bool(RATIONALE_WORDS.search(all_text)) + min(len(example.get("notes", [])) / 6, 0.3) - generic_penalty),
+        "alternatives_and_boundaries": clamp(0.25 + 0.25 * bool(BOUNDARY_WORDS.search(all_text)) + 0.2 * bool(unsupported_cells) + min(notes.lower().count("prefer") / 2, 0.2)),
+        "executable_determinism": clamp(0.75 + 0.25 * bool(example.get("expected_output")) - 0.25 * bool(example.get("version_sensitive"))),
+        "python_idiom_and_accuracy": clamp(0.75 + 0.15 * bool(example.get("doc_url")) + 0.1 * ("print(" in code) - 0.25 * bool(TOY_WORDS.search(code))),
+        "literate_fit": clamp(0.35 + min(len(normal_cells) / 4, 0.35) + 0.3 * all(cell.get("prose") for cell in normal_cells)),
+        "source_result_pairing": clamp(0.35 + 0.45 * all(outputs) + min(output_lines / 8, 0.2)),
+        "concept_decomposition": clamp(0.25 + min(len(normal_cells) / 3, 0.55) + 0.2 * (len(normal_cells) >= 3)),
+        "progressive_walkthrough": clamp(0.35 + min(distinct_cell_starts / max(len(normal_cells), 1), 0.45) + 0.2 * (len(normal_cells) >= 2)),
+        "representative_coverage": clamp(0.3 + min(output_lines / 10, 0.25) + min(len(example.get("see_also", [])) / 4, 0.25) + 0.2 * (len(normal_cells) >= 3)),
+        "practical_usefulness": clamp(0.55 + 0.25 * (not bool(TOY_WORDS.search(code))) + 0.2 * bool(re.search(r"Ada|Grace|project|config|score|price|request|path|file|service|team", code))),
+        "editorial_progression": clamp(0.35 + min(len(example.get("explanation", [])) / 3, 0.25) + min(len(example.get("notes", [])) / 4, 0.25) + 0.15 * bool(example.get("see_also"))),
+    }
+
+
+def weighted_score(scores: dict[str, float], weights: dict[str, float]) -> float:
+    return round(sum(scores[name] * float(weight) for name, weight in weights.items()), 1)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--json", action="store_true")
+    parser.add_argument("--below", type=float, default=9.0)
+    parser.add_argument("--limit", type=int, default=20)
+    args = parser.parse_args()
+
+    weights = tomllib.loads(REGISTRY.read_text())["score_model"]
+    _, examples = load_examples()
+    rows = []
+    for example in examples:
+        criteria = criterion_scores(example)
+        heuristic = weighted_score(criteria, weights)
+        curated, comment = EXAMPLE_QUALITY_SCORES[example["slug"]]
+        weakest = sorted(criteria.items(), key=lambda item: item[1])[:3]
+        rows.append({
+            "slug": example["slug"],
+            "curated": curated,
+            "heuristic": heuristic,
+            "delta": round(curated - heuristic, 1),
+            "comment": comment,
+            "weakest": weakest,
+            "criteria": criteria,
+        })
+
+    if args.json:
+        print(json.dumps(rows, indent=2, sort_keys=True))
+        return 0
+
+    selected = [row for row in rows if row["curated"] < args.below]
+    selected.sort(key=lambda row: (row["curated"], row["heuristic"]))
+    for row in selected[: args.limit]:
+        weak = ", ".join(f"{name}={score:.2f}" for name, score in row["weakest"])
+        print(f"{row['curated']:>3.1f} h={row['heuristic']:>3.1f} {row['slug']:<30} {weak}")
+    print(
+        f"criterion heuristic: examples={len(rows)} "
+        f"curated_avg={mean(row['curated'] for row in rows):.2f} "
+        f"heuristic_avg={mean(row['heuristic'] for row in rows):.2f}"
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
@@ -1,3 +1,3 @@
 # Generated by scripts/fingerprint_assets.py. Do not edit by hand.
 ASSET_PATHS = {'SITE_CSS': '/site.57a55415849b.css', 'SYNTAX_JS': '/syntax-highlight.3b6c7f730d46.js', 'EDITOR_JS': '/editor.a4a7766e1b9b.js'}
-HTML_CACHE_VERSION = 'd56bf0e86233'
+HTML_CACHE_VERSION = 'b5738224e50a'