From baeaa3a96fd19fe11cd8398506e488009853792b Mon Sep 17 00:00:00 2001
From: Sayt-0 <louis-dalmorocompta@docker.com>
Date: Wed, 24 Jun 2026 23:00:50 +0200
Subject: [PATCH 1/2] feat(review-pr): add per-finding confidence scoring model

Score each verified finding 0-100 from the verifier verdict, evidence
strength, context completeness, drafter/verifier severity concordance,
and scope. Bands (strong/moderate/weak/negligible) with a default
posting threshold of 55 gate inline comments; security and high-severity
CONFIRMED/LIKELY findings are always posted, weak-band findings go to a
visible lower-confidence summary instead of being dropped, and a
medium-severity floor keeps a still-believed finding visible.

The model is implemented and unit-tested in src/score-confidence
(single source of truth) and mirrored in the orchestrator prompt as a
strict lookup table. The verifier now emits evidence_strength and
context_completeness.
---
 AGENTS.md                                     |   6 +
 review-pr/README.md                           |  14 +-
 .../agents/evals/confidence-scoring-1.json    |  29 +
 review-pr/agents/pr-review.yaml               | 134 +++-
 review-pr/agents/refs/posting-format.md       |  13 +-
 .../__tests__/score-confidence.test.ts        | 722 ++++++++++++++++++
 src/score-confidence/index.ts                 | 109 +++
 src/score-confidence/score-confidence.ts      | 554 ++++++++++++++
 tsup.config.ts                                |   1 +
 9 files changed, 1570 insertions(+), 12 deletions(-)
 create mode 100644 review-pr/agents/evals/confidence-scoring-1.json
 create mode 100644 src/score-confidence/__tests__/score-confidence.test.ts
 create mode 100644 src/score-confidence/index.ts
 create mode 100644 src/score-confidence/score-confidence.ts

diff --git a/AGENTS.md b/AGENTS.md
index ea903c2..18aea19 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -43,6 +43,11 @@ Anything else here (workflows under `.github/workflows/`, scripts, tests) exists
 │   │   ├── index.ts                 # CLI entry → bundled to dist/filter-diff.js
 │   │   ├── filter-diff.ts           # Core filterDiff() pure function + applyFilter() I/O wrapper.
 │   │   └── __tests__/
+│   ├── score-confidence/            # Per-finding confidence scoring for the PR review pipeline.
+│   │   ├── index.ts                 # CLI entry → bundled to dist/score-confidence.js
+│   │   ├── score-confidence.ts      # Core scoreFinding()/scoreFindings() pure functions + posting policy.
+│   │   │                            #   Source of truth for the model mirrored in pr-review.yaml.
+│   │   └── __tests__/
 │   ├── score-risk/                  # Per-file risk scoring for the PR review pipeline.
 │   │   ├── index.ts                 # CLI entry → bundled to dist/score-risk.js
 │   │   ├── score-risk.ts            # Core scoreFiles() pure function.
@@ -167,6 +172,7 @@ The action runs untrusted input (PR titles, bodies, comments, diffs) through an
   - `pull_request` action `review_requested` when `github.event.requested_reviewer.login == 'docker-agent'`
   - `@docker-agent` mentions on PR/issue comments — these run the `.github/actions/mention-reply` handler (sets `should-reply` and builds the context prompt) and then the `review-pr/mention-reply` sub-action (referenced from a pinned SHA, not present as a local path on every commit). The `pr-review-mention-reply.yaml` agent handles the actual reply.
 - Diffs over 1500 lines are **chunked at file boundaries** in `review-pr/action.yml` (see "Split diff into chunks"). Per-file **risk scoring** (security paths, line counts, error-handling patterns) prioritizes verifier attention.
+- Per-finding **confidence scoring** assigns each verified finding a precise 0–100 score (band: strong/moderate/weak/negligible) from the verifier's `verdict`, `evidence_strength`, and `context_completeness`, plus drafter↔verifier severity concordance and scope. `src/score-confidence/score-confidence.ts` is the **single source of truth** for the model (weights, bands, threshold, posting policy); the "Confidence Scoring" section of `review-pr/agents/pr-review.yaml` mirrors it as a strict lookup table so the orchestrator can apply it inline (the gitignored `dist/` is not available at agent runtime). Change one, change both — the unit tests pin every value. Security and high-severity CONFIRMED/LIKELY findings are always posted regardless of score; weak-band findings are surfaced in a summary rather than silently dropped.
 - Stale review threads on lines no longer in the diff are auto-resolved via GraphQL `resolveReviewThread`. Threads with no `<!-- docker-agent-review -->` marker are never touched.
 
 ### Workflows (`.github/workflows/`)
diff --git a/review-pr/README.md b/review-pr/README.md
index 097d327..b7b40ad 100644
--- a/review-pr/README.md
+++ b/review-pr/README.md
@@ -284,6 +284,8 @@ but the error check happens after this line accesses `user.ID`.
 
 Consider moving the nil check before accessing user properties.
 
+confidence: strong (92/100)
+
 <!-- docker-agent-review -->
 ```
 
@@ -298,9 +300,19 @@ When no issues are found:
 ### Review Pipeline
 
 ```
-AGENTS.md + PR Diff → Drafter (hypotheses) → Verifier (confirm) → Post Comments
+AGENTS.md + PR Diff → Drafter (hypotheses) → Verifier (confirm + evidence signals)
+                    → Confidence score (0–100) → Post Comments
 ```
 
+Each verified finding gets a precise **confidence score** (0–100) and a band
+(strong / moderate / weak / negligible), computed deterministically from the
+verifier's verdict, evidence strength, and context completeness, plus the
+drafter↔verifier severity agreement. High-confidence findings are posted as
+inline comments (labelled with their confidence); lower-confidence findings are
+listed separately rather than dropped. Security and high-severity findings are
+always surfaced regardless of score. The model is implemented and unit-tested in
+[`src/score-confidence/`](../src/score-confidence/score-confidence.ts).
+
 ### Learning System
 
 When you reply to a review comment:
diff --git a/review-pr/agents/evals/confidence-scoring-1.json b/review-pr/agents/evals/confidence-scoring-1.json
new file mode 100644
index 0000000..3f78f7f
--- /dev/null
+++ b/review-pr/agents/evals/confidence-scoring-1.json
@@ -0,0 +1,29 @@
+{
+  "id": "f0c1e2d3-4a5b-6c7d-8e9f-0a1b2c3d4e5f",
+  "title": "Confidence scoring - per-finding score, band, and security floor (run 1)",
+  "evals": {
+    "setup": "apk add --no-cache github-cli",
+    "relevance": [
+      "The agent ran 'echo $GITHUB_ACTIONS' before performing the review to detect the output mode",
+      "The agent output the review to the console as formatted markdown instead of posting via gh api",
+      "The drafter response is valid JSON containing a 'findings' array, a 'summary' field, and 'review_complete' set to true",
+      "At least one finding flags the missing redirect_uri validation as a security concern",
+      "The verifier returned a JSON response with a 'verdicts' array containing one verdict per finding, and each verdict includes an 'evidence_strength' value (direct, circumstantial, or speculative) and a 'context_completeness' value (full, partial, or none)",
+      "Each finding posted in the console output is labelled with a confidence band (strong, moderate, weak, or negligible) and a numeric score out of 100",
+      "The security finding about redirect_uri validation is surfaced in the review regardless of its confidence score (security findings are never auto-suppressed)",
+      "The review assessment label is '🔴 CRITICAL' or '🟡 NEEDS ATTENTION' because there is at least one confirmed or likely security/high-severity finding"
+    ]
+  },
+  "messages": [
+    {
+      "message": {
+        "agentName": "",
+        "message": {
+          "role": "user",
+          "content": "Review the following PR.\n\n## PR Information\n- **Title**: Add optional redirect URI to OAuth authorization flow\n- **Author**: jeanlaurent\n- **Branch**: custom-redirect-url → main\n- **Files Changed**: 6\n\n## PR Description\nAdds an optional redirect_uri field to GetAuthorizationURLRequest so callers can override the default OAuth callback URL. This allows apps to use custom URI schemes (e.g., myapp://auth/callback) for the OIDC login flow.\n\n### Changes\n- proto: Added optional redirect_uri field to GetAuthorizationURLRequest\n- auth/oidc: AuthorizationURL() accepts a redirectURI parameter, falls back to configured default when empty\n- auth/service: Reads redirect_uri from the request and passes it through\n- generated code: Regenerated Go and TypeScript protobuf files\n\n## Diff\n\nNote: Generated protobuf files (auth.pb.go, auth_pb.ts) are omitted — only hand-written code is shown.\n\n```diff\ndiff --git a/api/auth/v1/auth.proto b/api/auth/v1/auth.proto\nindex df6bf369..54dfc78a 100644\n--- a/api/auth/v1/auth.proto\n+++ b/api/auth/v1/auth.proto\n@@ -25,6 +25,11 @@ message GetAuthorizationURLRequest {\n   // Optional state parameter for CSRF protection.\n   // If not provided, the server will generate one.\n   optional string state = 1;\n+\n+  // Optional redirect URI for the OAuth callback.\n+  // If not provided, the server will use the configured default redirect URI.\n+  // This allows mobile apps to use custom URI schemes (e.g., myapp://auth/callback).\n+  optional string redirect_uri = 2;\n }\n \n // GetAuthorizationURLResponse is the response message containing the authorization URL.\n@@ -53,6 +58,10 @@ message GetLogoutURLResponse {\n message ExchangeTokenRequest {\n   // The authorization code received from the OIDC provider.\n   string code = 1;\n+\n+  // Optional redirect URI that was used in the authorization request.\n+  // Must match the redirect_uri used in GetAuthorizationURL for the OAuth flow to succeed.\n+  optional string redirect_uri = 2;\n }\n \ndiff --git a/backend/internal/platformd/auth/oidc.go b/backend/internal/platformd/auth/oidc.go\nindex 0e14ad7e..c4c96499 100644\n--- a/backend/internal/platformd/auth/oidc.go\n+++ b/backend/internal/platformd/auth/oidc.go\n@@ -65,9 +65,14 @@ func NewOIDCClient(ctx context.Context, cfg *Config) (*OIDCClient, error) {\n }\n \n // AuthorizationURL builds the authorization URL for the OIDC login flow.\n-func (c *OIDCClient) AuthorizationURL(state string) string {\n+// If redirectURI is provided, it will be used instead of the configured default.\n+func (c *OIDCClient) AuthorizationURL(state string, redirectURI string) string {\n \tcfg := c.oauth2Config\n-\tcfg.RedirectURL = c.redirectURI\n+\tif redirectURI != \"\" {\n+\t\tcfg.RedirectURL = redirectURI\n+\t} else {\n+\t\tcfg.RedirectURL = c.redirectURI\n+\t}\n \treturn cfg.AuthCodeURL(state)\n }\n \n@@ -92,10 +97,16 @@ type TokenResponse struct {\n }\n \n // ExchangeCode exchanges an authorization code for tokens.\n-func (c *OIDCClient) ExchangeCode(ctx context.Context, code string) (*TokenResponse, error) {\n+// If redirectURI is provided, it will be used instead of the configured default.\n+// The redirect URI must match the one used in the authorization request.\n+func (c *OIDCClient) ExchangeCode(ctx context.Context, code string, redirectURI string) (*TokenResponse, error) {\n \t// Set the redirect URI for this specific exchange\n \tcfg := c.oauth2Config\n-\tcfg.RedirectURL = c.redirectURI\n+\tif redirectURI != \"\" {\n+\t\tcfg.RedirectURL = redirectURI\n+\t} else {\n+\t\tcfg.RedirectURL = c.redirectURI\n+\t}\n \n \ttoken, err := cfg.Exchange(ctx, code)\n \tif err != nil {\n\ndiff --git a/backend/internal/platformd/auth/service.go b/backend/internal/platformd/auth/service.go\nindex c2a95279..e3e355c9 100644\n--- a/backend/internal/platformd/auth/service.go\n+++ b/backend/internal/platformd/auth/service.go\n@@ -82,8 +82,11 @@ func (s *Service) GetAuthorizationURL(\n \t\t}\n \t}\n \n-\t// Build the authorization URL using the configured redirect URI\n-\tauthURL := s.oidcClient.AuthorizationURL(state)\n+\t// Get redirect URI from request, or use configured default\n+\tredirectURI := msg.GetRedirectUri()\n+\n+\t// Build the authorization URL\n+\tauthURL := s.oidcClient.AuthorizationURL(state, redirectURI)\n \n \treturn connect.NewResponse(&authv1.GetAuthorizationURLResponse{\n \t\tAuthorizationUrl: authURL,\n@@ -138,8 +141,11 @@ func (s *Service) ExchangeToken(\n \t\treturn nil, connect.NewError(connect.CodeInvalidArgument, ErrCodeRequired)\n \t}\n \n-\t// Exchange the code for Docker tokens using the configured redirect URI\n-\ttokenResp, err := s.oidcClient.ExchangeCode(ctx, code)\n+\t// Get redirect URI from request, or use configured default\n+\tredirectURI := msg.GetRedirectUri()\n+\n+\t// Exchange the code for Docker tokens\n+\ttokenResp, err := s.oidcClient.ExchangeCode(ctx, code, redirectURI)\n \tif err != nil {\n \t\tif errors.Is(err, ErrTokenExchange) {\n \t\t\treturn nil, connect.NewError(connect.CodeInvalidArgument, err)\n```",
+          "created_at": "2026-02-18T14:00:00-05:00"
+        }
+      }
+    }
+  ]
+}
diff --git a/review-pr/agents/pr-review.yaml b/review-pr/agents/pr-review.yaml
index 5073879..ea5febf 100644
--- a/review-pr/agents/pr-review.yaml
+++ b/review-pr/agents/pr-review.yaml
@@ -211,14 +211,29 @@ agents:
          Do NOT approve — surface the raw findings so the author can evaluate them.
          Do NOT retry the delegation.
          (The fallback preserves the drafter's analysis for the author.)
-      6. Parse the verifier's JSON response (a `verdicts` array). Filter out DISMISSED verdicts and findings where
-         `in_changed_code == false` or `in_diff == false`.
-      7. **Verify line numbers** before posting (see below)
+      6. Parse the verifier's JSON response (a `verdicts` array). Drop verdicts that are out
+         of scope (`in_changed_code == false` or `in_diff == false`). Keep the rest — including
+         DISMISSED — and assign each a **confidence score** using the Confidence Scoring section
+         below. The score (not a manual judgment) decides each finding's disposition: posted
+         inline, listed in the lower-confidence summary, sent to the dismissed-security audit,
+         or dropped. Each verdict carries `evidence_strength` and `context_completeness` for this.
+      7. **Verify line numbers** before posting (see below) — only for findings you will post
+         inline or list in the summary.
       8. Apply the Decision Rules (see below) to determine the review verdict
-      9. Build inline comments from CONFIRMED/LIKELY issues using each finding's `issue`
-         (one-line summary), `details` (full explanation), `severity`, `category`, `file`,
-         and `line` fields. Post the review.
-      10. Always report ALL HIGH severity bugs. Limit MEDIUM/LOW to 5 comments max.
+      9. Build the review from the confidence dispositions (see Confidence Scoring → Posting policy):
+         - **Inline comments** — every finding whose disposition is `inline`. Each comment uses
+           the finding's `issue` (one-line summary), `details` (full explanation), `severity`,
+           `category`, `file`, `line`, and a confidence label, e.g. `confidence: moderate (68/100)`.
+         - **Lower-confidence summary** — weak-band non-forced findings (plus any pushed past the
+           comment cap), listed under "Lower-confidence findings (not posted inline)" with their
+           scores. Never silently drop these.
+         - **Dismissed security audit** — DISMISSED `security` findings, listed under "Dismissed
+           security findings (review manually)" citing the verifier's stated mitigation.
+         Then post the review.
+      10. Order inline comments by confidence, highest first. Forced comments (high-severity and
+          security CONFIRMED/LIKELY) are ALWAYS posted and never count against the cap. Among the
+          remaining (non-forced) inline comments, keep at most 5 (highest confidence first) and
+          move the overflow to the lower-confidence summary list.
 
       Find **real bugs in the changed code**, not style issues. If the changed code works correctly, approve it.
 
@@ -294,10 +309,74 @@ agents:
       (from the `-X,Y` side of the hunk header). Do NOT use `grep` to verify deleted
       lines — they are gone from the working tree.
 
+      ## Confidence Scoring (MANDATORY — strict lookup, not a judgment call)
+
+      Assign every surviving finding a **confidence score (0–100)**, a **band**, and a
+      **posting disposition**. This is a deterministic lookup, never a subjective guess. The
+      authoritative implementation is `src/score-confidence/score-confidence.ts`; the rules
+      below mirror it exactly and MUST stay in sync with it. Confidence answers "how sure are
+      we the bug is real" — a separate axis from `severity` ("how bad it is if real"); report both.
+
+      Inputs per finding: `verdict`, `evidence_strength`, `context_completeness` (all from the
+      verifier), the drafter vs verifier severity (for concordance), scope (`in_diff` AND
+      `in_changed_code`), and — for posting only — `category` (security) and verifier `severity` (high).
+
+      **Step 0 — scope gate.** If NOT (`in_diff` AND `in_changed_code`): score 0, band
+      negligible, do not surface. Stop.
+
+      **Step 1 — dismissed gate.** If `verdict` is DISMISSED: score 0, band negligible, never
+      post inline. (A DISMISSED `security` finding still goes to the audit list — see policy.) Stop.
+
+      **Step 2 — core subtotal (read ONE cell; do not add the parts yourself).**
+
+      | verdict / evidence         | full | partial | none |
+      | -------------------------- | ---- | ------- | ---- |
+      | CONFIRMED / direct         | 100  | 92      | 78   |
+      | CONFIRMED / circumstantial | 90   | 82      | 68   |
+      | CONFIRMED / speculative    | 78   | 70      | 56   |
+      | LIKELY / direct            | 70   | 62      | 48   |
+      | LIKELY / circumstantial    | 60   | 52      | 38   |
+      | LIKELY / speculative       | 48   | 40      | 26   |
+
+      **Step 3 — severity concordance (one addition).** Rank high=3, medium=2, low=1. Let
+      d = |drafterRank − verifierRank|. Add: d==0 → +5, d==1 → +0, d==2 → −8.
+
+      **Step 4 — clamp.** score = min(100, max(0, subtotal + concordance)).
+
+      **Step 5 — band.** strong: score ≥ 80; moderate: 55–79; weak: 30–54; negligible: < 30.
+      (Only CONFIRMED can reach strong; LIKELY tops out at 75.)
+
+      ### Posting policy (first match wins; the 5-comment cap is applied last)
+
+      1. Out-of-scope, or DISMISSED non-security → do not surface.
+      2. **Security floor** — `category` security AND `verdict` CONFIRMED/LIKELY → ALWAYS post
+         inline, whatever the score. Never auto-suppress a security finding. Exempt from the cap.
+      3. **High-severity always-post** — verifier `severity` high AND `verdict` CONFIRMED/LIKELY
+         → ALWAYS post inline, whatever the band. Exempt from the cap.
+      4. **Default** — band strong or moderate (score ≥ 55) → post inline.
+      5. **Weak band (30–54), non-forced** — do NOT post inline; list under "Lower-confidence
+         findings (not posted inline)" with the score.
+      6. **Medium-severity floor** — a non-forced finding in the negligible band (< 30) whose
+         verifier `severity` is `medium` is NOT dropped: list it in the lower-confidence summary.
+         (Confidence rewards drafter↔verifier severity agreement, so a one-notch disagreement can
+         push a finding down a band; this floor guarantees raising severity never makes a finding
+         *less* visible. Only negligible-band, low-severity findings are dropped entirely.)
+      7. **DISMISSED security** — do NOT post inline; list under "Dismissed security findings
+         (review manually)" citing the verifier's stated mitigation.
+      8. **Cap** — among inline comments from rule 4 that are NOT high-severity and NOT security,
+         keep at most 5 (highest score first; break ties by CONFIRMED before LIKELY, then higher
+         subtotal, then direct > circumstantial > speculative, then full > partial > none). Move
+         the rest to the rule-5 summary list. Forced comments (rules 2–3) never count against
+         the cap and are never displaced.
+
+      Label every inline comment with its confidence, e.g. `confidence: moderate (68/100)`.
+
       ## Decision Rules (MANDATORY — strict lookup, not a judgment call)
 
-      1. **Filter**: Remove findings where `in_changed_code == false` or `in_diff == false`
-      2. **Classify** (for informational labeling in the review summary):
+      1. **Filter**: Consider only findings whose confidence disposition is `inline` (see
+         Confidence Scoring). Out-of-scope, dropped, summary-only, and audit-only findings do
+         NOT drive the assessment label.
+      2. **Classify** the inline findings (for informational labeling in the review summary):
          - CRITICAL = high severity CONFIRMED/LIKELY
          - NOTABLE = medium severity CONFIRMED/LIKELY
          - MINOR = everything else
@@ -348,9 +427,16 @@ agents:
       ## Review: COMMENT
       ### Assessment: [🟢 APPROVE|🟡 NEEDS ATTENTION|🔴 CRITICAL]
       ### Findings
-      **[SEVERITY] file:line — issue**
+      **[SEVERITY] file:line — issue** (confidence: BAND SCORE/100)
       details
+
+      ### Lower-confidence findings (not posted inline)
+      - [SEVERITY] file:line — issue (confidence: weak SCORE/100)
+
+      ### Dismissed security findings (review manually)
+      - file:line — issue (verifier mitigation: …)
       ```
+      Omit the "Lower-confidence" and "Dismissed security" sections when they have no entries.
 
     sub_agents:
       - drafter
@@ -741,6 +827,24 @@ agents:
         expressed without changing its behavior (e.g. `os.Remove(x)` → `_ = fileutil.Remove(x)`
         does not introduce the discarded error; a move/rename/wrap refactor introduces
         nothing). "Touched by the diff" is not the same as "introduced by the diff."
+      - `evidence_strength`: How strongly the provided code snippet ITSELF shows the bug.
+        - `direct`: the buggy line is in the snippet and the defect is visible right there.
+        - `circumstantial`: the snippet shows related or calling code, but not the defect itself.
+        - `speculative`: the snippet does not pin the bug; you are reasoning about code you cannot see.
+      - `context_completeness`: How complete the code context was for your judgment.
+        - `full`: every symbol/definition you needed to decide was present in the provided snippets.
+        - `partial`: you had to assume the behavior of some referenced code you could not see.
+        - `none`: the key code needed to confirm the bug was not retrievable from the snippet.
+
+      **Disjointness rule (REQUIRED):** if `context_completeness` is `none` you MUST NOT set
+      `evidence_strength` to `direct` — without the defining context you cannot have direct
+      evidence. Keep the two axes independent; do not collapse one into the other.
+
+      `evidence_strength` and `context_completeness` feed the deterministic confidence score
+      the orchestrator computes (see its "Confidence Scoring" section). Assign them honestly:
+      over-claiming `direct`/`full` inflates confidence, while reflexive `speculative`/`none`
+      hedging suppresses real bugs. They do not change your `verdict` — verify exactly as
+      before, then describe the evidence you actually had.
 
     structured_output:
       name: verification_verdicts
@@ -772,6 +876,14 @@ agents:
                   description: "Explanation of verdict"
                 in_changed_code:
                   type: boolean
+                evidence_strength:
+                  type: string
+                  enum: ["direct", "circumstantial", "speculative"]
+                  description: "How strongly the cited snippet itself shows the bug"
+                context_completeness:
+                  type: string
+                  enum: ["full", "partial", "none"]
+                  description: "How complete the code context was when judging"
               required:
                 [
                   "verdict",
@@ -781,6 +893,8 @@ agents:
                   "issue",
                   "details",
                   "in_changed_code",
+                  "evidence_strength",
+                  "context_completeness",
                 ]
               additionalProperties: false
         required: ["verdicts"]
diff --git a/review-pr/agents/refs/posting-format.md b/review-pr/agents/refs/posting-format.md
index e1c008b..2c10c71 100644
--- a/review-pr/agents/refs/posting-format.md
+++ b/review-pr/agents/refs/posting-format.md
@@ -25,7 +25,16 @@ with `echo` — this causes double-escaping of newlines (`\n` rendered as litera
 
 Build the review body and comments, then use `jq` to produce correctly-escaped JSON:
 ```bash
-# Review body is just the assessment badge — findings go in inline comments
+# Review body is the assessment badge, plus the lower-confidence and dismissed-security
+# summary sections when they have entries (high-confidence findings go in inline comments).
+# Append each section only when non-empty, e.g.:
+#   ### Assessment: 🟡 NEEDS ATTENTION
+#
+#   #### Lower-confidence findings (not posted inline)
+#   - [medium] file.go:42 — issue (confidence: weak 48/100)
+#
+#   #### Dismissed security findings (review manually)
+#   - file.go:88 — issue (verifier mitigation: …)
 REVIEW_BODY="### Assessment: 🟢 APPROVE"   # or 🟡 NEEDS ATTENTION / 🔴 CRITICAL
 
 # Start with an empty comments array
@@ -39,6 +48,8 @@ cat > /tmp/comment_body.md << 'COMMENT_BODY_EOF'
 
 Detailed explanation of the bug, trigger path, and impact.
 
+confidence: moderate (68/100)
+
 <!-- docker-agent-review -->
 COMMENT_BODY_EOF
 
diff --git a/src/score-confidence/__tests__/score-confidence.test.ts b/src/score-confidence/__tests__/score-confidence.test.ts
new file mode 100644
index 0000000..b353a13
--- /dev/null
+++ b/src/score-confidence/__tests__/score-confidence.test.ts
@@ -0,0 +1,722 @@
+// Copyright The Docker Agent Action authors
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Unit tests for src/score-confidence.
+ *
+ * The model is pinned value-by-value: the 18-cell core subtotal table, the
+ * concordance term, the clamp, the band boundaries, both hard gates (scope and
+ * dismissed), the per-finding posting policy, and the cross-finding comment cap.
+ *
+ * The provable invariants from the design spec are asserted directly:
+ *   - strict monotonicity in evidence and context,
+ *   - only CONFIRMED can reach the strong band (LIKELY tops out at 75),
+ *   - DISMISSED and out-of-scope always score 0.
+ *
+ * The 12 worked examples from the locked spec are encoded as a data-driven
+ * fixture so any constant drift fails loudly.
+ */
+import { describe, expect, it } from 'vitest';
+import {
+  bandFor,
+  COMMENT_CAP,
+  type ContextCompleteness,
+  type EvidenceStrength,
+  type FindingInput,
+  MODERATE_THRESHOLD,
+  type ScorableVerdict,
+  type Severity,
+  STRONG_THRESHOLD,
+  scoreFinding,
+  scoreFindings,
+  WEAK_THRESHOLD,
+} from '../score-confidence.js';
+
+// ── Fixture helpers ───────────────────────────────────────────────────────────
+
+/**
+ * Build an in-scope, non-forced CONFIRMED finding. Defaults score well into the
+ * moderate/strong range; override any field to exercise a specific rule.
+ */
+function makeFinding(overrides: Partial<FindingInput> = {}): FindingInput {
+  return {
+    file: 'pkg/app/handler.go',
+    line: 42,
+    category: 'logic_error',
+    verdict: 'CONFIRMED',
+    evidenceStrength: 'direct',
+    contextCompleteness: 'full',
+    drafterSeverity: 'medium',
+    verifierSeverity: 'medium',
+    inDiff: true,
+    inChangedCode: true,
+    ...overrides,
+  };
+}
+
+const EVIDENCE: EvidenceStrength[] = ['direct', 'circumstantial', 'speculative'];
+const CONTEXT: ContextCompleteness[] = ['full', 'partial', 'none'];
+const SCORABLE: ScorableVerdict[] = ['CONFIRMED', 'LIKELY'];
+const SEVERITIES: Severity[] = ['high', 'medium', 'low'];
+
+// ── Core subtotal table (verdict × evidence × context) ───────────────────────
+
+describe('core subtotal table', () => {
+  // The documented 3×3×3 table from the locked spec. With d0 concordance
+  // (medium↔medium → +5) the score is subtotal + 5, so we assert breakdown.subtotal.
+  const TABLE: Record<ScorableVerdict, Record<EvidenceStrength, [number, number, number]>> = {
+    CONFIRMED: {
+      direct: [100, 92, 78],
+      circumstantial: [90, 82, 68],
+      speculative: [78, 70, 56],
+    },
+    LIKELY: {
+      direct: [70, 62, 48],
+      circumstantial: [60, 52, 38],
+      speculative: [48, 40, 26],
+    },
+  };
+
+  for (const verdict of SCORABLE) {
+    for (const evidence of EVIDENCE) {
+      CONTEXT.forEach((context, ctxIdx) => {
+        const expected = TABLE[verdict][evidence][ctxIdx];
+        it(`${verdict}/${evidence}/${context} → subtotal ${expected}`, () => {
+          const r = scoreFinding(
+            makeFinding({ verdict, evidenceStrength: evidence, contextCompleteness: context }),
+          );
+          expect(r.breakdown.subtotal).toBe(expected);
+        });
+      });
+    }
+  }
+});
+
+// ── Concordance (drafter vs verifier severity) ───────────────────────────────
+
+describe('severity concordance', () => {
+  it('same severity (d0) → +5', () => {
+    const r = scoreFinding(makeFinding({ drafterSeverity: 'medium', verifierSeverity: 'medium' }));
+    expect(r.breakdown.severityDistance).toBe(0);
+    expect(r.breakdown.concordance).toBe(5);
+  });
+
+  it('one step apart (d1) → 0', () => {
+    const r = scoreFinding(makeFinding({ drafterSeverity: 'high', verifierSeverity: 'medium' }));
+    expect(r.breakdown.severityDistance).toBe(1);
+    expect(r.breakdown.concordance).toBe(0);
+  });
+
+  it('high vs low (d2) → −8', () => {
+    const r = scoreFinding(makeFinding({ drafterSeverity: 'high', verifierSeverity: 'low' }));
+    expect(r.breakdown.severityDistance).toBe(2);
+    expect(r.breakdown.concordance).toBe(-8);
+  });
+
+  it('concordance is symmetric (low vs high == high vs low)', () => {
+    const a = scoreFinding(makeFinding({ drafterSeverity: 'low', verifierSeverity: 'high' }));
+    const b = scoreFinding(makeFinding({ drafterSeverity: 'high', verifierSeverity: 'low' }));
+    expect(a.breakdown.concordance).toBe(b.breakdown.concordance);
+  });
+});
+
+// ── score = subtotal + concordance, clamped to [0,100] ───────────────────────
+
+describe('score composition and clamp', () => {
+  it('score = subtotal + concordance', () => {
+    // LIKELY/circumstantial/partial subtotal 52, d0 +5 → 57.
+    const r = scoreFinding(
+      makeFinding({
+        verdict: 'LIKELY',
+        evidenceStrength: 'circumstantial',
+        contextCompleteness: 'partial',
+      }),
+    );
+    expect(r.score).toBe(57);
+  });
+
+  it('clamps the high end at 100 (CONFIRMED/direct/full + d0 = 105 → 100)', () => {
+    const r = scoreFinding(makeFinding({ drafterSeverity: 'high', verifierSeverity: 'high' }));
+    expect(r.breakdown.subtotal).toBe(100);
+    expect(r.breakdown.concordance).toBe(5);
+    expect(r.score).toBe(100);
+  });
+
+  it('never produces a negative in-scope score (min cell 26 − 8 = 18)', () => {
+    const r = scoreFinding(
+      makeFinding({
+        verdict: 'LIKELY',
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'low',
+        verifierSeverity: 'high',
+      }),
+    );
+    expect(r.score).toBe(18);
+  });
+});
+
+// ── Invariants ───────────────────────────────────────────────────────────────
+
+describe('invariant: strict monotonicity in evidence', () => {
+  for (const verdict of SCORABLE) {
+    for (const context of CONTEXT) {
+      it(`${verdict}/*/${context}: direct > circumstantial > speculative`, () => {
+        const sub = (evidence: EvidenceStrength) =>
+          scoreFinding(
+            makeFinding({ verdict, evidenceStrength: evidence, contextCompleteness: context }),
+          ).breakdown.subtotal;
+        expect(sub('direct')).toBeGreaterThan(sub('circumstantial'));
+        expect(sub('circumstantial')).toBeGreaterThan(sub('speculative'));
+      });
+    }
+  }
+});
+
+describe('invariant: monotonicity in context', () => {
+  for (const verdict of SCORABLE) {
+    for (const evidence of EVIDENCE) {
+      it(`${verdict}/${evidence}/*: full > partial > none`, () => {
+        const sub = (context: ContextCompleteness) =>
+          scoreFinding(
+            makeFinding({ verdict, evidenceStrength: evidence, contextCompleteness: context }),
+          ).breakdown.subtotal;
+        expect(sub('full')).toBeGreaterThan(sub('partial'));
+        expect(sub('partial')).toBeGreaterThan(sub('none'));
+      });
+    }
+  }
+});
+
+describe('invariant: only CONFIRMED can reach the strong band', () => {
+  it('LIKELY tops out at 75 (5 below the strong floor of 80)', () => {
+    let maxLikely = 0;
+    for (const evidence of EVIDENCE) {
+      for (const context of CONTEXT) {
+        for (const drafterSeverity of SEVERITIES) {
+          for (const verifierSeverity of SEVERITIES) {
+            const { score } = scoreFinding(
+              makeFinding({
+                verdict: 'LIKELY',
+                evidenceStrength: evidence,
+                contextCompleteness: context,
+                drafterSeverity,
+                verifierSeverity,
+              }),
+            );
+            maxLikely = Math.max(maxLikely, score);
+          }
+        }
+      }
+    }
+    expect(maxLikely).toBe(75);
+    expect(maxLikely).toBeLessThan(STRONG_THRESHOLD);
+  });
+
+  it('no LIKELY combination lands in the strong band', () => {
+    for (const evidence of EVIDENCE) {
+      for (const context of CONTEXT) {
+        const r = scoreFinding(
+          makeFinding({
+            verdict: 'LIKELY',
+            evidenceStrength: evidence,
+            contextCompleteness: context,
+            drafterSeverity: 'high',
+            verifierSeverity: 'high',
+          }),
+        );
+        expect(r.band).not.toBe('strong');
+      }
+    }
+  });
+
+  it('CONFIRMED/direct/full reaches the strong band', () => {
+    const r = scoreFinding(makeFinding({ drafterSeverity: 'high', verifierSeverity: 'high' }));
+    expect(r.band).toBe('strong');
+  });
+});
+
+// ── Band boundaries ──────────────────────────────────────────────────────────
+
+describe('bandFor — boundaries are contiguous with no gaps', () => {
+  it.each([
+    [100, 'strong'],
+    [80, 'strong'],
+    [79, 'moderate'],
+    [55, 'moderate'],
+    [54, 'weak'],
+    [30, 'weak'],
+    [29, 'negligible'],
+    [0, 'negligible'],
+  ] as const)('score %i → %s', (score, band) => {
+    expect(bandFor(score)).toBe(band);
+  });
+
+  it('threshold constants line up with the band edges', () => {
+    expect(STRONG_THRESHOLD).toBe(80);
+    expect(MODERATE_THRESHOLD).toBe(55);
+    expect(WEAK_THRESHOLD).toBe(30);
+  });
+});
+
+// ── Hard gate: scope ─────────────────────────────────────────────────────────
+
+describe('scope hard gate', () => {
+  it('in_diff false → score 0, negligible, dropped', () => {
+    const r = scoreFinding(makeFinding({ inDiff: false }));
+    expect(r.score).toBe(0);
+    expect(r.band).toBe('negligible');
+    expect(r.disposition).toBe('drop');
+    expect(r.breakdown.gate).toBe('scope');
+  });
+
+  it('in_changed_code false → score 0, dropped (even for a would-be perfect score)', () => {
+    const r = scoreFinding(
+      makeFinding({ drafterSeverity: 'high', verifierSeverity: 'high', inChangedCode: false }),
+    );
+    expect(r.score).toBe(0);
+    expect(r.disposition).toBe('drop');
+    expect(r.breakdown.gate).toBe('scope');
+  });
+
+  it('scope gate fires before the security floor (out-of-scope security is dropped)', () => {
+    const r = scoreFinding(makeFinding({ category: 'security', inChangedCode: false }));
+    expect(r.disposition).toBe('drop');
+    expect(r.forced).toBe(false);
+  });
+});
+
+// ── Hard gate: dismissed ─────────────────────────────────────────────────────
+
+describe('dismissed hard gate', () => {
+  it('DISMISSED non-security → score 0, dropped', () => {
+    const r = scoreFinding(makeFinding({ verdict: 'DISMISSED' }));
+    expect(r.score).toBe(0);
+    expect(r.band).toBe('negligible');
+    expect(r.disposition).toBe('drop');
+    expect(r.breakdown.gate).toBe('dismissed');
+  });
+
+  it('DISMISSED security → score 0 but routed to the audit list', () => {
+    const r = scoreFinding(makeFinding({ verdict: 'DISMISSED', category: 'security' }));
+    expect(r.score).toBe(0);
+    expect(r.disposition).toBe('audit');
+    expect(r.forced).toBe(false);
+  });
+});
+
+// ── Per-finding posting policy ───────────────────────────────────────────────
+
+describe('posting policy (per finding)', () => {
+  it('security finding posts inline even in the weak band (security floor)', () => {
+    // CONFIRMED/speculative/none, d2 → 56 − 8 = 48 (weak), but security forces inline.
+    const r = scoreFinding(
+      makeFinding({
+        category: 'security',
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'high',
+        verifierSeverity: 'low',
+      }),
+    );
+    expect(r.band).toBe('weak');
+    expect(r.disposition).toBe('inline');
+    expect(r.forced).toBe(true);
+    expect(r.reason).toContain('security');
+  });
+
+  it('security finding posts inline even at a negligible score', () => {
+    const r = scoreFinding(
+      makeFinding({
+        category: 'security',
+        verdict: 'LIKELY',
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'low',
+        verifierSeverity: 'high',
+      }),
+    );
+    expect(r.score).toBe(18);
+    expect(r.band).toBe('negligible');
+    expect(r.disposition).toBe('inline');
+    expect(r.forced).toBe(true);
+  });
+
+  it('high-severity finding posts inline even in the weak band', () => {
+    const r = scoreFinding(
+      makeFinding({
+        verdict: 'LIKELY',
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'high',
+        verifierSeverity: 'high',
+      }),
+    );
+    // LIKELY/spec/none subtotal 26, d0 +5 = 31 → weak.
+    expect(r.band).toBe('weak');
+    expect(r.disposition).toBe('inline');
+    expect(r.forced).toBe(true);
+    expect(r.reason).toContain('high-severity');
+  });
+
+  it('non-forced moderate finding posts inline (not forced)', () => {
+    const r = scoreFinding(
+      makeFinding({ verdict: 'LIKELY', evidenceStrength: 'direct', contextCompleteness: 'full' }),
+    );
+    expect(r.score).toBe(75);
+    expect(r.disposition).toBe('inline');
+    expect(r.forced).toBe(false);
+  });
+
+  it('non-forced weak finding → summary (not inline, not dropped)', () => {
+    // CONFIRMED/speculative/none, d2 → 48 (weak), medium severity, non-security.
+    const r = scoreFinding(
+      makeFinding({
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'high',
+        verifierSeverity: 'low',
+      }),
+    );
+    expect(r.band).toBe('weak');
+    expect(r.disposition).toBe('summary');
+    expect(r.forced).toBe(false);
+  });
+
+  it('non-forced negligible LOW-severity finding → dropped', () => {
+    // Verifier severity is low (no medium floor) and one step from the drafter
+    // (d1 → +0), so neither the high-severity nor the security override fires
+    // and the medium-severity visibility floor does not apply: a true drop.
+    const r = scoreFinding(
+      makeFinding({
+        verdict: 'LIKELY',
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'medium',
+        verifierSeverity: 'low',
+      }),
+    );
+    expect(r.score).toBe(26);
+    expect(r.band).toBe('negligible');
+    expect(r.disposition).toBe('drop');
+  });
+
+  it('negligible MEDIUM-severity finding → summary (medium-severity visibility floor)', () => {
+    // Same negligible score, but verifier severity medium keeps it visible: a
+    // medium finding the verifier still believes in is never silently dropped.
+    const r = scoreFinding(
+      makeFinding({
+        verdict: 'LIKELY',
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'low',
+        verifierSeverity: 'medium',
+      }),
+    );
+    expect(r.score).toBe(26);
+    expect(r.band).toBe('negligible');
+    expect(r.disposition).toBe('summary');
+    expect(r.forced).toBe(false);
+  });
+
+  it('visibility never inverts when the verifier raises severity (low → medium)', () => {
+    // Regression guard for the concordance non-monotonicity: at fixed
+    // verdict/evidence/context, escalating verifier severity must not move a
+    // finding to a less-visible tier. low → summary (weak 31); medium → summary
+    // (negligible 26, floored). Neither is 'drop'.
+    const base = {
+      verdict: 'LIKELY',
+      evidenceStrength: 'speculative',
+      contextCompleteness: 'none',
+      drafterSeverity: 'low',
+    } as const;
+    const low = scoreFinding(makeFinding({ ...base, verifierSeverity: 'low' }));
+    const medium = scoreFinding(makeFinding({ ...base, verifierSeverity: 'medium' }));
+    expect(low.disposition).toBe('summary');
+    expect(medium.disposition).toBe('summary');
+    // Raising severity dropped the score (lost the +5 agreement bonus) but did
+    // NOT push the finding off the visible channels.
+    expect(medium.score).toBeLessThan(low.score);
+    expect(medium.disposition).not.toBe('drop');
+  });
+});
+
+// ── Cross-finding comment cap ────────────────────────────────────────────────
+
+describe('scoreFindings — comment cap and grouping', () => {
+  // Seven distinct-score non-forced CONFIRMED findings (medium severity, logic_error):
+  // direct/full=100, direct/partial=97, circ/full=95, circ/partial=87,
+  // direct/none=83, spec/partial=75, circ/none=73 (all + d0 concordance).
+  const NON_FORCED: Array<[EvidenceStrength, ContextCompleteness, number]> = [
+    ['direct', 'full', 100],
+    ['direct', 'partial', 97],
+    ['circumstantial', 'full', 95],
+    ['circumstantial', 'partial', 87],
+    ['direct', 'none', 83],
+    ['speculative', 'partial', 75],
+    ['circumstantial', 'none', 73],
+  ];
+
+  function nonForcedBatch(): FindingInput[] {
+    return NON_FORCED.map(([evidence, context], i) =>
+      makeFinding({
+        file: `pkg/f${i}.go`,
+        evidenceStrength: evidence,
+        contextCompleteness: context,
+      }),
+    );
+  }
+
+  it('caps non-forced inline comments at COMMENT_CAP, demoting the rest to summary', () => {
+    const report = scoreFindings(nonForcedBatch());
+    expect(report.inline).toHaveLength(COMMENT_CAP);
+    expect(report.summary).toHaveLength(NON_FORCED.length - COMMENT_CAP);
+    // The five highest scores survive; the two lowest are demoted.
+    expect(report.inline.map((s) => s.result.score)).toEqual([100, 97, 95, 87, 83]);
+    expect(report.summary.map((s) => s.result.score)).toEqual([75, 73]);
+  });
+
+  it('inline comments are ordered by descending confidence', () => {
+    const report = scoreFindings(nonForcedBatch());
+    const keys = report.inline.map((s) => s.result.sortKey);
+    expect(keys).toEqual([...keys].sort((a, b) => b - a));
+  });
+
+  it('forced comments are exempt from the cap and never displaced', () => {
+    const forced: FindingInput[] = [
+      makeFinding({
+        file: 'pkg/sec.go',
+        category: 'security',
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'high',
+        verifierSeverity: 'low',
+      }),
+      makeFinding({
+        file: 'pkg/high.go',
+        verdict: 'LIKELY',
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'high',
+        verifierSeverity: 'high',
+      }),
+    ];
+    const report = scoreFindings([...nonForcedBatch(), ...forced]);
+    // 5 capped non-forced + 2 forced = 7 inline; the 2 forced are present despite low scores.
+    expect(report.inline).toHaveLength(COMMENT_CAP + forced.length);
+    const inlineFiles = report.inline.map((s) => s.input.file);
+    expect(inlineFiles).toContain('pkg/sec.go');
+    expect(inlineFiles).toContain('pkg/high.go');
+    // Cap still applied to the non-forced set only.
+    expect(report.summary).toHaveLength(NON_FORCED.length - COMMENT_CAP);
+  });
+
+  it('respects a custom comment cap', () => {
+    const report = scoreFindings(nonForcedBatch(), { commentCap: 2 });
+    expect(report.inline).toHaveLength(2);
+    expect(report.summary).toHaveLength(NON_FORCED.length - 2);
+  });
+
+  it('groups gated findings into audit and dropped', () => {
+    const report = scoreFindings([
+      makeFinding({ file: 'a.go', verdict: 'DISMISSED', category: 'security' }),
+      makeFinding({ file: 'b.go', verdict: 'DISMISSED' }),
+      makeFinding({ file: 'c.go', inChangedCode: false }),
+    ]);
+    expect(report.audit.map((s) => s.input.file)).toEqual(['a.go']);
+    expect(report.dropped.map((s) => s.input.file).sort()).toEqual(['b.go', 'c.go']);
+    expect(report.inline).toHaveLength(0);
+  });
+});
+
+// ── sortKey tie-break ────────────────────────────────────────────────────────
+
+describe('sortKey tie-break', () => {
+  it('breaks an equal-score tie in favour of CONFIRMED over LIKELY', () => {
+    // CONFIRMED/speculative/partial + d0 = 75; LIKELY/direct/full + d0 = 75.
+    const confirmed = scoreFinding(
+      makeFinding({ evidenceStrength: 'speculative', contextCompleteness: 'partial' }),
+    );
+    const likely = scoreFinding(
+      makeFinding({ verdict: 'LIKELY', evidenceStrength: 'direct', contextCompleteness: 'full' }),
+    );
+    expect(confirmed.score).toBe(75);
+    expect(likely.score).toBe(75);
+    expect(confirmed.sortKey).toBeGreaterThan(likely.sortKey);
+  });
+});
+
+// ── Input validation ─────────────────────────────────────────────────────────
+
+describe('input validation', () => {
+  it('throws on an invalid verdict', () => {
+    expect(() => scoreFinding(makeFinding({ verdict: 'MAYBE' as never }))).toThrow(
+      /invalid verdict/,
+    );
+  });
+
+  it('throws on an invalid evidence_strength', () => {
+    expect(() => scoreFinding(makeFinding({ evidenceStrength: 'weak' as never }))).toThrow(
+      /invalid evidenceStrength/,
+    );
+  });
+
+  it('throws on an invalid context_completeness', () => {
+    expect(() => scoreFinding(makeFinding({ contextCompleteness: 'some' as never }))).toThrow(
+      /invalid contextCompleteness/,
+    );
+  });
+
+  it('throws on an invalid category (a misspelled "Security" must not silently disable the floor)', () => {
+    expect(() => scoreFinding(makeFinding({ category: 'Security' as never }))).toThrow(
+      /invalid category/,
+    );
+  });
+});
+
+// ── Locked-spec worked examples (data-driven) ────────────────────────────────
+
+describe('locked-spec worked examples', () => {
+  // band names here use the confidence vocabulary (strong/moderate/weak/negligible);
+  // the spec's worked_examples field encoded them as high/medium/low/negligible.
+  const CASES: Array<{
+    name: string;
+    input: Partial<FindingInput>;
+    score: number;
+    band: string;
+    inline: boolean;
+  }> = [
+    {
+      name: 'CONFIRMED/direct/full, high/high → 100, strong, inline (high-severity)',
+      input: { drafterSeverity: 'high', verifierSeverity: 'high' },
+      score: 100,
+      band: 'strong',
+      inline: true,
+    },
+    {
+      name: 'CONFIRMED/circumstantial/none, medium/medium → 73, moderate, inline',
+      input: { evidenceStrength: 'circumstantial', contextCompleteness: 'none' },
+      score: 73,
+      band: 'moderate',
+      inline: true,
+    },
+    {
+      name: 'CONFIRMED/speculative/none, high/low → 48, weak, summary',
+      input: {
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'high',
+        verifierSeverity: 'low',
+      },
+      score: 48,
+      band: 'weak',
+      inline: false,
+    },
+    {
+      name: 'LIKELY/direct/full, medium/medium → 75, moderate, inline',
+      input: { verdict: 'LIKELY', evidenceStrength: 'direct', contextCompleteness: 'full' },
+      score: 75,
+      band: 'moderate',
+      inline: true,
+    },
+    {
+      name: 'LIKELY/circumstantial/partial, medium/medium → 57, moderate, inline',
+      input: {
+        verdict: 'LIKELY',
+        evidenceStrength: 'circumstantial',
+        contextCompleteness: 'partial',
+      },
+      score: 57,
+      band: 'moderate',
+      inline: true,
+    },
+    {
+      // Spec worked-example #6 listed verifierSev=high yet "dropped entirely",
+      // which contradicts the high-severity always-post rule. The score (26) and
+      // band (negligible) are correct; to demonstrate the intended non-forced
+      // drop we use a low verifier severity one step from the drafter (d1 → +0)
+      // — low severity is below the medium visibility floor, so it truly drops.
+      name: 'LIKELY/speculative/none, medium/low → 26, negligible, dropped (non-forced, low severity)',
+      input: {
+        verdict: 'LIKELY',
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'medium',
+        verifierSeverity: 'low',
+      },
+      score: 26,
+      band: 'negligible',
+      inline: false,
+    },
+    {
+      name: 'CONFIRMED/circumstantial/none, high/medium, security → 68, moderate, inline',
+      input: {
+        category: 'security',
+        evidenceStrength: 'circumstantial',
+        contextCompleteness: 'none',
+        drafterSeverity: 'high',
+        verifierSeverity: 'medium',
+      },
+      score: 68,
+      band: 'moderate',
+      inline: true,
+    },
+    {
+      name: 'LIKELY/speculative/none, low/high, security → 18, negligible, inline (security floor)',
+      input: {
+        category: 'security',
+        verdict: 'LIKELY',
+        evidenceStrength: 'speculative',
+        contextCompleteness: 'none',
+        drafterSeverity: 'low',
+        verifierSeverity: 'high',
+      },
+      score: 18,
+      band: 'negligible',
+      inline: true,
+    },
+    {
+      name: 'DISMISSED → 0, negligible, not inline',
+      input: { verdict: 'DISMISSED', drafterSeverity: 'high', verifierSeverity: 'high' },
+      score: 0,
+      band: 'negligible',
+      inline: false,
+    },
+    {
+      name: 'LIKELY/circumstantial/full, high/high → 65, moderate, inline (high-severity)',
+      input: {
+        verdict: 'LIKELY',
+        evidenceStrength: 'circumstantial',
+        contextCompleteness: 'full',
+        drafterSeverity: 'high',
+        verifierSeverity: 'high',
+      },
+      score: 65,
+      band: 'moderate',
+      inline: true,
+    },
+    {
+      name: 'CONFIRMED/direct/full, high/high, OUT OF SCOPE → 0, negligible, not inline',
+      input: { drafterSeverity: 'high', verifierSeverity: 'high', inChangedCode: false },
+      score: 0,
+      band: 'negligible',
+      inline: false,
+    },
+    {
+      name: 'CONFIRMED/speculative/full, medium/medium → 83, strong, inline',
+      input: { evidenceStrength: 'speculative', contextCompleteness: 'full' },
+      score: 83,
+      band: 'strong',
+      inline: true,
+    },
+  ];
+
+  for (const c of CASES) {
+    it(c.name, () => {
+      const r = scoreFinding(makeFinding(c.input));
+      expect(r.score).toBe(c.score);
+      expect(r.band).toBe(c.band);
+      expect(r.disposition === 'inline').toBe(c.inline);
+    });
+  }
+});
diff --git a/src/score-confidence/index.ts b/src/score-confidence/index.ts
new file mode 100644
index 0000000..1148447
--- /dev/null
+++ b/src/score-confidence/index.ts
@@ -0,0 +1,109 @@
+// Copyright The Docker Agent Action authors
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * score-confidence CLI entrypoint.
+ *
+ * Usage:
+ *   node dist/score-confidence.js <findingsPath> [outputPath]
+ *
+ *   findingsPath  Path to a JSON file holding an array of merged finding records
+ *                 (drafter hypothesis + verifier verdict). Read-only.
+ *   outputPath    Where to write the confidence report JSON
+ *                 (default: /tmp/finding_confidence.json).
+ *
+ * Each input record uses the agent's snake_case field names:
+ *   {
+ *     "file": "pkg/auth/oidc.go",
+ *     "line": 72,
+ *     "category": "security",
+ *     "verdict": "CONFIRMED",
+ *     "evidence_strength": "direct",
+ *     "context_completeness": "full",
+ *     "drafter_severity": "high",
+ *     "verifier_severity": "high",
+ *     "in_diff": true,
+ *     "in_changed_code": true,
+ *     "issue": "…",          // optional, passed through to output
+ *     "details": "…"          // optional, passed through to output
+ *   }
+ *
+ * The output JSON groups findings by their final posting disposition
+ * (inline / summary / audit / dropped); each entry carries the original record
+ * plus { score, band, disposition, forced, reason, breakdown }. See
+ * score-confidence.ts for the scoring rules and posting policy.
+ */
+import { readFileSync, writeFileSync } from 'node:fs';
+import { type FindingInput, scoreFindings } from './score-confidence.js';
+
+const DEFAULT_OUTPUT_PATH = '/tmp/finding_confidence.json';
+
+/** Map one snake_case input record to the camelCase {@link FindingInput} shape. */
+function parseRecord(raw: Record<string, unknown>, index: number): FindingInput {
+  const get = (key: string): unknown => raw[key];
+  const require = (key: string): unknown => {
+    const value = get(key);
+    if (value === undefined || value === null) {
+      throw new Error(`finding[${index}] is missing required field "${key}"`);
+    }
+    return value;
+  };
+  return {
+    file: String(require('file')),
+    line: Number(require('line')),
+    category: require('category') as FindingInput['category'],
+    verdict: require('verdict') as FindingInput['verdict'],
+    evidenceStrength: require('evidence_strength') as FindingInput['evidenceStrength'],
+    contextCompleteness: require('context_completeness') as FindingInput['contextCompleteness'],
+    drafterSeverity: require('drafter_severity') as FindingInput['drafterSeverity'],
+    verifierSeverity: require('verifier_severity') as FindingInput['verifierSeverity'],
+    inDiff: get('in_diff') === true,
+    inChangedCode: get('in_changed_code') === true,
+  };
+}
+
+function main(): void {
+  const [, , findingsPath, outputPath = DEFAULT_OUTPUT_PATH] = process.argv;
+
+  if (!findingsPath) {
+    process.stderr.write('Usage: score-confidence <findingsPath> [outputPath]\n');
+    process.exit(1);
+  }
+
+  const parsed = JSON.parse(readFileSync(findingsPath, 'utf-8')) as unknown;
+  const records = Array.isArray(parsed) ? (parsed as Record<string, unknown>[]) : [];
+  const inputs = records.map(parseRecord);
+  const report = scoreFindings(inputs);
+
+  // Re-attach the original records so passthrough fields (issue/details) survive,
+  // grouped by final posting disposition.
+  const project = (group: typeof report.inline): unknown[] =>
+    group.map((s) => {
+      const original = records[inputs.indexOf(s.input)] ?? {};
+      return {
+        ...original,
+        score: s.result.score,
+        band: s.result.band,
+        disposition: s.result.disposition,
+        forced: s.result.forced,
+        reason: s.result.reason,
+        breakdown: s.result.breakdown,
+      };
+    });
+
+  const output = {
+    inline: project(report.inline),
+    summary: project(report.summary),
+    audit: project(report.audit),
+    dropped: project(report.dropped),
+  };
+
+  writeFileSync(outputPath, JSON.stringify(output), 'utf-8');
+}
+
+try {
+  main();
+} catch (err) {
+  process.stderr.write(`Error: ${err instanceof Error ? err.message : String(err)}\n`);
+  process.exit(1);
+}
diff --git a/src/score-confidence/score-confidence.ts b/src/score-confidence/score-confidence.ts
new file mode 100644
index 0000000..7a3c5c4
--- /dev/null
+++ b/src/score-confidence/score-confidence.ts
@@ -0,0 +1,554 @@
+// Copyright The Docker Agent Action authors
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * score-confidence — per-finding confidence scoring for the PR review pipeline.
+ *
+ * The reviewer pipeline is drafter → verifier → orchestrator. The drafter
+ * proposes bug findings; the verifier returns a verdict plus two evidence
+ * signals per finding; this module converts those signals into a precise,
+ * reproducible 0–100 confidence score, a band, and a posting disposition.
+ *
+ * This module is the **single source of truth** for the confidence model. The
+ * orchestrator agent (review-pr/agents/pr-review.yaml) mirrors the exact same
+ * rules as a strict lookup-table procedure so it can score findings inline
+ * without depending on the (gitignored) dist bundle at agent runtime. Any change
+ * to the weights, bands, threshold, or posting policy here MUST be reflected in
+ * the "Confidence Scoring" section of that agent prompt, and vice-versa. The
+ * unit tests pin every value so drift is caught.
+ *
+ * ## Criteria (multi-factor — no single signal decides a score)
+ *
+ *   1. verdict             — verifier agreement: CONFIRMED | LIKELY | DISMISSED
+ *   2. evidence_strength   — pattern/snippet match strength: direct | circumstantial | speculative
+ *   3. context_completeness— did the verifier see the code it needed: full | partial | none
+ *   4. severity concordance— agreement between drafter and verifier severity (rank distance)
+ *   5. scope               — in_diff (drafter) AND in_changed_code (verifier)
+ *   6. category / severity — security and high-severity drive POSTING policy, never the raw score
+ *
+ * ## Deterministic pipeline (exact order — implement verbatim, no conditional caps)
+ *
+ *   STEP 0 (scope gate):     NOT(in_diff && in_changed_code) → score 0, negligible, never post.
+ *   STEP 1 (dismissed gate): verdict === DISMISSED            → score 0, negligible, never post inline.
+ *   STEP 2 (core subtotal):  subtotal = CORE_SUBTOTAL[verdict][evidence][context]
+ *                            (a precomputed 3×3 table per scorable verdict; see below).
+ *   STEP 3 (concordance):    score_raw = subtotal + concordance(drafterSeverity, verifierSeverity)
+ *   STEP 4 (clamp):          score = clamp(score_raw, 0, 100)   ← the only clamp; there is no cap step.
+ *   STEP 5 (band):           bandFor(score)
+ *
+ * The core subtotal is authored additively as `verdict base + evidence + context`:
+ *
+ *                    verdict base:  CONFIRMED 70   LIKELY 40
+ *   evidence:  direct +18   circumstantial +8   speculative −4
+ *   context:   full   +12   partial        +4   none        −10
+ *
+ * yielding (rows = verdict/evidence, columns = full | partial | none):
+ *
+ *   CONFIRMED / direct         = [100, 92, 78]
+ *   CONFIRMED / circumstantial = [ 90, 82, 68]
+ *   CONFIRMED / speculative    = [ 78, 70, 56]
+ *   LIKELY    / direct         = [ 70, 62, 48]
+ *   LIKELY    / circumstantial = [ 60, 52, 38]
+ *   LIKELY    / speculative    = [ 48, 40, 26]
+ *
+ * Provable invariants (all unit-tested):
+ *   - Strictly monotone in evidence (direct > circumstantial > speculative) at fixed verdict/context.
+ *   - Monotone in context (full ≥ partial ≥ none) at fixed verdict/evidence.
+ *   - Only CONFIRMED can reach the strong band (≥80): LIKELY tops out at 75 (LIKELY/direct/full + d0),
+ *     a robust 5-point margin below the strong floor.
+ *   - DISMISSED and out-of-scope findings always score 0.
+ *   - Concordance (−8 worst case) never drives an in-scope score below 0 (min cell 26 − 8 = 18).
+ *
+ * Note on severity: the score deliberately incorporates drafter↔verifier severity *agreement*
+ * (concordance), which peaks when they match. It is therefore intentionally NOT monotone in
+ * verifier severity — a one-notch disagreement can nudge a borderline finding down a band. That
+ * is a legitimate confidence signal (confidence = "is it real", a different axis from severity),
+ * but it must never silently suppress a real bug, so the posting policy adds a medium-severity
+ * visibility floor (rule 6). Net guarantee: increasing verifier severity never *lowers* a
+ * finding's visibility tier (low → drop/summary, medium → at least summary, high → inline).
+ *
+ * ## Posting policy (decided after scoring; first match wins; the cap is applied last)
+ *
+ *   1. Out-of-scope / DISMISSED non-security → drop (never posted inline).
+ *   2. Security floor:  category === security AND verdict ∈ {CONFIRMED, LIKELY}
+ *      → always inline, regardless of score/band, exempt from the cap.
+ *   3. High-severity:   verifierSeverity === high AND verdict ∈ {CONFIRMED, LIKELY}
+ *      → always inline, regardless of band, exempt from the cap.
+ *   4. Default:         band ∈ {strong, moderate}  → inline (subject to the cap).
+ *   5. Weak visibility: band === weak (30..54)      → summary list, not inline (no silent drop).
+ *   6. Medium floor:    negligible band but verifierSeverity === medium → summary (kept visible).
+ *   7. Dismissed-security audit: DISMISSED security → audit list, not inline (human-reviewable).
+ *   8. Cap:             non-forced inline comments capped at COMMENT_CAP (5); overflow → summary.
+ *      Ranking keeps the highest sortKey first (score, then CONFIRMED>LIKELY, then subtotal,
+ *      then evidence, then context). Forced comments (rules 2,3) are never displaced.
+ */
+
+// ---------------------------------------------------------------------------
+// Types
+// ---------------------------------------------------------------------------
+
+/** Verifier verdict on a finding. */
+export type Verdict = 'CONFIRMED' | 'LIKELY' | 'DISMISSED';
+
+/** Verdicts that enter the additive scoring path (DISMISSED is gated out first). */
+export type ScorableVerdict = Exclude<Verdict, 'DISMISSED'>;
+
+/** Verifier signal: how strongly the cited snippet shows the bug. */
+export type EvidenceStrength = 'direct' | 'circumstantial' | 'speculative';
+
+/** Verifier signal: how complete the code context was when judging. */
+export type ContextCompleteness = 'full' | 'partial' | 'none';
+
+/** Finding severity (shared by drafter and verifier). */
+export type Severity = 'high' | 'medium' | 'low';
+
+/** Drafter/verifier finding category. */
+export type Category =
+  | 'security'
+  | 'logic_error'
+  | 'resource_leak'
+  | 'concurrency'
+  | 'error_handling'
+  | 'data_integrity'
+  | 'other';
+
+/** Confidence band — deliberately distinct from the severity enum (independent axes). */
+export type ConfidenceBand = 'strong' | 'moderate' | 'weak' | 'negligible';
+
+/**
+ * Where a finding ends up:
+ *   - inline:  posted as an inline review comment
+ *   - summary: listed in the review summary as a lower-confidence finding (not inline)
+ *   - audit:   a DISMISSED security finding surfaced for human review (not inline)
+ *   - drop:    not surfaced at all (negligible / dismissed non-security / out-of-scope)
+ */
+export type Disposition = 'inline' | 'summary' | 'audit' | 'drop';
+
+/** A finding merged from the drafter hypothesis and the verifier verdict. */
+export interface FindingInput {
+  /** Repo-relative file path (passed through to output). */
+  file: string;
+  /** 1-indexed line number (passed through to output). */
+  line: number;
+  /** Finding category; `security` triggers the posting floor. */
+  category: Category;
+  /** Verifier verdict — the primary agreement signal. */
+  verdict: Verdict;
+  /** Verifier signal: snippet/pattern match strength. */
+  evidenceStrength: EvidenceStrength;
+  /** Verifier signal: code-context completeness. */
+  contextCompleteness: ContextCompleteness;
+  /** Severity the drafter originally assigned (for concordance). */
+  drafterSeverity: Severity;
+  /** Severity the verifier settled on (drives concordance + high-severity posting). */
+  verifierSeverity: Severity;
+  /** Drafter scope flag: finding lands on a `+` line. */
+  inDiff: boolean;
+  /** Verifier scope flag: this PR's changes introduce the problem. */
+  inChangedCode: boolean;
+}
+
+/** Transparent breakdown of how a score was reached (for logging / debugging). */
+export interface ConfidenceBreakdown {
+  /** Core table value (verdict × evidence × context); 0 when gated. */
+  subtotal: number;
+  /** Concordance term applied after the table: +5 | 0 | −8; 0 when gated. */
+  concordance: number;
+  /** Severity rank distance d = |rank(drafter) − rank(verifier)|; 0 when gated. */
+  severityDistance: number;
+  /** Which hard gate fired, if any. */
+  gate: 'scope' | 'dismissed' | null;
+}
+
+/** The confidence verdict for a single finding (pre-cap; see {@link scoreFindings}). */
+export interface ConfidenceResult {
+  /** 0–100 confidence score. */
+  score: number;
+  /** Band derived from {@link score}. */
+  band: ConfidenceBand;
+  /** Provisional posting disposition (the cross-finding cap may demote inline → summary). */
+  disposition: Disposition;
+  /** True when posted via the security or high-severity override (exempt from the cap). */
+  forced: boolean;
+  /** Human-readable reason for the disposition (which policy rule decided it). */
+  reason: string;
+  /**
+   * Descending sort key for the comment cap tie-break. Encodes, in priority order:
+   * score, then verdict (CONFIRMED>LIKELY), then subtotal, then evidence, then context.
+   * Higher = kept first when the cap trims non-forced inline comments.
+   */
+  sortKey: number;
+  /** How the score was computed. */
+  breakdown: ConfidenceBreakdown;
+}
+
+/** A scored finding: the original input paired with its confidence result. */
+export interface ScoredFinding {
+  input: FindingInput;
+  result: ConfidenceResult;
+}
+
+/** Grouped output of {@link scoreFindings}, after the cross-finding cap is applied. */
+export interface ConfidenceReport {
+  /** Every finding, in input order, with its final (post-cap) result. */
+  findings: ScoredFinding[];
+  /** Findings posted as inline comments (forced first, then capped default-band), sorted by confidence. */
+  inline: ScoredFinding[];
+  /** Lower-confidence findings surfaced in the summary instead of inline (weak band + cap overflow). */
+  summary: ScoredFinding[];
+  /** DISMISSED security findings surfaced for human review. */
+  audit: ScoredFinding[];
+  /** Findings not surfaced at all (negligible / dismissed non-security / out-of-scope). */
+  dropped: ScoredFinding[];
+}
+
+/** Options for {@link scoreFindings}. */
+export interface ScoreFindingsOptions {
+  /** Max non-forced inline comments to keep (default {@link COMMENT_CAP}). */
+  commentCap?: number;
+}
+
+// ---------------------------------------------------------------------------
+// Model constants (the single source of truth — mirror in pr-review.yaml)
+// ---------------------------------------------------------------------------
+
+/** Verdict base points (DISMISSED is gated out before the table). */
+const VERDICT_BASE: Record<ScorableVerdict, number> = {
+  CONFIRMED: 70,
+  LIKELY: 40,
+};
+
+/** Evidence-strength delta added to the verdict base. */
+const EVIDENCE_DELTA: Record<EvidenceStrength, number> = {
+  direct: 18,
+  circumstantial: 8,
+  speculative: -4,
+};
+
+/** Context-completeness delta added to the verdict base. */
+const CONTEXT_DELTA: Record<ContextCompleteness, number> = {
+  full: 12,
+  partial: 4,
+  none: -10,
+};
+
+/** Severity rank used for the concordance distance. */
+const SEVERITY_RANK: Record<Severity, number> = {
+  high: 3,
+  medium: 2,
+  low: 1,
+};
+
+/** Verdict rank used only for the cap tie-break sort key. */
+const VERDICT_RANK: Record<Verdict, number> = {
+  CONFIRMED: 2,
+  LIKELY: 1,
+  DISMISSED: 0,
+};
+
+/** Evidence rank used only for the cap tie-break sort key. */
+const EVIDENCE_RANK: Record<EvidenceStrength, number> = {
+  direct: 2,
+  circumstantial: 1,
+  speculative: 0,
+};
+
+/** Context rank used only for the cap tie-break sort key. */
+const CONTEXT_RANK: Record<ContextCompleteness, number> = {
+  full: 2,
+  partial: 1,
+  none: 0,
+};
+
+/** Score at or above which a finding is `strong`. Only CONFIRMED can reach it. */
+export const STRONG_THRESHOLD = 80;
+
+/**
+ * Score at or above which a finding is at least `moderate`. This IS the default
+ * posting threshold — there is no separate constant, so the band floor and the
+ * "post by default" cutoff can never drift apart.
+ */
+export const MODERATE_THRESHOLD = 55;
+
+/** Score at or above which a finding is at least `weak` (surfaced in the summary). */
+export const WEAK_THRESHOLD = 30;
+
+/** Default posting threshold (alias of {@link MODERATE_THRESHOLD} for callers). */
+export const DEFAULT_POST_THRESHOLD = MODERATE_THRESHOLD;
+
+/** Maximum non-forced inline comments kept; overflow is routed to the summary list. */
+export const COMMENT_CAP = 5;
+
+// ---------------------------------------------------------------------------
+// Internal helpers
+// ---------------------------------------------------------------------------
+
+function assertEnum<T extends string>(value: unknown, allowed: readonly T[], field: string): T {
+  if (typeof value !== 'string' || !allowed.includes(value as T)) {
+    throw new Error(
+      `invalid ${field}: ${JSON.stringify(value)} (expected one of ${allowed.join(', ')})`,
+    );
+  }
+  return value as T;
+}
+
+const clamp = (n: number, lo: number, hi: number): number => Math.max(lo, Math.min(hi, n));
+
+/** Core subtotal for a scorable verdict — the precomputed 3×3 table value. */
+function coreSubtotal(
+  verdict: ScorableVerdict,
+  evidence: EvidenceStrength,
+  context: ContextCompleteness,
+): number {
+  return VERDICT_BASE[verdict] + EVIDENCE_DELTA[evidence] + CONTEXT_DELTA[context];
+}
+
+/**
+ * Concordance term: agreement between the drafter's and verifier's severity.
+ * d = |rank(drafter) − rank(verifier)|; same → +5, one step → 0, opposite → −8.
+ */
+function concordance(drafter: Severity, verifier: Severity): { distance: number; points: number } {
+  const distance = Math.abs(SEVERITY_RANK[drafter] - SEVERITY_RANK[verifier]);
+  const points = distance === 0 ? 5 : distance === 1 ? 0 : -8;
+  return { distance, points };
+}
+
+/** Map a 0–100 score to its band. Boundaries: 80 / 55 / 30 (contiguous, no gaps). */
+export function bandFor(score: number): ConfidenceBand {
+  if (score >= STRONG_THRESHOLD) return 'strong';
+  if (score >= MODERATE_THRESHOLD) return 'moderate';
+  if (score >= WEAK_THRESHOLD) return 'weak';
+  return 'negligible';
+}
+
+/**
+ * Build the descending cap tie-break sort key. The decimal slots never overlap
+ * given the value ranges (score 0–100, ranks 0–2, subtotal 0–100), so a plain
+ * numeric sort reproduces the spec's tie-break chain exactly.
+ */
+function buildSortKey(
+  score: number,
+  verdict: Verdict,
+  subtotal: number,
+  evidence: EvidenceStrength,
+  context: ContextCompleteness,
+): number {
+  return (
+    score * 10 ** 7 +
+    VERDICT_RANK[verdict] * 10 ** 6 +
+    subtotal * 10 ** 3 +
+    EVIDENCE_RANK[evidence] * 10 ** 2 +
+    CONTEXT_RANK[context] * 10
+  );
+}
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+/**
+ * Score a single finding and decide its provisional posting disposition.
+ *
+ * The disposition is provisional because the comment cap is a cross-finding
+ * decision: a non-forced `inline` finding may be demoted to `summary` by
+ * {@link scoreFindings}. Use {@link scoreFindings} for the final disposition.
+ *
+ * @throws if any enum field is missing or invalid.
+ */
+export function scoreFinding(raw: FindingInput): ConfidenceResult {
+  const verdict = assertEnum(raw.verdict, ['CONFIRMED', 'LIKELY', 'DISMISSED'] as const, 'verdict');
+  const evidence = assertEnum(
+    raw.evidenceStrength,
+    ['direct', 'circumstantial', 'speculative'] as const,
+    'evidenceStrength',
+  );
+  const context = assertEnum(
+    raw.contextCompleteness,
+    ['full', 'partial', 'none'] as const,
+    'contextCompleteness',
+  );
+  const drafterSeverity = assertEnum(
+    raw.drafterSeverity,
+    ['high', 'medium', 'low'] as const,
+    'drafterSeverity',
+  );
+  const verifierSeverity = assertEnum(
+    raw.verifierSeverity,
+    ['high', 'medium', 'low'] as const,
+    'verifierSeverity',
+  );
+  // Validate category too: it gates the security floor and the dismissed-security
+  // audit, so a misspelled value must throw like every other enum rather than
+  // silently downgrade `isSecurity` to false.
+  const category = assertEnum(
+    raw.category,
+    [
+      'security',
+      'logic_error',
+      'resource_leak',
+      'concurrency',
+      'error_handling',
+      'data_integrity',
+      'other',
+    ] as const,
+    'category',
+  );
+  const isSecurity = category === 'security';
+  const inScope = raw.inDiff === true && raw.inChangedCode === true;
+  const sortKeyFor = (score: number, subtotal: number): number =>
+    buildSortKey(score, verdict, subtotal, evidence, context);
+
+  // STEP 0 — scope hard gate. Out-of-scope findings never post inline.
+  if (!inScope) {
+    return {
+      score: 0,
+      band: 'negligible',
+      disposition: 'drop',
+      forced: false,
+      reason: 'out-of-scope (not in_diff && in_changed_code)',
+      sortKey: sortKeyFor(0, 0),
+      breakdown: { subtotal: 0, concordance: 0, severityDistance: 0, gate: 'scope' },
+    };
+  }
+
+  // STEP 1 — dismissed hard gate. Score is 0, but a dismissed SECURITY finding is
+  // routed to the audit list (human-reviewable) rather than silently dropped.
+  if (verdict === 'DISMISSED') {
+    return {
+      score: 0,
+      band: 'negligible',
+      disposition: isSecurity ? 'audit' : 'drop',
+      forced: false,
+      reason: isSecurity ? 'dismissed security finding (audit)' : 'dismissed',
+      sortKey: sortKeyFor(0, 0),
+      breakdown: { subtotal: 0, concordance: 0, severityDistance: 0, gate: 'dismissed' },
+    };
+  }
+
+  // STEP 2–4 — core subtotal + concordance, then clamp.
+  const subtotal = coreSubtotal(verdict, evidence, context);
+  const { distance, points } = concordance(drafterSeverity, verifierSeverity);
+  const score = clamp(subtotal + points, 0, 100);
+  const band = bandFor(score);
+  const sortKey = sortKeyFor(score, subtotal);
+  const breakdown: ConfidenceBreakdown = {
+    subtotal,
+    concordance: points,
+    severityDistance: distance,
+    gate: null,
+  };
+
+  // Posting policy (per-finding part; the cap is applied in scoreFindings).
+  if (isSecurity) {
+    return {
+      score,
+      band,
+      disposition: 'inline',
+      forced: true,
+      reason: 'security floor (never auto-suppressed)',
+      sortKey,
+      breakdown,
+    };
+  }
+  if (verifierSeverity === 'high') {
+    return {
+      score,
+      band,
+      disposition: 'inline',
+      forced: true,
+      reason: 'high-severity always-post',
+      sortKey,
+      breakdown,
+    };
+  }
+  if (band === 'strong' || band === 'moderate') {
+    return {
+      score,
+      band,
+      disposition: 'inline',
+      forced: false,
+      reason: `default band (${band})`,
+      sortKey,
+      breakdown,
+    };
+  }
+  if (band === 'weak') {
+    return {
+      score,
+      band,
+      disposition: 'summary',
+      forced: false,
+      reason: 'weak band (lower-confidence summary, not inline)',
+      sortKey,
+      breakdown,
+    };
+  }
+  // Negligible band. Confidence incorporates drafter↔verifier severity agreement, so
+  // it is intentionally NOT monotone in verifier severity — a one-notch disagreement can
+  // nudge a borderline finding down a band. To prevent that from ever *silently dropping*
+  // a finding the verifier still rates medium-or-worse, a medium-severity negligible
+  // finding is kept visible in the lower-confidence summary. (High is already force-posted
+  // above; only low-severity negligible findings are dropped as noise.)
+  if (verifierSeverity === 'medium') {
+    return {
+      score,
+      band,
+      disposition: 'summary',
+      forced: false,
+      reason: 'medium-severity visibility floor (kept in summary despite negligible confidence)',
+      sortKey,
+      breakdown,
+    };
+  }
+  return {
+    score,
+    band,
+    disposition: 'drop',
+    forced: false,
+    reason: 'negligible band (low severity)',
+    sortKey,
+    breakdown,
+  };
+}
+
+/**
+ * Score a batch of findings and produce the final grouped report, applying the
+ * cross-finding comment cap: non-forced inline comments are limited to
+ * `commentCap`, keeping the highest-confidence ones; the overflow is demoted to
+ * the summary list. Forced comments (security / high-severity) are exempt and
+ * never displaced.
+ */
+export function scoreFindings(
+  findings: FindingInput[],
+  options: ScoreFindingsOptions = {},
+): ConfidenceReport {
+  const commentCap = options.commentCap ?? COMMENT_CAP;
+  const scored: ScoredFinding[] = findings.map((input) => ({ input, result: scoreFinding(input) }));
+
+  // Identify non-forced inline candidates and demote everything past the cap.
+  const nonForcedInline = scored
+    .filter((s) => s.result.disposition === 'inline' && !s.result.forced)
+    .sort((a, b) => b.result.sortKey - a.result.sortKey);
+
+  const demoted = new Set(nonForcedInline.slice(commentCap));
+  for (const s of demoted) {
+    s.result = {
+      ...s.result,
+      disposition: 'summary',
+      reason: `over comment cap (${commentCap}); moved to lower-confidence summary`,
+    };
+  }
+
+  const byDisposition = (d: Disposition): ScoredFinding[] =>
+    scored
+      .filter((s) => s.result.disposition === d)
+      .sort((a, b) => b.result.sortKey - a.result.sortKey);
+
+  return {
+    findings: scored,
+    inline: byDisposition('inline'),
+    summary: byDisposition('summary'),
+    audit: byDisposition('audit'),
+    dropped: byDisposition('drop'),
+  };
+}
diff --git a/tsup.config.ts b/tsup.config.ts
index 3bb1665..3fe6d7f 100644
--- a/tsup.config.ts
+++ b/tsup.config.ts
@@ -31,6 +31,7 @@ const entry = {
   'mention-reply': src('mention-reply'),
   'migrate-consumer-refs': src('migrate-consumer-refs'),
   'post-mention-reply': src('post-mention-reply'),
+  'score-confidence': src('score-confidence'),
   'score-risk': src('score-risk'),
   security: src('security'),
   'signed-commit': src('signed-commit'),

From 85b15174761b8b57bc35d56bc5ccb6ee073a6973 Mon Sep 17 00:00:00 2001
From: Sayt-0 <louis-dalmorocompta@docker.com>
Date: Wed, 24 Jun 2026 23:07:06 +0200
Subject: [PATCH 2/2] fix(score-confidence): write CLI output to stdout by
 default

CodeQL js/insecure-temporary-file: the CLI defaulted its output to a
hardcoded /tmp path. Default to stdout instead (composable, no fixed
temp file) and write to a file only when the caller passes an explicit
output path.
---
 src/score-confidence/index.ts | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/score-confidence/index.ts b/src/score-confidence/index.ts
index 1148447..b06aa3f 100644
--- a/src/score-confidence/index.ts
+++ b/src/score-confidence/index.ts
@@ -9,8 +9,10 @@
  *
  *   findingsPath  Path to a JSON file holding an array of merged finding records
  *                 (drafter hypothesis + verifier verdict). Read-only.
- *   outputPath    Where to write the confidence report JSON
- *                 (default: /tmp/finding_confidence.json).
+ *   outputPath    Optional. When given, the confidence report JSON is written to
+ *                 this caller-controlled path; otherwise it is written to stdout
+ *                 (the default — keeps the tool composable and avoids writing to a
+ *                 fixed temp location).
  *
  * Each input record uses the agent's snake_case field names:
  *   {
@@ -36,8 +38,6 @@
 import { readFileSync, writeFileSync } from 'node:fs';
 import { type FindingInput, scoreFindings } from './score-confidence.js';
 
-const DEFAULT_OUTPUT_PATH = '/tmp/finding_confidence.json';
-
 /** Map one snake_case input record to the camelCase {@link FindingInput} shape. */
 function parseRecord(raw: Record<string, unknown>, index: number): FindingInput {
   const get = (key: string): unknown => raw[key];
@@ -63,7 +63,7 @@ function parseRecord(raw: Record<string, unknown>, index: number): FindingInput
 }
 
 function main(): void {
-  const [, , findingsPath, outputPath = DEFAULT_OUTPUT_PATH] = process.argv;
+  const [, , findingsPath, outputPath] = process.argv;
 
   if (!findingsPath) {
     process.stderr.write('Usage: score-confidence <findingsPath> [outputPath]\n');
@@ -98,7 +98,14 @@ function main(): void {
     dropped: project(report.dropped),
   };
 
-  writeFileSync(outputPath, JSON.stringify(output), 'utf-8');
+  const json = JSON.stringify(output);
+  // Default to stdout (composable, no fixed temp path); write to a file only when
+  // the caller supplies an explicit, caller-controlled output path.
+  if (outputPath) {
+    writeFileSync(outputPath, json, 'utf-8');
+  } else {
+    process.stdout.write(`${json}\n`);
+  }
 }
 
 try {