From baeaa3a96fd19fe11cd8398506e488009853792b Mon Sep 17 00:00:00 2001 From: Sayt-0 Date: Wed, 24 Jun 2026 23:00:50 +0200 Subject: [PATCH 1/2] feat(review-pr): add per-finding confidence scoring model Score each verified finding 0-100 from the verifier verdict, evidence strength, context completeness, drafter/verifier severity concordance, and scope. Bands (strong/moderate/weak/negligible) with a default posting threshold of 55 gate inline comments; security and high-severity CONFIRMED/LIKELY findings are always posted, weak-band findings go to a visible lower-confidence summary instead of being dropped, and a medium-severity floor keeps a still-believed finding visible. The model is implemented and unit-tested in src/score-confidence (single source of truth) and mirrored in the orchestrator prompt as a strict lookup table. The verifier now emits evidence_strength and context_completeness. --- AGENTS.md | 6 + review-pr/README.md | 14 +- .../agents/evals/confidence-scoring-1.json | 29 + review-pr/agents/pr-review.yaml | 134 +++- review-pr/agents/refs/posting-format.md | 13 +- .../__tests__/score-confidence.test.ts | 722 ++++++++++++++++++ src/score-confidence/index.ts | 109 +++ src/score-confidence/score-confidence.ts | 554 ++++++++++++++ tsup.config.ts | 1 + 9 files changed, 1570 insertions(+), 12 deletions(-) create mode 100644 review-pr/agents/evals/confidence-scoring-1.json create mode 100644 src/score-confidence/__tests__/score-confidence.test.ts create mode 100644 src/score-confidence/index.ts create mode 100644 src/score-confidence/score-confidence.ts diff --git a/AGENTS.md b/AGENTS.md index ea903c2..18aea19 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -43,6 +43,11 @@ Anything else here (workflows under `.github/workflows/`, scripts, tests) exists │ │ ├── index.ts # CLI entry → bundled to dist/filter-diff.js │ │ ├── filter-diff.ts # Core filterDiff() pure function + applyFilter() I/O wrapper. │ │ └── __tests__/ +│ ├── score-confidence/ # Per-finding confidence scoring for the PR review pipeline. +│ │ ├── index.ts # CLI entry → bundled to dist/score-confidence.js +│ │ ├── score-confidence.ts # Core scoreFinding()/scoreFindings() pure functions + posting policy. +│ │ │ # Source of truth for the model mirrored in pr-review.yaml. +│ │ └── __tests__/ │ ├── score-risk/ # Per-file risk scoring for the PR review pipeline. │ │ ├── index.ts # CLI entry → bundled to dist/score-risk.js │ │ ├── score-risk.ts # Core scoreFiles() pure function. @@ -167,6 +172,7 @@ The action runs untrusted input (PR titles, bodies, comments, diffs) through an - `pull_request` action `review_requested` when `github.event.requested_reviewer.login == 'docker-agent'` - `@docker-agent` mentions on PR/issue comments — these run the `.github/actions/mention-reply` handler (sets `should-reply` and builds the context prompt) and then the `review-pr/mention-reply` sub-action (referenced from a pinned SHA, not present as a local path on every commit). The `pr-review-mention-reply.yaml` agent handles the actual reply. - Diffs over 1500 lines are **chunked at file boundaries** in `review-pr/action.yml` (see "Split diff into chunks"). Per-file **risk scoring** (security paths, line counts, error-handling patterns) prioritizes verifier attention. +- Per-finding **confidence scoring** assigns each verified finding a precise 0–100 score (band: strong/moderate/weak/negligible) from the verifier's `verdict`, `evidence_strength`, and `context_completeness`, plus drafter↔verifier severity concordance and scope. `src/score-confidence/score-confidence.ts` is the **single source of truth** for the model (weights, bands, threshold, posting policy); the "Confidence Scoring" section of `review-pr/agents/pr-review.yaml` mirrors it as a strict lookup table so the orchestrator can apply it inline (the gitignored `dist/` is not available at agent runtime). Change one, change both — the unit tests pin every value. Security and high-severity CONFIRMED/LIKELY findings are always posted regardless of score; weak-band findings are surfaced in a summary rather than silently dropped. - Stale review threads on lines no longer in the diff are auto-resolved via GraphQL `resolveReviewThread`. Threads with no `` marker are never touched. ### Workflows (`.github/workflows/`) diff --git a/review-pr/README.md b/review-pr/README.md index 097d327..b7b40ad 100644 --- a/review-pr/README.md +++ b/review-pr/README.md @@ -284,6 +284,8 @@ but the error check happens after this line accesses `user.ID`. Consider moving the nil check before accessing user properties. +confidence: strong (92/100) + ``` @@ -298,9 +300,19 @@ When no issues are found: ### Review Pipeline ``` -AGENTS.md + PR Diff → Drafter (hypotheses) → Verifier (confirm) → Post Comments +AGENTS.md + PR Diff → Drafter (hypotheses) → Verifier (confirm + evidence signals) + → Confidence score (0–100) → Post Comments ``` +Each verified finding gets a precise **confidence score** (0–100) and a band +(strong / moderate / weak / negligible), computed deterministically from the +verifier's verdict, evidence strength, and context completeness, plus the +drafter↔verifier severity agreement. High-confidence findings are posted as +inline comments (labelled with their confidence); lower-confidence findings are +listed separately rather than dropped. Security and high-severity findings are +always surfaced regardless of score. The model is implemented and unit-tested in +[`src/score-confidence/`](../src/score-confidence/score-confidence.ts). + ### Learning System When you reply to a review comment: diff --git a/review-pr/agents/evals/confidence-scoring-1.json b/review-pr/agents/evals/confidence-scoring-1.json new file mode 100644 index 0000000..3f78f7f --- /dev/null +++ b/review-pr/agents/evals/confidence-scoring-1.json @@ -0,0 +1,29 @@ +{ + "id": "f0c1e2d3-4a5b-6c7d-8e9f-0a1b2c3d4e5f", + "title": "Confidence scoring - per-finding score, band, and security floor (run 1)", + "evals": { + "setup": "apk add --no-cache github-cli", + "relevance": [ + "The agent ran 'echo $GITHUB_ACTIONS' before performing the review to detect the output mode", + "The agent output the review to the console as formatted markdown instead of posting via gh api", + "The drafter response is valid JSON containing a 'findings' array, a 'summary' field, and 'review_complete' set to true", + "At least one finding flags the missing redirect_uri validation as a security concern", + "The verifier returned a JSON response with a 'verdicts' array containing one verdict per finding, and each verdict includes an 'evidence_strength' value (direct, circumstantial, or speculative) and a 'context_completeness' value (full, partial, or none)", + "Each finding posted in the console output is labelled with a confidence band (strong, moderate, weak, or negligible) and a numeric score out of 100", + "The security finding about redirect_uri validation is surfaced in the review regardless of its confidence score (security findings are never auto-suppressed)", + "The review assessment label is '🔴 CRITICAL' or '🟡 NEEDS ATTENTION' because there is at least one confirmed or likely security/high-severity finding" + ] + }, + "messages": [ + { + "message": { + "agentName": "", + "message": { + "role": "user", + "content": "Review the following PR.\n\n## PR Information\n- **Title**: Add optional redirect URI to OAuth authorization flow\n- **Author**: jeanlaurent\n- **Branch**: custom-redirect-url → main\n- **Files Changed**: 6\n\n## PR Description\nAdds an optional redirect_uri field to GetAuthorizationURLRequest so callers can override the default OAuth callback URL. This allows apps to use custom URI schemes (e.g., myapp://auth/callback) for the OIDC login flow.\n\n### Changes\n- proto: Added optional redirect_uri field to GetAuthorizationURLRequest\n- auth/oidc: AuthorizationURL() accepts a redirectURI parameter, falls back to configured default when empty\n- auth/service: Reads redirect_uri from the request and passes it through\n- generated code: Regenerated Go and TypeScript protobuf files\n\n## Diff\n\nNote: Generated protobuf files (auth.pb.go, auth_pb.ts) are omitted — only hand-written code is shown.\n\n```diff\ndiff --git a/api/auth/v1/auth.proto b/api/auth/v1/auth.proto\nindex df6bf369..54dfc78a 100644\n--- a/api/auth/v1/auth.proto\n+++ b/api/auth/v1/auth.proto\n@@ -25,6 +25,11 @@ message GetAuthorizationURLRequest {\n // Optional state parameter for CSRF protection.\n // If not provided, the server will generate one.\n optional string state = 1;\n+\n+ // Optional redirect URI for the OAuth callback.\n+ // If not provided, the server will use the configured default redirect URI.\n+ // This allows mobile apps to use custom URI schemes (e.g., myapp://auth/callback).\n+ optional string redirect_uri = 2;\n }\n \n // GetAuthorizationURLResponse is the response message containing the authorization URL.\n@@ -53,6 +58,10 @@ message GetLogoutURLResponse {\n message ExchangeTokenRequest {\n // The authorization code received from the OIDC provider.\n string code = 1;\n+\n+ // Optional redirect URI that was used in the authorization request.\n+ // Must match the redirect_uri used in GetAuthorizationURL for the OAuth flow to succeed.\n+ optional string redirect_uri = 2;\n }\n \ndiff --git a/backend/internal/platformd/auth/oidc.go b/backend/internal/platformd/auth/oidc.go\nindex 0e14ad7e..c4c96499 100644\n--- a/backend/internal/platformd/auth/oidc.go\n+++ b/backend/internal/platformd/auth/oidc.go\n@@ -65,9 +65,14 @@ func NewOIDCClient(ctx context.Context, cfg *Config) (*OIDCClient, error) {\n }\n \n // AuthorizationURL builds the authorization URL for the OIDC login flow.\n-func (c *OIDCClient) AuthorizationURL(state string) string {\n+// If redirectURI is provided, it will be used instead of the configured default.\n+func (c *OIDCClient) AuthorizationURL(state string, redirectURI string) string {\n \tcfg := c.oauth2Config\n-\tcfg.RedirectURL = c.redirectURI\n+\tif redirectURI != \"\" {\n+\t\tcfg.RedirectURL = redirectURI\n+\t} else {\n+\t\tcfg.RedirectURL = c.redirectURI\n+\t}\n \treturn cfg.AuthCodeURL(state)\n }\n \n@@ -92,10 +97,16 @@ type TokenResponse struct {\n }\n \n // ExchangeCode exchanges an authorization code for tokens.\n-func (c *OIDCClient) ExchangeCode(ctx context.Context, code string) (*TokenResponse, error) {\n+// If redirectURI is provided, it will be used instead of the configured default.\n+// The redirect URI must match the one used in the authorization request.\n+func (c *OIDCClient) ExchangeCode(ctx context.Context, code string, redirectURI string) (*TokenResponse, error) {\n \t// Set the redirect URI for this specific exchange\n \tcfg := c.oauth2Config\n-\tcfg.RedirectURL = c.redirectURI\n+\tif redirectURI != \"\" {\n+\t\tcfg.RedirectURL = redirectURI\n+\t} else {\n+\t\tcfg.RedirectURL = c.redirectURI\n+\t}\n \n \ttoken, err := cfg.Exchange(ctx, code)\n \tif err != nil {\n\ndiff --git a/backend/internal/platformd/auth/service.go b/backend/internal/platformd/auth/service.go\nindex c2a95279..e3e355c9 100644\n--- a/backend/internal/platformd/auth/service.go\n+++ b/backend/internal/platformd/auth/service.go\n@@ -82,8 +82,11 @@ func (s *Service) GetAuthorizationURL(\n \t\t}\n \t}\n \n-\t// Build the authorization URL using the configured redirect URI\n-\tauthURL := s.oidcClient.AuthorizationURL(state)\n+\t// Get redirect URI from request, or use configured default\n+\tredirectURI := msg.GetRedirectUri()\n+\n+\t// Build the authorization URL\n+\tauthURL := s.oidcClient.AuthorizationURL(state, redirectURI)\n \n \treturn connect.NewResponse(&authv1.GetAuthorizationURLResponse{\n \t\tAuthorizationUrl: authURL,\n@@ -138,8 +141,11 @@ func (s *Service) ExchangeToken(\n \t\treturn nil, connect.NewError(connect.CodeInvalidArgument, ErrCodeRequired)\n \t}\n \n-\t// Exchange the code for Docker tokens using the configured redirect URI\n-\ttokenResp, err := s.oidcClient.ExchangeCode(ctx, code)\n+\t// Get redirect URI from request, or use configured default\n+\tredirectURI := msg.GetRedirectUri()\n+\n+\t// Exchange the code for Docker tokens\n+\ttokenResp, err := s.oidcClient.ExchangeCode(ctx, code, redirectURI)\n \tif err != nil {\n \t\tif errors.Is(err, ErrTokenExchange) {\n \t\t\treturn nil, connect.NewError(connect.CodeInvalidArgument, err)\n```", + "created_at": "2026-02-18T14:00:00-05:00" + } + } + } + ] +} diff --git a/review-pr/agents/pr-review.yaml b/review-pr/agents/pr-review.yaml index 5073879..ea5febf 100644 --- a/review-pr/agents/pr-review.yaml +++ b/review-pr/agents/pr-review.yaml @@ -211,14 +211,29 @@ agents: Do NOT approve — surface the raw findings so the author can evaluate them. Do NOT retry the delegation. (The fallback preserves the drafter's analysis for the author.) - 6. Parse the verifier's JSON response (a `verdicts` array). Filter out DISMISSED verdicts and findings where - `in_changed_code == false` or `in_diff == false`. - 7. **Verify line numbers** before posting (see below) + 6. Parse the verifier's JSON response (a `verdicts` array). Drop verdicts that are out + of scope (`in_changed_code == false` or `in_diff == false`). Keep the rest — including + DISMISSED — and assign each a **confidence score** using the Confidence Scoring section + below. The score (not a manual judgment) decides each finding's disposition: posted + inline, listed in the lower-confidence summary, sent to the dismissed-security audit, + or dropped. Each verdict carries `evidence_strength` and `context_completeness` for this. + 7. **Verify line numbers** before posting (see below) — only for findings you will post + inline or list in the summary. 8. Apply the Decision Rules (see below) to determine the review verdict - 9. Build inline comments from CONFIRMED/LIKELY issues using each finding's `issue` - (one-line summary), `details` (full explanation), `severity`, `category`, `file`, - and `line` fields. Post the review. - 10. Always report ALL HIGH severity bugs. Limit MEDIUM/LOW to 5 comments max. + 9. Build the review from the confidence dispositions (see Confidence Scoring → Posting policy): + - **Inline comments** — every finding whose disposition is `inline`. Each comment uses + the finding's `issue` (one-line summary), `details` (full explanation), `severity`, + `category`, `file`, `line`, and a confidence label, e.g. `confidence: moderate (68/100)`. + - **Lower-confidence summary** — weak-band non-forced findings (plus any pushed past the + comment cap), listed under "Lower-confidence findings (not posted inline)" with their + scores. Never silently drop these. + - **Dismissed security audit** — DISMISSED `security` findings, listed under "Dismissed + security findings (review manually)" citing the verifier's stated mitigation. + Then post the review. + 10. Order inline comments by confidence, highest first. Forced comments (high-severity and + security CONFIRMED/LIKELY) are ALWAYS posted and never count against the cap. Among the + remaining (non-forced) inline comments, keep at most 5 (highest confidence first) and + move the overflow to the lower-confidence summary list. Find **real bugs in the changed code**, not style issues. If the changed code works correctly, approve it. @@ -294,10 +309,74 @@ agents: (from the `-X,Y` side of the hunk header). Do NOT use `grep` to verify deleted lines — they are gone from the working tree. + ## Confidence Scoring (MANDATORY — strict lookup, not a judgment call) + + Assign every surviving finding a **confidence score (0–100)**, a **band**, and a + **posting disposition**. This is a deterministic lookup, never a subjective guess. The + authoritative implementation is `src/score-confidence/score-confidence.ts`; the rules + below mirror it exactly and MUST stay in sync with it. Confidence answers "how sure are + we the bug is real" — a separate axis from `severity` ("how bad it is if real"); report both. + + Inputs per finding: `verdict`, `evidence_strength`, `context_completeness` (all from the + verifier), the drafter vs verifier severity (for concordance), scope (`in_diff` AND + `in_changed_code`), and — for posting only — `category` (security) and verifier `severity` (high). + + **Step 0 — scope gate.** If NOT (`in_diff` AND `in_changed_code`): score 0, band + negligible, do not surface. Stop. + + **Step 1 — dismissed gate.** If `verdict` is DISMISSED: score 0, band negligible, never + post inline. (A DISMISSED `security` finding still goes to the audit list — see policy.) Stop. + + **Step 2 — core subtotal (read ONE cell; do not add the parts yourself).** + + | verdict / evidence | full | partial | none | + | -------------------------- | ---- | ------- | ---- | + | CONFIRMED / direct | 100 | 92 | 78 | + | CONFIRMED / circumstantial | 90 | 82 | 68 | + | CONFIRMED / speculative | 78 | 70 | 56 | + | LIKELY / direct | 70 | 62 | 48 | + | LIKELY / circumstantial | 60 | 52 | 38 | + | LIKELY / speculative | 48 | 40 | 26 | + + **Step 3 — severity concordance (one addition).** Rank high=3, medium=2, low=1. Let + d = |drafterRank − verifierRank|. Add: d==0 → +5, d==1 → +0, d==2 → −8. + + **Step 4 — clamp.** score = min(100, max(0, subtotal + concordance)). + + **Step 5 — band.** strong: score ≥ 80; moderate: 55–79; weak: 30–54; negligible: < 30. + (Only CONFIRMED can reach strong; LIKELY tops out at 75.) + + ### Posting policy (first match wins; the 5-comment cap is applied last) + + 1. Out-of-scope, or DISMISSED non-security → do not surface. + 2. **Security floor** — `category` security AND `verdict` CONFIRMED/LIKELY → ALWAYS post + inline, whatever the score. Never auto-suppress a security finding. Exempt from the cap. + 3. **High-severity always-post** — verifier `severity` high AND `verdict` CONFIRMED/LIKELY + → ALWAYS post inline, whatever the band. Exempt from the cap. + 4. **Default** — band strong or moderate (score ≥ 55) → post inline. + 5. **Weak band (30–54), non-forced** — do NOT post inline; list under "Lower-confidence + findings (not posted inline)" with the score. + 6. **Medium-severity floor** — a non-forced finding in the negligible band (< 30) whose + verifier `severity` is `medium` is NOT dropped: list it in the lower-confidence summary. + (Confidence rewards drafter↔verifier severity agreement, so a one-notch disagreement can + push a finding down a band; this floor guarantees raising severity never makes a finding + *less* visible. Only negligible-band, low-severity findings are dropped entirely.) + 7. **DISMISSED security** — do NOT post inline; list under "Dismissed security findings + (review manually)" citing the verifier's stated mitigation. + 8. **Cap** — among inline comments from rule 4 that are NOT high-severity and NOT security, + keep at most 5 (highest score first; break ties by CONFIRMED before LIKELY, then higher + subtotal, then direct > circumstantial > speculative, then full > partial > none). Move + the rest to the rule-5 summary list. Forced comments (rules 2–3) never count against + the cap and are never displaced. + + Label every inline comment with its confidence, e.g. `confidence: moderate (68/100)`. + ## Decision Rules (MANDATORY — strict lookup, not a judgment call) - 1. **Filter**: Remove findings where `in_changed_code == false` or `in_diff == false` - 2. **Classify** (for informational labeling in the review summary): + 1. **Filter**: Consider only findings whose confidence disposition is `inline` (see + Confidence Scoring). Out-of-scope, dropped, summary-only, and audit-only findings do + NOT drive the assessment label. + 2. **Classify** the inline findings (for informational labeling in the review summary): - CRITICAL = high severity CONFIRMED/LIKELY - NOTABLE = medium severity CONFIRMED/LIKELY - MINOR = everything else @@ -348,9 +427,16 @@ agents: ## Review: COMMENT ### Assessment: [🟢 APPROVE|🟡 NEEDS ATTENTION|🔴 CRITICAL] ### Findings - **[SEVERITY] file:line — issue** + **[SEVERITY] file:line — issue** (confidence: BAND SCORE/100) details + + ### Lower-confidence findings (not posted inline) + - [SEVERITY] file:line — issue (confidence: weak SCORE/100) + + ### Dismissed security findings (review manually) + - file:line — issue (verifier mitigation: …) ``` + Omit the "Lower-confidence" and "Dismissed security" sections when they have no entries. sub_agents: - drafter @@ -741,6 +827,24 @@ agents: expressed without changing its behavior (e.g. `os.Remove(x)` → `_ = fileutil.Remove(x)` does not introduce the discarded error; a move/rename/wrap refactor introduces nothing). "Touched by the diff" is not the same as "introduced by the diff." + - `evidence_strength`: How strongly the provided code snippet ITSELF shows the bug. + - `direct`: the buggy line is in the snippet and the defect is visible right there. + - `circumstantial`: the snippet shows related or calling code, but not the defect itself. + - `speculative`: the snippet does not pin the bug; you are reasoning about code you cannot see. + - `context_completeness`: How complete the code context was for your judgment. + - `full`: every symbol/definition you needed to decide was present in the provided snippets. + - `partial`: you had to assume the behavior of some referenced code you could not see. + - `none`: the key code needed to confirm the bug was not retrievable from the snippet. + + **Disjointness rule (REQUIRED):** if `context_completeness` is `none` you MUST NOT set + `evidence_strength` to `direct` — without the defining context you cannot have direct + evidence. Keep the two axes independent; do not collapse one into the other. + + `evidence_strength` and `context_completeness` feed the deterministic confidence score + the orchestrator computes (see its "Confidence Scoring" section). Assign them honestly: + over-claiming `direct`/`full` inflates confidence, while reflexive `speculative`/`none` + hedging suppresses real bugs. They do not change your `verdict` — verify exactly as + before, then describe the evidence you actually had. structured_output: name: verification_verdicts @@ -772,6 +876,14 @@ agents: description: "Explanation of verdict" in_changed_code: type: boolean + evidence_strength: + type: string + enum: ["direct", "circumstantial", "speculative"] + description: "How strongly the cited snippet itself shows the bug" + context_completeness: + type: string + enum: ["full", "partial", "none"] + description: "How complete the code context was when judging" required: [ "verdict", @@ -781,6 +893,8 @@ agents: "issue", "details", "in_changed_code", + "evidence_strength", + "context_completeness", ] additionalProperties: false required: ["verdicts"] diff --git a/review-pr/agents/refs/posting-format.md b/review-pr/agents/refs/posting-format.md index e1c008b..2c10c71 100644 --- a/review-pr/agents/refs/posting-format.md +++ b/review-pr/agents/refs/posting-format.md @@ -25,7 +25,16 @@ with `echo` — this causes double-escaping of newlines (`\n` rendered as litera Build the review body and comments, then use `jq` to produce correctly-escaped JSON: ```bash -# Review body is just the assessment badge — findings go in inline comments +# Review body is the assessment badge, plus the lower-confidence and dismissed-security +# summary sections when they have entries (high-confidence findings go in inline comments). +# Append each section only when non-empty, e.g.: +# ### Assessment: 🟡 NEEDS ATTENTION +# +# #### Lower-confidence findings (not posted inline) +# - [medium] file.go:42 — issue (confidence: weak 48/100) +# +# #### Dismissed security findings (review manually) +# - file.go:88 — issue (verifier mitigation: …) REVIEW_BODY="### Assessment: 🟢 APPROVE" # or 🟡 NEEDS ATTENTION / 🔴 CRITICAL # Start with an empty comments array @@ -39,6 +48,8 @@ cat > /tmp/comment_body.md << 'COMMENT_BODY_EOF' Detailed explanation of the bug, trigger path, and impact. +confidence: moderate (68/100) + COMMENT_BODY_EOF diff --git a/src/score-confidence/__tests__/score-confidence.test.ts b/src/score-confidence/__tests__/score-confidence.test.ts new file mode 100644 index 0000000..b353a13 --- /dev/null +++ b/src/score-confidence/__tests__/score-confidence.test.ts @@ -0,0 +1,722 @@ +// Copyright The Docker Agent Action authors +// SPDX-License-Identifier: Apache-2.0 + +/** + * Unit tests for src/score-confidence. + * + * The model is pinned value-by-value: the 18-cell core subtotal table, the + * concordance term, the clamp, the band boundaries, both hard gates (scope and + * dismissed), the per-finding posting policy, and the cross-finding comment cap. + * + * The provable invariants from the design spec are asserted directly: + * - strict monotonicity in evidence and context, + * - only CONFIRMED can reach the strong band (LIKELY tops out at 75), + * - DISMISSED and out-of-scope always score 0. + * + * The 12 worked examples from the locked spec are encoded as a data-driven + * fixture so any constant drift fails loudly. + */ +import { describe, expect, it } from 'vitest'; +import { + bandFor, + COMMENT_CAP, + type ContextCompleteness, + type EvidenceStrength, + type FindingInput, + MODERATE_THRESHOLD, + type ScorableVerdict, + type Severity, + STRONG_THRESHOLD, + scoreFinding, + scoreFindings, + WEAK_THRESHOLD, +} from '../score-confidence.js'; + +// ── Fixture helpers ─────────────────────────────────────────────────────────── + +/** + * Build an in-scope, non-forced CONFIRMED finding. Defaults score well into the + * moderate/strong range; override any field to exercise a specific rule. + */ +function makeFinding(overrides: Partial = {}): FindingInput { + return { + file: 'pkg/app/handler.go', + line: 42, + category: 'logic_error', + verdict: 'CONFIRMED', + evidenceStrength: 'direct', + contextCompleteness: 'full', + drafterSeverity: 'medium', + verifierSeverity: 'medium', + inDiff: true, + inChangedCode: true, + ...overrides, + }; +} + +const EVIDENCE: EvidenceStrength[] = ['direct', 'circumstantial', 'speculative']; +const CONTEXT: ContextCompleteness[] = ['full', 'partial', 'none']; +const SCORABLE: ScorableVerdict[] = ['CONFIRMED', 'LIKELY']; +const SEVERITIES: Severity[] = ['high', 'medium', 'low']; + +// ── Core subtotal table (verdict × evidence × context) ─────────────────────── + +describe('core subtotal table', () => { + // The documented 3×3×3 table from the locked spec. With d0 concordance + // (medium↔medium → +5) the score is subtotal + 5, so we assert breakdown.subtotal. + const TABLE: Record> = { + CONFIRMED: { + direct: [100, 92, 78], + circumstantial: [90, 82, 68], + speculative: [78, 70, 56], + }, + LIKELY: { + direct: [70, 62, 48], + circumstantial: [60, 52, 38], + speculative: [48, 40, 26], + }, + }; + + for (const verdict of SCORABLE) { + for (const evidence of EVIDENCE) { + CONTEXT.forEach((context, ctxIdx) => { + const expected = TABLE[verdict][evidence][ctxIdx]; + it(`${verdict}/${evidence}/${context} → subtotal ${expected}`, () => { + const r = scoreFinding( + makeFinding({ verdict, evidenceStrength: evidence, contextCompleteness: context }), + ); + expect(r.breakdown.subtotal).toBe(expected); + }); + }); + } + } +}); + +// ── Concordance (drafter vs verifier severity) ─────────────────────────────── + +describe('severity concordance', () => { + it('same severity (d0) → +5', () => { + const r = scoreFinding(makeFinding({ drafterSeverity: 'medium', verifierSeverity: 'medium' })); + expect(r.breakdown.severityDistance).toBe(0); + expect(r.breakdown.concordance).toBe(5); + }); + + it('one step apart (d1) → 0', () => { + const r = scoreFinding(makeFinding({ drafterSeverity: 'high', verifierSeverity: 'medium' })); + expect(r.breakdown.severityDistance).toBe(1); + expect(r.breakdown.concordance).toBe(0); + }); + + it('high vs low (d2) → −8', () => { + const r = scoreFinding(makeFinding({ drafterSeverity: 'high', verifierSeverity: 'low' })); + expect(r.breakdown.severityDistance).toBe(2); + expect(r.breakdown.concordance).toBe(-8); + }); + + it('concordance is symmetric (low vs high == high vs low)', () => { + const a = scoreFinding(makeFinding({ drafterSeverity: 'low', verifierSeverity: 'high' })); + const b = scoreFinding(makeFinding({ drafterSeverity: 'high', verifierSeverity: 'low' })); + expect(a.breakdown.concordance).toBe(b.breakdown.concordance); + }); +}); + +// ── score = subtotal + concordance, clamped to [0,100] ─────────────────────── + +describe('score composition and clamp', () => { + it('score = subtotal + concordance', () => { + // LIKELY/circumstantial/partial subtotal 52, d0 +5 → 57. + const r = scoreFinding( + makeFinding({ + verdict: 'LIKELY', + evidenceStrength: 'circumstantial', + contextCompleteness: 'partial', + }), + ); + expect(r.score).toBe(57); + }); + + it('clamps the high end at 100 (CONFIRMED/direct/full + d0 = 105 → 100)', () => { + const r = scoreFinding(makeFinding({ drafterSeverity: 'high', verifierSeverity: 'high' })); + expect(r.breakdown.subtotal).toBe(100); + expect(r.breakdown.concordance).toBe(5); + expect(r.score).toBe(100); + }); + + it('never produces a negative in-scope score (min cell 26 − 8 = 18)', () => { + const r = scoreFinding( + makeFinding({ + verdict: 'LIKELY', + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'low', + verifierSeverity: 'high', + }), + ); + expect(r.score).toBe(18); + }); +}); + +// ── Invariants ─────────────────────────────────────────────────────────────── + +describe('invariant: strict monotonicity in evidence', () => { + for (const verdict of SCORABLE) { + for (const context of CONTEXT) { + it(`${verdict}/*/${context}: direct > circumstantial > speculative`, () => { + const sub = (evidence: EvidenceStrength) => + scoreFinding( + makeFinding({ verdict, evidenceStrength: evidence, contextCompleteness: context }), + ).breakdown.subtotal; + expect(sub('direct')).toBeGreaterThan(sub('circumstantial')); + expect(sub('circumstantial')).toBeGreaterThan(sub('speculative')); + }); + } + } +}); + +describe('invariant: monotonicity in context', () => { + for (const verdict of SCORABLE) { + for (const evidence of EVIDENCE) { + it(`${verdict}/${evidence}/*: full > partial > none`, () => { + const sub = (context: ContextCompleteness) => + scoreFinding( + makeFinding({ verdict, evidenceStrength: evidence, contextCompleteness: context }), + ).breakdown.subtotal; + expect(sub('full')).toBeGreaterThan(sub('partial')); + expect(sub('partial')).toBeGreaterThan(sub('none')); + }); + } + } +}); + +describe('invariant: only CONFIRMED can reach the strong band', () => { + it('LIKELY tops out at 75 (5 below the strong floor of 80)', () => { + let maxLikely = 0; + for (const evidence of EVIDENCE) { + for (const context of CONTEXT) { + for (const drafterSeverity of SEVERITIES) { + for (const verifierSeverity of SEVERITIES) { + const { score } = scoreFinding( + makeFinding({ + verdict: 'LIKELY', + evidenceStrength: evidence, + contextCompleteness: context, + drafterSeverity, + verifierSeverity, + }), + ); + maxLikely = Math.max(maxLikely, score); + } + } + } + } + expect(maxLikely).toBe(75); + expect(maxLikely).toBeLessThan(STRONG_THRESHOLD); + }); + + it('no LIKELY combination lands in the strong band', () => { + for (const evidence of EVIDENCE) { + for (const context of CONTEXT) { + const r = scoreFinding( + makeFinding({ + verdict: 'LIKELY', + evidenceStrength: evidence, + contextCompleteness: context, + drafterSeverity: 'high', + verifierSeverity: 'high', + }), + ); + expect(r.band).not.toBe('strong'); + } + } + }); + + it('CONFIRMED/direct/full reaches the strong band', () => { + const r = scoreFinding(makeFinding({ drafterSeverity: 'high', verifierSeverity: 'high' })); + expect(r.band).toBe('strong'); + }); +}); + +// ── Band boundaries ────────────────────────────────────────────────────────── + +describe('bandFor — boundaries are contiguous with no gaps', () => { + it.each([ + [100, 'strong'], + [80, 'strong'], + [79, 'moderate'], + [55, 'moderate'], + [54, 'weak'], + [30, 'weak'], + [29, 'negligible'], + [0, 'negligible'], + ] as const)('score %i → %s', (score, band) => { + expect(bandFor(score)).toBe(band); + }); + + it('threshold constants line up with the band edges', () => { + expect(STRONG_THRESHOLD).toBe(80); + expect(MODERATE_THRESHOLD).toBe(55); + expect(WEAK_THRESHOLD).toBe(30); + }); +}); + +// ── Hard gate: scope ───────────────────────────────────────────────────────── + +describe('scope hard gate', () => { + it('in_diff false → score 0, negligible, dropped', () => { + const r = scoreFinding(makeFinding({ inDiff: false })); + expect(r.score).toBe(0); + expect(r.band).toBe('negligible'); + expect(r.disposition).toBe('drop'); + expect(r.breakdown.gate).toBe('scope'); + }); + + it('in_changed_code false → score 0, dropped (even for a would-be perfect score)', () => { + const r = scoreFinding( + makeFinding({ drafterSeverity: 'high', verifierSeverity: 'high', inChangedCode: false }), + ); + expect(r.score).toBe(0); + expect(r.disposition).toBe('drop'); + expect(r.breakdown.gate).toBe('scope'); + }); + + it('scope gate fires before the security floor (out-of-scope security is dropped)', () => { + const r = scoreFinding(makeFinding({ category: 'security', inChangedCode: false })); + expect(r.disposition).toBe('drop'); + expect(r.forced).toBe(false); + }); +}); + +// ── Hard gate: dismissed ───────────────────────────────────────────────────── + +describe('dismissed hard gate', () => { + it('DISMISSED non-security → score 0, dropped', () => { + const r = scoreFinding(makeFinding({ verdict: 'DISMISSED' })); + expect(r.score).toBe(0); + expect(r.band).toBe('negligible'); + expect(r.disposition).toBe('drop'); + expect(r.breakdown.gate).toBe('dismissed'); + }); + + it('DISMISSED security → score 0 but routed to the audit list', () => { + const r = scoreFinding(makeFinding({ verdict: 'DISMISSED', category: 'security' })); + expect(r.score).toBe(0); + expect(r.disposition).toBe('audit'); + expect(r.forced).toBe(false); + }); +}); + +// ── Per-finding posting policy ─────────────────────────────────────────────── + +describe('posting policy (per finding)', () => { + it('security finding posts inline even in the weak band (security floor)', () => { + // CONFIRMED/speculative/none, d2 → 56 − 8 = 48 (weak), but security forces inline. + const r = scoreFinding( + makeFinding({ + category: 'security', + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'high', + verifierSeverity: 'low', + }), + ); + expect(r.band).toBe('weak'); + expect(r.disposition).toBe('inline'); + expect(r.forced).toBe(true); + expect(r.reason).toContain('security'); + }); + + it('security finding posts inline even at a negligible score', () => { + const r = scoreFinding( + makeFinding({ + category: 'security', + verdict: 'LIKELY', + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'low', + verifierSeverity: 'high', + }), + ); + expect(r.score).toBe(18); + expect(r.band).toBe('negligible'); + expect(r.disposition).toBe('inline'); + expect(r.forced).toBe(true); + }); + + it('high-severity finding posts inline even in the weak band', () => { + const r = scoreFinding( + makeFinding({ + verdict: 'LIKELY', + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'high', + verifierSeverity: 'high', + }), + ); + // LIKELY/spec/none subtotal 26, d0 +5 = 31 → weak. + expect(r.band).toBe('weak'); + expect(r.disposition).toBe('inline'); + expect(r.forced).toBe(true); + expect(r.reason).toContain('high-severity'); + }); + + it('non-forced moderate finding posts inline (not forced)', () => { + const r = scoreFinding( + makeFinding({ verdict: 'LIKELY', evidenceStrength: 'direct', contextCompleteness: 'full' }), + ); + expect(r.score).toBe(75); + expect(r.disposition).toBe('inline'); + expect(r.forced).toBe(false); + }); + + it('non-forced weak finding → summary (not inline, not dropped)', () => { + // CONFIRMED/speculative/none, d2 → 48 (weak), medium severity, non-security. + const r = scoreFinding( + makeFinding({ + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'high', + verifierSeverity: 'low', + }), + ); + expect(r.band).toBe('weak'); + expect(r.disposition).toBe('summary'); + expect(r.forced).toBe(false); + }); + + it('non-forced negligible LOW-severity finding → dropped', () => { + // Verifier severity is low (no medium floor) and one step from the drafter + // (d1 → +0), so neither the high-severity nor the security override fires + // and the medium-severity visibility floor does not apply: a true drop. + const r = scoreFinding( + makeFinding({ + verdict: 'LIKELY', + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'medium', + verifierSeverity: 'low', + }), + ); + expect(r.score).toBe(26); + expect(r.band).toBe('negligible'); + expect(r.disposition).toBe('drop'); + }); + + it('negligible MEDIUM-severity finding → summary (medium-severity visibility floor)', () => { + // Same negligible score, but verifier severity medium keeps it visible: a + // medium finding the verifier still believes in is never silently dropped. + const r = scoreFinding( + makeFinding({ + verdict: 'LIKELY', + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'low', + verifierSeverity: 'medium', + }), + ); + expect(r.score).toBe(26); + expect(r.band).toBe('negligible'); + expect(r.disposition).toBe('summary'); + expect(r.forced).toBe(false); + }); + + it('visibility never inverts when the verifier raises severity (low → medium)', () => { + // Regression guard for the concordance non-monotonicity: at fixed + // verdict/evidence/context, escalating verifier severity must not move a + // finding to a less-visible tier. low → summary (weak 31); medium → summary + // (negligible 26, floored). Neither is 'drop'. + const base = { + verdict: 'LIKELY', + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'low', + } as const; + const low = scoreFinding(makeFinding({ ...base, verifierSeverity: 'low' })); + const medium = scoreFinding(makeFinding({ ...base, verifierSeverity: 'medium' })); + expect(low.disposition).toBe('summary'); + expect(medium.disposition).toBe('summary'); + // Raising severity dropped the score (lost the +5 agreement bonus) but did + // NOT push the finding off the visible channels. + expect(medium.score).toBeLessThan(low.score); + expect(medium.disposition).not.toBe('drop'); + }); +}); + +// ── Cross-finding comment cap ──────────────────────────────────────────────── + +describe('scoreFindings — comment cap and grouping', () => { + // Seven distinct-score non-forced CONFIRMED findings (medium severity, logic_error): + // direct/full=100, direct/partial=97, circ/full=95, circ/partial=87, + // direct/none=83, spec/partial=75, circ/none=73 (all + d0 concordance). + const NON_FORCED: Array<[EvidenceStrength, ContextCompleteness, number]> = [ + ['direct', 'full', 100], + ['direct', 'partial', 97], + ['circumstantial', 'full', 95], + ['circumstantial', 'partial', 87], + ['direct', 'none', 83], + ['speculative', 'partial', 75], + ['circumstantial', 'none', 73], + ]; + + function nonForcedBatch(): FindingInput[] { + return NON_FORCED.map(([evidence, context], i) => + makeFinding({ + file: `pkg/f${i}.go`, + evidenceStrength: evidence, + contextCompleteness: context, + }), + ); + } + + it('caps non-forced inline comments at COMMENT_CAP, demoting the rest to summary', () => { + const report = scoreFindings(nonForcedBatch()); + expect(report.inline).toHaveLength(COMMENT_CAP); + expect(report.summary).toHaveLength(NON_FORCED.length - COMMENT_CAP); + // The five highest scores survive; the two lowest are demoted. + expect(report.inline.map((s) => s.result.score)).toEqual([100, 97, 95, 87, 83]); + expect(report.summary.map((s) => s.result.score)).toEqual([75, 73]); + }); + + it('inline comments are ordered by descending confidence', () => { + const report = scoreFindings(nonForcedBatch()); + const keys = report.inline.map((s) => s.result.sortKey); + expect(keys).toEqual([...keys].sort((a, b) => b - a)); + }); + + it('forced comments are exempt from the cap and never displaced', () => { + const forced: FindingInput[] = [ + makeFinding({ + file: 'pkg/sec.go', + category: 'security', + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'high', + verifierSeverity: 'low', + }), + makeFinding({ + file: 'pkg/high.go', + verdict: 'LIKELY', + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'high', + verifierSeverity: 'high', + }), + ]; + const report = scoreFindings([...nonForcedBatch(), ...forced]); + // 5 capped non-forced + 2 forced = 7 inline; the 2 forced are present despite low scores. + expect(report.inline).toHaveLength(COMMENT_CAP + forced.length); + const inlineFiles = report.inline.map((s) => s.input.file); + expect(inlineFiles).toContain('pkg/sec.go'); + expect(inlineFiles).toContain('pkg/high.go'); + // Cap still applied to the non-forced set only. + expect(report.summary).toHaveLength(NON_FORCED.length - COMMENT_CAP); + }); + + it('respects a custom comment cap', () => { + const report = scoreFindings(nonForcedBatch(), { commentCap: 2 }); + expect(report.inline).toHaveLength(2); + expect(report.summary).toHaveLength(NON_FORCED.length - 2); + }); + + it('groups gated findings into audit and dropped', () => { + const report = scoreFindings([ + makeFinding({ file: 'a.go', verdict: 'DISMISSED', category: 'security' }), + makeFinding({ file: 'b.go', verdict: 'DISMISSED' }), + makeFinding({ file: 'c.go', inChangedCode: false }), + ]); + expect(report.audit.map((s) => s.input.file)).toEqual(['a.go']); + expect(report.dropped.map((s) => s.input.file).sort()).toEqual(['b.go', 'c.go']); + expect(report.inline).toHaveLength(0); + }); +}); + +// ── sortKey tie-break ──────────────────────────────────────────────────────── + +describe('sortKey tie-break', () => { + it('breaks an equal-score tie in favour of CONFIRMED over LIKELY', () => { + // CONFIRMED/speculative/partial + d0 = 75; LIKELY/direct/full + d0 = 75. + const confirmed = scoreFinding( + makeFinding({ evidenceStrength: 'speculative', contextCompleteness: 'partial' }), + ); + const likely = scoreFinding( + makeFinding({ verdict: 'LIKELY', evidenceStrength: 'direct', contextCompleteness: 'full' }), + ); + expect(confirmed.score).toBe(75); + expect(likely.score).toBe(75); + expect(confirmed.sortKey).toBeGreaterThan(likely.sortKey); + }); +}); + +// ── Input validation ───────────────────────────────────────────────────────── + +describe('input validation', () => { + it('throws on an invalid verdict', () => { + expect(() => scoreFinding(makeFinding({ verdict: 'MAYBE' as never }))).toThrow( + /invalid verdict/, + ); + }); + + it('throws on an invalid evidence_strength', () => { + expect(() => scoreFinding(makeFinding({ evidenceStrength: 'weak' as never }))).toThrow( + /invalid evidenceStrength/, + ); + }); + + it('throws on an invalid context_completeness', () => { + expect(() => scoreFinding(makeFinding({ contextCompleteness: 'some' as never }))).toThrow( + /invalid contextCompleteness/, + ); + }); + + it('throws on an invalid category (a misspelled "Security" must not silently disable the floor)', () => { + expect(() => scoreFinding(makeFinding({ category: 'Security' as never }))).toThrow( + /invalid category/, + ); + }); +}); + +// ── Locked-spec worked examples (data-driven) ──────────────────────────────── + +describe('locked-spec worked examples', () => { + // band names here use the confidence vocabulary (strong/moderate/weak/negligible); + // the spec's worked_examples field encoded them as high/medium/low/negligible. + const CASES: Array<{ + name: string; + input: Partial; + score: number; + band: string; + inline: boolean; + }> = [ + { + name: 'CONFIRMED/direct/full, high/high → 100, strong, inline (high-severity)', + input: { drafterSeverity: 'high', verifierSeverity: 'high' }, + score: 100, + band: 'strong', + inline: true, + }, + { + name: 'CONFIRMED/circumstantial/none, medium/medium → 73, moderate, inline', + input: { evidenceStrength: 'circumstantial', contextCompleteness: 'none' }, + score: 73, + band: 'moderate', + inline: true, + }, + { + name: 'CONFIRMED/speculative/none, high/low → 48, weak, summary', + input: { + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'high', + verifierSeverity: 'low', + }, + score: 48, + band: 'weak', + inline: false, + }, + { + name: 'LIKELY/direct/full, medium/medium → 75, moderate, inline', + input: { verdict: 'LIKELY', evidenceStrength: 'direct', contextCompleteness: 'full' }, + score: 75, + band: 'moderate', + inline: true, + }, + { + name: 'LIKELY/circumstantial/partial, medium/medium → 57, moderate, inline', + input: { + verdict: 'LIKELY', + evidenceStrength: 'circumstantial', + contextCompleteness: 'partial', + }, + score: 57, + band: 'moderate', + inline: true, + }, + { + // Spec worked-example #6 listed verifierSev=high yet "dropped entirely", + // which contradicts the high-severity always-post rule. The score (26) and + // band (negligible) are correct; to demonstrate the intended non-forced + // drop we use a low verifier severity one step from the drafter (d1 → +0) + // — low severity is below the medium visibility floor, so it truly drops. + name: 'LIKELY/speculative/none, medium/low → 26, negligible, dropped (non-forced, low severity)', + input: { + verdict: 'LIKELY', + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'medium', + verifierSeverity: 'low', + }, + score: 26, + band: 'negligible', + inline: false, + }, + { + name: 'CONFIRMED/circumstantial/none, high/medium, security → 68, moderate, inline', + input: { + category: 'security', + evidenceStrength: 'circumstantial', + contextCompleteness: 'none', + drafterSeverity: 'high', + verifierSeverity: 'medium', + }, + score: 68, + band: 'moderate', + inline: true, + }, + { + name: 'LIKELY/speculative/none, low/high, security → 18, negligible, inline (security floor)', + input: { + category: 'security', + verdict: 'LIKELY', + evidenceStrength: 'speculative', + contextCompleteness: 'none', + drafterSeverity: 'low', + verifierSeverity: 'high', + }, + score: 18, + band: 'negligible', + inline: true, + }, + { + name: 'DISMISSED → 0, negligible, not inline', + input: { verdict: 'DISMISSED', drafterSeverity: 'high', verifierSeverity: 'high' }, + score: 0, + band: 'negligible', + inline: false, + }, + { + name: 'LIKELY/circumstantial/full, high/high → 65, moderate, inline (high-severity)', + input: { + verdict: 'LIKELY', + evidenceStrength: 'circumstantial', + contextCompleteness: 'full', + drafterSeverity: 'high', + verifierSeverity: 'high', + }, + score: 65, + band: 'moderate', + inline: true, + }, + { + name: 'CONFIRMED/direct/full, high/high, OUT OF SCOPE → 0, negligible, not inline', + input: { drafterSeverity: 'high', verifierSeverity: 'high', inChangedCode: false }, + score: 0, + band: 'negligible', + inline: false, + }, + { + name: 'CONFIRMED/speculative/full, medium/medium → 83, strong, inline', + input: { evidenceStrength: 'speculative', contextCompleteness: 'full' }, + score: 83, + band: 'strong', + inline: true, + }, + ]; + + for (const c of CASES) { + it(c.name, () => { + const r = scoreFinding(makeFinding(c.input)); + expect(r.score).toBe(c.score); + expect(r.band).toBe(c.band); + expect(r.disposition === 'inline').toBe(c.inline); + }); + } +}); diff --git a/src/score-confidence/index.ts b/src/score-confidence/index.ts new file mode 100644 index 0000000..1148447 --- /dev/null +++ b/src/score-confidence/index.ts @@ -0,0 +1,109 @@ +// Copyright The Docker Agent Action authors +// SPDX-License-Identifier: Apache-2.0 + +/** + * score-confidence CLI entrypoint. + * + * Usage: + * node dist/score-confidence.js [outputPath] + * + * findingsPath Path to a JSON file holding an array of merged finding records + * (drafter hypothesis + verifier verdict). Read-only. + * outputPath Where to write the confidence report JSON + * (default: /tmp/finding_confidence.json). + * + * Each input record uses the agent's snake_case field names: + * { + * "file": "pkg/auth/oidc.go", + * "line": 72, + * "category": "security", + * "verdict": "CONFIRMED", + * "evidence_strength": "direct", + * "context_completeness": "full", + * "drafter_severity": "high", + * "verifier_severity": "high", + * "in_diff": true, + * "in_changed_code": true, + * "issue": "…", // optional, passed through to output + * "details": "…" // optional, passed through to output + * } + * + * The output JSON groups findings by their final posting disposition + * (inline / summary / audit / dropped); each entry carries the original record + * plus { score, band, disposition, forced, reason, breakdown }. See + * score-confidence.ts for the scoring rules and posting policy. + */ +import { readFileSync, writeFileSync } from 'node:fs'; +import { type FindingInput, scoreFindings } from './score-confidence.js'; + +const DEFAULT_OUTPUT_PATH = '/tmp/finding_confidence.json'; + +/** Map one snake_case input record to the camelCase {@link FindingInput} shape. */ +function parseRecord(raw: Record, index: number): FindingInput { + const get = (key: string): unknown => raw[key]; + const require = (key: string): unknown => { + const value = get(key); + if (value === undefined || value === null) { + throw new Error(`finding[${index}] is missing required field "${key}"`); + } + return value; + }; + return { + file: String(require('file')), + line: Number(require('line')), + category: require('category') as FindingInput['category'], + verdict: require('verdict') as FindingInput['verdict'], + evidenceStrength: require('evidence_strength') as FindingInput['evidenceStrength'], + contextCompleteness: require('context_completeness') as FindingInput['contextCompleteness'], + drafterSeverity: require('drafter_severity') as FindingInput['drafterSeverity'], + verifierSeverity: require('verifier_severity') as FindingInput['verifierSeverity'], + inDiff: get('in_diff') === true, + inChangedCode: get('in_changed_code') === true, + }; +} + +function main(): void { + const [, , findingsPath, outputPath = DEFAULT_OUTPUT_PATH] = process.argv; + + if (!findingsPath) { + process.stderr.write('Usage: score-confidence [outputPath]\n'); + process.exit(1); + } + + const parsed = JSON.parse(readFileSync(findingsPath, 'utf-8')) as unknown; + const records = Array.isArray(parsed) ? (parsed as Record[]) : []; + const inputs = records.map(parseRecord); + const report = scoreFindings(inputs); + + // Re-attach the original records so passthrough fields (issue/details) survive, + // grouped by final posting disposition. + const project = (group: typeof report.inline): unknown[] => + group.map((s) => { + const original = records[inputs.indexOf(s.input)] ?? {}; + return { + ...original, + score: s.result.score, + band: s.result.band, + disposition: s.result.disposition, + forced: s.result.forced, + reason: s.result.reason, + breakdown: s.result.breakdown, + }; + }); + + const output = { + inline: project(report.inline), + summary: project(report.summary), + audit: project(report.audit), + dropped: project(report.dropped), + }; + + writeFileSync(outputPath, JSON.stringify(output), 'utf-8'); +} + +try { + main(); +} catch (err) { + process.stderr.write(`Error: ${err instanceof Error ? err.message : String(err)}\n`); + process.exit(1); +} diff --git a/src/score-confidence/score-confidence.ts b/src/score-confidence/score-confidence.ts new file mode 100644 index 0000000..7a3c5c4 --- /dev/null +++ b/src/score-confidence/score-confidence.ts @@ -0,0 +1,554 @@ +// Copyright The Docker Agent Action authors +// SPDX-License-Identifier: Apache-2.0 + +/** + * score-confidence — per-finding confidence scoring for the PR review pipeline. + * + * The reviewer pipeline is drafter → verifier → orchestrator. The drafter + * proposes bug findings; the verifier returns a verdict plus two evidence + * signals per finding; this module converts those signals into a precise, + * reproducible 0–100 confidence score, a band, and a posting disposition. + * + * This module is the **single source of truth** for the confidence model. The + * orchestrator agent (review-pr/agents/pr-review.yaml) mirrors the exact same + * rules as a strict lookup-table procedure so it can score findings inline + * without depending on the (gitignored) dist bundle at agent runtime. Any change + * to the weights, bands, threshold, or posting policy here MUST be reflected in + * the "Confidence Scoring" section of that agent prompt, and vice-versa. The + * unit tests pin every value so drift is caught. + * + * ## Criteria (multi-factor — no single signal decides a score) + * + * 1. verdict — verifier agreement: CONFIRMED | LIKELY | DISMISSED + * 2. evidence_strength — pattern/snippet match strength: direct | circumstantial | speculative + * 3. context_completeness— did the verifier see the code it needed: full | partial | none + * 4. severity concordance— agreement between drafter and verifier severity (rank distance) + * 5. scope — in_diff (drafter) AND in_changed_code (verifier) + * 6. category / severity — security and high-severity drive POSTING policy, never the raw score + * + * ## Deterministic pipeline (exact order — implement verbatim, no conditional caps) + * + * STEP 0 (scope gate): NOT(in_diff && in_changed_code) → score 0, negligible, never post. + * STEP 1 (dismissed gate): verdict === DISMISSED → score 0, negligible, never post inline. + * STEP 2 (core subtotal): subtotal = CORE_SUBTOTAL[verdict][evidence][context] + * (a precomputed 3×3 table per scorable verdict; see below). + * STEP 3 (concordance): score_raw = subtotal + concordance(drafterSeverity, verifierSeverity) + * STEP 4 (clamp): score = clamp(score_raw, 0, 100) ← the only clamp; there is no cap step. + * STEP 5 (band): bandFor(score) + * + * The core subtotal is authored additively as `verdict base + evidence + context`: + * + * verdict base: CONFIRMED 70 LIKELY 40 + * evidence: direct +18 circumstantial +8 speculative −4 + * context: full +12 partial +4 none −10 + * + * yielding (rows = verdict/evidence, columns = full | partial | none): + * + * CONFIRMED / direct = [100, 92, 78] + * CONFIRMED / circumstantial = [ 90, 82, 68] + * CONFIRMED / speculative = [ 78, 70, 56] + * LIKELY / direct = [ 70, 62, 48] + * LIKELY / circumstantial = [ 60, 52, 38] + * LIKELY / speculative = [ 48, 40, 26] + * + * Provable invariants (all unit-tested): + * - Strictly monotone in evidence (direct > circumstantial > speculative) at fixed verdict/context. + * - Monotone in context (full ≥ partial ≥ none) at fixed verdict/evidence. + * - Only CONFIRMED can reach the strong band (≥80): LIKELY tops out at 75 (LIKELY/direct/full + d0), + * a robust 5-point margin below the strong floor. + * - DISMISSED and out-of-scope findings always score 0. + * - Concordance (−8 worst case) never drives an in-scope score below 0 (min cell 26 − 8 = 18). + * + * Note on severity: the score deliberately incorporates drafter↔verifier severity *agreement* + * (concordance), which peaks when they match. It is therefore intentionally NOT monotone in + * verifier severity — a one-notch disagreement can nudge a borderline finding down a band. That + * is a legitimate confidence signal (confidence = "is it real", a different axis from severity), + * but it must never silently suppress a real bug, so the posting policy adds a medium-severity + * visibility floor (rule 6). Net guarantee: increasing verifier severity never *lowers* a + * finding's visibility tier (low → drop/summary, medium → at least summary, high → inline). + * + * ## Posting policy (decided after scoring; first match wins; the cap is applied last) + * + * 1. Out-of-scope / DISMISSED non-security → drop (never posted inline). + * 2. Security floor: category === security AND verdict ∈ {CONFIRMED, LIKELY} + * → always inline, regardless of score/band, exempt from the cap. + * 3. High-severity: verifierSeverity === high AND verdict ∈ {CONFIRMED, LIKELY} + * → always inline, regardless of band, exempt from the cap. + * 4. Default: band ∈ {strong, moderate} → inline (subject to the cap). + * 5. Weak visibility: band === weak (30..54) → summary list, not inline (no silent drop). + * 6. Medium floor: negligible band but verifierSeverity === medium → summary (kept visible). + * 7. Dismissed-security audit: DISMISSED security → audit list, not inline (human-reviewable). + * 8. Cap: non-forced inline comments capped at COMMENT_CAP (5); overflow → summary. + * Ranking keeps the highest sortKey first (score, then CONFIRMED>LIKELY, then subtotal, + * then evidence, then context). Forced comments (rules 2,3) are never displaced. + */ + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +/** Verifier verdict on a finding. */ +export type Verdict = 'CONFIRMED' | 'LIKELY' | 'DISMISSED'; + +/** Verdicts that enter the additive scoring path (DISMISSED is gated out first). */ +export type ScorableVerdict = Exclude; + +/** Verifier signal: how strongly the cited snippet shows the bug. */ +export type EvidenceStrength = 'direct' | 'circumstantial' | 'speculative'; + +/** Verifier signal: how complete the code context was when judging. */ +export type ContextCompleteness = 'full' | 'partial' | 'none'; + +/** Finding severity (shared by drafter and verifier). */ +export type Severity = 'high' | 'medium' | 'low'; + +/** Drafter/verifier finding category. */ +export type Category = + | 'security' + | 'logic_error' + | 'resource_leak' + | 'concurrency' + | 'error_handling' + | 'data_integrity' + | 'other'; + +/** Confidence band — deliberately distinct from the severity enum (independent axes). */ +export type ConfidenceBand = 'strong' | 'moderate' | 'weak' | 'negligible'; + +/** + * Where a finding ends up: + * - inline: posted as an inline review comment + * - summary: listed in the review summary as a lower-confidence finding (not inline) + * - audit: a DISMISSED security finding surfaced for human review (not inline) + * - drop: not surfaced at all (negligible / dismissed non-security / out-of-scope) + */ +export type Disposition = 'inline' | 'summary' | 'audit' | 'drop'; + +/** A finding merged from the drafter hypothesis and the verifier verdict. */ +export interface FindingInput { + /** Repo-relative file path (passed through to output). */ + file: string; + /** 1-indexed line number (passed through to output). */ + line: number; + /** Finding category; `security` triggers the posting floor. */ + category: Category; + /** Verifier verdict — the primary agreement signal. */ + verdict: Verdict; + /** Verifier signal: snippet/pattern match strength. */ + evidenceStrength: EvidenceStrength; + /** Verifier signal: code-context completeness. */ + contextCompleteness: ContextCompleteness; + /** Severity the drafter originally assigned (for concordance). */ + drafterSeverity: Severity; + /** Severity the verifier settled on (drives concordance + high-severity posting). */ + verifierSeverity: Severity; + /** Drafter scope flag: finding lands on a `+` line. */ + inDiff: boolean; + /** Verifier scope flag: this PR's changes introduce the problem. */ + inChangedCode: boolean; +} + +/** Transparent breakdown of how a score was reached (for logging / debugging). */ +export interface ConfidenceBreakdown { + /** Core table value (verdict × evidence × context); 0 when gated. */ + subtotal: number; + /** Concordance term applied after the table: +5 | 0 | −8; 0 when gated. */ + concordance: number; + /** Severity rank distance d = |rank(drafter) − rank(verifier)|; 0 when gated. */ + severityDistance: number; + /** Which hard gate fired, if any. */ + gate: 'scope' | 'dismissed' | null; +} + +/** The confidence verdict for a single finding (pre-cap; see {@link scoreFindings}). */ +export interface ConfidenceResult { + /** 0–100 confidence score. */ + score: number; + /** Band derived from {@link score}. */ + band: ConfidenceBand; + /** Provisional posting disposition (the cross-finding cap may demote inline → summary). */ + disposition: Disposition; + /** True when posted via the security or high-severity override (exempt from the cap). */ + forced: boolean; + /** Human-readable reason for the disposition (which policy rule decided it). */ + reason: string; + /** + * Descending sort key for the comment cap tie-break. Encodes, in priority order: + * score, then verdict (CONFIRMED>LIKELY), then subtotal, then evidence, then context. + * Higher = kept first when the cap trims non-forced inline comments. + */ + sortKey: number; + /** How the score was computed. */ + breakdown: ConfidenceBreakdown; +} + +/** A scored finding: the original input paired with its confidence result. */ +export interface ScoredFinding { + input: FindingInput; + result: ConfidenceResult; +} + +/** Grouped output of {@link scoreFindings}, after the cross-finding cap is applied. */ +export interface ConfidenceReport { + /** Every finding, in input order, with its final (post-cap) result. */ + findings: ScoredFinding[]; + /** Findings posted as inline comments (forced first, then capped default-band), sorted by confidence. */ + inline: ScoredFinding[]; + /** Lower-confidence findings surfaced in the summary instead of inline (weak band + cap overflow). */ + summary: ScoredFinding[]; + /** DISMISSED security findings surfaced for human review. */ + audit: ScoredFinding[]; + /** Findings not surfaced at all (negligible / dismissed non-security / out-of-scope). */ + dropped: ScoredFinding[]; +} + +/** Options for {@link scoreFindings}. */ +export interface ScoreFindingsOptions { + /** Max non-forced inline comments to keep (default {@link COMMENT_CAP}). */ + commentCap?: number; +} + +// --------------------------------------------------------------------------- +// Model constants (the single source of truth — mirror in pr-review.yaml) +// --------------------------------------------------------------------------- + +/** Verdict base points (DISMISSED is gated out before the table). */ +const VERDICT_BASE: Record = { + CONFIRMED: 70, + LIKELY: 40, +}; + +/** Evidence-strength delta added to the verdict base. */ +const EVIDENCE_DELTA: Record = { + direct: 18, + circumstantial: 8, + speculative: -4, +}; + +/** Context-completeness delta added to the verdict base. */ +const CONTEXT_DELTA: Record = { + full: 12, + partial: 4, + none: -10, +}; + +/** Severity rank used for the concordance distance. */ +const SEVERITY_RANK: Record = { + high: 3, + medium: 2, + low: 1, +}; + +/** Verdict rank used only for the cap tie-break sort key. */ +const VERDICT_RANK: Record = { + CONFIRMED: 2, + LIKELY: 1, + DISMISSED: 0, +}; + +/** Evidence rank used only for the cap tie-break sort key. */ +const EVIDENCE_RANK: Record = { + direct: 2, + circumstantial: 1, + speculative: 0, +}; + +/** Context rank used only for the cap tie-break sort key. */ +const CONTEXT_RANK: Record = { + full: 2, + partial: 1, + none: 0, +}; + +/** Score at or above which a finding is `strong`. Only CONFIRMED can reach it. */ +export const STRONG_THRESHOLD = 80; + +/** + * Score at or above which a finding is at least `moderate`. This IS the default + * posting threshold — there is no separate constant, so the band floor and the + * "post by default" cutoff can never drift apart. + */ +export const MODERATE_THRESHOLD = 55; + +/** Score at or above which a finding is at least `weak` (surfaced in the summary). */ +export const WEAK_THRESHOLD = 30; + +/** Default posting threshold (alias of {@link MODERATE_THRESHOLD} for callers). */ +export const DEFAULT_POST_THRESHOLD = MODERATE_THRESHOLD; + +/** Maximum non-forced inline comments kept; overflow is routed to the summary list. */ +export const COMMENT_CAP = 5; + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +function assertEnum(value: unknown, allowed: readonly T[], field: string): T { + if (typeof value !== 'string' || !allowed.includes(value as T)) { + throw new Error( + `invalid ${field}: ${JSON.stringify(value)} (expected one of ${allowed.join(', ')})`, + ); + } + return value as T; +} + +const clamp = (n: number, lo: number, hi: number): number => Math.max(lo, Math.min(hi, n)); + +/** Core subtotal for a scorable verdict — the precomputed 3×3 table value. */ +function coreSubtotal( + verdict: ScorableVerdict, + evidence: EvidenceStrength, + context: ContextCompleteness, +): number { + return VERDICT_BASE[verdict] + EVIDENCE_DELTA[evidence] + CONTEXT_DELTA[context]; +} + +/** + * Concordance term: agreement between the drafter's and verifier's severity. + * d = |rank(drafter) − rank(verifier)|; same → +5, one step → 0, opposite → −8. + */ +function concordance(drafter: Severity, verifier: Severity): { distance: number; points: number } { + const distance = Math.abs(SEVERITY_RANK[drafter] - SEVERITY_RANK[verifier]); + const points = distance === 0 ? 5 : distance === 1 ? 0 : -8; + return { distance, points }; +} + +/** Map a 0–100 score to its band. Boundaries: 80 / 55 / 30 (contiguous, no gaps). */ +export function bandFor(score: number): ConfidenceBand { + if (score >= STRONG_THRESHOLD) return 'strong'; + if (score >= MODERATE_THRESHOLD) return 'moderate'; + if (score >= WEAK_THRESHOLD) return 'weak'; + return 'negligible'; +} + +/** + * Build the descending cap tie-break sort key. The decimal slots never overlap + * given the value ranges (score 0–100, ranks 0–2, subtotal 0–100), so a plain + * numeric sort reproduces the spec's tie-break chain exactly. + */ +function buildSortKey( + score: number, + verdict: Verdict, + subtotal: number, + evidence: EvidenceStrength, + context: ContextCompleteness, +): number { + return ( + score * 10 ** 7 + + VERDICT_RANK[verdict] * 10 ** 6 + + subtotal * 10 ** 3 + + EVIDENCE_RANK[evidence] * 10 ** 2 + + CONTEXT_RANK[context] * 10 + ); +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/** + * Score a single finding and decide its provisional posting disposition. + * + * The disposition is provisional because the comment cap is a cross-finding + * decision: a non-forced `inline` finding may be demoted to `summary` by + * {@link scoreFindings}. Use {@link scoreFindings} for the final disposition. + * + * @throws if any enum field is missing or invalid. + */ +export function scoreFinding(raw: FindingInput): ConfidenceResult { + const verdict = assertEnum(raw.verdict, ['CONFIRMED', 'LIKELY', 'DISMISSED'] as const, 'verdict'); + const evidence = assertEnum( + raw.evidenceStrength, + ['direct', 'circumstantial', 'speculative'] as const, + 'evidenceStrength', + ); + const context = assertEnum( + raw.contextCompleteness, + ['full', 'partial', 'none'] as const, + 'contextCompleteness', + ); + const drafterSeverity = assertEnum( + raw.drafterSeverity, + ['high', 'medium', 'low'] as const, + 'drafterSeverity', + ); + const verifierSeverity = assertEnum( + raw.verifierSeverity, + ['high', 'medium', 'low'] as const, + 'verifierSeverity', + ); + // Validate category too: it gates the security floor and the dismissed-security + // audit, so a misspelled value must throw like every other enum rather than + // silently downgrade `isSecurity` to false. + const category = assertEnum( + raw.category, + [ + 'security', + 'logic_error', + 'resource_leak', + 'concurrency', + 'error_handling', + 'data_integrity', + 'other', + ] as const, + 'category', + ); + const isSecurity = category === 'security'; + const inScope = raw.inDiff === true && raw.inChangedCode === true; + const sortKeyFor = (score: number, subtotal: number): number => + buildSortKey(score, verdict, subtotal, evidence, context); + + // STEP 0 — scope hard gate. Out-of-scope findings never post inline. + if (!inScope) { + return { + score: 0, + band: 'negligible', + disposition: 'drop', + forced: false, + reason: 'out-of-scope (not in_diff && in_changed_code)', + sortKey: sortKeyFor(0, 0), + breakdown: { subtotal: 0, concordance: 0, severityDistance: 0, gate: 'scope' }, + }; + } + + // STEP 1 — dismissed hard gate. Score is 0, but a dismissed SECURITY finding is + // routed to the audit list (human-reviewable) rather than silently dropped. + if (verdict === 'DISMISSED') { + return { + score: 0, + band: 'negligible', + disposition: isSecurity ? 'audit' : 'drop', + forced: false, + reason: isSecurity ? 'dismissed security finding (audit)' : 'dismissed', + sortKey: sortKeyFor(0, 0), + breakdown: { subtotal: 0, concordance: 0, severityDistance: 0, gate: 'dismissed' }, + }; + } + + // STEP 2–4 — core subtotal + concordance, then clamp. + const subtotal = coreSubtotal(verdict, evidence, context); + const { distance, points } = concordance(drafterSeverity, verifierSeverity); + const score = clamp(subtotal + points, 0, 100); + const band = bandFor(score); + const sortKey = sortKeyFor(score, subtotal); + const breakdown: ConfidenceBreakdown = { + subtotal, + concordance: points, + severityDistance: distance, + gate: null, + }; + + // Posting policy (per-finding part; the cap is applied in scoreFindings). + if (isSecurity) { + return { + score, + band, + disposition: 'inline', + forced: true, + reason: 'security floor (never auto-suppressed)', + sortKey, + breakdown, + }; + } + if (verifierSeverity === 'high') { + return { + score, + band, + disposition: 'inline', + forced: true, + reason: 'high-severity always-post', + sortKey, + breakdown, + }; + } + if (band === 'strong' || band === 'moderate') { + return { + score, + band, + disposition: 'inline', + forced: false, + reason: `default band (${band})`, + sortKey, + breakdown, + }; + } + if (band === 'weak') { + return { + score, + band, + disposition: 'summary', + forced: false, + reason: 'weak band (lower-confidence summary, not inline)', + sortKey, + breakdown, + }; + } + // Negligible band. Confidence incorporates drafter↔verifier severity agreement, so + // it is intentionally NOT monotone in verifier severity — a one-notch disagreement can + // nudge a borderline finding down a band. To prevent that from ever *silently dropping* + // a finding the verifier still rates medium-or-worse, a medium-severity negligible + // finding is kept visible in the lower-confidence summary. (High is already force-posted + // above; only low-severity negligible findings are dropped as noise.) + if (verifierSeverity === 'medium') { + return { + score, + band, + disposition: 'summary', + forced: false, + reason: 'medium-severity visibility floor (kept in summary despite negligible confidence)', + sortKey, + breakdown, + }; + } + return { + score, + band, + disposition: 'drop', + forced: false, + reason: 'negligible band (low severity)', + sortKey, + breakdown, + }; +} + +/** + * Score a batch of findings and produce the final grouped report, applying the + * cross-finding comment cap: non-forced inline comments are limited to + * `commentCap`, keeping the highest-confidence ones; the overflow is demoted to + * the summary list. Forced comments (security / high-severity) are exempt and + * never displaced. + */ +export function scoreFindings( + findings: FindingInput[], + options: ScoreFindingsOptions = {}, +): ConfidenceReport { + const commentCap = options.commentCap ?? COMMENT_CAP; + const scored: ScoredFinding[] = findings.map((input) => ({ input, result: scoreFinding(input) })); + + // Identify non-forced inline candidates and demote everything past the cap. + const nonForcedInline = scored + .filter((s) => s.result.disposition === 'inline' && !s.result.forced) + .sort((a, b) => b.result.sortKey - a.result.sortKey); + + const demoted = new Set(nonForcedInline.slice(commentCap)); + for (const s of demoted) { + s.result = { + ...s.result, + disposition: 'summary', + reason: `over comment cap (${commentCap}); moved to lower-confidence summary`, + }; + } + + const byDisposition = (d: Disposition): ScoredFinding[] => + scored + .filter((s) => s.result.disposition === d) + .sort((a, b) => b.result.sortKey - a.result.sortKey); + + return { + findings: scored, + inline: byDisposition('inline'), + summary: byDisposition('summary'), + audit: byDisposition('audit'), + dropped: byDisposition('drop'), + }; +} diff --git a/tsup.config.ts b/tsup.config.ts index 3bb1665..3fe6d7f 100644 --- a/tsup.config.ts +++ b/tsup.config.ts @@ -31,6 +31,7 @@ const entry = { 'mention-reply': src('mention-reply'), 'migrate-consumer-refs': src('migrate-consumer-refs'), 'post-mention-reply': src('post-mention-reply'), + 'score-confidence': src('score-confidence'), 'score-risk': src('score-risk'), security: src('security'), 'signed-commit': src('signed-commit'), From 85b15174761b8b57bc35d56bc5ccb6ee073a6973 Mon Sep 17 00:00:00 2001 From: Sayt-0 Date: Wed, 24 Jun 2026 23:07:06 +0200 Subject: [PATCH 2/2] fix(score-confidence): write CLI output to stdout by default CodeQL js/insecure-temporary-file: the CLI defaulted its output to a hardcoded /tmp path. Default to stdout instead (composable, no fixed temp file) and write to a file only when the caller passes an explicit output path. --- src/score-confidence/index.ts | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/score-confidence/index.ts b/src/score-confidence/index.ts index 1148447..b06aa3f 100644 --- a/src/score-confidence/index.ts +++ b/src/score-confidence/index.ts @@ -9,8 +9,10 @@ * * findingsPath Path to a JSON file holding an array of merged finding records * (drafter hypothesis + verifier verdict). Read-only. - * outputPath Where to write the confidence report JSON - * (default: /tmp/finding_confidence.json). + * outputPath Optional. When given, the confidence report JSON is written to + * this caller-controlled path; otherwise it is written to stdout + * (the default — keeps the tool composable and avoids writing to a + * fixed temp location). * * Each input record uses the agent's snake_case field names: * { @@ -36,8 +38,6 @@ import { readFileSync, writeFileSync } from 'node:fs'; import { type FindingInput, scoreFindings } from './score-confidence.js'; -const DEFAULT_OUTPUT_PATH = '/tmp/finding_confidence.json'; - /** Map one snake_case input record to the camelCase {@link FindingInput} shape. */ function parseRecord(raw: Record, index: number): FindingInput { const get = (key: string): unknown => raw[key]; @@ -63,7 +63,7 @@ function parseRecord(raw: Record, index: number): FindingInput } function main(): void { - const [, , findingsPath, outputPath = DEFAULT_OUTPUT_PATH] = process.argv; + const [, , findingsPath, outputPath] = process.argv; if (!findingsPath) { process.stderr.write('Usage: score-confidence [outputPath]\n'); @@ -98,7 +98,14 @@ function main(): void { dropped: project(report.dropped), }; - writeFileSync(outputPath, JSON.stringify(output), 'utf-8'); + const json = JSON.stringify(output); + // Default to stdout (composable, no fixed temp path); write to a file only when + // the caller supplies an explicit, caller-controlled output path. + if (outputPath) { + writeFileSync(outputPath, json, 'utf-8'); + } else { + process.stdout.write(`${json}\n`); + } } try {