From 5f6af0b3fdfa91d25be063588adff4ac949e892e Mon Sep 17 00:00:00 2001 From: T Date: Mon, 8 Jun 2026 20:20:11 -0500 Subject: [PATCH 1/3] fix(ingestion,cli): make a broken parser fail loud, not silently produce a symbol-free graph MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The WASM-resolver bug (PR #201) shipped invisibly for ~5 days because a globally-broken parser degraded silently: every file's parse returned empty captures, `analyze` built a File/Directory-only skeleton graph, printed a node count, and exited 0. Nothing in the pipeline distinguished "parser is dead" from "repo legitimately has no symbols." This hardens that whole failure class plus the adjacent gaps the investigation surfaced. #1 Zero-symbol guard (run-level backstop). parse.ts now exposes treeSitterFileCount + treeSitterSymbolCount on ParseOutput; the orchestrator trips `zeroSymbolGuardTripped` (exported pure predicate shouldTripZeroSymbolGuard: >=5 tree-sitter files AND 0 symbols), pushes a loud warning, and analyze maps it to a distinct advisory exit code 3. Configs-only / cobol-only / unsupported-language repos report 0 tree-sitter files and never trip; CodeElement stubs are excluded so an import-only repo can't mask a break. #2 Distinguish global parser death from per-file failure. New WasmRuntimeUnavailableError (name set so it survives Piscina's structured clone). ensureWasmRuntime now THROWS it when the vendored grammar dir is missing / web-tree-sitter.wasm won't init (a deployment breakage), instead of the old ambiguous soft `undefined`. openWasmParser + parseOne rethrow it; the parse phase aborts the run with an actionable message naming the missing dir. The soft "web-tree-sitter package absent" path stays a no-throw hand-off to #1. Per-file errors (syntax, timeout, one missing grammar) still warn+skip. #3 Resolver drift-guard test. Extended the asset-resolver drift guard to cover ALL 5 shipped asset trees (added vendor/wasms + java to plugin-assets/ ci-templates/config), so a future fixed-offset regression in any resolver fails a test instead of shipping silently. #4 tsup rm-before-copy. copyTree now rm's each leaf dest before cp, so an incremental/watch rebuild can't leave a renamed/deleted asset dir behind in dist/ (the stale opencodehub-* skill dirs I hit mid-rename). Safe against the nested-dest hazard — every onSuccess dest is a distinct leaf. #5 Verifier asserts a real symbol. verify-global-install.sh now requires `codehub query 'Greet'` to return a Function/Class/Method row from greeter.go (a uniquely-cased Go func), instead of "any hit on 'export default'" — which passed on the 0-symbol skeleton (the stderr header alone satisfied the old gate). Verified end-to-end against the built bundle: happy path → exit 0, 16 nodes / 24 edges; vendor/wasms hidden → run ABORTS (exit 1) with the named-dir message, no skeleton graph persisted (was: silent exit 0). Tests: ingestion 594/594, cli 316/0/11-platform-skip; typecheck + lint + banned-strings clean. --- packages/cli/src/asset-resolver.test.ts | 29 +++++++++ packages/cli/src/commands/analyze.ts | 18 ++++++ packages/cli/src/index.ts | 6 +- packages/cli/tsup.config.ts | 11 +++- .../ingestion/src/parse/parse-worker.test.ts | 20 +++++++ packages/ingestion/src/parse/parse-worker.ts | 11 +++- packages/ingestion/src/parse/vendor-wasms.ts | 2 +- .../ingestion/src/parse/wasm-runtime.test.ts | 52 ++++++++++++++++ packages/ingestion/src/parse/wasm-runtime.ts | 60 +++++++++++++++++-- .../src/pipeline/orchestrator.test.ts | 29 ++++++++- .../ingestion/src/pipeline/orchestrator.ts | 50 ++++++++++++++++ .../src/pipeline/phases/ownership.test.ts | 2 + .../ingestion/src/pipeline/phases/parse.ts | 36 +++++++++++ scripts/verify-global-install.sh | 28 ++++++--- 14 files changed, 337 insertions(+), 17 deletions(-) create mode 100644 packages/ingestion/src/parse/wasm-runtime.test.ts diff --git a/packages/cli/src/asset-resolver.test.ts b/packages/cli/src/asset-resolver.test.ts index f5b1cefe..419d7d9a 100644 --- a/packages/cli/src/asset-resolver.test.ts +++ b/packages/cli/src/asset-resolver.test.ts @@ -38,6 +38,11 @@ const CI_TEMPLATE_CANDIDATES = [ ["ci-templates"], ["src", "commands", "ci-templates"], ] as const; +// vendor/wasms is resolved by doctor.ts's inline walk-up (resolveVendorWasmsDir, +// accepts on manifest.json) and by @opencodehub/ingestion's vendor-wasms.ts. +const VENDOR_WASMS_CANDIDATES = [["vendor", "wasms"]] as const; +// java/cobol_to_scip.java is resolved by cobol-proleap-setup.ts's inline walk-up. +const JAVA_WRAPPER_CANDIDATE = [["java", "cobol_to_scip.java"]] as const; test("resolveAsset: flat post-collapse bundle — finds dist/plugin-assets as a sibling", async () => { const root = await mkTree(); @@ -209,4 +214,28 @@ test("drift guard: production resolver candidates hit real files in the built di join(dist, "config", "betterleaks.default.toml"), "betterleaks default config", ); + + // vendored grammar WASMs — doctor.ts's resolveVendorWasmsDir walks up probing + // vendor/wasms and accepts on manifest.json. Also the runtime parse path + // (ingestion vendor-wasms.ts) loads from here. A fixed-offset regression would + // resolve outside dist/ and this assert.equal would fail. + const wasms = resolveAsset(VENDOR_WASMS_CANDIDATES, { startDir: dist }); + assert.equal(wasms, join(dist, "vendor", "wasms"), "ingestion vendored grammars"); + assert.ok( + statSync(join(wasms as string, "manifest.json")).isFile(), + "vendor/wasms/manifest.json must ship (doctor + parse pipeline accept on it)", + ); + assert.ok( + statSync(join(wasms as string, "web-tree-sitter.wasm")).isFile(), + "vendor/wasms/web-tree-sitter.wasm runtime must ship", + ); + + // COBOL ProLeap JVM bridge source — cobol-proleap-setup.ts walks up probing + // java/cobol_to_scip.java. + const javaSrc = resolveAsset(JAVA_WRAPPER_CANDIDATE, { startDir: dist, kind: "file" }); + assert.equal( + javaSrc, + join(dist, "java", "cobol_to_scip.java"), + "cobol proleap wrapper java source", + ); }); diff --git a/packages/cli/src/commands/analyze.ts b/packages/cli/src/commands/analyze.ts index ad02ff3b..983adca1 100644 --- a/packages/cli/src/commands/analyze.ts +++ b/packages/cli/src/commands/analyze.ts @@ -168,6 +168,12 @@ export interface AnalyzeSummary { readonly durationMs: number; readonly upToDate: boolean; readonly warnings: readonly string[]; + /** + * Set when the parse phase produced zero code symbols from a non-trivial + * number of tree-sitter files (likely a globally-broken parser). The CLI maps + * this to a distinct advisory exit code so CI catches a silent-skeleton run. + */ + readonly zeroSymbolGuard?: boolean; } export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promise { @@ -309,6 +315,17 @@ export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promi logWarnings(result.warnings, opts.verbose === true); + // Surface the zero-symbol guard prominently — the detailed message is already + // in result.warnings, but logWarnings collapses grouped warnings, so emit an + // explicit run-level banner that can't be missed (and that the exit-code + // mapping in index.ts keys off via the returned summary flag). + if (result.zeroSymbolGuardTripped === true) { + log( + "codehub analyze: WARNING — extracted 0 code symbols from a non-trivial source tree; " + + "the parser is likely broken (see the parse warning above). Run 'codehub doctor'.", + ); + } + // Persist to the composed graph + temporal store. Storage is always // graph.lbug (graph-tier) + temporal.duckdb sidecar (cochanges, summary // cache); the temporal-tier writes (`bulkLoadCochanges`, @@ -497,6 +514,7 @@ export async function runAnalyze(path: string, opts: AnalyzeOptions = {}): Promi durationMs, upToDate: false, warnings: result.warnings, + ...(result.zeroSymbolGuardTripped === true ? { zeroSymbolGuard: true } : {}), }; } diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts index 2c122d3a..c62c2893 100644 --- a/packages/cli/src/index.ts +++ b/packages/cli/src/index.ts @@ -149,7 +149,7 @@ program const embeddingsWorkers = parseWorkerCount(workersRaw); const embeddingsBatchSize = parsePositiveInt(opts["embeddingsBatchSize"]); - await mod.runAnalyze(path ?? process.cwd(), { + const analyzeSummary = await mod.runAnalyze(path ?? process.cwd(), { force: opts["force"] === true, embeddings: opts["embeddings"] === true, embeddingsVariant: opts["embeddingsInt8"] === true ? "int8" : "fp32", @@ -177,6 +177,10 @@ program strictDetectors: opts["strictDetectors"] === true, ...(allowBuildScripts !== undefined ? { allowBuildScripts } : {}), }); + // Advisory exit code 3: analyze built a graph but extracted zero code + // symbols (likely a broken parser). Distinct from the generic failure + // exit 1 so CI can detect a silent-skeleton run without parsing logs. + if (analyzeSummary.zeroSymbolGuard === true) process.exitCode = 3; }); program diff --git a/packages/cli/tsup.config.ts b/packages/cli/tsup.config.ts index 56f114fb..61d699bb 100644 --- a/packages/cli/tsup.config.ts +++ b/packages/cli/tsup.config.ts @@ -1,4 +1,4 @@ -import { cp, mkdir } from "node:fs/promises"; +import { cp, mkdir, rm } from "node:fs/promises"; import { dirname, join } from "node:path"; import { fileURLToPath } from "node:url"; import { defineConfig } from "tsup"; @@ -59,6 +59,15 @@ const distDir = join(here, "dist"); const EXTERNAL = [/^[^.]/]; async function copyTree(from: string, to: string): Promise { + // rm the leaf dest FIRST so a rename/delete in the source tree doesn't leave + // a stale entry behind in `dist/`. `cp(force:true)` overwrites files present + // in both trees but never removes dest-only entries, so without this an + // incremental/watch rebuild accumulates renamed dirs (e.g. a renamed skill + // left both old and new names under dist/plugin-assets/skills/). `clean:true` + // only wipes dist on a full build, not in --watch. rm the exact leaf `to` + // (never `dirname(to)`) — every onSuccess dest is a distinct non-nested leaf, + // and plugin-assets is copied as one tree, so this can't clobber a sibling. + await rm(to, { recursive: true, force: true }); await mkdir(dirname(to), { recursive: true }); await cp(from, to, { recursive: true, force: true, errorOnExist: false }); } diff --git a/packages/ingestion/src/parse/parse-worker.test.ts b/packages/ingestion/src/parse/parse-worker.test.ts index b437ab18..38b8d7d8 100644 --- a/packages/ingestion/src/parse/parse-worker.test.ts +++ b/packages/ingestion/src/parse/parse-worker.test.ts @@ -4,6 +4,7 @@ import { describe, it } from "node:test"; import type { LanguageId } from "@opencodehub/core-types"; import { MAX_FILE_BYTES, parseOne } from "./parse-worker.js"; import type { ParseCapture, ParseTask } from "./types.js"; +import { WasmRuntimeUnavailableError } from "./wasm-runtime.js"; function task(content: Buffer, language: LanguageId = "typescript"): ParseTask { return { filePath: "src/sample.ts", content, language }; @@ -75,6 +76,25 @@ describe("parseOne — error to warning mapping", () => { assert.equal(result.warnings?.[0], "plain string failure"); }); + + it("RETHROWS a global WasmRuntimeUnavailableError instead of mapping it to a warning", async () => { + // A global runtime death is broken-for-every-file; it must abort the run, + // not become a per-file warning that hides behind a 0-symbol skeleton graph. + await assert.rejects( + parseOne(task(Buffer.from("x")), async () => { + throw new WasmRuntimeUnavailableError("vendor/wasms missing"); + }), + /vendor\/wasms missing/, + ); + }); + + it("keeps an ordinary per-file Error as a warning (not the global sentinel)", async () => { + const result = await parseOne(task(Buffer.from("x")), () => { + throw new Error("one bad file"); + }); + assert.deepEqual(result.captures, []); + assert.equal(result.warnings?.[0], "one bad file"); + }); }); describe("parseOne — happy path", () => { diff --git a/packages/ingestion/src/parse/parse-worker.ts b/packages/ingestion/src/parse/parse-worker.ts index b69689fa..e0a6abe9 100644 --- a/packages/ingestion/src/parse/parse-worker.ts +++ b/packages/ingestion/src/parse/parse-worker.ts @@ -29,7 +29,11 @@ import { Buffer } from "node:buffer"; import { performance } from "node:perf_hooks"; import type { LanguageId, ParseBatch, ParseCapture, ParseResult, ParseTask } from "./types.js"; import { getUnifiedQuery } from "./unified-queries.js"; -import { openWasmParser, type WasmParserHandle } from "./wasm-runtime.js"; +import { + openWasmParser, + type WasmParserHandle, + WasmRuntimeUnavailableError, +} from "./wasm-runtime.js"; const PER_FILE_TIMEOUT_MS = 30_000; /** Pre-parse byte cap. Inputs above this are skipped before any WASM call. */ @@ -101,6 +105,11 @@ export async function parseOne( ...(warnings.length > 0 ? { warnings } : {}), }; } catch (err) { + // A global runtime failure means the parser is dead for every file — let it + // propagate so the batch (and the run) abort loudly, instead of recording N + // identical per-file warnings and a silently symbol-free graph. Per-file + // errors (syntax, timeout, one bad grammar) stay warnings and skip the file. + if (err instanceof WasmRuntimeUnavailableError) throw err; const message = err instanceof Error ? err.message : String(err); warnings.push(message); return { diff --git a/packages/ingestion/src/parse/vendor-wasms.ts b/packages/ingestion/src/parse/vendor-wasms.ts index fa4bedf5..c19a07fa 100644 --- a/packages/ingestion/src/parse/vendor-wasms.ts +++ b/packages/ingestion/src/parse/vendor-wasms.ts @@ -31,7 +31,7 @@ import { statSync } from "node:fs"; import { dirname, resolve } from "node:path"; import { fileURLToPath } from "node:url"; -function isDirSync(p: string): boolean { +export function isDirSync(p: string): boolean { try { return statSync(p).isDirectory(); } catch { diff --git a/packages/ingestion/src/parse/wasm-runtime.test.ts b/packages/ingestion/src/parse/wasm-runtime.test.ts new file mode 100644 index 00000000..21459667 --- /dev/null +++ b/packages/ingestion/src/parse/wasm-runtime.test.ts @@ -0,0 +1,52 @@ +/** + * Unit tests for the WASM-runtime global-failure detection. + * + * The load-bearing behavior: a globally-broken parse runtime (vendored + * `vendor/wasms/` missing, or web-tree-sitter.wasm un-loadable) must surface as + * a thrown {@link WasmRuntimeUnavailableError} — NOT a soft per-file skip that + * lets `analyze` complete with a symbol-free skeleton graph and exit 0. + * + * `VENDOR_WASMS_DIR` is computed once at module load, so the global-detection + * logic is split into the pure `assertRuntimeAvailable(dirExists, dir)` seam, + * which we test directly without standing up Emscripten or mutating the const. + */ + +import { strict as assert } from "node:assert"; +import { describe, it } from "node:test"; +import { + assertRuntimeAvailable, + vendorWasmsDirExists, + WasmRuntimeUnavailableError, +} from "./wasm-runtime.js"; + +describe("assertRuntimeAvailable — global runtime probe", () => { + it("throws WasmRuntimeUnavailableError naming the dir when vendored grammars are missing", () => { + assert.throws( + () => assertRuntimeAvailable(false, "/nope/vendor/wasms"), + (err: unknown) => { + assert.ok(err instanceof WasmRuntimeUnavailableError); + assert.equal((err as Error).name, "WasmRuntimeUnavailableError"); + assert.match((err as Error).message, /\/nope\/vendor\/wasms/); + return true; + }, + ); + }); + + it("does not throw when the vendored directory exists", () => { + assert.doesNotThrow(() => assertRuntimeAvailable(true, "/anywhere")); + }); + + it("sets a name that survives structured-clone across the worker boundary", () => { + // Piscina loses the subclass prototype across threads; the main thread must + // be able to match on `name`. Constructing fresh confirms the name is set. + const err = new WasmRuntimeUnavailableError("boom"); + assert.equal(err.name, "WasmRuntimeUnavailableError"); + }); + + it("reports the real vendored dir as present in this (built) package", () => { + // Sanity: the standalone ingestion build ships vendor/wasms, so the probe + // is true here. This also guards the resolver — a regression that pointed + // VENDOR_WASMS_DIR outside the package would flip this to false. + assert.equal(vendorWasmsDirExists(), true); + }); +}); diff --git a/packages/ingestion/src/parse/wasm-runtime.ts b/packages/ingestion/src/parse/wasm-runtime.ts index 4a864800..3308f1ff 100644 --- a/packages/ingestion/src/parse/wasm-runtime.ts +++ b/packages/ingestion/src/parse/wasm-runtime.ts @@ -17,10 +17,50 @@ import { createRequire } from "node:module"; import path from "node:path"; import type { LanguageId } from "./types.js"; -import { resolveVendorWasmsDir } from "./vendor-wasms.js"; +import { isDirSync, resolveVendorWasmsDir } from "./vendor-wasms.js"; const requireFn = createRequire(import.meta.url); +/** + * Thrown when the WASM parse runtime cannot be brought up at all — the + * vendored `vendor/wasms/` directory is missing or `web-tree-sitter.wasm` + * fails to initialize. This is a GLOBAL, deployment-level failure (a broken + * install / vendoring regression), distinct from a per-file parse error or a + * single missing grammar. Callers let it propagate so a run aborts loudly + * instead of silently producing a symbol-free graph. + * + * `name` is set explicitly so the identity survives Piscina's structured-clone + * across the worker boundary (the subclass prototype does not) — match on + * `err.name === "WasmRuntimeUnavailableError"` on the main thread. + */ +export class WasmRuntimeUnavailableError extends Error { + constructor(message: string) { + super(message); + this.name = "WasmRuntimeUnavailableError"; + } +} + +/** Whether the vendored `vendor/wasms/` directory exists in this deployment. */ +export function vendorWasmsDirExists(): boolean { + return isDirSync(VENDOR_WASMS_DIR); +} + +/** + * Guard that the WASM runtime's vendored assets are present. Throws + * {@link WasmRuntimeUnavailableError} (naming `dir`) when they are not. Split + * out as a pure function so the global-failure detection is unit-testable + * without standing up Emscripten or mutating the module-load-time + * `VENDOR_WASMS_DIR` const. + */ +export function assertRuntimeAvailable(dirExists: boolean, dir: string): void { + if (!dirExists) { + throw new WasmRuntimeUnavailableError( + `web-tree-sitter runtime failed to initialize; vendored grammar directory not found at ${dir} ` + + `(reinstall @opencodehub/cli, or re-vendor with scripts/build-vendor-wasms.sh in a source checkout)`, + ); + } +} + // Resolve `vendor/wasms/` regardless of the emitted layout — the standalone // ingestion build (`dist/parse/`), the flat `@opencodehub/cli` bundle // (`dist/` root, no `parse/` subdir), the test build, or a source checkout. @@ -173,7 +213,11 @@ export async function openWasmParser(lang: LanguageId): Promise { } catch { return undefined; } + // The package is installed, so a failure from here on is a GLOBAL deployment + // breakage (missing vendored runtime), not the soft "not installed" case — + // throw so the run aborts loudly rather than degrading to a 0-symbol graph. + assertRuntimeAvailable(vendorWasmsDirExists(), VENDOR_WASMS_DIR); try { if (typeof mod.Parser.init === "function") { const runtimeWasm = path.resolve(VENDOR_WASMS_DIR, "web-tree-sitter.wasm"); @@ -207,8 +255,12 @@ export async function ensureWasmRuntime(): Promise { locateFile: () => runtimeWasm, }); } - } catch { - return undefined; + } catch (cause) { + const detail = cause instanceof Error ? cause.message : String(cause); + throw new WasmRuntimeUnavailableError( + `web-tree-sitter runtime failed to initialize; web-tree-sitter.wasm not loadable from ${VENDOR_WASMS_DIR} ` + + `(reinstall @opencodehub/cli, or re-vendor with scripts/build-vendor-wasms.sh in a source checkout): ${detail}`, + ); } wasmRuntime = { Parser: mod.Parser, diff --git a/packages/ingestion/src/pipeline/orchestrator.test.ts b/packages/ingestion/src/pipeline/orchestrator.test.ts index 8431f61a..8b8d9e67 100644 --- a/packages/ingestion/src/pipeline/orchestrator.test.ts +++ b/packages/ingestion/src/pipeline/orchestrator.test.ts @@ -4,7 +4,7 @@ import { mkdtemp, rm } from "node:fs/promises"; import { tmpdir } from "node:os"; import path from "node:path"; import { after, before, describe, it } from "node:test"; -import { runIngestion } from "./orchestrator.js"; +import { runIngestion, shouldTripZeroSymbolGuard } from "./orchestrator.js"; import type { PipelineContext, PipelineOptions, PipelinePhase, PreviousGraph } from "./types.js"; describe("runIngestion (end-to-end)", () => { @@ -368,4 +368,31 @@ describe("runIngestion (determinism with communities + processes)", () => { assert.ok(hasCommunity, "Community node missing"); assert.ok(hasProcess, "Process node missing"); }); + + it("does NOT trip the zero-symbol guard on a healthy repo with real symbols", async () => { + const result = await runIngestion(repo, { skipGit: true }); + assert.notEqual( + result.zeroSymbolGuardTripped, + true, + "guard must stay quiet when symbols extracted", + ); + const fnCount = [...result.graph.nodes()].filter((n) => n.kind === "Function").length; + assert.ok(fnCount >= 1, "the TS fixture must yield at least one Function node"); + }); +}); + +describe("shouldTripZeroSymbolGuard", () => { + it("trips only when enough tree-sitter files yielded zero symbols", () => { + // A globally-broken parser: many files, zero symbols. + assert.equal(shouldTripZeroSymbolGuard(100, 0), true); + assert.equal(shouldTripZeroSymbolGuard(5, 0), true); + // Below the floor — a tiny repo is too small to distinguish broken-vs-sparse. + assert.equal(shouldTripZeroSymbolGuard(4, 0), false); + // Healthy — symbols were extracted. + assert.equal(shouldTripZeroSymbolGuard(5, 3), false); + assert.equal(shouldTripZeroSymbolGuard(100, 1), false); + // Legitimately-empty (configs-only / cobol-only / unsupported) — no + // tree-sitter files at all, so the guard never fires. + assert.equal(shouldTripZeroSymbolGuard(0, 0), false); + }); }); diff --git a/packages/ingestion/src/pipeline/orchestrator.ts b/packages/ingestion/src/pipeline/orchestrator.ts index aee35fe5..2679142f 100644 --- a/packages/ingestion/src/pipeline/orchestrator.ts +++ b/packages/ingestion/src/pipeline/orchestrator.ts @@ -96,6 +96,36 @@ export interface RunPipelineResult { * runs it (the phase internally short-circuits when gated off). */ readonly summarize?: SummarizePhaseOutput; + /** + * Set when the parse phase routed a non-trivial number of files to + * tree-sitter grammars but extracted ZERO code symbols — the signature of a + * globally-broken parser (e.g. a WASM grammar/resolver regression) that would + * otherwise complete "successfully" with a File/Directory-only skeleton + * graph. A run-level backstop to the per-file warnings (which collapse and + * never affect the verdict). See {@link shouldTripZeroSymbolGuard}. + */ + readonly zeroSymbolGuardTripped?: boolean; +} + +/** + * Below this many tree-sitter files, a real-but-tiny repo is too small to + * confidently distinguish "broken parser" from "genuinely few symbols", so the + * guard stays quiet. The failure mode it targets is a GLOBAL break across a + * real codebase, where the file count is comfortably above this floor. + */ +const ZERO_SYMBOL_MIN_FILES = 5; + +/** + * Predicate for the zero-symbol guard: trips when the repo presented at least + * {@link ZERO_SYMBOL_MIN_FILES} tree-sitter-parseable files yet produced zero + * code symbols. Pure + exported so the threshold logic is unit-testable + * without standing up the parser. + */ +export function shouldTripZeroSymbolGuard( + treeSitterFileCount: number, + treeSitterSymbolCount: number, +): boolean { + return treeSitterFileCount >= ZERO_SYMBOL_MIN_FILES && treeSitterSymbolCount === 0; } export interface RunIngestionOptions extends PipelineOptions { @@ -225,6 +255,25 @@ export async function runIngestion( if (phaseMemDebug) { process.stderr.write(`[phase-telemetry] graphHash-end dur=${Date.now() - hashStart}ms\n`); } + + // Run-level backstop: a globally-broken parser yields empty captures for + // every file, so extraction produces a File/Directory-only skeleton and the + // run "succeeds". Per-file warnings collapse and never affect the verdict — + // so aggregate here and surface a loud, greppable warning. (The hard-runtime + // case aborts earlier via WasmRuntimeUnavailableError; this catches the + // residual soft case, e.g. web-tree-sitter genuinely absent.) + const zeroSymbolGuardTripped = + parse !== undefined && + shouldTripZeroSymbolGuard(parse.treeSitterFileCount, parse.treeSitterSymbolCount); + if (zeroSymbolGuardTripped && parse !== undefined) { + warnings.push( + `parse: extracted 0 code symbols from ${parse.treeSitterFileCount} tree-sitter source file(s) ` + + `(expected Function/Class/Method/etc). The parser is likely globally broken — the graph holds ` + + `only File/Directory nodes. Re-run with --verbose for per-file parse warnings, or run ` + + `'codehub doctor' to check the vendored grammars.`, + ); + } + return { graph, graphHash: hashed, @@ -251,6 +300,7 @@ export async function runIngestion( ...(scan !== undefined ? { scan } : {}), ...(cochange !== undefined ? { cochange } : {}), ...(summarize !== undefined ? { summarize } : {}), + ...(zeroSymbolGuardTripped ? { zeroSymbolGuardTripped: true } : {}), }; } diff --git a/packages/ingestion/src/pipeline/phases/ownership.test.ts b/packages/ingestion/src/pipeline/phases/ownership.test.ts index e8371ab1..55469e54 100644 --- a/packages/ingestion/src/pipeline/phases/ownership.test.ts +++ b/packages/ingestion/src/pipeline/phases/ownership.test.ts @@ -73,6 +73,8 @@ function buildParseOutput(defs: ReadonlyMap n + d.length, 0), }; } diff --git a/packages/ingestion/src/pipeline/phases/parse.ts b/packages/ingestion/src/pipeline/phases/parse.ts index e0b3ae6e..5ab88b6b 100644 --- a/packages/ingestion/src/pipeline/phases/parse.ts +++ b/packages/ingestion/src/pipeline/phases/parse.ts @@ -97,6 +97,22 @@ export interface ParseOutput { readonly cacheHits: number; /** Number of files that bypassed the cache and were parsed fresh. */ readonly cacheMisses: number; + /** + * Number of files routed to a tree-sitter grammar (i.e. `parseCandidates`, + * which excludes cobol — it goes through the regex provider). This is the + * "should have produced symbols" denominator for the zero-symbol guard; a + * configs-only / cobol-only / unsupported-language repo reports 0 here and + * so never trips the guard. + */ + readonly treeSitterFileCount: number; + /** + * Total extracted definitions (Function/Class/Method/etc) from tree-sitter + * files. Counts only `definitionsByFile` entries — NOT cobol CodeElements or + * external import stubs (those are emitted directly to the graph and would + * mask a broken parser). 0 with a non-trivial {@link treeSitterFileCount} + * means symbol extraction globally failed. + */ + readonly treeSitterSymbolCount: number; } export const PARSE_PHASE_NAME = "parse"; @@ -218,6 +234,15 @@ async function runParse( const pool = new ParsePool({ maxThreads: poolMaxThreads() }); try { parseResults = await pool.dispatch(tasks); + } catch (err) { + // A global WASM-runtime failure (missing vendored grammars) rejects the + // dispatch — abort the parse phase loudly. Piscina serializes the worker + // error as a plain Error, so match on `name`, not `instanceof`. Re-throw + // with a phase prefix; any other dispatch error propagates unchanged. + if (err instanceof Error && err.name === "WasmRuntimeUnavailableError") { + throw new Error(`parse: WASM runtime unavailable — ${err.message}`); + } + throw err; } finally { await pool.destroy(); } @@ -691,6 +716,15 @@ async function runParse( } } + // Sum definitions extracted from tree-sitter files only (keyed by relPath of + // a parseCandidate). Excludes cobol CodeElements + external import stubs, + // which are added straight to the graph and would mask a globally-broken + // parser. 0 here with a non-trivial treeSitterFileCount = symbols failed. + let treeSitterSymbolCount = 0; + for (const f of parseCandidates) { + treeSitterSymbolCount += definitionsByFile.get(f.relPath)?.length ?? 0; + } + return { definitionsByFile, callsByFile, @@ -704,6 +738,8 @@ async function runParse( fileCount: parseCandidates.length + cobolCandidates.length, cacheHits: hits.length, cacheMisses: missFiles.length, + treeSitterFileCount: parseCandidates.length, + treeSitterSymbolCount, }; } diff --git a/scripts/verify-global-install.sh b/scripts/verify-global-install.sh index e24330c6..060f05fb 100755 --- a/scripts/verify-global-install.sh +++ b/scripts/verify-global-install.sh @@ -333,16 +333,28 @@ else rm -f "$ANALYZE_LOG" fi - # ------------------------------------------------------------------ smoke: codehub query 'export default' - # The query phase exits 0 even on zero hits, so the gate is "1+ hits". + # ------------------------------------------------------------------ smoke: known fixture symbol parses + # Assert the analyzer extracted a REAL symbol, not just that BM25 returned + # some text. `Greet` is the top-level Go func in greeter.go — uniquely cased + # (lowercase `greet` lives in greeter.ts AND greeter.py; `Greeting` is in all + # three), never an external/dependency ref, and a no-receiver Go func is + # always kind Function. We require a printed table ROW whose KIND is + # Function/Class/Method on the same line as the greeter.go FILE column. This + # FAILS on a 0-symbol skeleton graph, where `query` prints only the stderr + # header and zero rows — the exact regression the old 'export default' / + # "1+ hits" gate let through (the header alone satisfied "non-empty"). if [ -d "$FIXTURE_DIR" ]; then - QUERY_OUT=$(cd "$FIXTURE_DIR" && codehub query 'export default' 2>&1 || true) - if printf '%s' "$QUERY_OUT" | grep -qiE 'no results|0 results|0 hits|no matches'; then - fail "smoke: codehub query 'export default' returned no hits" - elif [ -n "$QUERY_OUT" ]; then - pass "smoke: codehub query 'export default' returned at least one hit" + QUERY_OUT=$(cd "$FIXTURE_DIR" && codehub query 'Greet' 2>&1 || true) + # Columns are padded with two spaces (SCORE KIND NAME FILE SOURCES); a line + # carrying both a real KIND token and greeter.go proves symbol extraction. + # Anchored on the KIND token so the stderr header line (which contains + # "Greet" but no KIND column and no greeter.go) cannot match. + if printf '%s\n' "$QUERY_OUT" | grep -qE '(^|[[:space:]])(Function|Class|Method)[[:space:]].*greeter\.go'; then + pass "smoke: codehub query 'Greet' returned a real symbol row (Function/Class/Method) from greeter.go" else - fail "smoke: codehub query 'export default' returned empty output" + fail "smoke: codehub query 'Greet' did not return a Function/Class/Method row from greeter.go — parser produced no real symbols" + note "query output:" + printf '%s\n' "$QUERY_OUT" | head -20 | sed 's/^/ /' >&2 || true fi fi fi From 8a5f890ca9d30d508f948e954fccafac42e1c0b2 Mon Sep 17 00:00:00 2001 From: T Date: Mon, 8 Jun 2026 20:27:12 -0500 Subject: [PATCH 2/3] docs(repo): record silent-degradation run-level-guard lesson --- .erpaval/INDEX.md | 1 + ...ent-degradation-needs-a-run-level-guard.md | 105 ++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 .erpaval/solutions/best-practices/silent-degradation-needs-a-run-level-guard.md diff --git a/.erpaval/INDEX.md b/.erpaval/INDEX.md index fe301b0c..5a05b0f6 100644 --- a/.erpaval/INDEX.md +++ b/.erpaval/INDEX.md @@ -33,6 +33,7 @@ development sessions. Solutions are reusable; specs are per-feature. - [Replace raw-SQL escape hatches with typed finders on the storage interface](solutions/architecture-patterns/typed-finders-replace-raw-sql-in-consumers.md) — 108 raw-SQL sites collapse into 15 named finders. Adapters internalize dialect; consumers stay backend-agnostic. Liskov-clean parity harness via public-method rebuilder. - [Parallel Act subagents on a shared git tree — interleaving + cherry-pick discipline](solutions/best-practices/parallel-act-subagents-with-shared-git-tree.md) — verify branch state, spawn on non-overlapping packages, watch for stale dist + phantom test counts, watch the test-fixup tail. - [Fixed-offset asset resolvers break on bundle collapse](solutions/best-practices/fixed-offset-asset-resolvers-break-on-bundle-collapse.md) — after a tsup collapse flattens `dist/commands/x.js`→`dist/x-.js`, EVERY `import.meta.url` resolver with a fixed `..` offset shifts by a level. #189 fixed only doctor.ts; 6 others shipped broken ~5 days (init/ci-init/setup/betterleaks + 2 ingestion WASM resolvers), two SILENTLY (`analyze` → 0 symbols, exit 0). Convert all to walk-up probes; the CLI bundles deps from `dist/` so rebuild in dep order; verify from a packed-tarball global install, not the hot node_modules. +- [Silent degradation needs a run-level guard](solutions/best-practices/silent-degradation-needs-a-run-level-guard.md) — a `catch→warn→continue` loop hides a GLOBAL stage failure: every item fails identically, the run exits 0 with empty output. Fix is a PAIR — (a) a thrown sentinel distinguishing global-init failure from per-item failure (set `.name`, Piscina drops the prototype across threads), (b) a run-level "processed N but produced 0 expected outputs" guard with a distinct exit code. Pick the denominator carefully (exclude regex-provider/placeholder output) and make smoke tests assert a KNOWN symbol, not "any non-empty result". - [Squash-merge masks pre-existing repo-wide debt](solutions/best-practices/squash-merge-masks-pre-existing-debt.md) — first action on a fresh branch from main is `mise run check` BEFORE starting work; lint rules / transitive deps / cross-package test assertions drift across squash boundaries even when per-commit gating was green inside the prior PR. - [No spec-coordinate leakage into source](solutions/best-practices/no-spec-coordinate-leakage-into-source.md) — ERPAVal `AC-*`, `M-*`, `W-*`, `CL-*` prefixes belong in commits, PR bodies, ADR refs sections — NOT in JSDoc, inline comments, CLI flag help, MCP tool descriptions, or test names. Sweep `rg -n "AC-[A-Z]-[0-9]" packages/` before every PR-open; LLM clients pick up the leakage and start citing it back. - [release: published events need PAT or inline](solutions/conventions/release-published-event-needs-pat-or-inline.md) — release-please-action with default `GITHUB_TOKEN` does NOT fire downstream `release: [published]` workflows; inline asset-attach in `release-please.yml` gated on `steps.release.outputs.release_created`. Fixed AC-D-4; sbom.yml has same latent bug for follow-on. diff --git a/.erpaval/solutions/best-practices/silent-degradation-needs-a-run-level-guard.md b/.erpaval/solutions/best-practices/silent-degradation-needs-a-run-level-guard.md new file mode 100644 index 00000000..5c9406fc --- /dev/null +++ b/.erpaval/solutions/best-practices/silent-degradation-needs-a-run-level-guard.md @@ -0,0 +1,105 @@ +--- +name: silent-degradation-needs-a-run-level-guard +description: A pipeline stage that catches per-item errors and continues (returns empty, logs a per-item warning) will hide a GLOBAL failure of that stage — every item fails identically, the run completes, prints a count, and exits 0. Per-item warnings collapse and never become a verdict. The fix is a pair — (a) at the failure site, distinguish a global/init failure from a per-item failure and THROW on global; (b) a run-level aggregate guard ("processed N items but produced 0 of the expected output") that maps to a distinct exit code. Smoke tests must assert real output, not "any non-empty result". +metadata: + type: bug + category: best-practices +tags: [analyze, parse, wasm, silent-failure, exit-code, error-handling, piscina, smoke-test, guard] +discovered: 2026-06-08 +session: session-analyze-hardening +related: + - fixed-offset-asset-resolvers-break-on-bundle-collapse + - doctor-probe-drift-after-rip-and-replace + - kill-escalation-races-the-exit-handler +--- + +# Silent degradation needs a run-level guard, not just per-item warnings + +## What bit us + +The fixed-offset WASM-resolver bug ([[fixed-offset-asset-resolvers-break-on-bundle-collapse]]) +shipped invisibly for ~5 days. The resolver bug was the *cause*; the reason it +was *invisible* is a separate, more general defect worth its own lesson: + +`codehub analyze` parses every source file in a Piscina worker. The worker's +per-file `try/catch` (parse-worker.ts) mapped **any** thrown error to +`{captures: [], warnings: [msg]}` and continued. So when the grammar runtime was +globally dead (vendored `vendor/wasms/` unresolvable in the flat bundle), EVERY +file took the catch, produced zero captures, and the pipeline built a +File/Directory-only **skeleton graph** — then printed `"5 nodes"` and **exited +0**. The per-file warnings existed but `logWarnings` collapsed them into +`"parse: N warnings (use --verbose)"`, and nothing aggregated them into a +run-level verdict or an exit code. A broken parser looked exactly like a healthy +parse of a symbol-free repo. + +## The shape of the bug class + +Any stage with this shape is vulnerable: + +``` +for (const item of items) { + try { results.push(await process(item)); } + catch (e) { warnings.push(e.message); results.push(EMPTY); } // continue +} +return results; // a GLOBAL failure = N identical EMPTY results = silent success +``` + +The per-item resilience (good — one malformed file shouldn't fail the run) is +exactly what masks a global failure (bad — a dead runtime should fail the run). + +## The fix is a PAIR, not one change + +You need both halves; either alone is insufficient. + +**(a) At the failure site, distinguish global from per-item and THROW on global.** +Introduce a sentinel error (`WasmRuntimeUnavailableError`) for the +init/deployment-level failure (vendored dir missing, runtime won't init). Throw +it from the init path; rethrow it through the per-item catch *before* the +generic warning-mapping. A global failure then aborts the run loudly with an +actionable message, instead of becoming N warnings. Per-item errors (one bad +file) still warn-and-skip. + +**(b) A run-level aggregate guard as the backstop.** Even with (a), a softer +degradation can slip through (e.g. the runtime package is genuinely absent and +the init path returns a soft `undefined` by design). So add a run-level +predicate: "the run processed >= K items that SHOULD have produced output, but +produced 0" → push a loud warning AND set a **distinct advisory exit code** +(here: 3, separate from generic-failure 1) so CI can detect a silent-skeleton +run without scraping logs. Make the predicate a pure exported function and +table-test the threshold. + +Keep the two orthogonal: (a) keys off the thrown error, (b) keys off the output +count. They must not double-fire on the same condition — (a) handles hard death, +(b) handles the residual soft case. + +## Gotchas that matter + +- **Piscina structured-clones the rejection across threads** — a custom + `class XError extends Error` arrives on the main thread as a plain `Error` + with the prototype lost. Set `this.name` in the constructor and match on + `err.name === "XError"` on the main thread (NOT `instanceof`). `instanceof` + is fine *inside* the worker, before serialization. +- **Pick the denominator carefully.** "Files scanned" is the wrong count if some + legitimately produce no output (here: cobol routes through a regex provider, + not tree-sitter; config-only/empty repos have zero parseable files). The guard + must key off "items that SHOULD have produced output" (tree-sitter files), or + it false-positives on legitimately-empty repos. And don't count placeholder + output (external import stubs are `CodeElement` nodes — counting them as + "symbols" would let an import-only repo mask a broken parser). +- **Smoke tests must assert REAL output, not "any non-empty result".** The + global-install verifier queried `'export default'` and passed on "≥1 hit" — + but the query command's stderr header (`query: "..." (0 results)`) is itself + non-empty, so the gate passed on a 0-symbol graph. Tighten to a KNOWN symbol + from a KNOWN fixture with the expected KIND (`Function`) and FILE — something + that can only appear if extraction actually ran. +- **A separate `doctor`-style health command does NOT protect the hot path.** + `doctor` had a vendored-wasms check, but it's a manual command never run by + `analyze`. Don't assume an existing health probe covers the pipeline. + +## The one-line takeaway + +Per-item error resilience hides global stage failure. Whenever you write +`catch → warn → continue` in a loop, ask "what does a GLOBAL failure of this +stage look like?" — and add (a) a thrown sentinel for the global case + (b) a +run-level "produced 0 of the expected output" guard with its own exit code. +Then make the smoke test assert real output, not a non-empty string. From 02c4a8821281ab3dab16423c6e9f1f080e69263e Mon Sep 17 00:00:00 2001 From: T Date: Tue, 9 Jun 2026 17:29:34 -0500 Subject: [PATCH 3/3] fix(ingestion): make the fail-loud parser contract symmetric + regression-proof MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up hardening on this PR's own contract, from a review pass. Three gaps, none a behavior regression today — but each let the contract silently rot: 1. `buildParserForLanguage` (complexity phase, main thread) is the one runtime-seam caller whose contract flipped from "return undefined" to "may throw WasmRuntimeUnavailableError" without being updated. Today the throw correctly propagates (no try/catch swallows it) and aborts the run — but only INCIDENTALLY. Document the invariant load-bearingly at the complexity seam ("do NOT wrap in a try/catch that folds the throw into the undefined skip path") and correct the stale `ensureWasmRuntime` docstring that still described the old all-undefined contract. No logic change. 2. `parse.ts` dispatch catch matched `err.name === "WasmRuntimeUnavailableError"` — but Piscina's structured-clone transport resets a custom Error subclass's `.name` to "Error" across the worker boundary (verified empirically against piscina 5.1.4), so that branch was DEAD. The run still aborted (runner wraps the thrown error as `Phase 'parse' failed: …`, and `.message` survives), but the actionable reinstall/re-vendor prefix never fired. Match on the stable message token both throw sites emit instead, and document why. 3. No test pinned the complexity propagation, so a future "defensive" refactor could re-swallow the throw with the suite staying green. Add two tests via a minimal injectable-builder seam (mirrors parse-worker's parseFn idiom; inert in production — defaults to buildParserForLanguage, fresh cache only when injected): one asserts a global sentinel REJECTS runComplexity (verified load-bearing — fails if the throw is swallowed into `skipped`), one asserts the soft undefined case still degrades to a per-language skip. ingestion 581/581, cli 316 pass / 0 fail / 11 platform-skip; typecheck + biome clean. Net production logic change: zero (one corrected docstring, one intent comment, one dead-branch→live-branch matcher fix, and a test-only seam). --- packages/ingestion/src/parse/wasm-runtime.ts | 16 +++- .../src/pipeline/phases/complexity.test.ts | 84 ++++++++++++++++++- .../src/pipeline/phases/complexity.ts | 47 +++++++++-- .../ingestion/src/pipeline/phases/parse.ts | 20 +++-- 4 files changed, 151 insertions(+), 16 deletions(-) diff --git a/packages/ingestion/src/parse/wasm-runtime.ts b/packages/ingestion/src/parse/wasm-runtime.ts index 3308f1ff..2fd57618 100644 --- a/packages/ingestion/src/parse/wasm-runtime.ts +++ b/packages/ingestion/src/parse/wasm-runtime.ts @@ -224,9 +224,19 @@ export async function openWasmParser(lang: LanguageId): Promise { } }); }); + +/** + * Build the {ctx, scan, parse} inputs for {@link runComplexity} from a real + * single-`if` Python fixture (one callable, `foo`). Stops before the complexity + * phase so the test can drive it directly with an injected parser builder. + */ +async function prepareComplexityInputs( + repo: string, +): Promise<{ ctx: PipelineContext; scan: ScanOutput; parse: ParseOutput }> { + await fs.writeFile( + path.join(repo, "m.py"), + ["def foo(x):", " if x:", " return 1", " return 0", ""].join("\n"), + ); + const ctx: PipelineContext = { + repoPath: repo, + options: { skipGit: true }, + graph: new KnowledgeGraph(), + phaseOutputs: new Map(), + }; + const scan = (await scanPhase.run(ctx, new Map())) as ScanOutput; + const structure = await structurePhase.run( + ctx, + new Map([[SCAN_PHASE_NAME, scan]]), + ); + const parse = (await parsePhase.run( + ctx, + new Map([ + [SCAN_PHASE_NAME, scan], + [STRUCTURE_PHASE_NAME, structure], + ]), + )) as ParseOutput; + return { ctx, scan, parse }; +} + +describe(`${COMPLEXITY_PHASE_NAME}Phase — fail-loud parser contract`, () => { + it("RETHROWS a global WasmRuntimeUnavailableError instead of swallowing it into skipped", async () => { + // A GLOBAL runtime death is broken-for-every-file; complexity must let it + // propagate and abort the run — NOT fold it into the soft per-language skip + // path and resolve with skipped>0 (which would let a broken install produce + // a symbol-free graph on an all-cache-hit run, where parse never opened the + // runtime to abort first). Mirror of parse-worker.test.ts's sentinel test. + const repo = await mkdtemp(path.join(tmpdir(), "och-complexity-failloud-")); + try { + const { ctx, scan, parse } = await prepareComplexityInputs(repo); + assert.ok(parse.definitionsByFile.size >= 1, "fixture must yield a callable to annotate"); + await assert.rejects( + runComplexity(ctx, parse, scan, () => { + throw new WasmRuntimeUnavailableError( + "web-tree-sitter runtime failed to initialize; test", + ); + }), + /web-tree-sitter runtime failed to initialize/, + ); + } finally { + await rm(repo, { recursive: true, force: true }); + } + }); + + it("treats an undefined parser (soft case) as a per-language skip, not a throw", async () => { + // The SOFT case (web-tree-sitter package genuinely absent, or no grammar) + // returns undefined → complexity degrades to a best-effort skip and still + // resolves. Locks the documented split so a future edit can't collapse the + // two cases in either direction. + const repo = await mkdtemp(path.join(tmpdir(), "och-complexity-soft-")); + try { + const { ctx, scan, parse } = await prepareComplexityInputs(repo); + const out = await runComplexity(ctx, parse, scan, async () => undefined); + assert.equal(out.symbolsAnnotated, 0, "no parser → nothing annotated"); + assert.ok(out.skipped >= 1, "the soft case skips the callable rather than throwing"); + } finally { + await rm(repo, { recursive: true, force: true }); + } + }); +}); diff --git a/packages/ingestion/src/pipeline/phases/complexity.ts b/packages/ingestion/src/pipeline/phases/complexity.ts index 4bcae85a..84ef9440 100644 --- a/packages/ingestion/src/pipeline/phases/complexity.ts +++ b/packages/ingestion/src/pipeline/phases/complexity.ts @@ -90,16 +90,41 @@ interface WasmParserLike { parse(source: string): WasmTree | null; } -async function getParser(lang: LanguageId): Promise { - const cached = parserCache.get(lang); +/** + * Builder seam for the per-language WASM parser. Defaults to the production + * {@link buildParserForLanguage}; injectable so the fail-loud contract (a + * GLOBAL runtime failure must propagate, the SOFT case must skip) is testable + * without standing up Emscripten. Mirrors the injectable-default-param idiom + * used by `parseOne` in parse-worker.ts. + */ +type ParserBuilder = (lang: LanguageId) => Promise; + +const defaultParserBuilder: ParserBuilder = (lang) => + buildParserForLanguage(lang) as Promise; + +async function getParser( + lang: LanguageId, + cache: Map, + build: ParserBuilder = defaultParserBuilder, +): Promise { + const cached = cache.get(lang); if (cached === null) return undefined; if (cached !== undefined) return cached; - const parser = (await buildParserForLanguage(lang)) as WasmParserLike | undefined; + // build() returns undefined for the SOFT case (web-tree-sitter package + // genuinely absent, or no grammar for this lang) → cache null and skip this + // language (best-effort, mirrors the parse phase). It THROWS + // WasmRuntimeUnavailableError on a GLOBAL runtime break; complexity + // intentionally lets that propagate (fail-loud) — do NOT wrap this in a + // try/catch that folds the throw into the undefined skip-path, or a broken + // install would silently produce a symbol-free graph. The parse phase aborts + // first on the common path; this is the backstop on an all-cache-hit run + // where parse re-used cached extractions and never opened the runtime. + const parser = await build(lang); if (parser === undefined) { - parserCache.set(lang, null); + cache.set(lang, null); return undefined; } - parserCache.set(lang, parser); + cache.set(lang, parser); return parser; } @@ -495,11 +520,19 @@ const CALLABLE_KINDS: ReadonlySet = new Set([ "Constructor", ]); -async function runComplexity( +export async function runComplexity( ctx: PipelineContext, parse: ParseOutput, scan: ScanOutput, + // Test seam: inject a parser builder to exercise the fail-loud contract + // without Emscripten. In production this is undefined → the module-global + // parserCache + defaultParserBuilder are used (cross-run caching preserved). + // When injected, a fresh cache scopes the call so a prior cached null can't + // shadow the injected behavior and the test can't pollute the global cache. + build?: ParserBuilder, ): Promise { + const cache = build === undefined ? parserCache : new Map(); + const buildFn = build ?? defaultParserBuilder; const absByRel = new Map(); const langByRel = new Map(); for (const f of scan.files) { @@ -532,7 +565,7 @@ async function runComplexity( continue; } - const parser = await getParser(lang); + const parser = await getParser(lang, cache, buildFn); if (parser === undefined) { skipped += callableDefs.length; continue; diff --git a/packages/ingestion/src/pipeline/phases/parse.ts b/packages/ingestion/src/pipeline/phases/parse.ts index 5ab88b6b..32c74923 100644 --- a/packages/ingestion/src/pipeline/phases/parse.ts +++ b/packages/ingestion/src/pipeline/phases/parse.ts @@ -235,11 +235,21 @@ async function runParse( try { parseResults = await pool.dispatch(tasks); } catch (err) { - // A global WASM-runtime failure (missing vendored grammars) rejects the - // dispatch — abort the parse phase loudly. Piscina serializes the worker - // error as a plain Error, so match on `name`, not `instanceof`. Re-throw - // with a phase prefix; any other dispatch error propagates unchanged. - if (err instanceof Error && err.name === "WasmRuntimeUnavailableError") { + // A global WASM-runtime failure (missing vendored grammars / un-loadable + // web-tree-sitter.wasm) rejects the dispatch — abort the parse phase + // loudly. Piscina's structured-clone transport across the worker boundary + // resets a custom Error subclass's `.name` back to "Error" (verified + // against piscina 5.1.4), so the WasmRuntimeUnavailableError identity is + // GONE by the time it arrives here — only `.message` survives. Match on + // the stable message token both throw sites emit, NOT on `.name` / + // `instanceof`. Re-throw with a phase prefix; any other dispatch error + // propagates unchanged. (The run aborts either way — the runner wraps a + // thrown phase error as `Phase 'parse' failed: …` — but this keeps the + // actionable reinstall/re-vendor guidance front-and-center.) + if ( + err instanceof Error && + /web-tree-sitter runtime failed to initialize/.test(err.message) + ) { throw new Error(`parse: WASM runtime unavailable — ${err.message}`); } throw err;