From dd8abbafeb85a3ed5cdeb11de98e93ba9d0c3b29 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Thu, 14 May 2026 16:25:16 -0600 Subject: [PATCH 1/2] fix(native): backfill new dropped-language files on quiet incrementals MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The post-orchestrator gate (`isFullBuild || removedCount>0 || changedCount>0`) left a silent gap: when a brand-new file in an extension outside the Rust binary's `file_collector` (e.g. on an older binary) is added on an otherwise-quiet incremental, the orchestrator reports no activity and backfill is skipped — the file never enters `nodes`/`file_hashes` until a forced full rebuild. Extract the missing-file detection from `backfillNativeDroppedFiles` into `detectDroppedLanguageGap`, run it once before the gate, and pass the result to `backfillNativeDroppedFiles`. The scan extends both code paths: - The `result.earlyExit` branch now also repairs the gap before returning, so no-op rebuilds with stale `nodes` rows for dropped-extension files are healed instead of left silent. - The non-earlyExit gate adds `gap.missingAbs.length > 0` so quiet incrementals with a real gap trigger backfill. No redundant fs walks: incrementals where the orchestrator signals fire would have done the same walk inside the old `backfillNativeDroppedFiles` anyway. The new cost on a truly-quiet incremental with no gap is one `collectFiles` walk + two cheap DB queries (early-return before any parsing/WAL handoff). Closes #1083, closes #1091. --- src/domain/graph/builder/pipeline.ts | 98 ++++++++++------ .../integration/dropped-language-gap.test.ts | 110 ++++++++++++++++++ 2 files changed, 174 insertions(+), 34 deletions(-) create mode 100644 tests/integration/dropped-language-gap.test.ts diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index f31afa77..1416631a 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -658,6 +658,14 @@ async function tryNativeOrchestrator( if (result.earlyExit) { info('No changes detected'); + // Even on no-op rebuilds, dropped-language files added since the last + // full build are still missing from `nodes`/`file_hashes` (#1083). The + // orchestrator's file_collector skipped them, so its earlyExit doesn't + // imply DB consistency. Run the gap repair before returning. + const gap = detectDroppedLanguageGap(ctx); + if (gap.missingAbs.length > 0) { + await backfillNativeDroppedFiles(ctx, gap); + } closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); return 'early-exit'; } @@ -753,37 +761,55 @@ async function tryNativeOrchestrator( // stale native binaries). WASM handles those — backfill via WASM so both // engines process the same file set (#967). // - // Runs on full builds and on incrementals when the orchestrator reports - // any file activity (removals or changes). The orchestrator's - // `detect_removed_files` filter (#1070) skips files outside its narrower - // file_collector, so on a current binary a no-op rebuild reports - // `removedCount=0` and `changedCount=0`, making the backfill call pure - // overhead (fs walk + 2 DB queries + 48-file WASM re-parse). Legacy - // binaries lacking the filter still report `removedCount>0` and get the - // gap-repair behavior #1068 introduced. Triggering on `changedCount>0` - // narrows (but does not fully close) the gap where a brand-new - // unsupported-extension file is added on an otherwise-quiet incremental - // — see #1091 for the residual gap. + // Detect the gap once (fs walk + 2 DB queries, ~20–30ms) and use it for + // both gating and the backfill itself. On dirty incrementals/full builds + // the orchestrator signals trigger backfill, so the walk happens once + // (instead of redundantly inside backfill). On quiet incrementals we + // still pay the walk so we can detect brand-new files in dropped-language + // extensions — a gap that the orchestrator's `detect_removed_files` + // filter (#1070) leaves open (#1083, #1091). The pre-check is cheap + // because the expensive part (WASM re-parse of the missing set) is + // gated below. const removedCount = result.removedCount ?? 0; const changedCount = result.changedCount ?? 0; - if (result.isFullBuild || removedCount > 0 || changedCount > 0) { - await backfillNativeDroppedFiles(ctx); + const gap = detectDroppedLanguageGap(ctx); + if (result.isFullBuild || removedCount > 0 || changedCount > 0 || gap.missingAbs.length > 0) { + await backfillNativeDroppedFiles(ctx, gap); } closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); return formatNativeTimingResult(p, structurePatchMs, analysisTiming); } +/** Files the native orchestrator silently dropped — the working set for backfill. */ +interface DroppedLanguageGap { + /** Relative paths (normalized) of files missing from `nodes` or `file_hashes`. */ + missingRel: string[]; + /** Absolute paths, aligned by index with `missingRel`. */ + missingAbs: string[]; +} + /** - * Backfill files that the native orchestrator silently dropped during parse. - * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). + * Detect files the native orchestrator silently dropped. + * + * Walks the filesystem and compares against `nodes` + `file_hashes`. A file + * is "missing" if it's absent from EITHER table — both must be present for + * the fast-skip pre-flight (#1054) to work, and the two can diverge (e.g. + * legacy DBs where `nodes` was populated but `file_hashes` was not). + * + * Restricted to files with an installed WASM grammar; extensions in + * `LANGUAGE_REGISTRY` without a shipped grammar (e.g. groovy on minimal + * installs) can't be parsed by either engine, so they're not a native + * regression — excluding them keeps the warn count in + * `backfillNativeDroppedFiles` meaningful. + * + * Cheap (no DB handoff, no parsing): used both to gate the backfill call + * and as its working set. NativeDbProxy supports `.prepare().all()`, so + * this works whether `ctx.db` is a proxy or a real better-sqlite3 + * connection — letting us skip the close-native / reopen-better-sqlite3 + * cost when there's nothing to backfill. */ -async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { - // Compute the missing-file set FIRST, before any expensive DB handoff. - // NativeDbProxy supports .prepare().all(), so the upfront query works - // whether ctx.db is a proxy or a real better-sqlite3 connection. On - // incremental no-op rebuilds nothing is missing, so we want to early-return - // without paying the close-native / reopen-better-sqlite3 cost. +function detectDroppedLanguageGap(ctx: PipelineContext): DroppedLanguageGap { const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set()); const expected = new Set( collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))), @@ -794,12 +820,6 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { .all() as Array<{ file: string }>; const existingNodes = new Set(existingNodeRows.map((r) => r.file)); - // Belt-and-suspenders: also check `file_hashes`. The fast-skip pre-flight - // (#1054) rejects on `file_hashes` gaps, and the two tables can diverge - // (e.g. a DB written by old code where `nodes` was populated but - // `file_hashes` was not). Treating "in nodes but not in file_hashes" as - // missing closes the gap so the backfill repairs the file_hashes row even - // when the node row already exists. let existingHashes = new Set(); try { const existingHashRows = ctx.db @@ -810,26 +830,36 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { // file_hashes table may not exist on legacy DBs; treat as fully missing // so the backfill writes rows on the upsert path below. debug( - `backfillNativeDroppedFiles: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`, + `detectDroppedLanguageGap: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`, ); } - // Restrict backfill to files with an installed WASM grammar. Extensions in - // LANGUAGE_REGISTRY without a shipped grammar file (e.g. groovy, erlang on - // minimal installs) can't be parsed by either engine, so they're not a - // native regression — excluding them keeps the warn count meaningful. const installedExts = getInstalledWasmExtensions(); const missingRel: string[] = []; const missingAbs: string[] = []; for (const rel of expected) { - // A file is "missing" if it's absent from EITHER nodes OR file_hashes. - // Both must be present for fast-skip to work correctly. if (existingNodes.has(rel) && existingHashes.has(rel)) continue; const ext = path.extname(rel).toLowerCase(); if (!installedExts.has(ext)) continue; missingRel.push(rel); missingAbs.push(path.join(ctx.rootDir, rel)); } + return { missingRel, missingAbs }; +} + +/** + * Backfill files that the native orchestrator silently dropped during parse. + * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). + * + * Accepts a pre-computed `gap` from `detectDroppedLanguageGap` so the caller + * can use the same scan for both gating and the actual backfill — avoiding + * a redundant fs walk when the orchestrator's signals already triggered. + */ +async function backfillNativeDroppedFiles( + ctx: PipelineContext, + gap: DroppedLanguageGap, +): Promise { + const { missingRel, missingAbs } = gap; if (missingAbs.length === 0) return; // Now that we know there's work to do, hand off to better-sqlite3 (needed diff --git a/tests/integration/dropped-language-gap.test.ts b/tests/integration/dropped-language-gap.test.ts new file mode 100644 index 00000000..0b7579f2 --- /dev/null +++ b/tests/integration/dropped-language-gap.test.ts @@ -0,0 +1,110 @@ +/** + * Regression test for #1083: backfill must run on a quiet incremental when + * the DB has a gap (a file present on disk but missing from `nodes`). + * + * Issue scenario: a brand-new file with a dropped-language extension is added + * on an incremental pass. The orchestrator's narrower file_collector doesn't + * see the file, so it reports `changedCount=0`, `removedCount=0`, + * `isFullBuild=false`. The pre-#1083 gate would skip backfill and the file + * would never appear in the graph until a full rebuild. + * + * We simulate that DB state directly (instead of relying on a specific binary + * version's dropped-extension set, which varies as extractors get ported to + * Rust): delete a file's `nodes` row but keep its `file_hashes` row intact + * so the orchestrator's content-hash tier sees the file as unchanged and + * processes nothing. Then run an incremental and assert the gap is repaired. + */ + +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import Database from 'better-sqlite3'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import { buildGraph } from '../../src/domain/graph/builder.js'; +import { isNativeAvailable } from '../../src/infrastructure/native.js'; + +const FIXTURE_DIR = path.join(import.meta.dirname, '..', 'fixtures', 'sample-project'); + +const hasNative = isNativeAvailable(); +const requireParity = !!process.env.CODEGRAPH_PARITY; +const describeOrSkip = requireParity || hasNative ? describe : describe.skip; + +function copyDirSync(src: string, dest: string) { + fs.mkdirSync(dest, { recursive: true }); + for (const entry of fs.readdirSync(src, { withFileTypes: true })) { + const s = path.join(src, entry.name); + const d = path.join(dest, entry.name); + if (entry.isDirectory()) copyDirSync(s, d); + else fs.copyFileSync(s, d); + } +} + +describeOrSkip('Dropped-language gap repair on quiet incremental (#1083)', () => { + let projectDir: string; + let dbPath: string; + let tmpBase: string; + const targetFile = 'math.js'; + + beforeAll(async () => { + tmpBase = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-1083-')); + projectDir = path.join(tmpBase, 'proj'); + copyDirSync(FIXTURE_DIR, projectDir); + dbPath = path.join(projectDir, '.codegraph', 'graph.db'); + + // Full build → every file has nodes + file_hashes rows. + await buildGraph(projectDir, { + engine: 'native', + incremental: false, + skipRegistry: true, + }); + + // Simulate the issue's DB state: a file is on disk and tracked in + // `file_hashes` (so the orchestrator's hash tier sees it as unchanged + // and reports `changedCount=0`), but its `kind='file'` node row is + // absent — the same shape produced when an old binary's collector + // never inserts dropped-extension files in the first place. + // + // Foreign keys are disabled for this surgical delete; edges referencing + // the file node are left intact so we only test the gap-detection path. + const db = new Database(dbPath); + db.pragma('foreign_keys = OFF'); + db.prepare("DELETE FROM nodes WHERE kind='file' AND file = ?").run(targetFile); + db.close(); + + // Bump the file's mtime without changing content. This forces the + // JS-side fast-skip pre-flight (#1054) to fall through to the + // orchestrator (mtime mismatch with file_hashes), while the + // orchestrator's content-hash tier still classifies the file as + // unchanged (metadata-only update) — so `changedCount=0` and + // `removedCount=0`. That's the exact orchestrator-state the issue + // describes for a brand-new dropped-language file. + const targetAbs = path.join(projectDir, targetFile); + const future = new Date(Date.now() + 5000); + fs.utimesSync(targetAbs, future, future); + + // Incremental rebuild — must detect the gap and re-insert the row. + await buildGraph(projectDir, { + engine: 'native', + incremental: true, + skipRegistry: true, + }); + }, 60_000); + + afterAll(() => { + try { + if (tmpBase) fs.rmSync(tmpBase, { recursive: true, force: true }); + } catch { + /* ignore */ + } + }); + + it('repairs the missing file node on quiet incremental', () => { + const db = new Database(dbPath, { readonly: true }); + const row = db + .prepare("SELECT name, kind, file FROM nodes WHERE kind='file' AND file = ?") + .get(targetFile) as { name: string; kind: string; file: string } | undefined; + db.close(); + expect(row).toBeDefined(); + expect(row?.file).toBe(targetFile); + }); +}); From 6f21bfea72cc90e889231fb499bb8d165a11580f Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Thu, 14 May 2026 23:18:36 -0600 Subject: [PATCH 2/2] test: cover both earlyExit branches in dropped-language gap test (#1123) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Greptile flagged that the test only exercised one of the two backfill call sites added by this PR. Split the test into two describeOrSkip blocks that explicitly target each path: - earlyExit=true: mtime bump with unchanged content → orchestrator's content-hash tier classifies as metadata-only → after filtering, parse_changes/removed are empty → returns early_exit=true → backfill fires from pipeline.ts:665-667. - earlyExit=false: real content change to a sibling file (utils.js) drives changedCount=1 → orchestrator returns early_exit=false → the non-early-exit gate (now augmented with gap.missingAbs.length > 0) drives backfill from pipeline.ts:776-778. Verified via debug logs: scenario 1 emits 'No changes detected' (orchestrator earlyExit branch), scenario 2 emits 'Incremental: 1 changed, 0 removed' (main path). Both end with the gap repaired. --- .../integration/dropped-language-gap.test.ts | 153 ++++++++++++++---- 1 file changed, 119 insertions(+), 34 deletions(-) diff --git a/tests/integration/dropped-language-gap.test.ts b/tests/integration/dropped-language-gap.test.ts index 0b7579f2..f69fdf96 100644 --- a/tests/integration/dropped-language-gap.test.ts +++ b/tests/integration/dropped-language-gap.test.ts @@ -13,6 +13,14 @@ * Rust): delete a file's `nodes` row but keep its `file_hashes` row intact * so the orchestrator's content-hash tier sees the file as unchanged and * processes nothing. Then run an incremental and assert the gap is repaired. + * + * Two scenarios cover both backfill call sites added by #1083: + * 1. earlyExit=true branch (pipeline.ts:659–671): content hash unchanged → + * orchestrator returns early_exit=true → backfill runs from that branch. + * 2. earlyExit=false branch (pipeline.ts:776–778): content changed in some + * other file → orchestrator returns early_exit=false with changedCount>0 + * → backfill runs from the main gate (which now also fires when + * `gap.missingAbs.length > 0`). */ import fs from 'node:fs'; @@ -39,45 +47,74 @@ function copyDirSync(src: string, dest: string) { } } -describeOrSkip('Dropped-language gap repair on quiet incremental (#1083)', () => { +/** + * Shared setup: full build → delete the target's `nodes` row → return paths + * so each scenario can stage its own incremental trigger before rebuilding. + */ +async function setupGappedDb(label: string, targetFile: string) { + const tmpBase = fs.mkdtempSync(path.join(os.tmpdir(), `codegraph-1083-${label}-`)); + const projectDir = path.join(tmpBase, 'proj'); + copyDirSync(FIXTURE_DIR, projectDir); + const dbPath = path.join(projectDir, '.codegraph', 'graph.db'); + + // Full build → every file has nodes + file_hashes rows. + await buildGraph(projectDir, { + engine: 'native', + incremental: false, + skipRegistry: true, + }); + + // Simulate the issue's DB state: a file is on disk and tracked in + // `file_hashes`, but its `kind='file'` node row is absent — the same + // shape produced when an old binary's collector never inserts + // dropped-extension files in the first place. + // + // Foreign keys are disabled for this surgical delete; edges referencing + // the file node are left intact so we only test the gap-detection path. + const db = new Database(dbPath); + db.pragma('foreign_keys = OFF'); + db.prepare("DELETE FROM nodes WHERE kind='file' AND file = ?").run(targetFile); + db.close(); + + return { tmpBase, projectDir, dbPath }; +} + +function readFileNodeRow(dbPath: string, file: string) { + const db = new Database(dbPath, { readonly: true }); + const row = db + .prepare("SELECT name, kind, file FROM nodes WHERE kind='file' AND file = ?") + .get(file) as { name: string; kind: string; file: string } | undefined; + db.close(); + return row; +} + +// ───────────────────────────────────────────────────────────────────────────── +// Scenario 1: orchestrator early_exit=true path +// +// Bump mtime without changing content. The JS-side fast-skip pre-flight +// (#1054) sees the mtime mismatch and falls through to the orchestrator. +// The orchestrator's mtime tier also fails, so it hashes the content; the +// hash matches → classifies as a metadata-only change → after the +// metadata_only filter, parse_changes is empty and removed is empty → +// returns early_exit=true. Backfill must run from pipeline.ts:665–667. +// ───────────────────────────────────────────────────────────────────────────── +describeOrSkip('Dropped-language gap repair on quiet incremental (#1083) — earlyExit=true', () => { let projectDir: string; let dbPath: string; let tmpBase: string; const targetFile = 'math.js'; beforeAll(async () => { - tmpBase = fs.mkdtempSync(path.join(os.tmpdir(), 'codegraph-1083-')); - projectDir = path.join(tmpBase, 'proj'); - copyDirSync(FIXTURE_DIR, projectDir); - dbPath = path.join(projectDir, '.codegraph', 'graph.db'); - - // Full build → every file has nodes + file_hashes rows. - await buildGraph(projectDir, { - engine: 'native', - incremental: false, - skipRegistry: true, - }); - - // Simulate the issue's DB state: a file is on disk and tracked in - // `file_hashes` (so the orchestrator's hash tier sees it as unchanged - // and reports `changedCount=0`), but its `kind='file'` node row is - // absent — the same shape produced when an old binary's collector - // never inserts dropped-extension files in the first place. - // - // Foreign keys are disabled for this surgical delete; edges referencing - // the file node are left intact so we only test the gap-detection path. - const db = new Database(dbPath); - db.pragma('foreign_keys = OFF'); - db.prepare("DELETE FROM nodes WHERE kind='file' AND file = ?").run(targetFile); - db.close(); + ({ tmpBase, projectDir, dbPath } = await setupGappedDb('quiet', targetFile)); // Bump the file's mtime without changing content. This forces the // JS-side fast-skip pre-flight (#1054) to fall through to the // orchestrator (mtime mismatch with file_hashes), while the // orchestrator's content-hash tier still classifies the file as - // unchanged (metadata-only update) — so `changedCount=0` and - // `removedCount=0`. That's the exact orchestrator-state the issue - // describes for a brand-new dropped-language file. + // unchanged (metadata-only update) — so `changedCount=0`, + // `removedCount=0`, and `earlyExit=true`. That's the exact + // orchestrator-state the issue describes for a brand-new + // dropped-language file. const targetAbs = path.join(projectDir, targetFile); const future = new Date(Date.now() + 5000); fs.utimesSync(targetAbs, future, future); @@ -98,13 +135,61 @@ describeOrSkip('Dropped-language gap repair on quiet incremental (#1083)', () => } }); - it('repairs the missing file node on quiet incremental', () => { - const db = new Database(dbPath, { readonly: true }); - const row = db - .prepare("SELECT name, kind, file FROM nodes WHERE kind='file' AND file = ?") - .get(targetFile) as { name: string; kind: string; file: string } | undefined; - db.close(); + it('repairs the missing file node when the orchestrator returns earlyExit=true', () => { + const row = readFileNodeRow(dbPath, targetFile); expect(row).toBeDefined(); expect(row?.file).toBe(targetFile); }); }); + +// ───────────────────────────────────────────────────────────────────────────── +// Scenario 2: orchestrator early_exit=false path with a real change +// +// One file's content actually changes (so the orchestrator reports +// changedCount=1, removedCount=0, earlyExit=false) AND a separate file has +// a gap (nodes row deleted, file_hashes intact). Without the +// `gap.missingAbs.length > 0` clause added by #1083, the existing gate +// would still fire (changedCount > 0), so this scenario verifies the +// non-early-exit branch still threads the pre-computed gap through to +// backfill and repairs the missing row alongside the regular changed-file +// processing. +// ───────────────────────────────────────────────────────────────────────────── +describeOrSkip('Dropped-language gap repair on dirty incremental (#1083) — earlyExit=false', () => { + let projectDir: string; + let dbPath: string; + let tmpBase: string; + const gapFile = 'math.js'; // gap target — node row deleted + const dirtyFile = 'utils.js'; // change driver — content modified + + beforeAll(async () => { + ({ tmpBase, projectDir, dbPath } = await setupGappedDb('dirty', gapFile)); + + // Append a comment to a *different* file. This makes the orchestrator's + // content hash diverge for that file, producing changedCount=1 and + // earlyExit=false. The gapFile's node row is still missing — the + // non-early-exit path's gate (with the new `gap.missingAbs.length > 0` + // clause) must drive backfill for the gapped file. + const dirtyAbs = path.join(projectDir, dirtyFile); + fs.appendFileSync(dirtyAbs, '\n// touched by dropped-language-gap test #1083\n'); + + await buildGraph(projectDir, { + engine: 'native', + incremental: true, + skipRegistry: true, + }); + }, 60_000); + + afterAll(() => { + try { + if (tmpBase) fs.rmSync(tmpBase, { recursive: true, force: true }); + } catch { + /* ignore */ + } + }); + + it('repairs the missing file node alongside a real change (earlyExit=false)', () => { + const row = readFileNodeRow(dbPath, gapFile); + expect(row).toBeDefined(); + expect(row?.file).toBe(gapFile); + }); +});