diff --git a/src/domain/graph/builder/pipeline.ts b/src/domain/graph/builder/pipeline.ts index f31afa77..1416631a 100644 --- a/src/domain/graph/builder/pipeline.ts +++ b/src/domain/graph/builder/pipeline.ts @@ -658,6 +658,14 @@ async function tryNativeOrchestrator( if (result.earlyExit) { info('No changes detected'); + // Even on no-op rebuilds, dropped-language files added since the last + // full build are still missing from `nodes`/`file_hashes` (#1083). The + // orchestrator's file_collector skipped them, so its earlyExit doesn't + // imply DB consistency. Run the gap repair before returning. + const gap = detectDroppedLanguageGap(ctx); + if (gap.missingAbs.length > 0) { + await backfillNativeDroppedFiles(ctx, gap); + } closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); return 'early-exit'; } @@ -753,37 +761,55 @@ async function tryNativeOrchestrator( // stale native binaries). WASM handles those — backfill via WASM so both // engines process the same file set (#967). // - // Runs on full builds and on incrementals when the orchestrator reports - // any file activity (removals or changes). The orchestrator's - // `detect_removed_files` filter (#1070) skips files outside its narrower - // file_collector, so on a current binary a no-op rebuild reports - // `removedCount=0` and `changedCount=0`, making the backfill call pure - // overhead (fs walk + 2 DB queries + 48-file WASM re-parse). Legacy - // binaries lacking the filter still report `removedCount>0` and get the - // gap-repair behavior #1068 introduced. Triggering on `changedCount>0` - // narrows (but does not fully close) the gap where a brand-new - // unsupported-extension file is added on an otherwise-quiet incremental - // — see #1091 for the residual gap. + // Detect the gap once (fs walk + 2 DB queries, ~20–30ms) and use it for + // both gating and the backfill itself. On dirty incrementals/full builds + // the orchestrator signals trigger backfill, so the walk happens once + // (instead of redundantly inside backfill). On quiet incrementals we + // still pay the walk so we can detect brand-new files in dropped-language + // extensions — a gap that the orchestrator's `detect_removed_files` + // filter (#1070) leaves open (#1083, #1091). The pre-check is cheap + // because the expensive part (WASM re-parse of the missing set) is + // gated below. const removedCount = result.removedCount ?? 0; const changedCount = result.changedCount ?? 0; - if (result.isFullBuild || removedCount > 0 || changedCount > 0) { - await backfillNativeDroppedFiles(ctx); + const gap = detectDroppedLanguageGap(ctx); + if (result.isFullBuild || removedCount > 0 || changedCount > 0 || gap.missingAbs.length > 0) { + await backfillNativeDroppedFiles(ctx, gap); } closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb }); return formatNativeTimingResult(p, structurePatchMs, analysisTiming); } +/** Files the native orchestrator silently dropped — the working set for backfill. */ +interface DroppedLanguageGap { + /** Relative paths (normalized) of files missing from `nodes` or `file_hashes`. */ + missingRel: string[]; + /** Absolute paths, aligned by index with `missingRel`. */ + missingAbs: string[]; +} + /** - * Backfill files that the native orchestrator silently dropped during parse. - * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). + * Detect files the native orchestrator silently dropped. + * + * Walks the filesystem and compares against `nodes` + `file_hashes`. A file + * is "missing" if it's absent from EITHER table — both must be present for + * the fast-skip pre-flight (#1054) to work, and the two can diverge (e.g. + * legacy DBs where `nodes` was populated but `file_hashes` was not). + * + * Restricted to files with an installed WASM grammar; extensions in + * `LANGUAGE_REGISTRY` without a shipped grammar (e.g. groovy on minimal + * installs) can't be parsed by either engine, so they're not a native + * regression — excluding them keeps the warn count in + * `backfillNativeDroppedFiles` meaningful. + * + * Cheap (no DB handoff, no parsing): used both to gate the backfill call + * and as its working set. NativeDbProxy supports `.prepare().all()`, so + * this works whether `ctx.db` is a proxy or a real better-sqlite3 + * connection — letting us skip the close-native / reopen-better-sqlite3 + * cost when there's nothing to backfill. */ -async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { - // Compute the missing-file set FIRST, before any expensive DB handoff. - // NativeDbProxy supports .prepare().all(), so the upfront query works - // whether ctx.db is a proxy or a real better-sqlite3 connection. On - // incremental no-op rebuilds nothing is missing, so we want to early-return - // without paying the close-native / reopen-better-sqlite3 cost. +function detectDroppedLanguageGap(ctx: PipelineContext): DroppedLanguageGap { const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set()); const expected = new Set( collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))), @@ -794,12 +820,6 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { .all() as Array<{ file: string }>; const existingNodes = new Set(existingNodeRows.map((r) => r.file)); - // Belt-and-suspenders: also check `file_hashes`. The fast-skip pre-flight - // (#1054) rejects on `file_hashes` gaps, and the two tables can diverge - // (e.g. a DB written by old code where `nodes` was populated but - // `file_hashes` was not). Treating "in nodes but not in file_hashes" as - // missing closes the gap so the backfill repairs the file_hashes row even - // when the node row already exists. let existingHashes = new Set(); try { const existingHashRows = ctx.db @@ -810,26 +830,36 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise { // file_hashes table may not exist on legacy DBs; treat as fully missing // so the backfill writes rows on the upsert path below. debug( - `backfillNativeDroppedFiles: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`, + `detectDroppedLanguageGap: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`, ); } - // Restrict backfill to files with an installed WASM grammar. Extensions in - // LANGUAGE_REGISTRY without a shipped grammar file (e.g. groovy, erlang on - // minimal installs) can't be parsed by either engine, so they're not a - // native regression — excluding them keeps the warn count meaningful. const installedExts = getInstalledWasmExtensions(); const missingRel: string[] = []; const missingAbs: string[] = []; for (const rel of expected) { - // A file is "missing" if it's absent from EITHER nodes OR file_hashes. - // Both must be present for fast-skip to work correctly. if (existingNodes.has(rel) && existingHashes.has(rel)) continue; const ext = path.extname(rel).toLowerCase(); if (!installedExts.has(ext)) continue; missingRel.push(rel); missingAbs.push(path.join(ctx.rootDir, rel)); } + return { missingRel, missingAbs }; +} + +/** + * Backfill files that the native orchestrator silently dropped during parse. + * Falls back to WASM + inserts file/symbol nodes so engine counts match (#967). + * + * Accepts a pre-computed `gap` from `detectDroppedLanguageGap` so the caller + * can use the same scan for both gating and the actual backfill — avoiding + * a redundant fs walk when the orchestrator's signals already triggered. + */ +async function backfillNativeDroppedFiles( + ctx: PipelineContext, + gap: DroppedLanguageGap, +): Promise { + const { missingRel, missingAbs } = gap; if (missingAbs.length === 0) return; // Now that we know there's work to do, hand off to better-sqlite3 (needed diff --git a/tests/integration/dropped-language-gap.test.ts b/tests/integration/dropped-language-gap.test.ts new file mode 100644 index 00000000..f69fdf96 --- /dev/null +++ b/tests/integration/dropped-language-gap.test.ts @@ -0,0 +1,195 @@ +/** + * Regression test for #1083: backfill must run on a quiet incremental when + * the DB has a gap (a file present on disk but missing from `nodes`). + * + * Issue scenario: a brand-new file with a dropped-language extension is added + * on an incremental pass. The orchestrator's narrower file_collector doesn't + * see the file, so it reports `changedCount=0`, `removedCount=0`, + * `isFullBuild=false`. The pre-#1083 gate would skip backfill and the file + * would never appear in the graph until a full rebuild. + * + * We simulate that DB state directly (instead of relying on a specific binary + * version's dropped-extension set, which varies as extractors get ported to + * Rust): delete a file's `nodes` row but keep its `file_hashes` row intact + * so the orchestrator's content-hash tier sees the file as unchanged and + * processes nothing. Then run an incremental and assert the gap is repaired. + * + * Two scenarios cover both backfill call sites added by #1083: + * 1. earlyExit=true branch (pipeline.ts:659–671): content hash unchanged → + * orchestrator returns early_exit=true → backfill runs from that branch. + * 2. earlyExit=false branch (pipeline.ts:776–778): content changed in some + * other file → orchestrator returns early_exit=false with changedCount>0 + * → backfill runs from the main gate (which now also fires when + * `gap.missingAbs.length > 0`). + */ + +import fs from 'node:fs'; +import os from 'node:os'; +import path from 'node:path'; +import Database from 'better-sqlite3'; +import { afterAll, beforeAll, describe, expect, it } from 'vitest'; +import { buildGraph } from '../../src/domain/graph/builder.js'; +import { isNativeAvailable } from '../../src/infrastructure/native.js'; + +const FIXTURE_DIR = path.join(import.meta.dirname, '..', 'fixtures', 'sample-project'); + +const hasNative = isNativeAvailable(); +const requireParity = !!process.env.CODEGRAPH_PARITY; +const describeOrSkip = requireParity || hasNative ? describe : describe.skip; + +function copyDirSync(src: string, dest: string) { + fs.mkdirSync(dest, { recursive: true }); + for (const entry of fs.readdirSync(src, { withFileTypes: true })) { + const s = path.join(src, entry.name); + const d = path.join(dest, entry.name); + if (entry.isDirectory()) copyDirSync(s, d); + else fs.copyFileSync(s, d); + } +} + +/** + * Shared setup: full build → delete the target's `nodes` row → return paths + * so each scenario can stage its own incremental trigger before rebuilding. + */ +async function setupGappedDb(label: string, targetFile: string) { + const tmpBase = fs.mkdtempSync(path.join(os.tmpdir(), `codegraph-1083-${label}-`)); + const projectDir = path.join(tmpBase, 'proj'); + copyDirSync(FIXTURE_DIR, projectDir); + const dbPath = path.join(projectDir, '.codegraph', 'graph.db'); + + // Full build → every file has nodes + file_hashes rows. + await buildGraph(projectDir, { + engine: 'native', + incremental: false, + skipRegistry: true, + }); + + // Simulate the issue's DB state: a file is on disk and tracked in + // `file_hashes`, but its `kind='file'` node row is absent — the same + // shape produced when an old binary's collector never inserts + // dropped-extension files in the first place. + // + // Foreign keys are disabled for this surgical delete; edges referencing + // the file node are left intact so we only test the gap-detection path. + const db = new Database(dbPath); + db.pragma('foreign_keys = OFF'); + db.prepare("DELETE FROM nodes WHERE kind='file' AND file = ?").run(targetFile); + db.close(); + + return { tmpBase, projectDir, dbPath }; +} + +function readFileNodeRow(dbPath: string, file: string) { + const db = new Database(dbPath, { readonly: true }); + const row = db + .prepare("SELECT name, kind, file FROM nodes WHERE kind='file' AND file = ?") + .get(file) as { name: string; kind: string; file: string } | undefined; + db.close(); + return row; +} + +// ───────────────────────────────────────────────────────────────────────────── +// Scenario 1: orchestrator early_exit=true path +// +// Bump mtime without changing content. The JS-side fast-skip pre-flight +// (#1054) sees the mtime mismatch and falls through to the orchestrator. +// The orchestrator's mtime tier also fails, so it hashes the content; the +// hash matches → classifies as a metadata-only change → after the +// metadata_only filter, parse_changes is empty and removed is empty → +// returns early_exit=true. Backfill must run from pipeline.ts:665–667. +// ───────────────────────────────────────────────────────────────────────────── +describeOrSkip('Dropped-language gap repair on quiet incremental (#1083) — earlyExit=true', () => { + let projectDir: string; + let dbPath: string; + let tmpBase: string; + const targetFile = 'math.js'; + + beforeAll(async () => { + ({ tmpBase, projectDir, dbPath } = await setupGappedDb('quiet', targetFile)); + + // Bump the file's mtime without changing content. This forces the + // JS-side fast-skip pre-flight (#1054) to fall through to the + // orchestrator (mtime mismatch with file_hashes), while the + // orchestrator's content-hash tier still classifies the file as + // unchanged (metadata-only update) — so `changedCount=0`, + // `removedCount=0`, and `earlyExit=true`. That's the exact + // orchestrator-state the issue describes for a brand-new + // dropped-language file. + const targetAbs = path.join(projectDir, targetFile); + const future = new Date(Date.now() + 5000); + fs.utimesSync(targetAbs, future, future); + + // Incremental rebuild — must detect the gap and re-insert the row. + await buildGraph(projectDir, { + engine: 'native', + incremental: true, + skipRegistry: true, + }); + }, 60_000); + + afterAll(() => { + try { + if (tmpBase) fs.rmSync(tmpBase, { recursive: true, force: true }); + } catch { + /* ignore */ + } + }); + + it('repairs the missing file node when the orchestrator returns earlyExit=true', () => { + const row = readFileNodeRow(dbPath, targetFile); + expect(row).toBeDefined(); + expect(row?.file).toBe(targetFile); + }); +}); + +// ───────────────────────────────────────────────────────────────────────────── +// Scenario 2: orchestrator early_exit=false path with a real change +// +// One file's content actually changes (so the orchestrator reports +// changedCount=1, removedCount=0, earlyExit=false) AND a separate file has +// a gap (nodes row deleted, file_hashes intact). Without the +// `gap.missingAbs.length > 0` clause added by #1083, the existing gate +// would still fire (changedCount > 0), so this scenario verifies the +// non-early-exit branch still threads the pre-computed gap through to +// backfill and repairs the missing row alongside the regular changed-file +// processing. +// ───────────────────────────────────────────────────────────────────────────── +describeOrSkip('Dropped-language gap repair on dirty incremental (#1083) — earlyExit=false', () => { + let projectDir: string; + let dbPath: string; + let tmpBase: string; + const gapFile = 'math.js'; // gap target — node row deleted + const dirtyFile = 'utils.js'; // change driver — content modified + + beforeAll(async () => { + ({ tmpBase, projectDir, dbPath } = await setupGappedDb('dirty', gapFile)); + + // Append a comment to a *different* file. This makes the orchestrator's + // content hash diverge for that file, producing changedCount=1 and + // earlyExit=false. The gapFile's node row is still missing — the + // non-early-exit path's gate (with the new `gap.missingAbs.length > 0` + // clause) must drive backfill for the gapped file. + const dirtyAbs = path.join(projectDir, dirtyFile); + fs.appendFileSync(dirtyAbs, '\n// touched by dropped-language-gap test #1083\n'); + + await buildGraph(projectDir, { + engine: 'native', + incremental: true, + skipRegistry: true, + }); + }, 60_000); + + afterAll(() => { + try { + if (tmpBase) fs.rmSync(tmpBase, { recursive: true, force: true }); + } catch { + /* ignore */ + } + }); + + it('repairs the missing file node alongside a real change (earlyExit=false)', () => { + const row = readFileNodeRow(dbPath, gapFile); + expect(row).toBeDefined(); + expect(row?.file).toBe(gapFile); + }); +});