Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 64 additions & 34 deletions src/domain/graph/builder/pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -658,6 +658,14 @@ async function tryNativeOrchestrator(

if (result.earlyExit) {
info('No changes detected');
// Even on no-op rebuilds, dropped-language files added since the last
// full build are still missing from `nodes`/`file_hashes` (#1083). The
// orchestrator's file_collector skipped them, so its earlyExit doesn't
// imply DB consistency. Run the gap repair before returning.
const gap = detectDroppedLanguageGap(ctx);
if (gap.missingAbs.length > 0) {
await backfillNativeDroppedFiles(ctx, gap);
}
closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb });
return 'early-exit';
}
Expand Down Expand Up @@ -753,37 +761,55 @@ async function tryNativeOrchestrator(
// stale native binaries). WASM handles those — backfill via WASM so both
// engines process the same file set (#967).
//
// Runs on full builds and on incrementals when the orchestrator reports
// any file activity (removals or changes). The orchestrator's
// `detect_removed_files` filter (#1070) skips files outside its narrower
// file_collector, so on a current binary a no-op rebuild reports
// `removedCount=0` and `changedCount=0`, making the backfill call pure
// overhead (fs walk + 2 DB queries + 48-file WASM re-parse). Legacy
// binaries lacking the filter still report `removedCount>0` and get the
// gap-repair behavior #1068 introduced. Triggering on `changedCount>0`
// narrows (but does not fully close) the gap where a brand-new
// unsupported-extension file is added on an otherwise-quiet incremental
// — see #1091 for the residual gap.
// Detect the gap once (fs walk + 2 DB queries, ~20–30ms) and use it for
// both gating and the backfill itself. On dirty incrementals/full builds
// the orchestrator signals trigger backfill, so the walk happens once
// (instead of redundantly inside backfill). On quiet incrementals we
// still pay the walk so we can detect brand-new files in dropped-language
// extensions — a gap that the orchestrator's `detect_removed_files`
// filter (#1070) leaves open (#1083, #1091). The pre-check is cheap
// because the expensive part (WASM re-parse of the missing set) is
// gated below.
const removedCount = result.removedCount ?? 0;
const changedCount = result.changedCount ?? 0;
if (result.isFullBuild || removedCount > 0 || changedCount > 0) {
await backfillNativeDroppedFiles(ctx);
const gap = detectDroppedLanguageGap(ctx);
if (result.isFullBuild || removedCount > 0 || changedCount > 0 || gap.missingAbs.length > 0) {
await backfillNativeDroppedFiles(ctx, gap);
}

closeDbPair({ db: ctx.db, nativeDb: ctx.nativeDb });
return formatNativeTimingResult(p, structurePatchMs, analysisTiming);
}

/** Files the native orchestrator silently dropped — the working set for backfill. */
interface DroppedLanguageGap {
/** Relative paths (normalized) of files missing from `nodes` or `file_hashes`. */
missingRel: string[];
/** Absolute paths, aligned by index with `missingRel`. */
missingAbs: string[];
}

/**
* Backfill files that the native orchestrator silently dropped during parse.
* Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
* Detect files the native orchestrator silently dropped.
*
* Walks the filesystem and compares against `nodes` + `file_hashes`. A file
* is "missing" if it's absent from EITHER table — both must be present for
* the fast-skip pre-flight (#1054) to work, and the two can diverge (e.g.
* legacy DBs where `nodes` was populated but `file_hashes` was not).
*
* Restricted to files with an installed WASM grammar; extensions in
* `LANGUAGE_REGISTRY` without a shipped grammar (e.g. groovy on minimal
* installs) can't be parsed by either engine, so they're not a native
* regression — excluding them keeps the warn count in
* `backfillNativeDroppedFiles` meaningful.
*
* Cheap (no DB handoff, no parsing): used both to gate the backfill call
* and as its working set. NativeDbProxy supports `.prepare().all()`, so
* this works whether `ctx.db` is a proxy or a real better-sqlite3
* connection — letting us skip the close-native / reopen-better-sqlite3
* cost when there's nothing to backfill.
*/
async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
// Compute the missing-file set FIRST, before any expensive DB handoff.
// NativeDbProxy supports .prepare().all(), so the upfront query works
// whether ctx.db is a proxy or a real better-sqlite3 connection. On
// incremental no-op rebuilds nothing is missing, so we want to early-return
// without paying the close-native / reopen-better-sqlite3 cost.
function detectDroppedLanguageGap(ctx: PipelineContext): DroppedLanguageGap {
const collected = collectFilesUtil(ctx.rootDir, [], ctx.config, new Set<string>());
const expected = new Set(
collected.files.map((f) => normalizePath(path.relative(ctx.rootDir, f))),
Expand All @@ -794,12 +820,6 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
.all() as Array<{ file: string }>;
const existingNodes = new Set(existingNodeRows.map((r) => r.file));

// Belt-and-suspenders: also check `file_hashes`. The fast-skip pre-flight
// (#1054) rejects on `file_hashes` gaps, and the two tables can diverge
// (e.g. a DB written by old code where `nodes` was populated but
// `file_hashes` was not). Treating "in nodes but not in file_hashes" as
// missing closes the gap so the backfill repairs the file_hashes row even
// when the node row already exists.
let existingHashes = new Set<string>();
try {
const existingHashRows = ctx.db
Expand All @@ -810,26 +830,36 @@ async function backfillNativeDroppedFiles(ctx: PipelineContext): Promise<void> {
// file_hashes table may not exist on legacy DBs; treat as fully missing
// so the backfill writes rows on the upsert path below.
debug(
`backfillNativeDroppedFiles: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`,
`detectDroppedLanguageGap: file_hashes read failed (table may not exist): ${toErrorMessage(e)}`,
);
}

// Restrict backfill to files with an installed WASM grammar. Extensions in
// LANGUAGE_REGISTRY without a shipped grammar file (e.g. groovy, erlang on
// minimal installs) can't be parsed by either engine, so they're not a
// native regression — excluding them keeps the warn count meaningful.
const installedExts = getInstalledWasmExtensions();
const missingRel: string[] = [];
const missingAbs: string[] = [];
for (const rel of expected) {
// A file is "missing" if it's absent from EITHER nodes OR file_hashes.
// Both must be present for fast-skip to work correctly.
if (existingNodes.has(rel) && existingHashes.has(rel)) continue;
const ext = path.extname(rel).toLowerCase();
if (!installedExts.has(ext)) continue;
missingRel.push(rel);
missingAbs.push(path.join(ctx.rootDir, rel));
}
return { missingRel, missingAbs };
}

/**
* Backfill files that the native orchestrator silently dropped during parse.
* Falls back to WASM + inserts file/symbol nodes so engine counts match (#967).
*
* Accepts a pre-computed `gap` from `detectDroppedLanguageGap` so the caller
* can use the same scan for both gating and the actual backfill — avoiding
* a redundant fs walk when the orchestrator's signals already triggered.
*/
async function backfillNativeDroppedFiles(
ctx: PipelineContext,
gap: DroppedLanguageGap,
): Promise<void> {
const { missingRel, missingAbs } = gap;
if (missingAbs.length === 0) return;

// Now that we know there's work to do, hand off to better-sqlite3 (needed
Expand Down
195 changes: 195 additions & 0 deletions tests/integration/dropped-language-gap.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
/**
* Regression test for #1083: backfill must run on a quiet incremental when
* the DB has a gap (a file present on disk but missing from `nodes`).
*
* Issue scenario: a brand-new file with a dropped-language extension is added
* on an incremental pass. The orchestrator's narrower file_collector doesn't
* see the file, so it reports `changedCount=0`, `removedCount=0`,
* `isFullBuild=false`. The pre-#1083 gate would skip backfill and the file
* would never appear in the graph until a full rebuild.
*
* We simulate that DB state directly (instead of relying on a specific binary
* version's dropped-extension set, which varies as extractors get ported to
* Rust): delete a file's `nodes` row but keep its `file_hashes` row intact
* so the orchestrator's content-hash tier sees the file as unchanged and
* processes nothing. Then run an incremental and assert the gap is repaired.
*
* Two scenarios cover both backfill call sites added by #1083:
* 1. earlyExit=true branch (pipeline.ts:659–671): content hash unchanged →
* orchestrator returns early_exit=true → backfill runs from that branch.
* 2. earlyExit=false branch (pipeline.ts:776–778): content changed in some
* other file → orchestrator returns early_exit=false with changedCount>0
* → backfill runs from the main gate (which now also fires when
* `gap.missingAbs.length > 0`).
*/

import fs from 'node:fs';
import os from 'node:os';
import path from 'node:path';
import Database from 'better-sqlite3';
import { afterAll, beforeAll, describe, expect, it } from 'vitest';
import { buildGraph } from '../../src/domain/graph/builder.js';
import { isNativeAvailable } from '../../src/infrastructure/native.js';

const FIXTURE_DIR = path.join(import.meta.dirname, '..', 'fixtures', 'sample-project');

const hasNative = isNativeAvailable();
const requireParity = !!process.env.CODEGRAPH_PARITY;
const describeOrSkip = requireParity || hasNative ? describe : describe.skip;

function copyDirSync(src: string, dest: string) {
fs.mkdirSync(dest, { recursive: true });
for (const entry of fs.readdirSync(src, { withFileTypes: true })) {
const s = path.join(src, entry.name);
const d = path.join(dest, entry.name);
if (entry.isDirectory()) copyDirSync(s, d);
else fs.copyFileSync(s, d);
}
}

/**
* Shared setup: full build → delete the target's `nodes` row → return paths
* so each scenario can stage its own incremental trigger before rebuilding.
*/
async function setupGappedDb(label: string, targetFile: string) {
const tmpBase = fs.mkdtempSync(path.join(os.tmpdir(), `codegraph-1083-${label}-`));
const projectDir = path.join(tmpBase, 'proj');
copyDirSync(FIXTURE_DIR, projectDir);
const dbPath = path.join(projectDir, '.codegraph', 'graph.db');

// Full build → every file has nodes + file_hashes rows.
await buildGraph(projectDir, {
engine: 'native',
incremental: false,
skipRegistry: true,
});

// Simulate the issue's DB state: a file is on disk and tracked in
// `file_hashes`, but its `kind='file'` node row is absent — the same
// shape produced when an old binary's collector never inserts
// dropped-extension files in the first place.
//
// Foreign keys are disabled for this surgical delete; edges referencing
// the file node are left intact so we only test the gap-detection path.
const db = new Database(dbPath);
db.pragma('foreign_keys = OFF');
db.prepare("DELETE FROM nodes WHERE kind='file' AND file = ?").run(targetFile);
db.close();

return { tmpBase, projectDir, dbPath };
}

function readFileNodeRow(dbPath: string, file: string) {
const db = new Database(dbPath, { readonly: true });
const row = db
.prepare("SELECT name, kind, file FROM nodes WHERE kind='file' AND file = ?")
.get(file) as { name: string; kind: string; file: string } | undefined;
db.close();
return row;
}

// ─────────────────────────────────────────────────────────────────────────────
// Scenario 1: orchestrator early_exit=true path
//
// Bump mtime without changing content. The JS-side fast-skip pre-flight
// (#1054) sees the mtime mismatch and falls through to the orchestrator.
// The orchestrator's mtime tier also fails, so it hashes the content; the
// hash matches → classifies as a metadata-only change → after the
// metadata_only filter, parse_changes is empty and removed is empty →
// returns early_exit=true. Backfill must run from pipeline.ts:665–667.
// ─────────────────────────────────────────────────────────────────────────────
describeOrSkip('Dropped-language gap repair on quiet incremental (#1083) — earlyExit=true', () => {
let projectDir: string;
let dbPath: string;
let tmpBase: string;
const targetFile = 'math.js';

beforeAll(async () => {
({ tmpBase, projectDir, dbPath } = await setupGappedDb('quiet', targetFile));

// Bump the file's mtime without changing content. This forces the
// JS-side fast-skip pre-flight (#1054) to fall through to the
// orchestrator (mtime mismatch with file_hashes), while the
// orchestrator's content-hash tier still classifies the file as
// unchanged (metadata-only update) — so `changedCount=0`,
// `removedCount=0`, and `earlyExit=true`. That's the exact
// orchestrator-state the issue describes for a brand-new
// dropped-language file.
const targetAbs = path.join(projectDir, targetFile);
const future = new Date(Date.now() + 5000);
fs.utimesSync(targetAbs, future, future);

// Incremental rebuild — must detect the gap and re-insert the row.
await buildGraph(projectDir, {
engine: 'native',
incremental: true,
skipRegistry: true,
});
}, 60_000);

afterAll(() => {
try {
if (tmpBase) fs.rmSync(tmpBase, { recursive: true, force: true });
} catch {
/* ignore */
}
});

it('repairs the missing file node when the orchestrator returns earlyExit=true', () => {
const row = readFileNodeRow(dbPath, targetFile);
expect(row).toBeDefined();
expect(row?.file).toBe(targetFile);
});
});

// ─────────────────────────────────────────────────────────────────────────────
// Scenario 2: orchestrator early_exit=false path with a real change
//
// One file's content actually changes (so the orchestrator reports
// changedCount=1, removedCount=0, earlyExit=false) AND a separate file has
// a gap (nodes row deleted, file_hashes intact). Without the
// `gap.missingAbs.length > 0` clause added by #1083, the existing gate
// would still fire (changedCount > 0), so this scenario verifies the
// non-early-exit branch still threads the pre-computed gap through to
// backfill and repairs the missing row alongside the regular changed-file
// processing.
// ─────────────────────────────────────────────────────────────────────────────
describeOrSkip('Dropped-language gap repair on dirty incremental (#1083) — earlyExit=false', () => {
let projectDir: string;
let dbPath: string;
let tmpBase: string;
const gapFile = 'math.js'; // gap target — node row deleted
const dirtyFile = 'utils.js'; // change driver — content modified

beforeAll(async () => {
({ tmpBase, projectDir, dbPath } = await setupGappedDb('dirty', gapFile));

// Append a comment to a *different* file. This makes the orchestrator's
// content hash diverge for that file, producing changedCount=1 and
// earlyExit=false. The gapFile's node row is still missing — the
// non-early-exit path's gate (with the new `gap.missingAbs.length > 0`
// clause) must drive backfill for the gapped file.
const dirtyAbs = path.join(projectDir, dirtyFile);
fs.appendFileSync(dirtyAbs, '\n// touched by dropped-language-gap test #1083\n');

await buildGraph(projectDir, {
engine: 'native',
incremental: true,
skipRegistry: true,
});
}, 60_000);

afterAll(() => {
try {
if (tmpBase) fs.rmSync(tmpBase, { recursive: true, force: true });
} catch {
/* ignore */
}
});

it('repairs the missing file node alongside a real change (earlyExit=false)', () => {
const row = readFileNodeRow(dbPath, gapFile);
expect(row).toBeDefined();
expect(row?.file).toBe(gapFile);
});
});
Loading