From 224edcd7766a50e3cdc28c98b8c258d10e0a6df9 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Thu, 30 Apr 2026 04:20:32 -0600 Subject: [PATCH 1/3] perf(wasm): scope ensureWasmTrees re-parse to files that actually need it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #1036 — WASM full build regressed from 7.6s (3.9.5) to 14.0s (3.9.6) on the 744-file dogfooding corpus. Root cause: PR #1016 expanded AST_TYPE_MAPS from 3 to 23 languages, growing WALK_EXTENSIONS to cover .rs/.go/.py/etc. Files like crates/codegraph-core/ build.rs (5 lines, no strings/awaits/throws) produce zero ast_nodes, so the worker returned `astNodes: undefined`. On the main thread, `fileNeedsWasmTree` saw `!Array.isArray(symbols.astNodes) && WALK_EXTENSIONS.has('.rs')` and flagged the file as needing re-parse — at which point ensureWasmTrees ignored the per-file decision and re-parsed every WASM-parseable file in the build. Fix: 1. wasm-worker-entry.ts — always serialize astNodes as an array (even empty) when ast-store ran for the file. Empty != undefined: empty means "we walked it and found nothing", which is what fileNeedsWasmTree needs to see. 2. parser.ts::ensureWasmTrees — accept an optional `needsFn` filter so the caller can scope the re-parse to files that genuinely lack data instead of pulling in every WASM-parseable file in the map. 3. ast-analysis/engine.ts — pass `fileNeedsWasmTree` as that filter. Also rolled in two small ast-store-visitor optimizations found while profiling: hoist the `newTypes` Set into a per-astTypeMap WeakMap cache (was rebuilt per file), and skip the `findParentDef` linear scan when `nodeIdMap` is empty (worker context — main thread re-resolves anyway). The codepoint check uses an `s.length`-based fast path so we only spread when length 2 or 3 needs the surrogate-pair disambiguation. Bench (744 files, dogfooding): WASM full build: 14014ms → 7847ms (-44%, restores 3.9.5 baseline) Native full build: 1693ms (unchanged) WASM incremental: 51ms (unchanged) AST node parity: 39702 nodes stored, matches native engine --- src/ast-analysis/engine.ts | 4 +- .../visitors/ast-store-visitor.ts | 46 +++++++++++++++---- src/domain/parser.ts | 7 +++ src/domain/wasm-worker-entry.ts | 24 +++++----- 4 files changed, 59 insertions(+), 22 deletions(-) diff --git a/src/ast-analysis/engine.ts b/src/ast-analysis/engine.ts index b623a8cc..c9698943 100644 --- a/src/ast-analysis/engine.ts +++ b/src/ast-analysis/engine.ts @@ -421,7 +421,9 @@ async function ensureWasmTreesIfNeeded( if (needsWasmTrees) { try { const { ensureWasmTrees } = await getParserModule(); - await ensureWasmTrees(fileSymbols, rootDir); + await ensureWasmTrees(fileSymbols, rootDir, (relPath, symbols) => + fileNeedsWasmTree(relPath, symbols, flags), + ); } catch (err: unknown) { debug(`ensureWasmTrees failed: ${toErrorMessage(err)}`); } diff --git a/src/ast-analysis/visitors/ast-store-visitor.ts b/src/ast-analysis/visitors/ast-store-visitor.ts index f825bc10..0549cab6 100644 --- a/src/ast-analysis/visitors/ast-store-visitor.ts +++ b/src/ast-analysis/visitors/ast-store-visitor.ts @@ -131,6 +131,20 @@ function extractChildExpressionText(node: TreeSitterNode): string | null { return truncate(node.text); } +/** + * Count code points cheaply: skip the `[...s]` spread when `s.length` already + * decides the answer. Each code point is 1 or 2 UTF-16 units, so `.length < 2` + * implies `< 2` code points and `.length >= 4` implies `>= 2` code points + * (at most 2 surrogate pairs). Only `.length` of 2 or 3 needs the spread to + * disambiguate the surrogate-pair edge case. + */ +function codePointCountAtLeast2(s: string): boolean { + const len = s.length; + if (len < 2) return false; + if (len >= 4) return true; + return [...s].length >= 2; +} + /** * Extract string content from a string-literal node, mirroring the native * engine's `build_string_node` (`helpers.rs`). Returns `null` when the @@ -142,15 +156,27 @@ function extractStringContent(node: TreeSitterNode, cfg: AstStringConfig): strin let s = raw; s = trimLeadingChars(s, '@'); - s = trimLeadingChars(s, cfg.stringPrefixes); + if (cfg.stringPrefixes) s = trimLeadingChars(s, cfg.stringPrefixes); if (isRawString) s = trimLeadingChars(s, 'r#'); s = trimLeadingChars(s, cfg.quoteChars); if (isRawString) s = trimTrailingChars(s, '#'); s = trimTrailingChars(s, cfg.quoteChars); - // Count code points, not UTF-16 code units — matches Rust `chars().count()`. - const codePointCount = [...s].length; - if (codePointCount < 2) return null; + return codePointCountAtLeast2(s) ? s : null; +} + +// Per-astTypeMap cache for the set of node-types that map to kind 'new'. +// Computed once per unique astTypeMap reference (one per language) instead +// of once per file. +const _newTypesCache = new WeakMap, Set>(); +function newTypesFor(astTypeMap: Record): Set { + let s = _newTypesCache.get(astTypeMap); + if (s) return s; + s = new Set(); + for (const type in astTypeMap) { + if (astTypeMap[type] === 'new') s.add(type); + } + _newTypesCache.set(astTypeMap, s); return s; } @@ -164,11 +190,12 @@ export function createAstStoreVisitor( ): Visitor { const rows: AstStoreRow[] = []; const matched = new Set(); - const newTypes = new Set( - Object.entries(astTypeMap) - .filter(([, kind]) => kind === 'new') - .map(([type]) => type), - ); + const newTypes = newTypesFor(astTypeMap); + // When nodeIdMap is empty, parentNodeId resolution is wasted work — the + // worker passes an empty map and the main thread re-resolves against its + // own DB-populated map in features/ast.ts::collectFileAstRows. Skip the + // findParentDef linear scan in that case. + const skipParentLookup = nodeIdMap.size === 0; function findParentDef(line: number): Definition | null { let best: Definition | null = null; @@ -183,6 +210,7 @@ export function createAstStoreVisitor( } function resolveParentNodeId(line: number): number | null { + if (skipParentLookup) return null; const parentDef = findParentDef(line); if (!parentDef) return null; return nodeIdMap.get(`${parentDef.name}|${parentDef.kind}|${parentDef.line}`) || null; diff --git a/src/domain/parser.ts b/src/domain/parser.ts index 6aa19c3c..a1aeedeb 100644 --- a/src/domain/parser.ts +++ b/src/domain/parser.ts @@ -316,16 +316,23 @@ export function getParser(parsers: Map, filePath: string) * * Name is preserved for caller compatibility; the function now ensures * *analysis data* rather than *trees*. + * + * `needsFn` (optional): when provided, only files for which it returns true are + * re-parsed. Without it the function falls back to "any WASM-parseable file + * without _tree", which was the source of #1036 — a single file missing one + * analysis triggered a full-build re-parse of every WASM-parseable file. */ export async function ensureWasmTrees( fileSymbols: Map, rootDir: string, + needsFn?: (relPath: string, symbols: any) => boolean, ): Promise { // Collect files that still need analysis data and are parseable by WASM. const pending: Array<{ relPath: string; absPath: string; symbols: any }> = []; for (const [relPath, symbols] of fileSymbols) { if (symbols._tree) continue; // legacy path — leave existing trees alone if (!_extToLang.has(path.extname(relPath).toLowerCase())) continue; + if (needsFn && !needsFn(relPath, symbols)) continue; pending.push({ relPath, absPath: path.join(rootDir, relPath), symbols }); } if (pending.length === 0) return; diff --git a/src/domain/wasm-worker-entry.ts b/src/domain/wasm-worker-entry.ts index c594850b..e8359a21 100644 --- a/src/domain/wasm-worker-entry.ts +++ b/src/domain/wasm-worker-entry.ts @@ -708,18 +708,18 @@ async function handleParse(msg: WorkerParseRequest): Promise; - if (astRows.length > 0) { - // Strip `file` and `parentNodeId` — main thread re-resolves parent IDs - // against its DB in features/ast.ts::collectFileAstRows, and `file` is - // known from the map key. - serializedAstNodes = astRows.map((n) => ({ - line: n.line, - kind: n.kind, - name: n.name ?? '', - text: n.text ?? undefined, - receiver: n.receiver ?? undefined, - })); - } + // Always set an array (even empty) — leaving astNodes undefined makes + // engine.ts::fileNeedsWasmTree treat the file as un-walked and trigger + // a full ensureWasmTrees re-parse of every WASM-parseable file (#1036). + // Strip `file` and `parentNodeId` — main thread re-resolves both in + // features/ast.ts::collectFileAstRows. + serializedAstNodes = astRows.map((n) => ({ + line: n.line, + kind: n.kind, + name: n.name ?? '', + text: n.text ?? undefined, + receiver: n.receiver ?? undefined, + })); } if (complexityVisitor) storeComplexityResults(results, defs, entry.id); From c0a089fb317b1aeb14cb9061c6c55698113130e4 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Thu, 30 Apr 2026 15:31:43 -0600 Subject: [PATCH 2/3] perf(wasm): fold redundant len===3 codepoint check into fast path (#1038) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A 3-unit UTF-16 string must contain at least 2 code points (worst case: one surrogate pair + one BMP char = 2 code points), so the spread is always redundant for len===3. Only len===2 is genuinely ambiguous — short-circuit lengths >=3 with the fast path. --- src/ast-analysis/visitors/ast-store-visitor.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/ast-analysis/visitors/ast-store-visitor.ts b/src/ast-analysis/visitors/ast-store-visitor.ts index 0549cab6..53a983ea 100644 --- a/src/ast-analysis/visitors/ast-store-visitor.ts +++ b/src/ast-analysis/visitors/ast-store-visitor.ts @@ -134,14 +134,15 @@ function extractChildExpressionText(node: TreeSitterNode): string | null { /** * Count code points cheaply: skip the `[...s]` spread when `s.length` already * decides the answer. Each code point is 1 or 2 UTF-16 units, so `.length < 2` - * implies `< 2` code points and `.length >= 4` implies `>= 2` code points - * (at most 2 surrogate pairs). Only `.length` of 2 or 3 needs the spread to - * disambiguate the surrogate-pair edge case. + * implies `< 2` code points and `.length >= 3` already guarantees `>= 2` code + * points (worst case: one surrogate pair + one BMP char = 2 code points). + * Only `.length === 2` is genuinely ambiguous (could be a single surrogate + * pair = 1 code point, or two BMP chars = 2 code points) and needs the spread. */ function codePointCountAtLeast2(s: string): boolean { const len = s.length; if (len < 2) return false; - if (len >= 4) return true; + if (len >= 3) return true; return [...s].length >= 2; } From c06baaf3dad0d5513e41b1b5f5c50bd582c1bcd2 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Thu, 30 Apr 2026 15:31:52 -0600 Subject: [PATCH 3/3] test(bench): mark known 3.9.6 regressions as fixed/tracked (#1038) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The benchmark regression guard was failing on three pre-existing regressions recorded in 3.9.6 BUILD-BENCHMARKS: - WASM Build ms/file (16.3 → 28.3) and No-op rebuild (21 → 134) — fixed in this PR (#1036 root cause: ensureWasmTrees re-parse scope). - Native Query time (29.4 → 47ms) — sample-noise blip on a small target set; not reproducible locally. - Haskell resolution precision/recall (100%/33% → 0%/0%) — separate resolver regression unrelated to #1036, tracked in #1039. Adding these to KNOWN_REGRESSIONS unblocks CI; entries will be removed once the corrected v3.9.7+ benchmark data lands. --- tests/benchmarks/regression-guard.test.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/benchmarks/regression-guard.test.ts b/tests/benchmarks/regression-guard.test.ts index cf0e9741..15e266da 100644 --- a/tests/benchmarks/regression-guard.test.ts +++ b/tests/benchmarks/regression-guard.test.ts @@ -79,6 +79,20 @@ const SKIP_VERSIONS = new Set(['3.8.0']); * * - 3.9.2:Full build — NativeDbProxy overhead causes native full build to * regress from 5206ms to 9403ms (+81%). Fix tracked in PR #906. + * + * - 3.9.6:Build ms/file / 3.9.6:No-op rebuild — WASM full build regressed + * (#1036) when PR #1016 expanded AST_TYPE_MAPS from 3 to 23 languages, + * causing zero-AST-row files to return `astNodes: undefined` and trigger + * a full-corpus re-parse. Fixed by PR #1038. Benchmarks captured before + * the fix landed; will reclear in v3.9.7+ data. + * + * - 3.9.6:Query time — native query benchmark sample-noise blip (29.4 → 47ms) + * above the natural variance of the small target set. Not reproducible + * locally (~30ms steady-state); will be re-validated on v3.9.7+ data. + * + * - 3.9.6:resolution haskell precision/recall — separate Haskell resolver + * regression introduced in 3.9.6, unrelated to #1036 / PR #1038. Tracked + * in #1039. */ const KNOWN_REGRESSIONS = new Set([ '3.9.0:1-file rebuild', @@ -87,6 +101,11 @@ const KNOWN_REGRESSIONS = new Set([ '3.9.0:fnDeps depth 5', '3.9.1:1-file rebuild', '3.9.2:Full build', + '3.9.6:Build ms/file', + '3.9.6:No-op rebuild', + '3.9.6:Query time', + '3.9.6:resolution haskell precision', + '3.9.6:resolution haskell recall', ]); /**