From 514c407e47681470dfab62ab4fdcdba77d3cf391 Mon Sep 17 00:00:00 2001 From: "wangzekun.zekin" Date: Wed, 20 May 2026 15:17:31 +0800 Subject: [PATCH 1/8] feat(cpp): add typeHierarchy/Detail-driven base+signature resolution MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace fragile C++ text heuristics with standard LSP requests where possible. Changes: - lsp: add DocumentSymbol.Detail field, propagate Detail + SelectionRange through flattenDocumentSymbols (previously stripped). - cpp/lsp: new CppProvider implementing PrepareTypeHierarchy / TypeHierarchySupertypes / TypeHierarchySubtypes / Hover / Implementation / WorkspaceSearchSymbols. Registered for uniast.Cpp. - collect: collectImpl now resolves C++ bases via typeHierarchy as the primary path; falls back to text-level BaseClassRefs only when typeHierarchy returns nothing (typical for unresolved external templates). Saves ~50 lines of fragile Definition+alias-follow logic per class. - collect: extractCppCallSig prefers clangd-canonical signature from sym.Detail ("void(int)") over source-scanned text ("void(int x)"), which is more stable across multi-line parameter lists. - collect: dedupCppFunction uses signature-aware key (cppBaseName on full mid.Name) — overloads no longer collapse in Type.Methods, and multi-line .cpp parameter formatting no longer breaks .h/.cpp merge. Bug fixes carried in this commit: 1. (T,Text)-keyed positive dep cache (collectDepsForEntity) collapsed distinct same-name calls onto one receiver. Cache reduced to negative only. 2. BaseClassRefs preserves the full qualified base name instead of just the last segment, so unresolved third_party::Provider and other_pkg::Provider remain distinguishable. 3. IsUsingAlias kind-aware: using-declarations importing functions/members (using Base::foo;) are no longer mis-routed through the alias-redirect path. 4. dedupCppFunction uses bracket-depth-aware cppFnHasBody helper instead of strings.Contains(content, "{"), so header decls with lambda/braced-init default arguments no longer get falsely classified as definitions. 5. baseSym/sym self-resolution detected by Location.Include containment rather than short-name equality, so app::Provider extending common::Provider isn't dropped as "resolved to self". 6. NVI synthesis dedup keyed by local signature (handle(int)) instead of short name (handle), so overloads survive inheritance. 7. Sysroot (cstdlib) + build-time codegen (build_generated) modules emit only edges, no nodes — scanner skips build64_release; addSymbol drops sysroot/codegen syms before they enter c.syms; export skips body emission for the same buckets. Testdata + tests: - testdata/cpp/{2_inheritance, 3_aliases, 4_inline_methods, 5_external_calls, 6_nvi_dispatch, 7_provider_chain, 8_overloaded_methods, 9_crossns_basename, 10_unresolved_basens, 11_using_decl, 13_methods_key_consistency} for regression coverage. - 15-case unit test for cppFnHasBody covering declarations vs definitions, lambda defaults, braced-init defaults, strings, comments. - Integration tests in lang/parse_cpp_test.go for each fix. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/cpp_known_issues.md | 99 ++ lang/collect/collect.go | 598 +++++++++-- lang/collect/cpp_body_test.go | 58 ++ lang/collect/export.go | 728 +++++++++++++- lang/cpp/lib.go | 8 +- lang/cpp/lsp/cpp_lsp.go | 110 +++ lang/cpp/spec.go | 743 +++++++++++++- lang/lsp/client.go | 23 +- lang/lsp/lsp.go | 28 +- lang/lsp/lsp_methods.go | 376 ++++--- lang/parse.go | 4 + lang/parse_cpp_test.go | 934 ++++++++++++++++++ lang/register/provider.go | 3 +- main.go | 67 +- testdata/cpp/10_unresolved_basens/main.cpp | 26 + testdata/cpp/11_using_decl/base.h | 12 + testdata/cpp/11_using_decl/derived.h | 20 + testdata/cpp/11_using_decl/main.cpp | 7 + .../cpp/13_methods_key_consistency/main.cpp | 6 + .../cpp/13_methods_key_consistency/types.h | 18 + testdata/cpp/2_inheritance/main.cpp | 16 + testdata/cpp/2_inheritance/shapes.cpp | 17 + testdata/cpp/2_inheritance/shapes.h | 75 ++ testdata/cpp/3_aliases/main.cpp | 13 + testdata/cpp/3_aliases/types.h | 35 + testdata/cpp/4_inline_methods/handlers.h | 49 + testdata/cpp/4_inline_methods/main.cpp | 10 + testdata/cpp/5_external_calls/main.cpp | 38 + testdata/cpp/6_nvi_dispatch/main.cpp | 7 + testdata/cpp/6_nvi_dispatch/nvi.cpp | 22 + testdata/cpp/6_nvi_dispatch/nvi.h | 47 + testdata/cpp/7_provider_chain/concrete.cpp | 25 + testdata/cpp/7_provider_chain/concrete.h | 65 ++ testdata/cpp/7_provider_chain/main.cpp | 10 + testdata/cpp/7_provider_chain/provider.h | 44 + testdata/cpp/8_overloaded_methods/main.cpp | 8 + testdata/cpp/8_overloaded_methods/overloads.h | 24 + testdata/cpp/9_crossns_basename/app.h | 18 + testdata/cpp/9_crossns_basename/common.h | 14 + testdata/cpp/9_crossns_basename/main.cpp | 6 + 40 files changed, 4142 insertions(+), 269 deletions(-) create mode 100644 docs/cpp_known_issues.md create mode 100644 lang/collect/cpp_body_test.go create mode 100644 lang/cpp/lsp/cpp_lsp.go create mode 100644 lang/parse_cpp_test.go create mode 100644 testdata/cpp/10_unresolved_basens/main.cpp create mode 100644 testdata/cpp/11_using_decl/base.h create mode 100644 testdata/cpp/11_using_decl/derived.h create mode 100644 testdata/cpp/11_using_decl/main.cpp create mode 100644 testdata/cpp/13_methods_key_consistency/main.cpp create mode 100644 testdata/cpp/13_methods_key_consistency/types.h create mode 100644 testdata/cpp/2_inheritance/main.cpp create mode 100644 testdata/cpp/2_inheritance/shapes.cpp create mode 100644 testdata/cpp/2_inheritance/shapes.h create mode 100644 testdata/cpp/3_aliases/main.cpp create mode 100644 testdata/cpp/3_aliases/types.h create mode 100644 testdata/cpp/4_inline_methods/handlers.h create mode 100644 testdata/cpp/4_inline_methods/main.cpp create mode 100644 testdata/cpp/5_external_calls/main.cpp create mode 100644 testdata/cpp/6_nvi_dispatch/main.cpp create mode 100644 testdata/cpp/6_nvi_dispatch/nvi.cpp create mode 100644 testdata/cpp/6_nvi_dispatch/nvi.h create mode 100644 testdata/cpp/7_provider_chain/concrete.cpp create mode 100644 testdata/cpp/7_provider_chain/concrete.h create mode 100644 testdata/cpp/7_provider_chain/main.cpp create mode 100644 testdata/cpp/7_provider_chain/provider.h create mode 100644 testdata/cpp/8_overloaded_methods/main.cpp create mode 100644 testdata/cpp/8_overloaded_methods/overloads.h create mode 100644 testdata/cpp/9_crossns_basename/app.h create mode 100644 testdata/cpp/9_crossns_basename/common.h create mode 100644 testdata/cpp/9_crossns_basename/main.cpp diff --git a/docs/cpp_known_issues.md b/docs/cpp_known_issues.md new file mode 100644 index 00000000..b106d8aa --- /dev/null +++ b/docs/cpp_known_issues.md @@ -0,0 +1,99 @@ +# C++ Collector Known Issues + +记录 freq_service v7 parse 之后用户发现的 6 个问题。每条带:现象、复现位置、根因猜想、修复进度。 + +## 1. main.cpp 没有 method(实为:模板函数调用没进 FC) + +**现象(修订)**:`main` 函数节点是存在的,但 FunctionCalls / MethodCalls 都是 0。content 里能看到 `cppservice::main<...>(argc, argv)` 模板调用。 + +**实证**:v7 AST `service/main.cpp` 唯一函数 `main(int argc, char** argv)`,FC=0, MC=0。Content `return cppservice::main(argc, argv);`。 + +**根因猜想**:模板函数调用的 token 在 clangd 里类型是 `function`/`template`,未被识别为 reference token,导致 Definition 查询不发生,dep 收集失败。 + +## 2. `cppservice::Strategy::process(RequestContext& ctx)` 没有定义 + +**现象**:节点存在,但其 Content 字段为空 / FileLine 指向不存在 / 是 declaration-only 残留。 + +**复现**:v7 AST 搜 `Strategy::process` 节点。Strategy 是 freq_service 内部基类(参考 NVI 模式)。 + +**根因猜想**:Strategy 是一个非纯虚基类,header 有声明,cpp 中有定义;但 abcoder 把 cpp 里的定义弄丢了,剩了 header 那份;或 header 那份被 declaration-only 过滤吃掉,导致整条记录消失。 + +## 3. `FeathubProvider::cppservice::Provider::provide(...)` 既无 call 也无 construct + +**现象**:节点存在,但 FunctionCalls、MethodCalls、Types(构造调用) 全空。 + +**复现**:v7 AST 搜 `FeathubProvider::cppservice::Provider::provide`。 + +**根因猜想**:(a) 名字形态怪异(`Class::OtherNS::OtherClass::method`),说明 NodeID 拼装出错;(b) 节点是 NVI 合成的 ImplHead 包裹件,原 deps 没有挂上来;(c) ImplHead 路径绕过了 deps 收集。 + +## 4. Provider 相关的没有 Implement + +**现象**:`XxxProvider extends Provider` 时,AST 上 XxxProvider Type 的 Implements 字段为空,断了继承链。 + +**复现**:v7 AST 看 freq_service Provider 派生类。 + +**根因猜想**:`Provider` 基类是模板(`template class Provider`),BaseClassTokens 可能在模板参数里走偏;或基类 token 被 isReferenceTypeToken 判错;或 Implements 写入 races。 + +## 5. 继承时 `Provider::provide` 应改名为 `XXXProvider::provide`,很多没改 + +**现象**:派生类有自己的 override,但在 AST 中仍以基类 NodeID(`Provider::provide`)出现,没绑到派生类 NodeID 上。 + +**复现**:v7 AST 搜 `Provider::provide` 节点。应该看到 `XxxProvider::provide` 而非泛型基类。 + +**根因猜想**:CppSpec 在 method 归属时把 method 挂到了 SKMethod 报告的 containerName,而 clangd 偶尔会把 override 报成 base class 的 method;也可能是 Method.Receiver 解析失败 fallback 到了 base。 + +**实证补充(问题 4/5 共同的)**:在 v7 freq_service 中检查所有 Provider 派生类的 Implements 字段,只有继承裸 `Provider`(非模板)的 FeathubProvider 的 Implements 是 `['cppservice::Provider']`,其余继承 `SimpleProvider`、`CircuitBreakerProvider` 的派生类 Implements 全部为空 — 说明 BaseClassTokens 在模板基类场景下没找到正确的 base type token(很可能 isReferenceTypeToken 把模板参数当成 base type,或在 `<...>` 区间外的 base 名字漏过)。 + +## 6. 前向声明出现在 Types 里 + +**现象**:`class AwemeUserProfileFeatureProvider;` 这类前向声明产生了一个 Type 节点(无字段、无方法),跟真正的 class 定义并列。 + +**复现**:v7 AST 搜该名字,会看到一个 Type 但 Methods/SubStruct/Fields 全空。 + +**根因猜想**:clangd 会把 forward decl 报成单独的 DocumentSymbol(SKClass,Range 只覆盖 `class X;` 这一行)。export.go 没有区分 forward-only 与定义,两者都写入了 Types 表。 + +**实证**:单单 `service/core/freq_resource_manager.h` 里就有 107 个 Type 节点,**106 个**是前向声明(Methods=0、Fields=0、Implements=[]、content 形如 `class XxxProvider`,注意末尾 `;` 被 declarationText 截掉了)。仅 1 个是真定义。 + +--- + +## 修复计划 + +1. 调查阶段:用 v7 AST 把每条问题的具体节点 dump 出来,看 Content / Identity / 字段。 +2. 修复阶段:分条改 collect / spec / export,并优先动作小的(forward decl 过滤、main 函数过滤误伤、declaration-only 误伤 main/Strategy);继承重命名/Implements/ImplHead-deps 涉及 NVI 合成与 receiver 解析,要小心。 +3. 测试阶段:每修一条,在 `testdata/cpp/` 下增加 minimal repro 样本,在 `lang/parse_cpp_test.go` 加 assert,回归测试中跑通。 + +--- + +## 修复进度 + +### ✓ 已修 + +- **问题 6(forward decl 误入 Types)** — `lang/collect/export.go` 在 SKClass/SKStruct emit 之前判断 content 是否以 `class `/`struct ` 开头且不含 `{`,是则跳过。typedef 不受影响。 +- **问题 4/5(模板基类 Implements)** — clangd 对外部未解析的模板基类不上报 base token,旧的 token-index 路径漏掉它们。`lang/cpp/spec.go` 新增 `BaseClassRefs(sym)` 从源码 `declarationText` 直接 parse base 名字 + 计算文件位置,`lang/collect/collect.go::collectImpl` 优先用该路径;当 Definition 解不出(fallback 回 self)时,记录到 `cppUnresolvedBases`;`lang/collect/export.go` 把它们转成 light Identity 加到 Type.Implements。 +- **问题 2(Strategy::process declaration-only 节点缺失)** — 之前的 declaration-only 过滤把外部头里只有 declaration 的 method 一并丢弃。改用 cross-pkg dedup:保留 *body 那份*;当只有 declaration 没有 definition 时(外部 cpp 未加载、纯虚等),保留 declaration。Collector 新增 `cppFnEmittedBody`/`cppFnLocation` 两个 map 来识别原始 hasBody 状态(不受 ImplHead 包装影响)。 +- **NodeID 拼装 bug(问题 3 的一部分)** — `synthesizeInheritedMethodsCpp` 用 `LastIndex("::")` 找 receiver 分割点,但 `(` 之后的参数(`const std::string&`)里也有 `::`,落到错误位置。改成只在 `(` 之前的 head 内找。 + +- **问题 3(NVI inline body 内 deps 缺失)** ✓ 修复 + - 根因:外部 `LoadExternalSymbol` 加载的 method/function sym 进入 `c.syms`,但 `collectDepsForEntity` 只对 workspace 内 `entity_syms` 调用。`needProcessExternal` 对 C++ 又要求 `SKObject`(clangd 永远不报),所以外部 method 永远无 deps。 + - 修复:`Collector` 增加 `cppExtDepsQueue`,在 `getSymbolByLocation` 的 external-load 分支里把每个 SKMethod/SKFunction sym 入队;`Collect` 结尾跑一遍批量 `collectDepsForEntity`(并发,去重)。直接 inline 会触发 `getSymbolByToken` → 新外部 load → 新 `collectDepsForEntity` 的无界递归。 + - v17 验证:`cppservice::Provider::provide` MC=1 → `cppservice::Remote::call`;合成的 `cppservice::freq::FeathubProvider::provide` 继承到同一边。 + +- **header-inline 方法 body deps 不收集** ✓ 修复 + - 根因:method 的 `functionInfo.Method.Receiver.Location` fallback 时被设成 *整个 receiver class body* range(`collect.go:2462` 的 GetParent 路径)。`collectDepsForEntity` 用 `Receiver.Location.Include(token.Location)` skip receiver-qualifier token —— 对 inline-in-class method,class body 包含整个 method body,body 内 *所有* token 都被这条规则误 skip。 + - 修复:`collect.go::collectDepsForEntity` 区分两种情况:当 sym.Location 落在 receiver class body 内(即 inline),只 skip 字面匹配 receiver class 名字的 token;当 sym 在 receiver class body 外(即 out-of-line),保留原 Include-based skip 行为。 + +- **空 package 清理** ✓ — 修复 inline body deps 之后,dedup 把 .cpp 里的 method 全部移到 .h pkg,留下 *空的 .cpp pkg*。export 末尾扫一遍 `repo.Modules`,删 `len(Functions)==len(Types)==len(Vars)==0` 的 pkg。 +- **Type.Methods 回填合成 method** ✓ — `synthesizeInheritedMethodsCpp` 末尾把合成的 NodeID 加到 derived type 的 Methods map(之前只写到 pkg.Functions,Type 的 Methods 字段缺这条 entry)。 +- **method 归属重新指到 .h pkg** ✓ — 当存在同 NodeID 的 .h decl + .cpp def 时,合并:保留 .h 的 pkg 位置,但把 content/FC/MC 等 body 字段从 .cpp 覆盖过来。 + +### 后续修复 + +- **问题 1(main 模板函数调用未进 FC)** ✓ 修复 + - 真正根因不在 `IsEntityToken`(`run` token type=`function` 是 entity OK),而是 inline body deps 的 receiver-skip 副作用 — `main` 本身没受影响,但同套修复让 inline body 内 token 不再被误 skip 后,所有 function-call edge(含模板)都能正确收集。testdata 7 main() 调 `common::run(argc, argv)` 现在产生 FC=1 → `common::run`。 + +- **`using NS::Name;` 别名 NodeID 形态错乱** ✓ 修复 + - 根因:`using common::Provider;` 这种 using-declaration 被 clangd 当成一个 phantom `Class` DocumentSymbol,Range 只覆盖 `Provider` 8 字符;BaseClassRefs 的 fakeTok Definition 命中这个 phantom sym,lightIdentityForExternal 拼出 `app::Provider::Provider`。 + - 修复 1:`spec.go::IsUsingAlias` 扩展支持 `using NS::Name;`(不要求有 `=`)。 + - 修复 2:`collect.go::collectImpl` 中拿到 baseSym 后检测 *单行 + 字符数 ≈ name 长* 的 narrow Range —— 是 alias decl 标志 —— 再调一次 `c.cli.Definition` follow 到真目标,最多 4 跳。 + - 修复 3:新增 `Collector.resolveAlias`(cyclic-safe)走 `c.aliasRedirect` 链。 + - testdata 7 `BareNameProvider` 现在 `Implements=['common::Provider']`、`Methods=['do_provide', 'provide']`。 diff --git a/lang/collect/collect.go b/lang/collect/collect.go index dbd4bcfe..4512b736 100644 --- a/lang/collect/collect.go +++ b/lang/collect/collect.go @@ -51,6 +51,37 @@ type CollectOption struct { Excludes []string LoadByPackages bool BuildFlags []string + // Sysroots is a list of filesystem prefixes whose contents should be + // classified under the `cstdlib` module (typically toolchain sysroots + // containing libstdc++/glibc/clang builtins). Currently honoured by the + // C++ spec only. + Sysroots []string +} + +type cppFnLoc struct { + mod string + pkg string + hasBody bool + isHeader bool +} + +// isCppHeaderPkg reports whether a C++ pkg path looks like a header file +// (`.h`, `.hpp`, `.hh`, `.hxx`). Used to bias the .h-decl-vs-.cpp-def +// dedup toward keeping the header entry. +func isCppHeaderPkg(s string) bool { + return strings.HasSuffix(s, ".h") || + strings.HasSuffix(s, ".hpp") || + strings.HasSuffix(s, ".hh") || + strings.HasSuffix(s, ".hxx") +} + +// canonicalizePath resolves a path to its realpath form. Returns the +// input unchanged when EvalSymlinks fails or yields an empty string. +func canonicalizePath(p string) string { + if real, err := filepath.EvalSymlinks(p); err == nil && real != "" { + return real + } + return p } type Collector struct { @@ -59,11 +90,31 @@ type Collector struct { repo string + // mu guards the collector's maps; never held across LSP RPCs. + mu sync.Mutex + syms map[Location]*DocumentSymbol // symbol => (receiver,impl,func) funcs map[*DocumentSymbol]functionInfo + // cppFnEmitted records, per emitted Function NodeID, where it landed + // and whether the original (pre-ImplHead-wrap) content had a body. + // Used by export-time dedup to merge a header decl with the .cpp + // definition under the header pkg. + cppFnEmitted map[string]cppFnLoc + + // cppUnresolvedBases: type symbol -> base names that clangd couldn't + // resolve (typically external/unloaded templated bases). Export wraps + // them as light Identities under module=external so the Implements + // edge survives even without a definition. + cppUnresolvedBases map[*DocumentSymbol][]string + + // cppExtDepsQueue queues external method/function syms for a final + // deps pass after Collect — running inline would recurse through + // more loads. + cppExtDepsQueue []*DocumentSymbol + // symbol => [deps] deps map[*DocumentSymbol][]dependency @@ -84,6 +135,13 @@ type Collector struct { // receivers cache: receiver symbol => list of method symbols receivers map[*DocumentSymbol][]*DocumentSymbol + // aliasRedirect: typedef/using alias symbol -> the underlying type's + // symbol. Used at export time to substitute references to the alias + // with the real type's Identity, and to suppress emission of the alias + // itself as a standalone Type entry. C++ only for now. Guarded by c.mu + // like the rest of the collector state. + aliasRedirect map[*DocumentSymbol]*DocumentSymbol + // file content cache to reduce IO in Export fileContentCache map[string]string @@ -100,17 +158,45 @@ type Collector struct { CollectOption } +// collectorConcurrency caps the parallelism for processSymbol and the dep +// collection token loop. 32 matches clangd's default -j=32, letting every +// worker thread on the server side be saturated by an in-flight RPC. +const collectorConcurrency = 32 + // UseJavaIPC sets the Java IPC converter caches as the source of truth for Java collecting. // When enabled, Java Collect will not rely on LSP (no Definition/SemanticTokens). func (c *Collector) UseJavaIPC(conv *javaipc.Converter) { c.javaIPC = conv } + +// resolveAlias walks c.aliasRedirect (cyclic-safe) to find the real symbol +// behind a `using Y = X;` / `using NS::Name;` declaration. Returns the +// input unchanged when no redirect applies. +func (c *Collector) resolveAlias(sym *DocumentSymbol) *DocumentSymbol { + if sym == nil { + return nil + } + c.mu.Lock() + defer c.mu.Unlock() + visited := map[*DocumentSymbol]bool{sym: true} + for { + target, ok := c.aliasRedirect[sym] + if !ok || target == nil || visited[target] { + return sym + } + visited[target] = true + sym = target + } +} + // addImplementsRel records that `from` implements `iface`. Idempotent on (from, iface). func (c *Collector) addImplementsRel(from *DocumentSymbol, iface *DocumentSymbol, tokenLoc Location) { if from == nil || iface == nil { return } + c.mu.Lock() + defer c.mu.Unlock() for _, existing := range c.implementsRel[from] { if existing.Symbol == iface { return @@ -119,6 +205,50 @@ func (c *Collector) addImplementsRel(from *DocumentSymbol, iface *DocumentSymbol c.implementsRel[from] = append(c.implementsRel[from], dependency{Location: tokenLoc, Symbol: iface}) } +// collectCppBasesViaTypeHierarchy uses LSP typeHierarchy to resolve the +// supertypes of a C++ class symbol, recording an implements-rel for each +// resolved base. Returns true when typeHierarchy returned at least one +// supertype — the caller can then skip the text-level BaseClassRefs +// fallback. Returns false on RPC failure / empty results. +// +// typeHierarchy is the LSP-blessed way to ask the server "what does this +// class extend" — clangd has the parsed C++ AST so it knows about +// using-declarations, type aliases, and namespace-qualified bases +// without us reparsing the declaration text. +func (c *Collector) collectCppBasesViaTypeHierarchy(ctx context.Context, sym *DocumentSymbol) bool { + if c.cli == nil { + return false + } + // SelectionRange (the name token range) is what clangd expects; + // fall back to Location.Range.Start if not populated. + pos := sym.Location.Range.Start + if sym.SelectionRange != nil { + pos = sym.SelectionRange.Start + } + items, err := c.cli.PrepareTypeHierarchy(ctx, sym.Location.URI, pos) + if err != nil || len(items) == 0 { + return false + } + resolvedAny := false + for _, item := range items { + supers, err := c.cli.TypeHierarchySupertypes(ctx, item) + if err != nil { + continue + } + for _, sup := range supers { + loc := Location{URI: sup.URI, Range: sup.SelectionRange} + fakeTok := Token{Location: loc, Type: "class", Text: sup.Name} + baseSym, err := c.getSymbolByLocation(ctx, loc, 0, fakeTok) + if err != nil || baseSym == nil || baseSym == sym { + continue + } + c.addImplementsRel(sym, baseSym, loc) + resolvedAny = true + } + } + return resolvedAny +} + type methodInfo struct { Receiver dependency `json:"receiver"` Interface *dependency `json:"implement,omitempty"` // which interface it implements @@ -153,7 +283,19 @@ func switchSpec(l uniast.Language, repo string) LanguageSpec { } } +// ApplyCollectOptionToSpec forwards language-specific entries from +// CollectOption to the underlying LanguageSpec. Currently routes +// `--sysroot` paths into CppSpec; other languages are no-ops. +func (c *Collector) ApplyCollectOptionToSpec() { + if cs, ok := c.spec.(interface{ SetSysroots([]string) }); ok && len(c.Sysroots) > 0 { + cs.SetSysroots(c.Sysroots) + } +} + func NewCollector(repo string, cli *LSPClient) *Collector { + // Canonicalise: prefix-based "is this file inside the repo?" checks + // must match the realpath form clangd reports in sym.Location.URI. + repo = canonicalizePath(repo) ret := &Collector{ repo: repo, cli: cli, @@ -162,10 +304,13 @@ func NewCollector(repo string, cli *LSPClient) *Collector { funcs: map[*DocumentSymbol]functionInfo{}, deps: map[*DocumentSymbol][]dependency{}, implementsRel: map[*DocumentSymbol][]dependency{}, + cppFnEmitted: map[string]cppFnLoc{}, + cppUnresolvedBases: map[*DocumentSymbol][]string{}, vars: map[*DocumentSymbol]dependency{}, files: map[string]*uniast.File{}, fileContentCache: make(map[string]string), symsByFile: make(map[DocumentURI][]*DocumentSymbol), + aliasRedirect: map[*DocumentSymbol]*DocumentSymbol{}, } // if cli.Language == uniast.Rust { // ret.modPatcher = &rust.RustModulePatcher{Root: repo} @@ -221,9 +366,18 @@ func (c *Collector) Collect(ctx context.Context) error { if c.spec.IsEntitySymbol(*sym) { entity_syms = append(entity_syms, sym) } - if c.Language != uniast.Java { - c.processSymbol(ctx, sym, 1) + } + if c.Language != uniast.Java { + var psg errgroup.Group + psg.SetLimit(collectorConcurrency) + for _, sym := range root_syms { + sym := sym + psg.Go(func() error { + c.processSymbol(ctx, sym, 1) + return nil + }) } + _ = psg.Wait() } // collect internal references @@ -258,87 +412,185 @@ func (c *Collector) Collect(ctx context.Context) error { // } // } - // collect dependencies + // collect dependencies — parallel per entity symbol. processSymbol above + // already finished, so c.funcs/c.vars are read-only here. Writes to + // c.deps and c.syms are routed through c.mu / addSymbol. + var deg errgroup.Group + deg.SetLimit(collectorConcurrency) for _, sym := range entity_syms { - next_token: - - for i, token := range sym.Tokens { - // only entity token need to be collect (std token is only collected when NeedStdSymbol is true) - if !c.spec.IsEntityToken(token) { + sym := sym + deg.Go(func() error { + c.collectDepsForEntity(ctx, sym) + return nil + }) + } + _ = deg.Wait() + + // C++: needProcessExternal is gated on SKObject (clangd never reports + // that for C++), so external method/function bodies — including NVI + // base `provide()` → `do_provide()` calls — would otherwise have + // empty FC/MC edges. Drain the queue here. + if c.Language == uniast.Cpp { + c.mu.Lock() + queue := c.cppExtDepsQueue + c.cppExtDepsQueue = nil + c.mu.Unlock() + // De-dup; an entity may have been queued by multiple call sites. + seen := map[*DocumentSymbol]bool{} + uniq := queue[:0] + for _, sym := range queue { + if seen[sym] { continue } - - // skip function's params - if sym.Kind == SKFunction || sym.Kind == SKMethod { - if finfo, ok := c.funcs[sym]; ok { - if finfo.Method != nil { - if finfo.Method.Receiver.Location.Include(token.Location) { - continue next_token - } - } - if finfo.Inputs != nil { - if _, ok := finfo.Inputs[i]; ok { - continue next_token - } + seen[sym] = true + uniq = append(uniq, sym) + } + var eg errgroup.Group + eg.SetLimit(collectorConcurrency) + for _, sym := range uniq { + sym := sym + eg.Go(func() error { + if len(sym.Tokens) == 0 { + tokens, err := c.cli.SemanticTokens(ctx, sym.Location) + if err == nil { + sym.Tokens = tokens } - if finfo.Outputs != nil { - if _, ok := finfo.Outputs[i]; ok { + } + c.collectDepsForEntity(ctx, sym) + return nil + }) + } + _ = eg.Wait() + } + return nil +} + +// collectDepsForEntity resolves all dep tokens of a single entity symbol and +// appends them to c.deps[sym]. The caller is responsible for ensuring +// processSymbol has already run on sym (c.funcs/c.vars populated) and that +// sym.Location.URI is open in clangd so Definition queries can be answered. +func (c *Collector) collectDepsForEntity(ctx context.Context, sym *DocumentSymbol) { + localDeps := make([]dependency, 0, 8) + // Negative cache only: a token (type, text) that already failed to + // resolve once is unlikely to resolve on a later occurrence in the + // same body, so we skip the redundant Definition() RPC. + // We do NOT positive-cache: in C++, three sibling method calls like + // `a.process(c)`, `s.process(c)`, `g.process(c)` share token + // (type=method, text=process) but resolve to three different + // definitions; a positive cache would collapse them onto the first + // receiver's method and silently mis-route the call edges. + type tokKey struct{ typ, text string } + negativeResolved := make(map[tokKey]bool, 16) + + // Snapshot funcs/vars info for this sym once. After processSymbol + // has finished, these maps are only written when external recursion + // runs in other goroutines, but never for THIS sym key. + c.mu.Lock() + finfo, hasFn := c.funcs[sym] + vardep, hasVar := c.vars[sym] + c.mu.Unlock() +next_token: + for i, token := range sym.Tokens { + // only entity token need to be collect (std token is only collected when NeedStdSymbol is true) + if !c.spec.IsEntityToken(token) { + continue + } + + // skip function's params + if sym.Kind == SKFunction || sym.Kind == SKMethod { + if hasFn { + if finfo.Method != nil { + rl := finfo.Method.Receiver.Location + // Receiver.Location can legitimately be either the + // qualifier token (for out-of-line methods, narrow) + // or the whole class body (fallback for inline-in- + // class methods where the language spec couldn't + // find a qualifier in the signature). For the latter + // the body contains real call tokens, so we must NOT + // blanket-skip everything under Include — only the + // literal receiver name should be skipped. + if rl.Include(sym.Location) { + // inline: token text-match check + if finfo.Method.Receiver.Symbol != nil && + token.Text == finfo.Method.Receiver.Symbol.Name { continue next_token } + } else if rl.Include(token.Location) { + // out-of-line: receiver qualifier token range + continue next_token } - if finfo.TypeParams != nil { - if _, ok := finfo.TypeParams[i]; ok { - continue next_token - } + } + if finfo.Inputs != nil { + if _, ok := finfo.Inputs[i]; ok { + continue next_token } } - } - // skip variable's type - if sym.Kind == SKVariable || sym.Kind == SKConstant { - if dep, ok := c.vars[sym]; ok { - if dep.Location.Include(token.Location) { + if finfo.Outputs != nil { + if _, ok := finfo.Outputs[i]; ok { continue next_token } } - } - - // go to definition - dep, err := c.getSymbolByToken(ctx, token) - if err != nil || dep == nil { - if token.Type == "method_invocation" || token.Type == "static_method_invocation" { - // 外部依赖无法从LSP 中查询到定义,先不报错 - continue + if finfo.TypeParams != nil { + if _, ok := finfo.TypeParams[i]; ok { + continue next_token + } } - log.Error("dep token %v not found: %v\n", token, err) - continue } - - // NOTICE: some internal symbols may not been get by DocumentSymbols, thus we let Unknown symbol pass - if dep.Kind == SKUnknown && c.internal(dep.Location) { - // try get symbol kind by token - sk := c.spec.TokenKind(token) - if sk != SKUnknown { - dep.Kind = sk - dep.Name = token.Text + } + // skip variable's type + if sym.Kind == SKVariable || sym.Kind == SKConstant { + if hasVar { + if vardep.Location.Include(token.Location) { + continue next_token } } + } - // remove local symbols - if sym.Location.Include(dep.Location) { + // go to definition — only consult negative cache to avoid retrying + // tokens that already failed once. Positive results are NOT cached + // because same (type, text) can resolve to different definitions + // per occurrence (e.g. different receivers for the same short + // method name). + key := tokKey{typ: token.Type, text: token.Text} + if negativeResolved[key] { + continue + } + dep, err := c.getSymbolByToken(ctx, token) + if err != nil || dep == nil { + negativeResolved[key] = true + if token.Type == "method_invocation" || token.Type == "static_method_invocation" { + // 外部依赖无法从LSP 中查询到定义,先不报错 continue - } else { - c.addSymbol(dep.Location, dep) } + log.Error("dep token %v not found: %v\n", token, err) + continue + } + // NOTICE: some internal symbols may not been get by DocumentSymbols, thus we let Unknown symbol pass + if dep.Kind == SKUnknown && c.internal(dep.Location) { + // try get symbol kind by token + sk := c.spec.TokenKind(token) + if sk != SKUnknown { + dep.Kind = sk + dep.Name = token.Text + } + } - c.deps[sym] = append(c.deps[sym], dependency{ - Location: token.Location, - Symbol: dep, - }) - + // remove local symbols + if sym.Location.Include(dep.Location) { + continue } - } + c.addSymbol(dep.Location, dep) - return nil + localDeps = append(localDeps, dependency{ + Location: token.Location, + Symbol: dep, + }) + } + if len(localDeps) > 0 { + c.mu.Lock() + c.deps[sym] = append(c.deps[sym], localDeps...) + c.mu.Unlock() + } } // ScannerByJavaIPC collects Java symbols/deps from IPC Java Parser caches. @@ -1940,10 +2192,22 @@ func (c *Collector) internal(loc Location) bool { } func (c *Collector) addSymbol(loc Location, sym *DocumentSymbol) { + // Drop C++ cstdlib (sysroot) and build_generated (codegen) symbols + // at the storage gate so they never enter c.syms / c.symsByFile and + // thus never get iterated by export. Callers receive the *sym they + // constructed back and can still build edges by Identity. + if c.Language == uniast.Cpp { + if mod, _, err := c.spec.NameSpace(loc.URI.File(), nil); err == nil && + (mod == "cstdlib" || mod == "build_generated") { + return + } + } + c.mu.Lock() if _, ok := c.syms[loc]; !ok { c.syms[loc] = sym c.symsByFile[loc.URI] = append(c.symsByFile[loc.URI], sym) } + c.mu.Unlock() } func (c *Collector) getSymbolByToken(ctx context.Context, tok Token) (*DocumentSymbol, error) { @@ -2018,14 +2282,36 @@ func (c *Collector) getSymbolByLocation(ctx context.Context, loc Location, depth if !(from.Type == "typeParameter" && c.Language == uniast.Cpp) { // 1. already loaded // Optimization: only search in symbols of the same file - if fileSyms, ok := c.symsByFile[loc.URI]; ok { - if sym := c.findMatchingSymbolIn(loc, fileSyms); sym != nil { + c.mu.Lock() + fileSyms, ok := c.symsByFile[loc.URI] + // copy the slice header so iteration is safe after unlock + var snap []*DocumentSymbol + if ok { + snap = append(snap, fileSyms...) + } + c.mu.Unlock() + if ok { + if sym := c.findMatchingSymbolIn(loc, snap); sym != nil { return sym, nil } } } - if c.LoadExternalSymbol && !c.internal(loc) && (c.NeedStdSymbol || !c.spec.IsStdToken(from)) { + // Short-circuit for C++ sysroot (cstdlib) and build-time generated + // (build_generated) files: skip the deep external-load path + // (DocumentSymbols + per-sym Locate/SemanticTokens + processSymbol + // + cppExtDepsQueue). Edges still resolve via a fake Unknown symbol + // in the !LoadExternalSymbol branch below; export then emits the + // edge by Identity without creating a node. + skipExternalLoad := false + if c.LoadExternalSymbol && c.Language == uniast.Cpp && !c.internal(loc) { + if mod, _, err := c.spec.NameSpace(loc.URI.File(), nil); err == nil && + (mod == "cstdlib" || mod == "build_generated") { + skipExternalLoad = true + } + } + + if !skipExternalLoad && c.LoadExternalSymbol && !c.internal(loc) && (c.NeedStdSymbol || !c.spec.IsStdToken(from)) { // 2. load external symbol from its file syms, err := c.cli.DocumentSymbols(ctx, loc.URI) if err != nil { @@ -2033,21 +2319,23 @@ func (c *Collector) getSymbolByLocation(ctx context.Context, loc Location, depth } // load the other external symbols in that file for _, sym := range syms { - // save symbol first - if _, ok := c.syms[sym.Location]; !ok { - content, err := c.cli.Locate(sym.Location) - if err != nil { - return nil, err - } - sym.Text = content - c.addSymbol(sym.Location, sym) + c.mu.Lock() + _, already := c.syms[sym.Location] + c.mu.Unlock() + if already { + continue + } + content, err := c.cli.Locate(sym.Location) + if err != nil { + return nil, err } + sym.Text = content + c.addSymbol(sym.Location, sym) } // load more external symbols if depth permits if depth >= 0 { // process target symbol for _, sym := range syms { - // check if need process if c.needProcessExternal(sym) { // collect tokens before process tokens, err := c.cli.SemanticTokens(ctx, sym.Location) @@ -2057,6 +2345,13 @@ func (c *Collector) getSymbolByLocation(ctx context.Context, loc Location, depth sym.Tokens = tokens c.processSymbol(ctx, sym, depth-1) } + // Defer external C++ method/function deps to a final pass + // (inline collection would recurse via getSymbolByToken). + if c.Language == uniast.Cpp && (sym.Kind == SKMethod || sym.Kind == SKFunction) { + c.mu.Lock() + c.cppExtDepsQueue = append(c.cppExtDepsQueue, sym) + c.mu.Unlock() + } } } rsym := c.findMatchingSymbolIn(loc, slices.Collect(maps.Values(syms))) @@ -2220,22 +2515,121 @@ func (c *Collector) collectImpl(ctx context.Context, sym *DocumentSymbol, depth if impl == "" || len(impl) < len(sym.Name) { impl = fmt.Sprintf("class %s {\n", sym.Name) } - // search all methods + // C++ multi-inheritance: pick up additional base classes beyond the + // single `inter` slot exposed by ImplSymbol. For each base class token, + // resolve it and record an implements-rel so the Type's Implements + // field is populated. + if c.Language == uniast.Cpp { + // Primary path: typeHierarchy. clangd knows the real C++ AST so + // it returns properly resolved supertypes (including across + // using-declarations and aliases) without us having to text-parse + // the base clause. Falls back to BaseClassRefs below when + // typeHierarchy returns nothing — typical for bases clangd can't + // resolve (external templates) where we need the raw text name + // for unresolved-Implements recording. + thyResolved := c.collectCppBasesViaTypeHierarchy(ctx, sym) + // Fallback / supplemental: source-text base extraction. + // BaseClassRefs always returns one entry per base specifier, so + // we can issue a Definition query at the recovered file position + // for anything typeHierarchy missed (or use it to record + // unresolved bases). + if mi, ok := c.spec.(interface { + BaseClassRefs(sym DocumentSymbol) []cpp.BaseClassRef + }); ok && !thyResolved { + for _, ref := range mi.BaseClassRefs(*sym) { + fakeTok := Token{ + Location: ref.Loc, + Type: "class", + Text: ref.Name, + } + baseSym, err := c.getSymbolByTokenWithLimit(ctx, fakeTok, depth) + // `using NS::Name;` declarations are reported by clangd as + // a Class sym whose Range covers only the `Name` token — + // not a real class definition. Detect that narrow range + // and follow one more Definition hop to reach the real + // target (avoiding NodeIDs like `Cur::Provider::Provider`). + for hops := 0; baseSym != nil && hops < 4; hops++ { + r := baseSym.Location.Range + if r.End.Line != r.Start.Line { + break // multi-line: real definition + } + if int(r.End.Character)-int(r.Start.Character) > len(baseSym.Name)+2 { + break // wide enough to be a real def, not just a name + } + defs, derr := c.cli.Definition(ctx, baseSym.Location.URI, baseSym.Location.Range.Start) + if derr != nil || len(defs) == 0 { + break + } + if defs[0] == baseSym.Location { + break + } + next, nerr := c.getSymbolByLocation(ctx, defs[0], depth, fakeTok) + if nerr != nil || next == nil || next == baseSym { + break + } + baseSym = next + } + // When the fake-token location can't be resolved by clangd + // (typical for unresolved external template bases), the + // Definition query either fails or falls back to the + // enclosing symbol (the class itself). Detect "resolved + // to self" by Location containment, NOT by short-name + // equality — short names can legitimately collide across + // namespaces (e.g. `app::Provider : public common::Provider`). + selfResolved := baseSym == sym || + (baseSym != nil && baseSym.Location.URI == sym.Location.URI && + sym.Location.Include(baseSym.Location)) + if err != nil || baseSym == nil || selfResolved { + c.mu.Lock() + c.cppUnresolvedBases[sym] = append(c.cppUnresolvedBases[sym], ref.Name) + c.mu.Unlock() + continue + } + // Follow `using NS::Name;` / `using Y = X;` aliases so the + // recorded base points at the real type, not at the alias + // symbol (which would yield NodeIDs like `Cur::Provider::Provider`). + baseSym = c.resolveAlias(baseSym) + c.addImplementsRel(sym, baseSym, ref.Loc) + } + } else if mi, ok := c.spec.(interface { + BaseClassTokens(sym DocumentSymbol) []int + }); ok { + for _, idx := range mi.BaseClassTokens(*sym) { + if idx < 0 || idx >= len(sym.Tokens) { + continue + } + baseTok := sym.Tokens[idx] + baseSym, err := c.getSymbolByTokenWithLimit(ctx, baseTok, depth) + if err != nil || baseSym == nil { + continue + } + c.addImplementsRel(sym, baseSym, baseTok.Location) + } + } + } + + // search all methods — snapshot c.syms first so we hold mu only briefly, + // then mutate c.funcs under mu. + c.mu.Lock() + candidates := make([]*DocumentSymbol, 0, len(c.syms)) for _, method := range c.syms { - // NOTICE: some class method (ex: XXType::new) are SKFunction, but still collect its receiver if (method.Kind == SKMethod || method.Kind == SKFunction) && sym.Location.Include(method.Location) { - if _, ok := c.funcs[method]; !ok { - c.funcs[method] = functionInfo{} - } - f := c.funcs[method] - f.Method = &methodInfo{ - Receiver: *rd, - Interface: ind, - ImplHead: impl, - } - c.funcs[method] = f + candidates = append(candidates, method) } } + for _, method := range candidates { + if _, ok := c.funcs[method]; !ok { + c.funcs[method] = functionInfo{} + } + f := c.funcs[method] + f.Method = &methodInfo{ + Receiver: *rd, + Interface: ind, + ImplHead: impl, + } + c.funcs[method] = f + } + c.mu.Unlock() } func (c *Collector) needProcessExternal(sym *DocumentSymbol) bool { @@ -2243,6 +2637,25 @@ func (c *Collector) needProcessExternal(sym *DocumentSymbol) bool { } func (c *Collector) processSymbol(ctx context.Context, sym *DocumentSymbol, depth int) { + // C++ `using Y = X;` / `using NS::Name;` aliases: resolve target once + // and stash a redirect so subsequent dep lookups / Export resolve + // through to the real type. + if c.Language == uniast.Cpp { + if ai, ok := c.spec.(interface { + AliasTargetTokenIndex(sym DocumentSymbol) int + }); ok { + idx := ai.AliasTargetTokenIndex(*sym) + if idx >= 0 && idx < len(sym.Tokens) { + target, err := c.getSymbolByTokenWithLimit(ctx, sym.Tokens[idx], depth) + if err == nil && target != nil && target != sym { + c.mu.Lock() + c.aliasRedirect[sym] = target + c.mu.Unlock() + } + } + } + } + // method info: receiver, implementee hasImpl := c.spec.HasImplSymbol() if hasImpl { @@ -2260,6 +2673,17 @@ func (c *Collector) processSymbol(ctx context.Context, sym *DocumentSymbol, dept log.Error("get receiver symbol for token %v failed: %v\n", rec, err) } } + // C++ fallback: clangd reports inline methods with an unqualified + // Name (e.g. `process(...)` instead of `ApiHandler::process(...)`), + // so FunctionSymbol can't locate a receiver token in the signature. + // Recover the receiver from the enclosing class/struct in the + // documentSymbol tree, otherwise distinct methods of distinct + // external classes collapse to namespace-level overloads. + if rd == nil && c.Language == uniast.Cpp && c.cli != nil { + if p := c.cli.GetParent(sym); p != nil && (p.Kind == SKClass || p.Kind == SKStruct || p.Kind == SKInterface) { + rd = &dependency{Location: p.Location, Symbol: p} + } + } tsyms, ts := c.getDepsWithLimit(ctx, sym, tps, depth-1) ipsyms, is := c.getDepsWithLimit(ctx, sym, ips, depth-1) opsyms, os := c.getDepsWithLimit(ctx, sym, ops, depth-1) @@ -2325,14 +2749,18 @@ func (c *Collector) processSymbol(ctx context.Context, sym *DocumentSymbol, dept log.Error("get type symbol for token %s failed:%v\n", sym.Tokens[i], err) return } + c.mu.Lock() c.vars[sym] = dependency{ Location: sym.Tokens[i].Location, Symbol: tsym, } + c.mu.Unlock() } } func (c *Collector) updateFunctionInfo(sym *DocumentSymbol, tsyms, ipsyms, opsyms map[int]dependency, ts, is, os []dependency, rsym *dependency, lastToken int) { + c.mu.Lock() + defer c.mu.Unlock() if _, ok := c.funcs[sym]; !ok { c.funcs[sym] = functionInfo{} } diff --git a/lang/collect/cpp_body_test.go b/lang/collect/cpp_body_test.go new file mode 100644 index 00000000..d21b84bc --- /dev/null +++ b/lang/collect/cpp_body_test.go @@ -0,0 +1,58 @@ +// Copyright 2025 CloudWeGo Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collect + +import "testing" + +func TestCppFnHasBody(t *testing.T) { + cases := []struct { + name string + content string + want bool + }{ + // Declarations + {"plain decl", `void f();`, false}, + {"decl with default arg", `int g(int x = 0);`, false}, + {"decl with lambda default — the headline repro case", + `void f(std::function cb = []{});`, false}, + {"decl with brace-init default", + `void f(std::vector v = {1, 2, 3});`, false}, + {"decl with templated function-type default", + `void f(std::function cb = [](int x) { return x*2; });`, false}, + {"decl with trailing override/noexcept", `int handle(int x) const noexcept;`, false}, + {"comment containing brace", `// f { ... +void f();`, false}, + {"block comment containing brace", `/* {} */ void f();`, false}, + {"string literal containing brace", `void f(const char* s = "hello {world}");`, false}, + + // Definitions + {"plain definition", `void f() {}`, true}, + {"definition with body", `int g(int x) { return x + 1; }`, true}, + {"ctor with member init list", + `Foo::Foo(int x) : member_(x) {}`, true}, + {"definition with lambda default + body", + `void f(std::function cb = []{}) { cb(); }`, true}, + {"definition with brace-init default + body", + `void f(std::vector v = {1, 2, 3}) { use(v); }`, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := cppFnHasBody(tc.content) + if got != tc.want { + t.Errorf("cppFnHasBody(%q) = %v, want %v", tc.content, got, tc.want) + } + }) + } +} diff --git a/lang/collect/export.go b/lang/collect/export.go index bad885ac..adfb4c84 100644 --- a/lang/collect/export.go +++ b/lang/collect/export.go @@ -35,6 +35,30 @@ type dependency struct { Symbol *DocumentSymbol `json:"symbol"` } +// lightIdentityForExternal builds a Identity for an external symbol we don't +// want to fully export (no Type/Function entry produced) but still want to +// reference from the current symbol's Implements / similar edge fields. +// Returns nil if not enough info is available. +func (c *Collector) lightIdentityForExternal(sym *DocumentSymbol) *uniast.Identity { + if sym == nil || sym.Name == "" { + return nil + } + file := sym.Location.URI.File() + mod, pkg, err := c.spec.NameSpace(file, nil) + if err != nil || mod == "" { + return nil + } + name := sym.Name + // For C++ classes, the short name from clangd doesn't include namespace; + // best-effort prepend the lexical scope from sibling symbols. Skip if it + // already qualifies. + if c.Language == uniast.Cpp { + name = applyCppScopePrefix(c.scopePrefix(sym), name) + } + id := uniast.NewIdentity(mod, pkg, name) + return &id +} + func (c *Collector) fileLine(loc Location) uniast.FileLine { var rel string if c.internal(loc) { @@ -78,6 +102,178 @@ func (c *Collector) fileLine(loc Location) uniast.FileLine { } } +// cppFnHasBody returns true when `content` represents a function +// DEFINITION (with body) rather than a DECLARATION. A naive +// strings.Contains(content, "{") check misclassifies header declarations +// whose parameter list contains `{`-bearing constructs — for example a +// default argument with a lambda or a braced-init: +// +// void f(std::function cb = []{}); +// void g(std::vector v = {1, 2, 3}); +// +// In both cases the `{` lives INSIDE the parameter list (paren depth > 0). +// A true function body is the `{` that appears AFTER the parameter list's +// closing `)` (paren depth back to 0). We walk the text skipping strings, +// chars, and comments, track paren depth, and only count `{` once the +// parameter list has closed. +func cppFnHasBody(content string) bool { + parenDepth := 0 + sawParamClose := false + const ( + stNormal = iota + stLineComment + stBlockComment + stString + stChar + ) + st := stNormal + for i := 0; i < len(content); i++ { + c := content[i] + switch st { + case stLineComment: + if c == '\n' { + st = stNormal + } + continue + case stBlockComment: + if c == '*' && i+1 < len(content) && content[i+1] == '/' { + i++ + st = stNormal + } + continue + case stString: + if c == '\\' && i+1 < len(content) { + i++ + continue + } + if c == '"' { + st = stNormal + } + continue + case stChar: + if c == '\\' && i+1 < len(content) { + i++ + continue + } + if c == '\'' { + st = stNormal + } + continue + } + // stNormal + if c == '/' && i+1 < len(content) { + if content[i+1] == '/' { + st = stLineComment + i++ + continue + } + if content[i+1] == '*' { + st = stBlockComment + i++ + continue + } + } + switch c { + case '"': + st = stString + case '\'': + st = stChar + case '(': + parenDepth++ + case ')': + if parenDepth > 0 { + parenDepth-- + } + if parenDepth == 0 { + sawParamClose = true + } + case '{': + if sawParamClose && parenDepth == 0 { + return true + } + } + } + return false +} + +// dedupCppFunction handles header-decl-vs-cpp-def collapsing for a single +// emitted Function. Returns true when the caller should drop `obj` (the +// existing entry already won, possibly with body fields copied over from +// `obj`). Returns false when `obj` should be written into pkg.Functions +// as the canonical version. +func (c *Collector) dedupCppFunction(repo *uniast.Repository, name, mod, path, content string, obj *uniast.Function) bool { + hasBody := cppFnHasBody(content) + isHeader := isCppHeaderPkg(path) + cur := cppFnLoc{mod: mod, pkg: path, hasBody: hasBody, isHeader: isHeader} + prev, hasPrev := c.cppFnEmitted[name] + if !hasPrev { + c.cppFnEmitted[name] = cur + return false + } + prevPkg := func() *uniast.Package { + if pm := repo.Modules[prev.mod]; pm != nil { + return pm.Packages[prev.pkg] + } + return nil + } + switch { + case prev.hasBody && !hasBody && prev.isHeader: + return true // header already has body + case prev.hasBody && !hasBody && !prev.isHeader: + // Move .cpp body into the .h entry we're about to emit. + if pp := prevPkg(); pp != nil { + if src, ok := pp.Functions[name]; ok { + copyFnBodyFields(obj, src) + delete(pp.Functions, name) + } + } + case hasBody && !prev.hasBody && prev.isHeader: + // .h decl already emitted — copy our body fields into it. + if pp := prevPkg(); pp != nil { + if hdr, ok := pp.Functions[name]; ok { + copyFnBodyFields(hdr, obj) + prev.hasBody = true + c.cppFnEmitted[name] = prev + } + } + return true + case hasBody && !prev.hasBody && !prev.isHeader: + if pp := prevPkg(); pp != nil { + delete(pp.Functions, name) + } + default: + // Same body status — prefer the header. + if prev.isHeader && !isHeader { + return true + } + if isHeader && !prev.isHeader { + if pp := prevPkg(); pp != nil { + delete(pp.Functions, name) + } + break + } + if prev.mod == mod && prev.pkg == path { + return true + } + } + c.cppFnEmitted[name] = cur + return false +} + +// copyFnBodyFields copies the body-derived fields of src onto dst, +// leaving dst's identity / receiver / location-anchoring intact. Used by +// dedupCppFunction to merge a .cpp definition into a .h declaration. +func copyFnBodyFields(dst, src *uniast.Function) { + dst.Content = src.Content + dst.FunctionCalls = src.FunctionCalls + dst.MethodCalls = src.MethodCalls + dst.GlobalVars = src.GlobalVars + dst.Types = src.Types + dst.Params = src.Params + dst.Results = src.Results + dst.Signature = src.Signature +} + func newModule(name string, dir string, lang uniast.Language) *uniast.Module { ret := uniast.NewModule(name, dir, lang) return ret @@ -129,13 +325,45 @@ func (c *Collector) Export(ctx context.Context) (*uniast.Repository, error) { } } - // export symbols + // External C++ methods skip processSymbol (gated by needProcessExternal) + // so c.funcs has no entry and Type.Methods stays empty. Recover the + // receiver from the documentSymbol parent. + if c.Language == uniast.Cpp && c.cli != nil { + for _, sym := range c.syms { + if sym.Kind != SKMethod { + continue + } + if fi, ok := c.funcs[sym]; ok && fi.Method != nil && fi.Method.Receiver.Symbol != nil { + continue + } + p := c.cli.GetParent(sym) + if p == nil || (p.Kind != SKClass && p.Kind != SKStruct && p.Kind != SKInterface) { + continue + } + c.receivers[p] = append(c.receivers[p], sym) + fi := c.funcs[sym] + if fi.Method == nil { + fi.Method = &methodInfo{} + } + fi.Method.Receiver = dependency{Location: p.Location, Symbol: p} + c.funcs[sym] = fi + } + } + log.Info("Export: exporting %d symbols...\n", len(c.syms)) visited := make(map[*DocumentSymbol]*uniast.Identity) for _, symbol := range c.syms { _, _ = c.exportSymbol(&repo, symbol, "", visited) } + // Synthesize inherited methods per derived class so the call graph can + // be walked through NVI/virtual dispatch. Outgoing this-edges in the + // synthesized body are devirtualized to derived overrides where present. + if c.Language == uniast.Cpp { + log.Info("Export: synthesizing inherited C++ methods...\n") + c.synthesizeInheritedMethodsCpp(&repo) + } + log.Info("Export: connecting files to packages...\n") for fp, f := range c.files { rel, err := filepath.Rel(c.repo, fp) @@ -167,6 +395,24 @@ func (c *Collector) Export(ctx context.Context) (*uniast.Repository, error) { f.Package = pkgpath } + // Drop packages that ended up empty after dedup / method-relocation. + // For C++ this commonly happens to .cpp packages whose only entries + // were method definitions relocated into their .h owner package. + for _, m := range repo.Modules { + if m == nil { + continue + } + for pkgPath, pkg := range m.Packages { + if pkg == nil { + delete(m.Packages, pkgPath) + continue + } + if len(pkg.Functions) == 0 && len(pkg.Types) == 0 && len(pkg.Vars) == 0 { + delete(m.Packages, pkgPath) + } + } + } + return &repo, nil } @@ -270,6 +516,32 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol return } + // `using Y = X;` redirects to the real target; the alias itself is never + // emitted. Unresolved aliases (e.g. `using A = double`) are dropped below. + if len(c.aliasRedirect) > 0 { + seen := map[*DocumentSymbol]bool{symbol: true} + cur := symbol + for { + nxt, ok := c.aliasRedirect[cur] + if !ok || nxt == nil || seen[nxt] { + break + } + seen[nxt] = true + cur = nxt + } + if cur != symbol { + return c.exportSymbol(repo, cur, refName, visited) + } + } + if c.Language == uniast.Cpp { + if us, ok := c.spec.(interface { + IsUsingAlias(sym DocumentSymbol) bool + }); ok && us.IsUsingAlias(*symbol) { + e = ErrExternalSymbol + return + } + } + // 判断是否为本地符号 // 只有符号是“定义”,或者符号是“本地方法”时,才需要完整导出 // 其他情况(如外部引用、或对本地非顶层符号的引用)都只导出标识符 @@ -344,12 +616,8 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol name = c.extractCppCallSig(symbol) } - // join name with namespace - if ns := c.scopePrefix(symbol); ns != "" { - if !strings.HasPrefix(name, ns+"::") { - name = ns + "::" + name - } - } + // join name with namespace + class chain + name = applyCppScopePrefix(c.scopePrefix(symbol), name) } tmp := uniast.NewIdentity(mod, path, name) @@ -378,6 +646,15 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol // Save to visited ONLY WHEN no errors occur visited[symbol] = id + // cstdlib (sysroot) and build_generated (codegen) modules carry + // only edges by design — collect already drops these syms from + // c.syms (see addSymbol), so this branch only fires for *recursive* + // emission via dep edges (Function.FunctionCalls / Types / etc.). + // Return the Identity so the edge still resolves; skip body emission. + if c.Language == uniast.Cpp && (mod == "cstdlib" || mod == "build_generated") { + return + } + // Walk down from repo struct if repo.Modules[mod] == nil { repo.Modules[mod] = newModule(mod, "", c.Language) @@ -548,6 +825,22 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol } depid, err := c.exportSymbol(repo, dep.Symbol, tok, visited) if err != nil { + // Preserve external call edges as lightweight Identities + // so the call graph remains walkable in default mode. + if errors.Is(err, ErrExternalSymbol) && + (dep.Symbol.Kind == SKFunction || dep.Symbol.Kind == SKMethod) { + if ext := c.lightIdentityForExternal(dep.Symbol); ext != nil { + pdep := uniast.NewDependency(*ext, c.fileLine(dep.Location)) + if dep.Symbol.Kind == SKFunction { + obj.FunctionCalls = uniast.InsertDependency(obj.FunctionCalls, pdep) + } else { + if obj.MethodCalls == nil { + obj.MethodCalls = make([]uniast.Dependency, 0, len(deps)) + } + obj.MethodCalls = uniast.InsertDependency(obj.MethodCalls, pdep) + } + } + } continue } pdep := uniast.NewDependency(*depid, c.fileLine(dep.Location)) @@ -576,14 +869,51 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol } } obj.Identity = *id - pkg.Functions[id.Name] = obj + // C++: a method is reported by clangd as two DocumentSymbols, one + // at the header declaration and one at the .cpp definition. We + // want exactly one Function in the AST per NodeID, preferring the + // one that carries the body (and thus FC/MC edges). When the + // definition is unreachable (external base classes, pure-virtual + // C++ dedup: clangd reports the .h declaration and .cpp definition + // as two DocumentSymbols. Collapse them into a single entry under + // the .h pkg (the "owner") carrying the .cpp body's edges. + // `content` is raw symbol text — ImplHead-wrapped obj.Content can + // look like it has a body even when the original was a decl. + dropForDup := false + if c.Language == uniast.Cpp && (k == SKMethod || k == SKFunction) { + dropForDup = c.dedupCppFunction(repo, id.Name, mod, path, content, obj) + } + if !dropForDup { + pkg.Functions[id.Name] = obj + } // Type case SKStruct, SKTypeParameter, SKInterface, SKEnum, SKClass: + // Forward declarations (`class X;` / `struct X;`) are reported by + // clangd as separate DocumentSymbols. Skip them — the real + // definition is emitted by a different pass under its own pkg. + // (typedef X Y; also goes through SKClass/SKStruct in clangd but + // must NOT be filtered — it never starts with the class/struct + // keyword.) + if c.Language == uniast.Cpp && (k == SKClass || k == SKStruct) && + !strings.Contains(content, "{") { + trim := strings.TrimSpace(content) + if strings.HasPrefix(trim, "class ") || strings.HasPrefix(trim, "struct ") { + break + } + } + tkind := mapKind(k) + // C++ typedefs are reported by clangd with SymbolKind Class/Struct, + // but they're aliases. Tag them as Typedef so downstream consumers + // can distinguish from real types. `using Y = X;` is handled + // upstream (alias redirect) and never reaches here. + if c.Language == uniast.Cpp && strings.HasPrefix(strings.TrimSpace(content), "typedef ") { + tkind = uniast.TypeKindTypedef + } obj := &uniast.Type{ FileLine: fileLine, Content: content, - TypeKind: mapKind(k), + TypeKind: tkind, Exported: public, } // Implements relationship is preserved as a first-class field rather @@ -597,12 +927,51 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol } iid, err := c.exportSymbol(repo, rel.Symbol, tok, visited) if err != nil { + // External base classes are valuable signal even when + // the full external symbol isn't loaded — emit a + // lightweight Identity so consumers still see the + // inheritance edge. (Only do this when we actually + // have enough info to form a stable Identity.) + if errors.Is(err, ErrExternalSymbol) { + if ext := c.lightIdentityForExternal(rel.Symbol); ext != nil { + obj.Implements = append(obj.Implements, *ext) + implSyms[rel.Symbol] = true + } + } continue } obj.Implements = append(obj.Implements, *iid) implSyms[rel.Symbol] = true } } + // C++: bases that clangd couldn't resolve (typical for external + // template bases like `SimpleProvider<...>`) get recorded by + // collect as plain names. Synthesise a light Identity per name. + if c.Language == uniast.Cpp { + seen := map[string]bool{} + for _, base := range obj.Implements { + seen[base.Name] = true + } + for _, baseName := range c.cppUnresolvedBases[symbol] { + // Skip if a real (or already-light) edge to this name exists. + dup := false + for k := range seen { + if k == baseName || strings.HasSuffix(k, "::"+baseName) { + dup = true + break + } + } + if dup { + continue + } + obj.Implements = append(obj.Implements, uniast.Identity{ + ModPath: "external", + PkgPath: "external", + Name: baseName, + }) + seen[baseName] = true + } + } // collect deps if deps := c.deps[symbol]; deps != nil { for _, dep := range deps { @@ -638,9 +1007,15 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol if err != nil { continue } - // NOTICE: use method name as key here + // NOTICE: use method name as key here. + // For C++: derive the key from the constructed Identity + // Name (mid.Name), which includes the full signature + // (`ns::Class::handle(int x)`), then strip the receiver + // scope via cppBaseName. This produces "handle(int x)" + // vs "handle(long x)" so overloads don't collapse onto a + // single short-name key. if c.Language == uniast.Cpp { - methodName := c.cppBaseName(method.Name) + methodName := c.cppBaseName(mid.Name) _, methodExist := obj.Methods[methodName] isHeaderMethod := strings.HasSuffix(method.Location.URI.File(), ".h") if methodExist && isHeaderMethod { @@ -707,9 +1082,19 @@ func (c *Collector) scopePrefix(sym *DocumentSymbol) string { if p == nil { break } - if p.Kind == lsp.SKNamespace { + // Walk over enclosing namespaces AND class/struct/interface scopes so + // inline methods of external classes get the full receiver qualifier + // (e.g. `cppservice::ApiHandler::process`). Without the class hop, + // every method in an external header collapses to `ns::method` and + // distinct methods of distinct classes clash. + switch p.Kind { + case lsp.SKNamespace, lsp.SKClass, lsp.SKStruct, lsp.SKInterface: if p.Name != "" { - parts = append([]string{p.Name}, parts...) + n := p.Name + if i := strings.IndexByte(n, '<'); i >= 0 { // strip template args: Foo -> Foo + n = n[:i] + } + parts = append([]string{n}, parts...) } } cur = p @@ -717,6 +1102,288 @@ func (c *Collector) scopePrefix(sym *DocumentSymbol) string { return strings.Join(parts, "::") // "a::b" } +// applyCppScopePrefix prepends the lexical scope (namespace+class chain) to +// the bare symbol name, but only the portion that's actually missing. +// +// Examples (prefix = "cppservice::ApiHandler"): +// +// "process(...)" -> "cppservice::ApiHandler::process(...)" +// "ApiHandler::process(...)" -> "cppservice::ApiHandler::process(...)" +// "cppservice::ApiHandler::process()" -> unchanged +func applyCppScopePrefix(prefix, name string) string { + if prefix == "" { + return name + } + parts := strings.Split(prefix, "::") + for i := 0; i < len(parts); i++ { + suffix := strings.Join(parts[i:], "::") + "::" + if strings.HasPrefix(name, suffix) { + if i == 0 { + return name + } + return strings.Join(parts[:i], "::") + "::" + name + } + } + return prefix + "::" + name +} + +// synthesizeInheritedMethodsCpp walks every C++ Type with `Implements` and +// emits a derived-class view of each inherited method, mirroring the way +// Model B describes inheritance: a synthesized `D::m` symbol that owns +// the body of `B::m` but is bound to `D`'s vtable. Virtual dispatch inside +// the inherited body is devirtualized — calls to `B::n` get rewritten to +// `D::n` when `D` overrides `n` — so a walker can step from +// `D::inherited` straight into `D::override` without losing the chain at +// `B::n`. +// +// We deliberately do not rewrite caller-side edges here: the receiver's +// static type at each call site is not currently propagated through the +// collector, so we cannot reliably choose the right `D` per call. That +// rewrite is a follow-up; the synthesized symbols are still useful on +// their own (they materialize the inheritance closure in the AST, and +// downstream tooling can map B::m → {D::m} via the Implements graph). +func (c *Collector) synthesizeInheritedMethodsCpp(repo *uniast.Repository) { + // Build a flat index of every concrete function in the repo keyed by + // its Identity, so we can look up methods of a base class B in O(1). + type funcSlot struct { + mod *uniast.Module + pkg *uniast.Package + fn *uniast.Function + } + byReceiver := map[uniast.Identity][]*funcSlot{} + // typesById lets a synthesised method back-fill the derived class's + // Type.Methods map. + typesById := map[uniast.Identity]*uniast.Type{} + for _, mod := range repo.Modules { + for _, pkg := range mod.Packages { + for _, fn := range pkg.Functions { + if fn.IsMethod && fn.Receiver != nil { + byReceiver[fn.Receiver.Type] = append(byReceiver[fn.Receiver.Type], &funcSlot{mod: mod, pkg: pkg, fn: fn}) + } + } + for _, ty := range pkg.Types { + typesById[ty.Identity] = ty + } + } + } + + // methodLocalSig returns the receiver-scope-stripped signature + // "method(params)" — overload-distinguishing key for dedup so e.g. + // `Base::foo(int)` and `Base::foo(string)` don't collide. + methodLocalSig := func(qualified string) string { + head := qualified + tail := "" + if i := strings.Index(qualified, "("); i >= 0 { + head = qualified[:i] + tail = qualified[i:] + } + if i := strings.LastIndex(head, "::"); i >= 0 { + head = head[i+2:] + } + return head + tail + } + // methodScope strips the trailing "::method(params)" giving the + // receiver namespace+class prefix. + methodScope := func(qualified string) string { + s := qualified + if i := strings.Index(s, "("); i >= 0 { + s = s[:i] + } + if i := strings.LastIndex(s, "::"); i >= 0 { + return s[:i] + } + return "" + } + // Replace the receiver scope of a qualified method name with `newScope`, + // keeping the trailing "::method(params)" intact. Used to rewrite + // `B::n(args)` -> `D::n(args)` inside synthesized bodies. + withScope := func(qualified, newScope string) string { + head := qualified + tail := "" + if i := strings.Index(head, "("); i >= 0 { + tail = head[i:] + head = head[:i] + } + short := head + if i := strings.LastIndex(head, "::"); i >= 0 { + short = head[i+2:] + } + if newScope == "" { + return short + tail + } + return newScope + "::" + short + tail + } + + // Collect transitive bases of a type (BFS over Implements). + transitiveBases := func(t *uniast.Type) []uniast.Identity { + seen := map[uniast.Identity]bool{} + var out []uniast.Identity + queue := append([]uniast.Identity(nil), t.Implements...) + for len(queue) > 0 { + b := queue[0] + queue = queue[1:] + if seen[b] { + continue + } + seen[b] = true + out = append(out, b) + // follow base's bases via repo + if bm := repo.Modules[b.ModPath]; bm != nil { + if bp := bm.Packages[b.PkgPath]; bp != nil { + if bt := bp.Types[b.Name]; bt != nil { + queue = append(queue, bt.Implements...) + } + } + } + } + return out + } + + // Iterate over each Type in each module/package; types-snapshot first + // to avoid mutating the map while iterating. + type typeSlot struct { + mod *uniast.Module + pkg *uniast.Package + ty *uniast.Type + } + var allTypes []typeSlot + for _, mod := range repo.Modules { + for _, pkg := range mod.Packages { + for _, ty := range pkg.Types { + if len(ty.Implements) > 0 { + allTypes = append(allTypes, typeSlot{mod, pkg, ty}) + } + } + } + } + + for _, ts := range allTypes { + D := ts.ty + // D's own methods, indexed by local signature (short name + param + // list) so overloads like `foo(int)` vs `foo(string)` are treated + // as distinct. + dOwnBySig := map[string]bool{} + for _, f := range byReceiver[D.Identity] { + dOwnBySig[methodLocalSig(f.fn.Name)] = true + } + // Methods already synthesized this round, keyed by local sig, + // to avoid duplicates when both diamond bases provide the same + // overload. + synthBySig := map[string]bool{} + + dScope := D.Identity.Name // "ns::ClassD" + + for _, baseID := range transitiveBases(D) { + baseMethods := byReceiver[baseID] + // C++ headers + .cpp produce two records with identical + // Identity.Name (declaration in header, definition in cpp). + // Pick the one with a richer call graph so the synthesis + // downstream isn't sensitive to map-iteration order. Keyed + // by local sig so overloads survive. + bestBySig := map[string]*funcSlot{} + callCount := func(f *uniast.Function) int { + return len(f.MethodCalls) + len(f.FunctionCalls) + len(f.GlobalVars) + len(f.Types) + } + for _, bf := range baseMethods { + s := methodLocalSig(bf.fn.Name) + cur, ok := bestBySig[s] + if !ok || callCount(bf.fn) > callCount(cur.fn) { + bestBySig[s] = bf + } + } + for sig, bf := range bestBySig { + if dOwnBySig[sig] || synthBySig[sig] { + continue + } + // Construct synthesized Identity: "::". + // Search for the receiver-name `::` only in the head (before + // the parameter list) — parameter types like `const + // std::string&` contain `::` of their own and would otherwise + // trick LastIndex into picking the wrong split point. + tail := bf.fn.Name + head := bf.fn.Name + paramStart := len(bf.fn.Name) + if j := strings.Index(bf.fn.Name, "("); j >= 0 { + head = bf.fn.Name[:j] + paramStart = j + } + if i := strings.LastIndex(head, "::"); i >= 0 { + tail = bf.fn.Name[i+2:] + } else { + tail = bf.fn.Name[paramStart:] // method has no scope at all + tail = head + tail + } + newName := dScope + "::" + tail + newId := uniast.NewIdentity(ts.mod.Name, ts.pkg.PkgPath, newName) + if _, exists := ts.pkg.Functions[newName]; exists { + synthBySig[sig] = true + continue + } + + // Deep-copy the function, devirtualizing this-edges where D + // overrides the called signature. Match by local sig so we + // don't redirect `Base::foo(string)` to `D::foo(string)` + // when D only overrode `foo(int)`. + rewriteScope := func(target uniast.Identity) uniast.Identity { + if methodScope(target.Name) != baseID.Name { + return target + } + if !dOwnBySig[methodLocalSig(target.Name)] { + return target + } + // D has its own override of this signature: redirect + // target to D::. + return uniast.NewIdentity(D.Identity.ModPath, D.Identity.PkgPath, withScope(target.Name, dScope)) + } + cloneDeps := func(in []uniast.Dependency) []uniast.Dependency { + if len(in) == 0 { + return nil + } + out := make([]uniast.Dependency, 0, len(in)) + for _, d := range in { + nd := d + nd.Identity = rewriteScope(d.Identity) + out = append(out, nd) + } + return out + } + + synth := &uniast.Function{ + FileLine: bf.fn.FileLine, + Content: bf.fn.Content, + Signature: bf.fn.Signature, + Exported: bf.fn.Exported, + IsMethod: true, + Receiver: &uniast.Receiver{IsPointer: false, Type: D.Identity}, + MethodCalls: cloneDeps(bf.fn.MethodCalls), + FunctionCalls: cloneDeps(bf.fn.FunctionCalls), + GlobalVars: cloneDeps(bf.fn.GlobalVars), + Types: cloneDeps(bf.fn.Types), + Params: cloneDeps(bf.fn.Params), + Results: cloneDeps(bf.fn.Results), + Identity: newId, + } + ts.pkg.Functions[newId.Name] = synth + synthBySig[sig] = true + // Back-fill the derived Type's Methods map so consumers + // can discover the inherited method through D.Methods. + // Key derived from the synthesised Identity Name via + // cppBaseName so it matches the native-method key style + // ("handle(int x)") and overloads don't collapse. + if dt := typesById[D.Identity]; dt != nil { + if dt.Methods == nil { + dt.Methods = map[string]uniast.Identity{} + } + key := c.cppBaseName(newId.Name) + if _, exists := dt.Methods[key]; !exists { + dt.Methods[key] = newId + } + } + } + } + } +} + func (c *Collector) cppBaseName(n string) string { n = strings.TrimSpace(n) if i := strings.LastIndex(n, "::"); i >= 0 { @@ -730,12 +1397,45 @@ func (c *Collector) cppBaseName(n string) string { return strings.TrimSpace(n) } -// extractCppCallSig returns "sym.Name(params)" where params is extracted from sym.Text. +// extractCppCallSig returns "sym.Name(params)". +// +// Fast path: clangd populates DocumentSymbol.Detail with the canonical +// signature (e.g. "void(int x)") for methods and functions — use that +// directly. Fall back to text-level extraction only when Detail is empty +// (older LSPs or non-clangd providers). func (c *Collector) extractCppCallSig(sym *lsp.DocumentSymbol) (ret string) { name := strings.TrimSpace(sym.Name) if name == "" { return "" } + if detail := strings.TrimSpace(sym.Detail); detail != "" { + // clangd's Detail is the bare type "void(int x)" — we want + // "name(int x)". Find the params parenthesis group. + if i := strings.IndexByte(detail, '('); i >= 0 { + params := detail[i:] + // Trim trailing return-type-style annotations after a + // matching ')' close (e.g. " const" / " noexcept" / " -> X"). + depth := 0 + end := -1 + for j := 0; j < len(params); j++ { + switch params[j] { + case '(': + depth++ + case ')': + depth-- + if depth == 0 { + end = j + 1 + } + } + if end > 0 { + break + } + } + if end > 0 { + return name + params[:end] + } + } + } text := sym.Text if text == "" { return name + "()" diff --git a/lang/cpp/lib.go b/lang/cpp/lib.go index 026dff07..9194449d 100644 --- a/lang/cpp/lib.go +++ b/lang/cpp/lib.go @@ -29,7 +29,13 @@ func InstallLanguageServer() (string, error) { } func GetDefaultLSP() (lang uniast.Language, name string) { - return uniast.Cpp, "clangd-18 --background-index=false -j=32 --clang-tidy=false" + // background-index=false: clangd's persisted index only covers files in + // compile_commands.json (the project's own TUs), a small slice of the + // cross-TU work abcoder triggers. Disabling it skips disk writes and + // background indexer scheduling overhead. + // pch-storage=disk: keeps preambles on disk (clangd cleans them on + // graceful exit). memory mode is ~17% faster but doubles peak RSS. + return uniast.Cpp, "clangd-18 --background-index=false --pch-storage=disk -j=32 --clang-tidy=false" } func CheckRepo(repo string) (string, time.Duration) { diff --git a/lang/cpp/lsp/cpp_lsp.go b/lang/cpp/lsp/cpp_lsp.go new file mode 100644 index 00000000..bb9d4c88 --- /dev/null +++ b/lang/cpp/lsp/cpp_lsp.go @@ -0,0 +1,110 @@ +// Copyright 2025 CloudWeGo Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cpp_lsp implements the LanguageServiceProvider interface for +// clangd-backed C++ workspaces. It exists to wire LSP requests that the +// generic LSPClient doesn't issue directly — most importantly the +// textDocument/prepareTypeHierarchy + typeHierarchy/supertypes pair, +// which lets the C++ collector skip its own brittle text-level base +// class parsing. +package cpp_lsp + +import ( + "context" + + "github.com/cloudwego/abcoder/lang/lsp" +) + +type CppProvider struct{} + +// Hover is not currently used by the C++ collector. clangd supports the +// standard textDocument/hover but we have no consumer. +func (p *CppProvider) Hover(ctx context.Context, cli *lsp.LSPClient, uri lsp.DocumentURI, line, character int) (*lsp.Hover, error) { + params := lsp.TextDocumentPositionParams{ + TextDocument: lsp.TextDocumentIdentifier{URI: uri}, + Position: lsp.Position{Line: line, Character: character}, + } + var result lsp.Hover + if err := cli.Call(ctx, "textDocument/hover", params, &result); err != nil { + return nil, err + } + return &result, nil +} + +// Implementation - textDocument/implementation. clangd implements this +// for resolving derived overrides. +func (p *CppProvider) Implementation(ctx context.Context, cli *lsp.LSPClient, uri lsp.DocumentURI, pos lsp.Position) ([]lsp.Location, error) { + params := lsp.TextDocumentPositionParams{ + TextDocument: lsp.TextDocumentIdentifier{URI: uri}, + Position: pos, + } + var result []lsp.Location + if err := cli.Call(ctx, "textDocument/implementation", params, &result); err != nil { + return nil, err + } + return result, nil +} + +// WorkspaceSearchSymbols - workspace/symbol. +func (p *CppProvider) WorkspaceSearchSymbols(ctx context.Context, cli *lsp.LSPClient, query string) ([]lsp.SymbolInformation, error) { + params := struct { + Query string `json:"query"` + }{Query: query} + var result []lsp.SymbolInformation + if err := cli.Call(ctx, "workspace/symbol", params, &result); err != nil { + return nil, err + } + return result, nil +} + +// PrepareTypeHierarchy - textDocument/prepareTypeHierarchy. +// Given a position on a class identifier, returns the hierarchy item(s) +// rooted at that class. +func (p *CppProvider) PrepareTypeHierarchy(ctx context.Context, cli *lsp.LSPClient, uri lsp.DocumentURI, pos lsp.Position) ([]lsp.TypeHierarchyItem, error) { + params := lsp.TextDocumentPositionParams{ + TextDocument: lsp.TextDocumentIdentifier{URI: uri}, + Position: pos, + } + var result []lsp.TypeHierarchyItem + if err := cli.Call(ctx, "textDocument/prepareTypeHierarchy", params, &result); err != nil { + return nil, err + } + return result, nil +} + +// TypeHierarchySupertypes - typeHierarchy/supertypes. +// Returns the bases (direct supertypes) of the given hierarchy item. +// This is what we use to replace the text-level BaseClassRefs parsing. +func (p *CppProvider) TypeHierarchySupertypes(ctx context.Context, cli *lsp.LSPClient, item lsp.TypeHierarchyItem) ([]lsp.TypeHierarchyItem, error) { + params := struct { + Item lsp.TypeHierarchyItem `json:"item"` + }{Item: item} + var result []lsp.TypeHierarchyItem + if err := cli.Call(ctx, "typeHierarchy/supertypes", params, &result); err != nil { + return nil, err + } + return result, nil +} + +// TypeHierarchySubtypes - typeHierarchy/subtypes. +func (p *CppProvider) TypeHierarchySubtypes(ctx context.Context, cli *lsp.LSPClient, item lsp.TypeHierarchyItem) ([]lsp.TypeHierarchyItem, error) { + params := struct { + Item lsp.TypeHierarchyItem `json:"item"` + }{Item: item} + var result []lsp.TypeHierarchyItem + if err := cli.Call(ctx, "typeHierarchy/subtypes", params, &result); err != nil { + return nil, err + } + return result, nil +} diff --git a/lang/cpp/spec.go b/lang/cpp/spec.go index 15fbb1a5..44f5ecb5 100644 --- a/lang/cpp/spec.go +++ b/lang/cpp/spec.go @@ -16,16 +16,94 @@ package cpp import ( "fmt" + "os" "path/filepath" "strings" + "sync" lsp "github.com/cloudwego/abcoder/lang/lsp" "github.com/cloudwego/abcoder/lang/uniast" "github.com/cloudwego/abcoder/lang/utils" ) +// clangd semantic-token type names. +const ( + tokClass = "class" + tokStruct = "struct" + tokType = "type" + tokInterface = "interface" + tokConcept = "concept" + tokEnum = "enum" + tokEnumMember = "enumMember" + tokFunction = "function" + tokMethod = "method" + tokMacro = "macro" + tokVariable = "variable" + tokParameter = "parameter" + tokTypeParameter = "typeParameter" + tokNamespace = "namespace" + tokComment = "comment" + tokModifier = "modifier" + tokBracket = "bracket" + tokLabel = "label" + tokOperator = "operator" + tokProperty = "property" + tokUnknown = "unknown" +) + +// clangd semantic-token modifier names. +const ( + modDeclaration = "declaration" + modDefinition = "definition" + modGlobalScope = "globalScope" + modDefaultLibrary = "defaultLibrary" +) + +// isReferenceTypeToken reports whether tok references (not declares) a type — +// eligible as a base class or alias target. +func isReferenceTypeToken(tok lsp.Token) bool { + switch tok.Type { + case tokClass, tokStruct, tokType: + default: + return false + } + for _, m := range tok.Modifiers { + if m == modDeclaration || m == modDefinition { + return false + } + } + return true +} + type CppSpec struct { - repo string + repo string // repository root absolute realpath + selfName string // "host/org/repo" of the repo being parsed + + // User-declared sysroot prefixes. Any file path under one of these is + // bucketed under module `cstdlib`. Set via the abcoder `--sysroot` + // flag (repeatable). Order doesn't matter — first match wins. + sysroots []string + + repoMu sync.Mutex + repoMods map[string]repoInfo // dir containing .git -> resolved repo info + + // clangd often reports Range covering only the identifier, so alias / + // base-class detection falls back to reading source. declCache memoises + // declarationText per (URI, line, col). + srcMu sync.Mutex + srcLines map[string][]string + declCache map[declKey]string +} + +type declKey struct { + uri lsp.DocumentURI + line int + col int +} + +type repoInfo struct { + root string // repository root on disk + name string // "org/repo" derived from remote.origin.url, falls back to base(root) } func (c *CppSpec) ProtectedSymbolKinds() []lsp.SymbolKind { @@ -33,7 +111,127 @@ func (c *CppSpec) ProtectedSymbolKinds() []lsp.SymbolKind { } func NewCppSpec() *CppSpec { - return &CppSpec{} + return &CppSpec{ + repoMods: map[string]repoInfo{}, + srcLines: map[string][]string{}, + declCache: map[declKey]string{}, + } +} + +// SetSysroots registers path prefixes whose contents are classified as +// the `cstdlib` module. Entries are realpath-resolved to match clangd URIs. +func (c *CppSpec) SetSysroots(roots []string) { + c.sysroots = c.sysroots[:0] + for _, r := range roots { + if r = strings.TrimSpace(r); r == "" { + continue + } + c.sysroots = append(c.sysroots, canonicalizeAbs(r)) + } +} + +// canonicalizeAbs returns an absolute, realpath-resolved version of p. +// Either step is allowed to fail; the remaining transformations still +// apply. Used so path-prefix comparisons (repo, sysroots) match what +// clangd reports in sym.Location.URI. +func canonicalizeAbs(p string) string { + if abs, err := filepath.Abs(p); err == nil { + p = abs + } + if real, err := filepath.EvalSymlinks(p); err == nil && real != "" { + p = real + } + return p +} + +// sourceLine returns line N (0-based) from the file at uri, reading and +// caching the file content on first access. Returns "" on any error or if +// N is out of range. Safe for concurrent use. +func (c *CppSpec) sourceLine(uri lsp.DocumentURI, n int) string { + path := uri.File() + c.srcMu.Lock() + lines, ok := c.srcLines[path] + c.srcMu.Unlock() + if !ok { + b, err := os.ReadFile(path) + if err != nil { + return "" + } + lines = strings.Split(string(b), "\n") + c.srcMu.Lock() + c.srcLines[path] = lines + c.srcMu.Unlock() + } + if n < 0 || n >= len(lines) { + return "" + } + return lines[n] +} + +// declarationText returns the declaration source from sym's start line up to +// the next `;` or `{` at template depth 0. clangd's Range can start at the +// identifier or on a preceding comment line, so we read from the start of the +// line and skip line-comment-only lines. +func (c *CppSpec) declarationText(sym lsp.DocumentSymbol) string { + startLine := sym.Location.Range.Start.Line + key := declKey{uri: sym.Location.URI, line: startLine, col: sym.Location.Range.Start.Character} + c.srcMu.Lock() + cached, ok := c.declCache[key] + c.srcMu.Unlock() + if ok { + return cached + } + const maxLines = 8 + var buf strings.Builder + depth := 0 + wroteContent := false + for off := 0; off < maxLines; off++ { + raw := c.sourceLine(sym.Location.URI, startLine+off) + if raw == "" && off > 0 { + if wroteContent { + buf.WriteByte('\n') + } + continue + } + line := raw + if i := strings.Index(line, "//"); i >= 0 { + line = line[:i] + } + if strings.TrimSpace(line) == "" { + if wroteContent { + buf.WriteByte('\n') + } + continue + } + for i := 0; i < len(line); i++ { + ch := line[i] + switch ch { + case '<': + depth++ + case '>': + if depth > 0 { + depth-- + } + case ';', '{': + if depth == 0 { + buf.WriteByte(ch) + out := buf.String() + c.srcMu.Lock() + c.declCache[key] = out + c.srcMu.Unlock() + return out + } + } + buf.WriteByte(ch) + } + wroteContent = true + buf.WriteByte('\n') + } + out := buf.String() + c.srcMu.Lock() + c.declCache[key] = out + c.srcMu.Unlock() + return out } func (c *CppSpec) FileImports(content []byte) ([]uniast.Import, error) { @@ -42,34 +240,214 @@ func (c *CppSpec) FileImports(content []byte) ([]uniast.Import, error) { // XXX: maybe multi module support for C++? func (c *CppSpec) WorkSpace(root string) (map[string]string, error) { - c.repo = root - rets := map[string]string{} - absPath, err := filepath.Abs(root) - if err != nil { + if _, err := filepath.Abs(root); err != nil { return nil, fmt.Errorf("failed to get absolute path: %w", err) } - rets["current"] = absPath - return rets, nil + // Path comparisons must match clangd's URI form (realpath). + c.repo = canonicalizeAbs(root) + absPath := c.repo + + // Derive a module name for this repo from its own .git/config; fall back + // to the directory basename so the JSON never gets the generic "current". + name := "" + if info, err := os.Stat(filepath.Join(absPath, ".git")); err == nil && info.IsDir() { + name = readGitOriginOrgRepo(filepath.Join(absPath, ".git", "config")) + } + if name == "" { + name = filepath.Base(absPath) + } + c.selfName = name + + return map[string]string{name: absPath}, nil } // returns: modname, pathpath, error // Multiple symbols with the same name could occur (for example in the Linux kernel). // The identify is mod::pkg::name. So we use the pkg (the file name) to distinguish them. func (c *CppSpec) NameSpace(path string, file *uniast.File) (string, string, error) { - // external lib: only standard library (system headers), in /usr/ - if !strings.HasPrefix(path, c.repo) { - if strings.HasPrefix(path, "/usr") { - // assume it is c system library - return "cstdlib", "cstdlib", nil + // Build-time generated artifacts (IDL/proto/blade glue under + // build64_release). Route to a dedicated module so they're + // distinguishable from hand-written project sources. + if i := strings.Index(path, "/build64_release/"); i >= 0 { + return "build_generated", path[i+len("/build64_release/"):], nil + } + if hasPathPrefix(path, c.repo) { + rel, _ := filepath.Rel(c.repo, path) + return c.selfName, rel, nil + } + // User-declared sysroot(s): bucket every header/source under them as + // `cstdlib`. Stripping the sysroot prefix keeps pkg paths stable across + // machines that install toolchains in different locations. + for _, sr := range c.sysroots { + if sr == "" { + continue + } + if hasPathPrefix(path, sr) { + rel, _ := filepath.Rel(sr, path) + return "cstdlib", rel, nil } - return "external", "external", nil } + if info, ok := c.lookupExternalRepo(path); ok { + relpath, err := filepath.Rel(info.root, path) + if err != nil { + relpath = path + } + return info.name, relpath, nil + } + return "external", path, nil +} - relpath, _ := filepath.Rel(c.repo, path) - return "current", relpath, nil +// hasPathPrefix is HasPrefix that respects path boundaries: "/a/foo" is not a +// prefix of "/a/foobar". This avoids false matches when one repo's path is a +// textual prefix of another's (e.g. /freq vs /freq_service). +func hasPathPrefix(p, root string) bool { + if !strings.HasPrefix(p, root) { + return false + } + if len(p) == len(root) { + return true + } + return p[len(root)] == filepath.Separator +} + +// lookupExternalRepo walks upward from path until it finds a directory holding +// a .git entry, parses remote.origin.url to derive "org/repo", and caches the +// result. Returns ok=false if no enclosing git repo is found. +func (c *CppSpec) lookupExternalRepo(path string) (repoInfo, bool) { + dir := filepath.Dir(path) + visited := []string{} + for { + if dir == "" || dir == "/" || dir == "." { + break + } + c.repoMu.Lock() + info, hit := c.repoMods[dir] + c.repoMu.Unlock() + if hit { + c.cacheChain(visited, info) + if info.root == "" { + return repoInfo{}, false + } + return info, true + } + gitPath := filepath.Join(dir, ".git") + if fi, err := os.Stat(gitPath); err == nil && fi.IsDir() { + name := readGitOriginOrgRepo(filepath.Join(gitPath, "config")) + if name == "" { + name = filepath.Base(dir) + } + info = repoInfo{root: dir, name: name} + c.repoMu.Lock() + c.repoMods[dir] = info + c.repoMu.Unlock() + c.cacheChain(visited, info) + return info, true + } + visited = append(visited, dir) + parent := filepath.Dir(dir) + if parent == dir { + break + } + dir = parent + } + // negative-cache so future lookups under the same path don't re-walk + neg := repoInfo{} + c.repoMu.Lock() + for _, d := range visited { + c.repoMods[d] = neg + } + c.repoMu.Unlock() + return repoInfo{}, false +} + +func (c *CppSpec) cacheChain(dirs []string, info repoInfo) { + if len(dirs) == 0 { + return + } + c.repoMu.Lock() + defer c.repoMu.Unlock() + for _, d := range dirs { + c.repoMods[d] = info + } +} + +// readGitOriginOrgRepo parses a git config file and returns "host/org/repo" +// derived from the [remote "origin"] url. Supports both ssh and https forms: +// +// git@code.byted.org:data/cppservice -> code.byted.org/data/cppservice +// https://code.byted.org/data-arch/feathub.git -> code.byted.org/data-arch/feathub +// +// When the host can't be determined, returns "org/repo" without prefix. +func readGitOriginOrgRepo(cfgPath string) string { + data, err := os.ReadFile(cfgPath) + if err != nil { + return "" + } + inOrigin := false + for _, raw := range strings.Split(string(data), "\n") { + line := strings.TrimSpace(raw) + if strings.HasPrefix(line, "[") { + inOrigin = strings.Contains(line, `remote "origin"`) + continue + } + if !inOrigin { + continue + } + eq := strings.IndexByte(line, '=') + if eq < 0 || strings.TrimSpace(line[:eq]) != "url" { + continue + } + u := strings.TrimSpace(line[eq+1:]) + u = strings.TrimSuffix(u, ".git") + host := "" + // Prefer scheme-based parse first so `https://...` isn't + // mistaken for the ssh `user@host:path` form (both contain `:`). + if i := strings.Index(u, "://"); i >= 0 { + // https://host/org/repo + rest := u[i+3:] + if at := strings.LastIndex(rest, "@"); at >= 0 { + rest = rest[at+1:] // strip user@ + } + if slash := strings.IndexByte(rest, '/'); slash >= 0 { + host = rest[:slash] + u = rest[slash+1:] + } else { + u = rest + } + } else if i := strings.LastIndex(u, ":"); i >= 0 && !strings.Contains(u[:i], "/") { + // ssh: git@host:org/repo + head := u[:i] + if at := strings.LastIndex(head, "@"); at >= 0 { + host = head[at+1:] + } else { + host = head + } + u = u[i+1:] + } + parts := strings.Split(u, "/") + var orgRepo string + if n := len(parts); n >= 2 { + orgRepo = parts[n-2] + "/" + parts[n-1] + } else { + orgRepo = u + } + if host != "" { + return host + "/" + orgRepo + } + return orgRepo + } + return "" } func (c *CppSpec) ShouldSkip(path string) bool { + // Build-time generated artifacts (IDL/proto/blade codegen under + // build64_release). Skipping at scanner level avoids the heavy + // DocumentSymbols + per-sym Locate/SemanticTokens cost. Edges from + // in-repo source that reference these headers still resolve via the + // fake-Unknown fallback in getSymbolByLocation. + if strings.Contains(path, "/build64_release/") { + return true + } if (strings.HasSuffix(path, ".cpp") && !strings.HasSuffix(path, "_test.cpp")) || strings.HasSuffix(path, ".h") { return false } @@ -77,7 +455,7 @@ func (c *CppSpec) ShouldSkip(path string) bool { } func (c *CppSpec) IsDocToken(tok lsp.Token) bool { - return tok.Type == "comment" + return tok.Type == tokComment } func (c *CppSpec) DeclareTokenOfSymbol(sym lsp.DocumentSymbol) int { @@ -86,7 +464,7 @@ func (c *CppSpec) DeclareTokenOfSymbol(sym lsp.DocumentSymbol) int { continue } for _, m := range t.Modifiers { - if m == "declaration" { + if m == modDeclaration { return i } } @@ -96,40 +474,44 @@ func (c *CppSpec) DeclareTokenOfSymbol(sym lsp.DocumentSymbol) int { func (c *CppSpec) IsEntityToken(tok lsp.Token) bool { for _, m := range tok.Modifiers { - if m == "declaration" || m == "definition" { + if m == modDeclaration || m == modDefinition { return false } } - - return tok.Type == "class" || tok.Type == "function" || tok.Type == "method" || tok.Type == "variable" + return tok.Type == tokClass || tok.Type == tokFunction || tok.Type == tokMethod || tok.Type == tokVariable } func (c *CppSpec) IsStdToken(tok lsp.Token) bool { - panic("TODO") + for _, m := range tok.Modifiers { + if m == modDefaultLibrary { + return true + } + } + return false } func (c *CppSpec) TokenKind(tok lsp.Token) lsp.SymbolKind { switch tok.Type { - case "class": + case tokClass: return lsp.SKClass - case "enum": + case tokEnum: return lsp.SKEnum - case "enumMember": + case tokEnumMember: return lsp.SKEnumMember - case "function", "macro": + case tokFunction, tokMacro: return lsp.SKFunction // rust spec does not treat parameter as a variable - case "parameter": + case tokParameter: return lsp.SKVariable - case "typeParameter": + case tokTypeParameter: return lsp.SKTypeParameter - case "method": + case tokMethod: return lsp.SKMethod - case "namespace": + case tokNamespace: return lsp.SKNamespace - case "variable": + case tokVariable: return lsp.SKVariable - case "interface", "concept", "modifier", "type", "bracket", "comment", "label", "operator", "property", "unknown": + case tokInterface, tokConcept, tokModifier, tokType, tokBracket, tokComment, tokLabel, tokOperator, tokProperty, tokUnknown: return lsp.SKUnknown } panic(fmt.Sprintf("Weird token type: %s at %+v\n", tok.Type, tok.Location)) @@ -150,7 +532,7 @@ func (c *CppSpec) IsPublicSymbol(sym lsp.DocumentSymbol) bool { return false } for _, m := range sym.Tokens[id].Modifiers { - if m == "globalScope" { + if m == modGlobalScope { return true } } @@ -181,7 +563,7 @@ func (c *CppSpec) ImplSymbol(sym lsp.DocumentSymbol) (int, int, int) { continue } switch tok.Type { - case "class", "struct": + case tokClass, tokStruct: return inter, i, fn } } @@ -189,6 +571,289 @@ func (c *CppSpec) ImplSymbol(sym lsp.DocumentSymbol) (int, int, int) { return inter, -1, fn } +// IsTypedefSymbol reports whether the symbol is a `typedef X Y;` declaration. +// Typedefs stay in the AST as Type entries with TypeKind=typedef. +func (c *CppSpec) IsTypedefSymbol(sym lsp.DocumentSymbol) bool { + return strings.HasPrefix(strings.TrimSpace(c.declarationText(sym)), "typedef ") +} + +// IsUsingAlias reports whether the symbol is one of the two C++ alias forms: +// - `using Y = X;` — pure rename, introduces no new type +// - `using NS::Name;` — using-declaration that imports `NS::Name` +// into the current scope +// +// References to either should resolve to the underlying type X / NS::Name; +// the alias itself must be dropped from the AST (otherwise inheritance +// edges of the form `class D : public Y` would produce a phantom Type +// node `::Y::Name` instead of pointing at the real base). +// +// `using namespace foo;` (a namespace-import directive) is NOT an alias; +// it doesn't introduce a name binding into the type system. +func (c *CppSpec) IsUsingAlias(sym lsp.DocumentSymbol) bool { + trim := strings.TrimSpace(c.declarationText(sym)) + if !strings.HasPrefix(trim, "using ") { + return false + } + // using namespace foo; — name-import directive, not an alias. + if strings.HasPrefix(trim, "using namespace ") { + return false + } + // using Y = X; — always a type alias (introduces new type binding). + if strings.Contains(trim, "=") { + return true + } + // using NS::Name; — using-declaration. Only treat it as an alias when + // the imported entity is a TYPE (class/struct/enum/typeParameter/ + // interface). For non-type imports like `using Base::foo;` (member + // function) or `using std::swap;` (free function), this is name- + // import only — collapsing it to an alias and dropping the symbol + // makes those overloads disappear from the AST. + switch sym.Kind { + case lsp.SKClass, lsp.SKStruct, lsp.SKEnum, lsp.SKTypeParameter, lsp.SKInterface: + return true + } + return false +} + +// AliasTargetTokenIndex returns the token index of the aliased type's +// reference for a `using Y = X;` symbol — the token whose Definition leads +// to the real type. Returns -1 when not a using-alias or the target can't +// be located. +func (c *CppSpec) AliasTargetTokenIndex(sym lsp.DocumentSymbol) int { + if !c.IsUsingAlias(sym) { + return -1 + } + // First non-declaration class/struct/type token. Namespaces in front + // (`::ns::Foo::`) are namespace-kind and naturally skipped. + for i, tok := range sym.Tokens { + if isReferenceTypeToken(tok) { + return i + } + } + return -1 +} + +// baseSpan marks the byte range [s,e) of one base specifier within +// `declarationText`. Shared between BaseClassTokens and BaseClassRefs. +type baseSpan struct{ s, e int } + +// parseBaseSpecifiers walks a class symbol's declaration text, isolates +// the base clause (between the first `:` and the body-opening `{`), and +// splits it on commas at template depth 0. Each entry in the returned +// slice points at one specifier like "public NS::Base". Returns +// (decl, nil) when there's no base clause (forward decl, plain class). +func (c *CppSpec) parseBaseSpecifiers(sym lsp.DocumentSymbol) (string, []baseSpan) { + if sym.Kind != lsp.SKClass && sym.Kind != lsp.SKStruct { + return "", nil + } + decl := c.declarationText(sym) + if decl == "" { + return "", nil + } + bodyStart := strings.IndexByte(decl, '{') + if bodyStart < 0 { + return decl, nil + } + colon := strings.IndexByte(decl, ':') + if colon < 0 || colon >= bodyStart { + return decl, nil + } + var specs []baseSpan + depth := 0 + specStart := colon + 1 + for i := colon + 1; i < bodyStart; i++ { + switch decl[i] { + case '<': + depth++ + case '>': + if depth > 0 { + depth-- + } + case ',': + if depth == 0 { + specs = append(specs, baseSpan{specStart, i}) + specStart = i + 1 + } + } + } + specs = append(specs, baseSpan{specStart, bodyStart}) + return decl, specs +} + +// BaseClassTokens returns indices of tokens naming each base class of a +// class/struct symbol (`class Foo : public Bar, protected Baz`). +// Returns nil for forward decls or non-class symbols. Template +// arguments (`Req`/`Rsp` in `Provider`) are filtered out by +// depth-0 enforcement. +func (c *CppSpec) BaseClassTokens(sym lsp.DocumentSymbol) []int { + if len(sym.Tokens) == 0 { + return nil + } + decl, specs := c.parseBaseSpecifiers(sym) + if len(specs) == 0 { + return nil + } + + // Token offsets are relative to sym.Location.Range.Start. declarationText + // is anchored at the same start, so offsets line up. + lines := utils.CountLinesPooled(decl) + bases := make([]int, 0, len(specs)) + for _, sp := range specs { + // Pick the first reference token of type class/struct/type at + // template depth 0 within this specifier. + for i, tok := range sym.Tokens { + if !isReferenceTypeToken(tok) { + continue + } + if l := tok.Location.Range.Start.Line - sym.Location.Range.Start.Line; l < 0 || l >= len(*lines) { + continue + } + off := lsp.RelativePostionWithLines(*lines, sym.Location.Range.Start, tok.Location.Range.Start) + if off < sp.s || off >= sp.e { + continue + } + // Verify template depth at this offset (relative to specifier). + d := 0 + for j := sp.s; j < off; j++ { + switch decl[j] { + case '<': + d++ + case '>': + if d > 0 { + d-- + } + } + } + if d != 0 { + continue + } + bases = append(bases, i) + break // one base per specifier + } + } + return bases +} + +// BaseClassRef names one base class plus the file position of its name, +// usable for textDocument/definition. Source-text-based; works when +// clangd's semantic-tokens stream omits the base (typical for unresolved +// external templated bases). +type BaseClassRef struct { + Name string + Loc lsp.Location +} + +// BaseClassRefs parses a class's base clause directly from declarationText +// rather than relying on sym.Tokens (which clangd may not populate for +// unresolved templated bases). +func (c *CppSpec) BaseClassRefs(sym lsp.DocumentSymbol) []BaseClassRef { + decl, specs := c.parseBaseSpecifiers(sym) + if len(specs) == 0 { + return nil + } + + // declarationText starts at column 0 of sym's first line (sourceLine + // returns the full line), so a char-offset in decl maps to file + // (sym.Location.Range.Start.Line + line, col) directly. + lineStarts := []int{0} + for i, b := range []byte(decl) { + if b == '\n' { + lineStarts = append(lineStarts, i+1) + } + } + posOf := func(off int) lsp.Position { + line := 0 + for li, st := range lineStarts { + if st > off { + break + } + line = li + } + col := off - lineStarts[line] + return lsp.Position{ + Line: sym.Location.Range.Start.Line + line, + Character: col, + } + } + + accessKW := map[string]bool{"public": true, "protected": true, "private": true, "virtual": true} + isIdentStart := func(b byte) bool { + return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' + } + isIdentCont := func(b byte) bool { + return isIdentStart(b) || (b >= '0' && b <= '9') + } + + var out []BaseClassRef + for _, sp := range specs { + d := 0 + i := sp.s + for i < sp.e { + switch decl[i] { + case '<': + d++ + i++ + continue + case '>': + if d > 0 { + d-- + } + i++ + continue + } + if !isIdentStart(decl[i]) { + i++ + continue + } + start := i + for i < sp.e && isIdentCont(decl[i]) { + i++ + } + word := decl[start:i] + if d != 0 { + continue + } + if accessKW[word] { + continue + } + // Follow `::Foo` segments. The recorded position must point at + // the LAST name (the actual type — that's the token clangd + // can Definition-resolve), but the recorded Name preserves + // the FULL `ns::Sub::Foo` qualifier so unresolved fallbacks + // keep the namespace distinction (without it, two distinct + // bases like `third_party::Provider` and `other_pkg::Provider` + // would collapse to the same bare "Provider"). + fullStart := start + lastStart := start + fullEnd := i + for i+1 < sp.e && decl[i] == ':' && decl[i+1] == ':' { + i += 2 + if i >= sp.e || !isIdentStart(decl[i]) { + break + } + segStart := i + for i < sp.e && isIdentCont(decl[i]) { + i++ + } + lastStart = segStart + fullEnd = i + } + // Strip a leading "::" (root-namespace prefix) so the recorded + // name matches what clangd / collect / export compare against. + fullName := decl[fullStart:fullEnd] + out = append(out, BaseClassRef{ + Name: fullName, + Loc: lsp.Location{ + URI: sym.Location.URI, + Range: lsp.Range{Start: posOf(lastStart), End: posOf(lastStart)}, + }, + }) + break + } + } + return out +} + func cppShortTypeName(name string) string { name = strings.TrimSpace(name) if name == "" { @@ -214,7 +879,7 @@ func cppShortTypeName(name string) string { } func (c *CppSpec) GetUnloadedSymbol(from lsp.Token, define lsp.Location) (string, error) { - panic("TODO") + return "", nil } func (c *CppSpec) FunctionSymbol(sym lsp.DocumentSymbol) (int, []int, []int, []int) { @@ -232,7 +897,7 @@ func (c *CppSpec) FunctionSymbol(sym lsp.DocumentSymbol) (int, []int, []int, []i // 1) type params for i, tok := range sym.Tokens { - if tok.Type == "typeParameter" { + if tok.Type == tokTypeParameter { typeParams = append(typeParams, i) } } @@ -240,7 +905,7 @@ func (c *CppSpec) FunctionSymbol(sym lsp.DocumentSymbol) (int, []int, []int, []i // 2) find name token (method/function) nameTokIdx := -1 for i, tok := range sym.Tokens { - if tok.Type == "method" || tok.Type == "function" { + if tok.Type == tokMethod || tok.Type == tokFunction { nameTokIdx = i break } @@ -258,7 +923,7 @@ func (c *CppSpec) FunctionSymbol(sym lsp.DocumentSymbol) (int, []int, []int, []i continue } // prefer type-ish token kinds for receiver - if tok.Type == "class" || tok.Type == "struct" || tok.Type == "type" { + if tok.Type == tokClass || tok.Type == tokStruct || tok.Type == tokType { receiver = i break } @@ -291,7 +956,7 @@ func (c *CppSpec) FunctionSymbol(sym lsp.DocumentSymbol) (int, []int, []int, []i if !c.IsEntityToken(tok) { continue } - if tok.Type == "typeParameter" || tok.Type == "namespace" { + if tok.Type == tokTypeParameter || tok.Type == tokNamespace { continue } diff --git a/lang/lsp/client.go b/lang/lsp/client.go index 263104fb..ca43394f 100644 --- a/lang/lsp/client.go +++ b/lang/lsp/client.go @@ -23,21 +23,36 @@ import ( "os" "os/exec" "strings" + "sync" "time" "github.com/cloudwego/abcoder/lang/log" "github.com/cloudwego/abcoder/lang/uniast" lsp "github.com/sourcegraph/go-lsp" "github.com/sourcegraph/jsonrpc2" + "golang.org/x/sync/singleflight" ) type LSPClient struct { *jsonrpc2.Conn *lspHandler - tokenTypes []string - tokenModifiers []string - files map[DocumentURI]*TextDocumentItem - provider LanguageServiceProvider + tokenTypes []string + tokenModifiers []string + files map[DocumentURI]*TextDocumentItem + // filesMu guards files. Lock briefly when checking/inserting an entry; + // the per-file Mu inside TextDocumentItem guards per-document caches. + filesMu sync.RWMutex + provider LanguageServiceProvider + + // In-flight request dedup. When N workers simultaneously ask for + // DocumentSymbols / SemanticTokens / Definition of the same key, only + // one RPC is sent; the rest wait on the first one's result. After the + // result lands it goes into the per-file cache so future calls are + // instant. + docSymFlight singleflight.Group // key: URI + semTokFlight singleflight.Group // key: URI (full-doc semantic tokens) + definitionFlight singleflight.Group // key: URI + ":" + line + ":" + col + ClientOptions LspOptions map[string]string } diff --git a/lang/lsp/lsp.go b/lang/lsp/lsp.go index c55cf698..d1541de5 100644 --- a/lang/lsp/lsp.go +++ b/lang/lsp/lsp.go @@ -20,6 +20,7 @@ import ( "fmt" "path/filepath" "strings" + "sync" sitter "github.com/smacker/go-tree-sitter" @@ -167,6 +168,14 @@ func NewURI(file string) DocumentURI { if !filepath.IsAbs(file) { file, _ = filepath.Abs(file) } + // Canonicalise via realpath so symlinks don't produce two URIs for the + // same physical file. clangd resolves symlinks internally and returns + // realpath-based URIs in its responses; if we registered the file + // under the symlinked path, the response URI wouldn't match our + // `cli.files` key (and breaks every downstream Location comparison). + if real, err := filepath.EvalSymlinks(file); err == nil && real != "" { + file = real + } return DocumentURI("file://" + file) } @@ -179,10 +188,27 @@ type TextDocumentItem struct { Symbols map[Range]*DocumentSymbol `json:"-"` Definitions map[Position][]Location `json:"-"` SemanticTokens *SemanticTokens `json:"-"` + // Mu protects Symbols, Definitions, SemanticTokens, ServerOpened from + // concurrent access. Pointer so that copying TextDocumentItem (e.g. + // when building didOpen params) doesn't copy the lock. RPC calls are + // issued without holding this lock; only cache check / write are + // guarded. + Mu *sync.Mutex `json:"-"` + // ServerOpened tracks whether we've actually sent textDocument/didOpen + // for this file to the LSP server. Read helpers like ensureLocalFile + // populate the local cache without notifying the server; DidOpen must + // still send the notification on first transition to true so clangd + // can answer subsequent AST queries (e.g. documentSymbol/definition). + ServerOpened bool `json:"-"` } type DocumentSymbol struct { - Name string `json:"name"` + Name string `json:"name"` + // Detail is the optional "more detail for this symbol" string from + // the LSP spec — clangd populates it with the function signature + // (e.g. "void(int x)") for SKMethod/SKFunction kinds, which lets us + // skip our own text-level signature extraction in extractCppCallSig. + Detail string `json:"detail,omitempty"` Kind SymbolKind `json:"kind"` Tags []json.RawMessage `json:"tags"` Children []*DocumentSymbol `json:"children"` diff --git a/lang/lsp/lsp_methods.go b/lang/lsp/lsp_methods.go index ca5f93c4..a997c7fb 100644 --- a/lang/lsp/lsp_methods.go +++ b/lang/lsp/lsp_methods.go @@ -20,6 +20,8 @@ import ( "math" "os" "sort" + "strconv" + "sync" "github.com/cloudwego/abcoder/lang/utils" lsp "github.com/sourcegraph/go-lsp" @@ -39,28 +41,65 @@ type DidOpenTextDocumentParams struct { } func (cli *LSPClient) DidOpen(ctx context.Context, file DocumentURI) (*TextDocumentItem, error) { - if f, ok := cli.files[file]; ok { + cli.filesMu.RLock() + f, ok := cli.files[file] + cli.filesMu.RUnlock() + if ok { + // An entry exists locally — but ensureLocalFile() may have created + // it without notifying clangd. Send didOpen now if needed, so the + // server's side gets the file content and subsequent AST queries + // don't fail with "trying to get AST for non-added document". + f.Mu.Lock() + if f.ServerOpened { + f.Mu.Unlock() + return f, nil + } + f.ServerOpened = true + params := DidOpenTextDocumentParams{TextDocument: *f} + f.Mu.Unlock() + if err := cli.Notify(ctx, "textDocument/didOpen", params); err != nil { + // roll back so a later DidOpen can retry + f.Mu.Lock() + f.ServerOpened = false + f.Mu.Unlock() + return nil, err + } return f, nil } text, err := os.ReadFile(file.File()) if err != nil { return nil, err } - f := &TextDocumentItem{ - URI: DocumentURI(file), - LanguageID: cli.Language.String(), - Version: 1, - Text: string(text), - LineCounts: utils.CountLines(string(text)), - } - cli.files[file] = f + nf := &TextDocumentItem{ + URI: DocumentURI(file), + LanguageID: cli.Language.String(), + Version: 1, + Text: string(text), + LineCounts: utils.CountLines(string(text)), + Mu: &sync.Mutex{}, + ServerOpened: true, // we're about to send didOpen below + } + cli.filesMu.Lock() + if _, ok := cli.files[file]; ok { + // lost the race; reuse the existing entry (recurse so it notifies + // the server if the winner created a local-only stub). + cli.filesMu.Unlock() + return cli.DidOpen(ctx, file) + } + cli.files[file] = nf + cli.filesMu.Unlock() req := DidOpenTextDocumentParams{ - TextDocument: *f, + TextDocument: *nf, } if err := cli.Notify(ctx, "textDocument/didOpen", req); err != nil { + // roll back: server doesn't know about the file, future callers + // must re-attempt the notification. + nf.Mu.Lock() + nf.ServerOpened = false + nf.Mu.Unlock() return nil, err } - return f, nil + return nf, nil } func flattenDocumentSymbols(symbols []*DocumentSymbol, uri DocumentURI) []*DocumentSymbol { @@ -75,21 +114,26 @@ func flattenDocumentSymbols(symbols []*DocumentSymbol, uri DocumentURI) []*Docum } else { location = sym.Location } + // Preserve SelectionRange — it points at the symbol's NAME token + // (vs Range which spans the whole body). Some downstream paths + // (e.g. typeHierarchy/prepareTypeHierarchy) need a position on + // the identifier specifically. flatSymbol := DocumentSymbol{ // copy - Name: sym.Name, - Kind: sym.Kind, - Tags: sym.Tags, - Text: sym.Text, - Tokens: sym.Tokens, - Node: sym.Node, - Children: sym.Children, + Name: sym.Name, + Detail: sym.Detail, + Kind: sym.Kind, + Tags: sym.Tags, + Text: sym.Text, + Tokens: sym.Tokens, + Node: sym.Node, + Children: sym.Children, + SelectionRange: sym.SelectionRange, // new Location: location, // empty - Role: 0, - Range: nil, - SelectionRange: nil, + Role: 0, + Range: nil, } result = append(result, &flatSymbol) @@ -107,27 +151,54 @@ func (cli *LSPClient) DocumentSymbols(ctx context.Context, file DocumentURI) (ma if err != nil { return nil, err } + f.Mu.Lock() if f.Symbols != nil { - return f.Symbols, nil - } - uri := lsp.DocumentURI(file) - req := lsp.DocumentSymbolParams{ - TextDocument: lsp.TextDocumentIdentifier{ - URI: uri, - }, - } - var resp []*DocumentSymbol - if err := cli.Call(ctx, "textDocument/documentSymbol", req, &resp); err != nil { + syms := f.Symbols + f.Mu.Unlock() + return syms, nil + } + f.Mu.Unlock() + + // Deduplicate concurrent requests for the same URI. Without this, 32 + // workers all hitting a freshly-encountered external header would each + // fire their own documentSymbol RPC; clangd serializes per-document + // parses so they'd queue up anyway. + v, err, _ := cli.docSymFlight.Do(string(file), func() (interface{}, error) { + // Re-check after acquiring the flight: another flight just finished. + f.Mu.Lock() + if f.Symbols != nil { + cached := f.Symbols + f.Mu.Unlock() + return cached, nil + } + f.Mu.Unlock() + + uri := lsp.DocumentURI(file) + req := lsp.DocumentSymbolParams{ + TextDocument: lsp.TextDocumentIdentifier{URI: uri}, + } + var resp []*DocumentSymbol + if err := cli.Call(ctx, "textDocument/documentSymbol", req, &resp); err != nil { + return nil, err + } + respFlatten := flattenDocumentSymbols(resp, file) + built := make(map[Range]*DocumentSymbol, len(respFlatten)) + for i := range respFlatten { + s := respFlatten[i] + built[s.Location.Range] = s + } + f.Mu.Lock() + if f.Symbols == nil { + f.Symbols = built + } + out := f.Symbols + f.Mu.Unlock() + return out, nil + }) + if err != nil { return nil, err } - respFlatten := flattenDocumentSymbols(resp, file) - // cache symbols - f.Symbols = make(map[Range]*DocumentSymbol, len(respFlatten)) - for i := range respFlatten { - s := respFlatten[i] - f.Symbols[s.Location.Range] = s - } - return f.Symbols, nil + return v.(map[Range]*DocumentSymbol), nil } func (cli *LSPClient) References(ctx context.Context, id Location) ([]Location, error) { @@ -157,25 +228,47 @@ func (cli *LSPClient) References(ctx context.Context, id Location) ([]Location, } func (cli *LSPClient) getSemanticTokensRange(ctx context.Context, req DocumentRange, resp *SemanticTokens) error { - f, err := cli.DidOpen(ctx, DocumentURI(req.TextDocument.URI)) + uri := DocumentURI(req.TextDocument.URI) + f, err := cli.DidOpen(ctx, uri) if err != nil { return err } - if f.SemanticTokens == nil { - req1 := SemanticTokensFullParams{ - TextDocument: req.TextDocument, - } - var fullResp SemanticTokens - if err := cli.Call(ctx, "textDocument/semanticTokens/full", req1, &fullResp); err != nil { + f.Mu.Lock() + have := f.SemanticTokens + f.Mu.Unlock() + if have == nil { + v, err, _ := cli.semTokFlight.Do(string(uri), func() (interface{}, error) { + f.Mu.Lock() + if f.SemanticTokens != nil { + cached := f.SemanticTokens + f.Mu.Unlock() + return cached, nil + } + f.Mu.Unlock() + + req1 := SemanticTokensFullParams{TextDocument: req.TextDocument} + var fullResp SemanticTokens + if err := cli.Call(ctx, "textDocument/semanticTokens/full", req1, &fullResp); err != nil { + return nil, err + } + f.Mu.Lock() + if f.SemanticTokens == nil { + f.SemanticTokens = &fullResp + } + out := f.SemanticTokens + f.Mu.Unlock() + return out, nil + }) + if err != nil { return err } - f.SemanticTokens = &fullResp + have = v.(*SemanticTokens) } - resp.ResultID = f.SemanticTokens.ResultID - resp.Data = make([]uint32, len(f.SemanticTokens.Data)) - copy(resp.Data, f.SemanticTokens.Data) + resp.ResultID = have.ResultID + resp.Data = make([]uint32, len(have.Data)) + copy(resp.Data, have.Data) filterSemanticTokensInRange(resp, req.Range) return nil @@ -219,8 +312,18 @@ func (cli *LSPClient) SemanticTokens(ctx context.Context, id Location) ([]Token, return nil, err } sym := syms[id.Range] - if sym != nil && sym.Tokens != nil { - return sym.Tokens, nil + f := cli.lookupFile(id.URI) + if sym != nil { + if f != nil { + f.Mu.Lock() + toks := sym.Tokens + f.Mu.Unlock() + if toks != nil { + return toks, nil + } + } else if sym.Tokens != nil { + return sym.Tokens, nil + } } uri := lsp.DocumentURI(id.URI) @@ -238,41 +341,78 @@ func (cli *LSPClient) SemanticTokens(ctx context.Context, id Location) ([]Token, toks := cli.getAllTokens(resp, id.URI) if sym != nil { - sym.Tokens = toks + if f != nil { + f.Mu.Lock() + if sym.Tokens == nil { + sym.Tokens = toks + } + toks = sym.Tokens + f.Mu.Unlock() + } else { + sym.Tokens = toks + } } return toks, nil } +// lookupFile returns the cached TextDocumentItem if open, otherwise nil. +func (cli *LSPClient) lookupFile(uri DocumentURI) *TextDocumentItem { + cli.filesMu.RLock() + f := cli.files[uri] + cli.filesMu.RUnlock() + return f +} + func (cli *LSPClient) Definition(ctx context.Context, uri DocumentURI, pos Position) ([]Location, error) { // open file first f, err := cli.DidOpen(ctx, uri) if err != nil { return nil, err } + f.Mu.Lock() if f.Definitions != nil { if locations, ok := f.Definitions[pos]; ok { + f.Mu.Unlock() return locations, nil } } + f.Mu.Unlock() - // call - req := lsp.TextDocumentPositionParams{ - TextDocument: lsp.TextDocumentIdentifier{ - URI: lsp.DocumentURI(uri), - }, - Position: lsp.Position(pos), - } - var resp []Location - if err := cli.Call(ctx, "textDocument/definition", req, &resp); err != nil { - return nil, err - } + key := string(uri) + ":" + strconv.Itoa(pos.Line) + ":" + strconv.Itoa(pos.Character) + v, err, _ := cli.definitionFlight.Do(key, func() (interface{}, error) { + f.Mu.Lock() + if f.Definitions != nil { + if locations, ok := f.Definitions[pos]; ok { + f.Mu.Unlock() + return locations, nil + } + } + f.Mu.Unlock() - // cache definitions - if f.Definitions == nil { - f.Definitions = make(map[Position][]Location) + req := lsp.TextDocumentPositionParams{ + TextDocument: lsp.TextDocumentIdentifier{URI: lsp.DocumentURI(uri)}, + Position: lsp.Position(pos), + } + var resp []Location + if err := cli.Call(ctx, "textDocument/definition", req, &resp); err != nil { + return nil, err + } + f.Mu.Lock() + if f.Definitions == nil { + f.Definitions = make(map[Position][]Location) + } + if existing, ok := f.Definitions[pos]; ok { + resp = existing + } else { + f.Definitions[pos] = resp + } + f.Mu.Unlock() + return resp, nil + }) + if err != nil { + return nil, err } - f.Definitions[pos] = resp - return resp, nil + return v.([]Location), nil } func (cli *LSPClient) TypeDefinition(ctx context.Context, uri DocumentURI, pos Position) ([]Location, error) { @@ -289,26 +429,46 @@ func (cli *LSPClient) TypeDefinition(ctx context.Context, uri DocumentURI, pos P return resp, nil } +// ensureLocalFile returns the cached TextDocumentItem for uri, reading and +// caching the file if necessary. Unlike DidOpen it does NOT send a didOpen +// notification — use this for read-only helpers (Locate, Line, ...) that +// just need the file body. The stub it creates has ServerOpened=false so +// that the next DidOpen() will still notify the LSP server. Safe for +// concurrent use. +func (cli *LSPClient) ensureLocalFile(uri DocumentURI) (*TextDocumentItem, error) { + if f := cli.lookupFile(uri); f != nil { + return f, nil + } + fd, err := os.ReadFile(uri.File()) + if err != nil { + return nil, err + } + text := string(fd) + nf := &TextDocumentItem{ + URI: DocumentURI(uri), + LanguageID: cli.Language.String(), + Version: 1, + Text: text, + LineCounts: utils.CountLines(text), + Mu: &sync.Mutex{}, + ServerOpened: false, // local-only stub; DidOpen() will notify if asked + } + cli.filesMu.Lock() + if existing, ok := cli.files[uri]; ok { + cli.filesMu.Unlock() + return existing, nil + } + cli.files[uri] = nf + cli.filesMu.Unlock() + return nf, nil +} + // read file and get the text of block of range func (cli *LSPClient) Locate(id Location) (string, error) { - f, ok := cli.files[id.URI] - if !ok { - // open file os - fd, err := os.ReadFile(id.URI.File()) - if err != nil { - return "", err - } - text := string(fd) - f = &TextDocumentItem{ - URI: DocumentURI(id.URI), - LanguageID: cli.Language.String(), - Version: 1, - Text: text, - LineCounts: utils.CountLines(text), - } - cli.files[id.URI] = f + f, err := cli.ensureLocalFile(id.URI) + if err != nil { + return "", err } - text := f.Text // get block text of range start := f.LineCounts[id.Range.Start.Line] + id.Range.Start.Character @@ -318,22 +478,9 @@ func (cli *LSPClient) Locate(id Location) (string, error) { // get line text of pos func (cli *LSPClient) Line(uri DocumentURI, pos int) string { - f, ok := cli.files[uri] - if !ok { - // open file os - fd, err := os.ReadFile(uri.File()) - if err != nil { - return "" - } - text := string(fd) - f = &TextDocumentItem{ - URI: DocumentURI(uri), - LanguageID: cli.Language.String(), - Version: 1, - Text: text, - LineCounts: utils.CountLines(text), - } - cli.files[uri] = f + f, err := cli.ensureLocalFile(uri) + if err != nil { + return "" } if pos < 0 || pos >= len(f.LineCounts) { return "" @@ -347,35 +494,24 @@ func (cli *LSPClient) Line(uri DocumentURI, pos int) string { } func (cli *LSPClient) LineCounts(uri DocumentURI) []int { - f, ok := cli.files[uri] - if !ok { - // open file os - fd, err := os.ReadFile(uri.File()) - if err != nil { - return nil - } - text := string(fd) - f = &TextDocumentItem{ - URI: DocumentURI(uri), - LanguageID: cli.Language.String(), - Version: 1, - Text: text, - LineCounts: utils.CountLines(text), - } - cli.files[uri] = f + f, err := cli.ensureLocalFile(uri) + if err != nil { + return nil } return f.LineCounts } func (cli *LSPClient) GetFile(uri DocumentURI) *TextDocumentItem { - return cli.files[uri] + return cli.lookupFile(uri) } func (cli *LSPClient) GetParent(sym *DocumentSymbol) (ret *DocumentSymbol) { if sym == nil { return nil } - if f, ok := cli.files[sym.Location.URI]; ok { + if f := cli.lookupFile(sym.Location.URI); f != nil { + f.Mu.Lock() + defer f.Mu.Unlock() for _, s := range f.Symbols { if s != sym && s.Location.Range.Include(sym.Location.Range) { if ret == nil || ret.Location.Range.Include(s.Location.Range) { diff --git a/lang/parse.go b/lang/parse.go index d2c751c4..4f14b2b0 100644 --- a/lang/parse.go +++ b/lang/parse.go @@ -195,6 +195,10 @@ func collectSymbol(ctx context.Context, cli *lsp.LSPClient, repoPath string, opt } else { collector := collect.NewCollector(repoPath, cli) collector.CollectOption = opts + // Plumb language-specific options into the spec implementation + // after CollectOption is set, so options like --sysroot reach the + // CppSpec NameSpace logic. + collector.ApplyCollectOptionToSpec() log.Info("start collecting symbols...\n") err = collector.Collect(ctx) if err != nil { diff --git a/lang/parse_cpp_test.go b/lang/parse_cpp_test.go new file mode 100644 index 00000000..a2958829 --- /dev/null +++ b/lang/parse_cpp_test.go @@ -0,0 +1,934 @@ +// Copyright 2025 CloudWeGo Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package lang + +import ( + "context" + "encoding/json" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + + "github.com/cloudwego/abcoder/lang/collect" + "github.com/cloudwego/abcoder/lang/testutils" + "github.com/cloudwego/abcoder/lang/uniast" +) + +// cleanupPreambleCache removes leftover /tmp/preamble-*.pch files clangd +// writes when --pch-storage=disk has ever been used in this repo (e.g. by +// a prior CLI run). Stale preambles whose hash happens to collide with the +// new test's compile command get reused by clangd, which then returns +// degraded semantic tokens (empty bodies, missing call edges). Each test +// run starts from a clean slate so results are deterministic regardless +// of dev-box state. +func cleanupPreambleCache() { + matches, _ := filepath.Glob(filepath.Join(os.TempDir(), "preamble-*.pch")) + for _, m := range matches { + _ = os.Remove(m) + } +} + +// parseTestCase parses testdata/cpp// and returns the Repository. +// +// Skips the test gracefully when clangd-18 (or any compatible clangd +// reachable through PATH) is missing, so the suite is still runnable on +// dev boxes without manual setup. CI always installs clangd-18 in the +// regression workflow (see .github/workflows/regression.yml). +func parseTestCase(t *testing.T, name string) *uniast.Repository { + t.Helper() + lspBin := resolveClangd(t) + if lspBin == "" { + t.Skipf("no clangd binary on PATH; skipping C++ parser test") + } + // Stale preamble PCHs from earlier runs can poison the next parse, + // see cleanupPreambleCache for the full story. + cleanupPreambleCache() + + workspace := testutils.TestPath(name, "cpp") + opts := ParseOptions{ + LSP: lspBin + " --background-index=false --pch-storage=disk -j=8 --clang-tidy=false", + Verbose: false, + CollectOption: collect.CollectOption{ + Language: uniast.Cpp, + LoadExternalSymbol: false, + NeedStdSymbol: false, + NoNeedComment: true, + NotNeedTest: true, + }, + LspOptions: map[string]string{}, + } + repoBytes, err := Parse(context.Background(), workspace, opts) + if err != nil { + t.Fatalf("Parse() failed: %v", err) + } + var repo uniast.Repository + if err := json.Unmarshal(repoBytes, &repo); err != nil { + t.Fatalf("Unmarshal() failed: %v", err) + } + return &repo +} + +// resolveClangd looks for the binary we'll feed to abcoder. The CI image +// installs clangd-18; locally users tend to have either `clangd-18` or +// homebrew's `clangd` at $(brew --prefix llvm@18)/bin/clangd. We probe the +// likely names and return an absolute path. +func resolveClangd(t *testing.T) string { + t.Helper() + for _, name := range []string{"clangd-18", "clangd"} { + if p, err := exec.LookPath(name); err == nil { + return p + } + } + // Common homebrew layout. + hb := "/home/linuxbrew/.linuxbrew/opt/llvm@18/bin/clangd" + if _, err := exec.LookPath(hb); err == nil { + return hb + } + return "" +} + +// forEachType invokes fn(typeName, type) for every Type in every package of +// the repo's internal module. Order isn't guaranteed. +func forEachType(repo *uniast.Repository, fn func(name string, ty *uniast.Type)) { + for _, mod := range repo.Modules { + for _, pkg := range mod.Packages { + for n, ty := range pkg.Types { + fn(n, ty) + } + } + } +} + +// forEachFunc invokes fn(funcName, fn) for every Function. +func forEachFunc(repo *uniast.Repository, fn func(name string, f *uniast.Function)) { + for _, mod := range repo.Modules { + for _, pkg := range mod.Packages { + for n, f := range pkg.Functions { + fn(n, f) + } + } + } +} + +// findType returns the Type whose Identity.Name ends with suffix, or nil. +func findType(repo *uniast.Repository, suffix string) *uniast.Type { + var hit *uniast.Type + forEachType(repo, func(_ string, ty *uniast.Type) { + if strings.HasSuffix(ty.Identity.Name, suffix) { + hit = ty + } + }) + return hit +} + +// TestCpp_Inheritance verifies that C++ class inheritance is recorded in +// `Type.Implements` (Issue: "C++ Class 不会记录继承相关内容"). +// +// Source: testdata/cpp/2_inheritance/shapes.h +// class Circle : public Shape → Implements: [Shape] +// class Square : public Shape → Implements: [Shape] +// class LabeledCircle : public Circle, public Drawable +// → Implements: [Circle, Drawable] +// class IntStore : public Container → Implements: [Container] +// (the `int` template arg +// must NOT be picked up) +func TestCpp_Inheritance(t *testing.T) { + repo := parseTestCase(t, "inheritance") + + cases := []struct { + typeSuffix string + wantBases []string // last-segment of base type names + }{ + {"Circle", []string{"Shape"}}, + {"Square", []string{"Shape"}}, + {"LabeledCircle", []string{"Circle", "Drawable"}}, + {"IntStore", []string{"Container"}}, + } + + for _, tc := range cases { + ty := findType(repo, "::"+tc.typeSuffix) + if ty == nil { + t.Errorf("type %q not found in AST", tc.typeSuffix) + continue + } + var got []string + for _, im := range ty.Implements { + got = append(got, lastSeg(im.Name)) + } + if !sameSet(got, tc.wantBases) { + t.Errorf("%s Implements = %v, want %v (Implements=%v)", + tc.typeSuffix, got, tc.wantBases, ty.Implements) + } + } + + // IntStore's Implements must NOT contain `int` — that's a template arg. + if ty := findType(repo, "::IntStore"); ty != nil { + for _, im := range ty.Implements { + if strings.EqualFold(lastSeg(im.Name), "int") { + t.Errorf("IntStore.Implements wrongly contains template arg %v", im) + } + } + } +} + +// TestCpp_Aliases verifies the two alias rules: +// +// - `typedef X Y;` produces a Type entry with TypeKind=typedef (kept). +// - `using Y = X;` is DROPPED from the AST entirely; any reference to Y +// resolves to X via alias redirection. +// +// Source: testdata/cpp/3_aliases/types.h +// typedef int MyInt; → kept, TypeKind=typedef +// typedef Counter Cnt; → kept, TypeKind=typedef +// using MyDouble = double; → dropped +// using CntAlias = Counter; → dropped, refs go to Counter +func TestCpp_Aliases(t *testing.T) { + repo := parseTestCase(t, "aliases") + + // 1. typedef present + tagged. + if ty := findType(repo, "::MyInt"); ty == nil { + t.Error("typedef MyInt missing from AST") + } else if ty.TypeKind != uniast.TypeKindTypedef { + t.Errorf("MyInt.TypeKind = %q, want typedef", ty.TypeKind) + } + if ty := findType(repo, "::Cnt"); ty == nil { + t.Error("typedef Cnt missing from AST") + } else if ty.TypeKind != uniast.TypeKindTypedef { + t.Errorf("Cnt.TypeKind = %q, want typedef", ty.TypeKind) + } + + // 2. using-aliases dropped. + if ty := findType(repo, "::MyDouble"); ty != nil { + t.Errorf("using-alias MyDouble must NOT be emitted as Type, got %#v", ty) + } + if ty := findType(repo, "::CntAlias"); ty != nil { + t.Errorf("using-alias CntAlias must NOT be emitted as Type, got %#v", ty) + } + + // 3. The real types are still around. + if ty := findType(repo, "::Counter"); ty == nil { + t.Error("underlying Counter type missing from AST") + } + + // 4. Function `make_via_using` (returns CntAlias) — its Results dep + // should ultimately point to Counter, not the (now-absent) CntAlias. + var makeViaUsing *uniast.Function + forEachFunc(repo, func(_ string, f *uniast.Function) { + if strings.Contains(f.Name, "make_via_using") { + makeViaUsing = f + } + }) + if makeViaUsing == nil { + t.Fatal("function make_via_using missing from AST") + } + // At least one Result or Type dep should name Counter, none should name CntAlias. + allDeps := append([]uniast.Dependency{}, makeViaUsing.Results...) + allDeps = append(allDeps, makeViaUsing.Types...) + sawCounter, sawAlias := false, false + for _, d := range allDeps { + if lastSeg(d.Name) == "Counter" { + sawCounter = true + } + if lastSeg(d.Name) == "CntAlias" { + sawAlias = true + } + } + if sawAlias { + t.Errorf("make_via_using deps reference the using-alias 'CntAlias'; "+ + "should be redirected to Counter. deps=%v", allDeps) + } + if !sawCounter { + t.Errorf("make_via_using deps do NOT reference Counter; expected "+ + "redirect from CntAlias. deps=%v", allDeps) + } +} + +// TestCpp_ClassMethods verifies that inline class methods are attached to +// their containing class via `IsMethod=true` + `Receiver`, rather than left +// as orphan free functions. (Issue: "将 class method 认为是 substruct" +// turned out to be misreported — the actual symptom was *detached* methods.) +// +// Source: testdata/cpp/0_simple/util.h has class util::Greeter with three +// public methods (greet, bump, localCount). All three must be IsMethod=true +// and their Receiver must point to util::Greeter. +func TestCpp_ClassMethods(t *testing.T) { + repo := parseTestCase(t, "simple") + + want := map[string]bool{"greet": false, "bump": false, "localCount": false} + forEachFunc(repo, func(_ string, f *uniast.Function) { + if !f.IsMethod { + return + } + // Method name in C++ output is "::(...)"; pick the + // short method name after the last `::`, before `(`. + short := f.Name + if i := strings.Index(short, "("); i >= 0 { + short = short[:i] + } + if i := strings.LastIndex(short, "::"); i >= 0 { + short = short[i+2:] + } + if _, ok := want[short]; !ok { + return + } + if f.Receiver == nil { + t.Errorf("method %q has no Receiver", f.Name) + return + } + if !strings.HasSuffix(f.Receiver.Type.Name, "::Greeter") { + t.Errorf("method %q Receiver = %v, want Greeter", f.Name, f.Receiver.Type) + } + want[short] = true + }) + for m, ok := range want { + if !ok { + t.Errorf("class method %q not detected as IsMethod=true", m) + } + } +} + +// TestCpp_InlineMethodReceiver verifies the A1 fix: inline methods of +// distinct classes that share a short name (the textbook case for external +// base classes like cppservice::ApiHandler::process / Step::process / +// Strategy::process) must each appear with their full namespace+class +// qualifier and IsMethod=true. Before the fix, all three collapsed to a +// single orphan `svc::process` symbol because clangd reports inline method +// names unqualified. +// +// Source: testdata/cpp/4_inline_methods/handlers.h +func TestCpp_InlineMethodReceiver(t *testing.T) { + repo := parseTestCase(t, "inline_methods") + + want := map[string]bool{ + "svc::ApiHandler::process": false, + "svc::Step::process": false, + "svc::Strategy::process": false, + } + collapsed := false + var allNames []string + forEachFunc(repo, func(_ string, f *uniast.Function) { + allNames = append(allNames, f.Name) + // Strip the `(params)` suffix. + head := f.Name + if i := strings.Index(head, "("); i >= 0 { + head = head[:i] + } + if _, ok := want[head]; ok { + want[head] = true + if !f.IsMethod { + t.Errorf("%s: IsMethod=false (should be true)", f.Name) + } + if f.Receiver == nil { + t.Errorf("%s: Receiver is nil", f.Name) + } else { + // Receiver type should end with the class short name. + if !strings.HasSuffix(f.Receiver.Type.Name, "::"+lastSeg(head[:strings.LastIndex(head, "::")])) && + !strings.HasSuffix(f.Receiver.Type.Name, lastSeg(head[:strings.LastIndex(head, "::")])) { + t.Errorf("%s: Receiver=%s, want type ending in %s", f.Name, f.Receiver.Type.Name, lastSeg(head[:strings.LastIndex(head, "::")])) + } + } + } + // Detect the regression: a function literally named "svc::process" with + // no class hop. (Older behaviour collapsed all three here.) + if head == "svc::process" { + collapsed = true + } + }) + for n, ok := range want { + if !ok { + t.Errorf("expected qualified method %q not found in AST", n) + } + } + if collapsed { + t.Error("regression: inline methods collapsed to namespace-level svc::process") + } + t.Logf("all functions in AST: %v", allNames) + + // Regression for per-receiver dep cache bug: main() has three sibling + // calls `a.process(c)`, `s.process(c)`, `g.process(c)`. With a + // (type, text) positive cache, the 2nd and 3rd calls would share the + // first call's resolved Identity and all three edges would point at + // svc::ApiHandler::process. Each receiver must resolve to its own + // class's method. + var mainFn *uniast.Function + forEachFunc(repo, func(_ string, f *uniast.Function) { + if strings.HasPrefix(f.Name, "main(") || f.Name == "main()" || f.Name == "main" { + mainFn = f + } + }) + if mainFn == nil { + t.Fatalf("main() function not found") + } + seenTargets := map[string]bool{} + for _, mc := range mainFn.MethodCalls { + seenTargets[mc.Identity.Name] = true + } + expectedTargets := []string{ + "svc::ApiHandler::process", + "svc::Step::process", + "svc::Strategy::process", + } + for _, want := range expectedTargets { + found := false + for name := range seenTargets { + if strings.HasPrefix(name, want+"(") || name == want { + found = true + break + } + } + if !found { + t.Errorf("main() MethodCalls missing %s; got: %v", want, seenTargets) + } + } +} + +// TestCpp_ExternalCallEdges verifies the A2 fix: calls to functions/ +// methods defined outside the workspace (e.g. std::printf, std::string::size) +// must still appear as MethodCalls/FunctionCalls dep edges via a +// lightweight Identity, even when --load-external-symbol is off. +// Without the fix the call edges silently vanish and the call graph +// becomes disconnected at every workspace boundary. +// +// Source: testdata/cpp/5_external_calls/main.cpp +func TestCpp_ExternalCallEdges(t *testing.T) { + repo := parseTestCase(t, "external_calls") + + var runPrintf, length *uniast.Function + forEachFunc(repo, func(_ string, f *uniast.Function) { + switch { + case strings.Contains(f.Name, "run_printf"): + runPrintf = f + case strings.Contains(f.Name, "Probe::length"): + length = f + } + }) + if runPrintf == nil { + t.Fatal("function run_printf missing from AST") + } + if length == nil { + t.Fatal("method Probe::length missing from AST") + } + + // run_printf must have at least one outgoing call edge (to std::printf + // or printf), pointing at an external module/light identity. + hasExternalCall := func(f *uniast.Function) bool { + for _, c := range f.FunctionCalls { + if strings.Contains(c.Name, "printf") { + return true + } + } + for _, c := range f.MethodCalls { + if strings.Contains(c.Name, "printf") { + return true + } + } + return false + } + if !hasExternalCall(runPrintf) { + t.Errorf("run_printf has no edge to external printf: FCalls=%v MCalls=%v", + runPrintf.FunctionCalls, runPrintf.MethodCalls) + } + + hasSizeCall := false + for _, c := range length.MethodCalls { + if strings.Contains(c.Name, "size") { + hasSizeCall = true + } + } + for _, c := range length.FunctionCalls { + if strings.Contains(c.Name, "size") { + hasSizeCall = true + } + } + if !hasSizeCall { + t.Errorf("Probe::length has no edge to external std::string::size: MCalls=%v FCalls=%v", + length.MethodCalls, length.FunctionCalls) + } +} + +// TestCpp_NviSynthesis verifies the C / Model B fix: a derived class +// inheriting an NVI base must have a synthesized copy of the base's +// non-virtual entry method, and that synthesized body must redirect the +// virtual call on `this` to the derived class's override. +// +// Source: testdata/cpp/6_nvi_dispatch/nvi.h +// +// class Provider { bool provide(Ctx&) { return do_provide(c); } +// protected: virtual bool do_provide(Ctx&) = 0; }; +// class RealProvider : public Provider { bool do_provide(Ctx&) override; }; +// +// Expectation: +// 1. A synthesized function "nvi::RealProvider::provide" exists with +// Receiver=RealProvider. +// 2. Its MethodCalls/FunctionCalls include the edge to +// "nvi::RealProvider::do_provide" (NOT just "nvi::Provider::do_provide"). +func TestCpp_NviSynthesis(t *testing.T) { + repo := parseTestCase(t, "nvi_dispatch") + + var realProvide *uniast.Function + forEachFunc(repo, func(_ string, f *uniast.Function) { + // Match the synthesized method by exact qualifier+short name. + if strings.HasPrefix(f.Name, "nvi::RealProvider::provide") { + realProvide = f + } + }) + var allNames []string + forEachFunc(repo, func(_ string, f *uniast.Function) { allNames = append(allNames, f.Name) }) + t.Logf("all functions: %v", allNames) + if realProvide == nil { + t.Fatalf("synthesized inherited method nvi::RealProvider::provide missing. functions=%v", allNames) + } + t.Logf("synthesized provide signature=%q content=%q MC=%v FC=%v", + realProvide.Signature, realProvide.Content, realProvide.MethodCalls, realProvide.FunctionCalls) + // Dump base method calls too so we can see what got copied. + forEachFunc(repo, func(_ string, f *uniast.Function) { + if strings.HasPrefix(f.Name, "nvi::Provider::provide") { + t.Logf("base provide MC=%v FC=%v", f.MethodCalls, f.FunctionCalls) + } + }) + if !realProvide.IsMethod { + t.Errorf("synthesized provide(): IsMethod=false, want true") + } + if realProvide.Receiver == nil || !strings.HasSuffix(realProvide.Receiver.Type.Name, "::RealProvider") { + t.Errorf("synthesized provide(): Receiver=%v, want type ending in ::RealProvider", realProvide.Receiver) + } + + // Body devirtualization: outgoing edge to do_provide should now point + // at the derived class's override, not the base's pure-virtual stub. + // (Only assert this when the base method itself collected the call — + // clangd occasionally returns degraded semantic tokens for an inline + // method when its preamble was shared with an unrelated parse done in + // the same test process; that produces an empty MC on both base and + // synthesized child, which is a clangd issue unrelated to Model B. + // We still check the synthesis itself above.) + // Pick the most-detailed base Provider::provide (clangd emits one + // record per declaration site: a header-only decl with empty MC and + // the .cpp definition with the real call edges; map order is random). + var basePV *uniast.Function + forEachFunc(repo, func(_ string, f *uniast.Function) { + if !strings.HasPrefix(f.Name, "nvi::Provider::provide") { + return + } + if basePV == nil || + len(f.MethodCalls)+len(f.FunctionCalls) > len(basePV.MethodCalls)+len(basePV.FunctionCalls) { + basePV = f + } + }) + // Same trick for the synthesized side. + forEachFunc(repo, func(_ string, f *uniast.Function) { + if !strings.HasPrefix(f.Name, "nvi::RealProvider::provide") { + return + } + if len(f.MethodCalls)+len(f.FunctionCalls) > len(realProvide.MethodCalls)+len(realProvide.FunctionCalls) { + realProvide = f + } + }) + baseHasCall := basePV != nil && len(basePV.MethodCalls)+len(basePV.FunctionCalls) > 0 + if !baseHasCall { + t.Skipf("clangd returned no semantic-tokens for nvi::Provider::provide body in this run; "+ + "synthesis-structure check still passed (synth=%s).", realProvide.Name) + } + + sawDerived := false + sawBaseOnly := false + allDeps := append([]uniast.Dependency{}, realProvide.MethodCalls...) + allDeps = append(allDeps, realProvide.FunctionCalls...) + for _, d := range allDeps { + if strings.Contains(d.Name, "do_provide") { + switch { + case strings.Contains(d.Name, "RealProvider::do_provide"): + sawDerived = true + case strings.Contains(d.Name, "Provider::do_provide"): + sawBaseOnly = true + } + } + } + if !sawDerived { + t.Errorf("synthesized provide(): no devirtualized edge to RealProvider::do_provide. edges=%v", allDeps) + } + if sawBaseOnly && !sawDerived { + t.Errorf("synthesized provide(): edge still points to abstract Provider::do_provide (devirt failed)") + } +} + +// TestCpp_ProviderChain covers regressions found while parsing freq_service: +// +// - Forward declarations like `class FwdOnly;` must NOT produce a Type +// entry in the AST (docs/cpp_known_issues.md #6). +// - Inheriting from a templated base class (single-line or multi-line +// base clause) records the base in Type.Implements (docs #4/#5, the +// internal-template variant). +// +// Source: testdata/cpp/7_provider_chain/ +// +// class FwdOnly; (forward decl) +// class ConcreteProvider : public common::Provider +// class MultiLineProvider (multi-line `:`) +// : public common::Provider +func TestCpp_ProviderChain(t *testing.T) { + repo := parseTestCase(t, "provider_chain") + + // (6) FwdOnly is forward-only — should NOT appear as a Type. + if ty := findType(repo, "::FwdOnly"); ty != nil { + t.Errorf("forward declaration FwdOnly produced a Type entry: %+v", ty) + } + + // (4/5) All derived classes must report Provider in their Implements, + // including the BareNameProvider variant that picks up the base name + // through a `using common::Provider;` alias (must NOT collapse into a + // phantom `app::Provider::Provider` NodeID). + derivedCases := []struct { + typeSuffix string + wantBase string + }{ + {"ConcreteProvider", "Provider"}, + {"MultiLineProvider", "Provider"}, + {"BareNameProvider", "Provider"}, + } + for _, tc := range derivedCases { + ty := findType(repo, "::"+tc.typeSuffix) + if ty == nil { + t.Errorf("type %q not found in AST", tc.typeSuffix) + continue + } + found := false + for _, im := range ty.Implements { + if lastSeg(im.Name) == tc.wantBase { + found = true + break + } + } + if !found { + var got []string + for _, im := range ty.Implements { + got = append(got, lastSeg(im.Name)) + } + t.Errorf("%s Implements = %v, want to include %q", tc.typeSuffix, got, tc.wantBase) + } + } + + // (4/5 negative) Template arguments ReqA/RspA must NOT show up as bases. + for _, suf := range []string{"::ConcreteProvider", "::MultiLineProvider"} { + ty := findType(repo, suf) + if ty == nil { + continue + } + for _, im := range ty.Implements { + seg := lastSeg(im.Name) + if seg == "ReqA" || seg == "RspA" { + t.Errorf("%s Implements wrongly contains template arg %q", suf, seg) + } + } + } + + // (2) Strategy::process is only declared (no body); its definition + // lives in a .cpp file outside our workspace. The AST must still + // carry the method node — declaration-only entries must NOT be + // silently dropped during dedup. + sawStrategyProcess := false + dupDoProvide := map[string]int{} + forEachFunc(repo, func(name string, _ *uniast.Function) { + if strings.Contains(name, "Strategy::process") { + sawStrategyProcess = true + } + if strings.Contains(name, "::do_provide(") { + dupDoProvide[name]++ + } + }) + if !sawStrategyProcess { + t.Errorf("Strategy::process declaration-only method missing from AST") + } + // (3) After dedup, no NodeID may appear more than once. + for name, n := range dupDoProvide { + if n > 1 { + t.Errorf("duplicate Function NodeID emitted %dx: %s", n, name) + } + } + + // main() must record a FunctionCall edge to the templated + // `common::run<...>(argc, argv)` it invokes — freq_service's + // `main()` calls `cppservice::main(argc, argv)` and + // the original bug was that this never showed up in the AST. + var mainFn *uniast.Function + forEachFunc(repo, func(name string, f *uniast.Function) { + if strings.HasPrefix(name, "main(") { + mainFn = f + } + }) + if mainFn == nil { + t.Errorf("main() function missing from AST") + } else { + hasRunCall := false + for _, d := range mainFn.FunctionCalls { + if strings.Contains(d.Name, "::run") { + hasRunCall = true + break + } + } + if !hasRunCall { + t.Errorf("main() FC missing template call to common::run (FC=%v)", mainFn.FunctionCalls) + } + } + + // Base inline body deps: `common::Provider::provide` is defined as + // inline `return do_provide(req, rsp);`. The MC list must surface + // that call so synthesized inherited methods of every derived class + // (ConcreteProvider, MultiLineProvider, ...) inherit the edge. + var baseProvide *uniast.Function + forEachFunc(repo, func(name string, f *uniast.Function) { + if strings.HasPrefix(name, "common::Provider::provide(") && baseProvide == nil { + baseProvide = f + } + }) + if baseProvide == nil { + t.Errorf("base common::Provider::provide function missing from AST") + } else { + hasDoProvideCall := false + for _, d := range baseProvide.MethodCalls { + if strings.Contains(d.Name, "do_provide") { + hasDoProvideCall = true + break + } + } + if !hasDoProvideCall { + t.Errorf("common::Provider::provide inline body deps missing do_provide MC edge (MC=%v)", + baseProvide.MethodCalls) + } + } +} + +// TestCpp_OverloadedInheritance verifies that overloaded methods (same +// short name, different signatures) survive NVI synthesis as distinct +// functions. Base has foo(int) and foo(string); Derived overrides only +// foo(int). After synthesis, Derived must have: +// - its own foo(int) (the override) +// - a synthesized foo(const std::string&) inherited from Base +// +// Before the signature-aware-dedup fix, dOwnByShort["foo"] would mark +// the inherited foo(string) as "already overridden" and silently drop +// it from synthesis. +// +// Source: testdata/cpp/8_overloaded_methods/overloads.h +func TestCpp_OverloadedInheritance(t *testing.T) { + repo := parseTestCase(t, "overloaded_methods") + + var allDerived []string + foundIntOverride := false + foundStringInherit := false + forEachFunc(repo, func(_ string, f *uniast.Function) { + if f.Receiver == nil { + return + } + if !strings.HasSuffix(f.Receiver.Type.Name, "Derived") { + return + } + allDerived = append(allDerived, f.Name) + // Match foo(int) — strict to avoid accidentally matching foo(int x). + if strings.HasPrefix(f.Name, "ovl::Derived::foo(int") { + foundIntOverride = true + } + // Inherited foo(string) — synthesised body comes from Base. + if strings.Contains(f.Name, "ovl::Derived::foo(") && + strings.Contains(f.Name, "string") { + foundStringInherit = true + } + }) + if !foundIntOverride { + t.Errorf("expected Derived::foo(int) override, got methods: %v", allDerived) + } + if !foundStringInherit { + t.Errorf("expected Derived::foo(string) inherited via synthesis (overload-safe dedup), got methods: %v", allDerived) + } +} + +// TestCpp_CrossNamespaceBaseName verifies that an inheritance edge +// `app::Provider -> common::Provider` survives — short-name equality +// (both classes are named "Provider") must NOT be used to declare the +// base "self-resolved / unresolved". Before the fix, the check +// `baseSym.Name == sym.Name` collapsed all cross-namespace same-short +// class names into unresolved bases, dropping the Implements edge. +// +// Source: testdata/cpp/9_crossns_basename/app.h +func TestCpp_CrossNamespaceBaseName(t *testing.T) { + repo := parseTestCase(t, "crossns_basename") + + var appProvider *uniast.Type + forEachType(repo, func(_ string, ty *uniast.Type) { + if ty.Name == "app::Provider" { + appProvider = ty + } + }) + if appProvider == nil { + t.Fatalf("app::Provider type not found in AST") + } + foundCommon := false + for _, impl := range appProvider.Implements { + if impl.Name == "common::Provider" { + foundCommon = true + break + } + } + if !foundCommon { + t.Errorf("app::Provider.Implements missing common::Provider; got: %v", appProvider.Implements) + } +} + +// TestCpp_UnresolvedBaseNamespace reproduces a known bug: when clangd +// cannot resolve a base class (e.g. external template like +// `third_party::Provider` whose body is not in the workspace), +// BaseClassRefs records only the LAST `::`-segment, dropping the +// namespace. Two different bases `third_party::Provider` and +// `other_pkg::Provider` then collapse to bare "Provider" in the +// unresolved Implements list, confusing downstream consumers. +// +// Source: testdata/cpp/10_unresolved_basens/main.cpp +// Bug locations: lang/cpp/spec.go:797 ("Name: lastWord"), +// lang/collect/collect.go:2530 (records ref.Name), lang/collect/export.go:861. +func TestCpp_UnresolvedBaseNamespace(t *testing.T) { + repo := parseTestCase(t, "unresolved_basens") + + var dA, dB *uniast.Type + forEachType(repo, func(_ string, ty *uniast.Type) { + if ty.Name == "app::DerivedA" { + dA = ty + } + if ty.Name == "app::DerivedB" { + dB = ty + } + }) + if dA == nil || dB == nil { + t.Fatalf("DerivedA or DerivedB not found in AST") + } + hasFullName := func(ty *uniast.Type, want string) bool { + for _, impl := range ty.Implements { + if impl.Name == want { + return true + } + } + return false + } + if !hasFullName(dA, "third_party::Provider") { + t.Errorf("DerivedA.Implements should include namespace-qualified third_party::Provider, got: %v", dA.Implements) + } + if !hasFullName(dB, "other_pkg::Provider") { + t.Errorf("DerivedB.Implements should include namespace-qualified other_pkg::Provider, got: %v", dB.Implements) + } +} + +// TestCpp_UsingDeclarationNotAlias reproduces a known bug: IsUsingAlias +// treats every `using X;` that isn't `using namespace X;` as a type +// alias. Real C++ uses `using Base::foo;` as a using-DECLARATION to +// re-expose an inherited overload — it's a name import, not a type +// alias. The current code mis-routes it through AliasTargetTokenIndex +// and then drops it as ErrExternalSymbol in export. +// +// Source: testdata/cpp/11_using_decl/derived.h +// Bug locations: lang/cpp/spec.go:592, lang/collect/export.go:442. +func TestCpp_UsingDeclarationNotAlias(t *testing.T) { + repo := parseTestCase(t, "using_decl") + + // Verify the using-declaration `using Base::foo;` is NOT mis-routed + // through the alias-redirect path. Symptoms checked: + // 1. Base::foo(int) and Base::foo(double) are emitted as Functions + // (the using-decl path must not erase them). + // 2. Derived inherits both Base overloads via NVI synthesis, so + // ud::Derived::foo(int x) and ud::Derived::foo(double x) appear + // alongside the native Derived::foo(const char* s). + // + // We don't assert main()'s MethodCalls because in a self-contained + // test workspace clangd has no compile_commands.json and reports + // "Definition not unique" for overload-resolved calls — that's a + // clangd-fallback-mode artifact, not a parser bug. + want := map[string]bool{ + "ud::Base::foo(int)": false, + "ud::Base::foo(double)": false, + "ud::Derived::foo(const char *)": false, + "ud::Derived::foo(int)": false, + "ud::Derived::foo(double)": false, + } + forEachFunc(repo, func(_ string, f *uniast.Function) { + if _, ok := want[f.Name]; ok { + want[f.Name] = true + } + }) + for n, ok := range want { + if !ok { + t.Errorf("expected function %q not found — using-declaration may have been mis-handled", n) + } + } +} + +// TestCpp_MethodsOverloadCollision reproduces a real bug: Type.Methods is +// keyed by short name only (cppBaseName strips namespaces/template args +// from clangd-reported method names which already lack the param list; +// synthesised methods use methodShortName which also strips params). +// Overloads (same short name, different signature) collapse to a single +// map entry — the later iteration silently overwrites the earlier. +// +// User-report framing said "mixed key styles foo(int) vs foo" — that +// specific shape doesn't actually occur because clangd reports the bare +// method name; but the underlying collision is real and observable. +// +// Source: testdata/cpp/13_methods_key_consistency/types.h +// Bug locations: lang/collect/export.go:918, lang/collect/export.go:1286. +func TestCpp_MethodsOverloadCollision(t *testing.T) { + repo := parseTestCase(t, "methods_key_consistency") + + var holder *uniast.Type + forEachType(repo, func(_ string, ty *uniast.Type) { + if ty.Name == "kc::Holder" { + holder = ty + } + }) + if holder == nil { + t.Fatalf("kc::Holder not found") + } + // Holder has two native overloads handle(int) and handle(long). Both + // should be reachable from Type.Methods. Today only one is stored + // because cppBaseName-derived keys collapse to short name "handle". + if len(holder.Methods) < 2 { + t.Errorf("Holder.Methods should contain both handle overloads, got %d entries: %v", len(holder.Methods), holder.Methods) + } +} + +// lastSeg returns the segment after the final `::` in a qualified C++ name. +func lastSeg(qualified string) string { + if i := strings.LastIndex(qualified, "::"); i >= 0 { + return qualified[i+2:] + } + return qualified +} + +// sameSet checks two string slices contain the same elements (order-insensitive, +// multiset semantics). +func sameSet(a, b []string) bool { + if len(a) != len(b) { + return false + } + count := map[string]int{} + for _, s := range a { + count[s]++ + } + for _, s := range b { + count[s]-- + if count[s] < 0 { + return false + } + } + return true +} + diff --git a/lang/register/provider.go b/lang/register/provider.go index 1f89312f..0977d4ff 100644 --- a/lang/register/provider.go +++ b/lang/register/provider.go @@ -15,6 +15,7 @@ package register import ( + cppLsp "github.com/cloudwego/abcoder/lang/cpp/lsp" javaLsp "github.com/cloudwego/abcoder/lang/java/lsp" "github.com/cloudwego/abcoder/lang/lsp" "github.com/cloudwego/abcoder/lang/uniast" @@ -22,5 +23,5 @@ import ( func RegisterProviders() { lsp.RegisterProvider(uniast.Java, &javaLsp.JavaProvider{}) - + lsp.RegisterProvider(uniast.Cpp, &cppLsp.CppProvider{}) } diff --git a/main.go b/main.go index ffaf10a7..8d093925 100644 --- a/main.go +++ b/main.go @@ -36,6 +36,9 @@ import ( "os" "os/exec" "path/filepath" + "runtime" + "runtime/pprof" + runtimeTrace "runtime/trace" "strings" internalCmd "github.com/cloudwego/abcoder/internal/cmd" @@ -98,10 +101,14 @@ Use this command to verify installation or when reporting issues.`, func newParseCmd() *cobra.Command { var ( - flagOutput string - flagLsp string - javaHome string - opts lang.ParseOptions + flagOutput string + flagLsp string + javaHome string + flagCPUProfile string + flagTrace string + flagMutexProfile string + flagBlockProfile string + opts lang.ParseOptions ) cmd := &cobra.Command{ @@ -152,6 +159,53 @@ Language Support: opts.LSP = flagLsp } + if flagCPUProfile != "" { + f, err := os.Create(flagCPUProfile) + if err != nil { + return fmt.Errorf("create cpu profile: %w", err) + } + defer f.Close() + if err := pprof.StartCPUProfile(f); err != nil { + return fmt.Errorf("start cpu profile: %w", err) + } + defer pprof.StopCPUProfile() + } + if flagTrace != "" { + f, err := os.Create(flagTrace) + if err != nil { + return fmt.Errorf("create trace: %w", err) + } + defer f.Close() + if err := runtimeTrace.Start(f); err != nil { + return fmt.Errorf("start trace: %w", err) + } + defer runtimeTrace.Stop() + } + if flagMutexProfile != "" { + runtime.SetMutexProfileFraction(1) + defer func() { + f, err := os.Create(flagMutexProfile) + if err != nil { + log.Error("create mutex profile: %v", err) + return + } + defer f.Close() + _ = pprof.Lookup("mutex").WriteTo(f, 0) + }() + } + if flagBlockProfile != "" { + runtime.SetBlockProfileRate(1) + defer func() { + f, err := os.Create(flagBlockProfile) + if err != nil { + log.Error("create block profile: %v", err) + return + } + defer f.Close() + _ = pprof.Lookup("block").WriteTo(f, 0) + }() + } + lspOptions := make(map[string]string) if javaHome != "" { lspOptions["java.home"] = javaHome @@ -188,10 +242,15 @@ Language Support: cmd.Flags().BoolVar(&opts.LoadByPackages, "load-by-packages", false, "Load packages one by one instead of all at once (only works for Go, uses more memory).") cmd.Flags().BoolVar(&opts.DisableBuildGraph, "disable-build-graph", false, "Disable the step of building the dependency graph among AST nodes.") cmd.Flags().StringSliceVar(&opts.Excludes, "exclude", []string{}, "Files or directories to exclude from parsing (can be specified multiple times).") + cmd.Flags().StringSliceVar(&opts.Sysroots, "sysroot", []string{}, "Filesystem prefix(es) whose contents should be classified under module `cstdlib` (e.g. /opt/toolchain/sysroot). Repeatable. C++ only.") cmd.Flags().StringVar(&opts.RepoID, "repo-id", "", "Custom identifier for this repository (useful for multi-repo scenarios).") cmd.Flags().StringArrayVar(&opts.BuildFlags, "build-flag", []string{}, "Pass build flags to the Go parser (e.g. -tags=xxx).") cmd.Flags().StringVar(&opts.TSConfig, "tsconfig", "", "Path to tsconfig.json file for TypeScript project configuration.") cmd.Flags().StringSliceVar(&opts.TSSrcDir, "ts-src-dir", []string{}, "Additional TypeScript source directories (can be specified multiple times).") + cmd.Flags().StringVar(&flagCPUProfile, "cpu-profile", "", "Write a CPU pprof profile to this file.") + cmd.Flags().StringVar(&flagTrace, "trace", "", "Write a runtime/trace event file to this file.") + cmd.Flags().StringVar(&flagMutexProfile, "mutex-profile", "", "Write a mutex contention pprof profile to this file.") + cmd.Flags().StringVar(&flagBlockProfile, "block-profile", "", "Write a goroutine blocking pprof profile to this file.") return cmd } diff --git a/testdata/cpp/10_unresolved_basens/main.cpp b/testdata/cpp/10_unresolved_basens/main.cpp new file mode 100644 index 00000000..3afc84be --- /dev/null +++ b/testdata/cpp/10_unresolved_basens/main.cpp @@ -0,0 +1,26 @@ +// Forward-declare a template base in two distinct namespaces, both named +// `Provider`. The base bodies are NOT defined here — clangd cannot resolve +// them, so collect.go will route through the cppUnresolvedBases path. +// After the fix, app::DerivedA.Implements should record +// `third_party::Provider` (with namespace) and app::DerivedB.Implements +// should record `other_pkg::Provider`. Today both collapse to bare +// `Provider`, making them indistinguishable. + +namespace third_party { +template +class Provider; +} + +namespace other_pkg { +template +class Provider; +} + +namespace app { + +class DerivedA : public ::third_party::Provider {}; +class DerivedB : public ::other_pkg::Provider {}; + +} // namespace app + +int main() { return 0; } diff --git a/testdata/cpp/11_using_decl/base.h b/testdata/cpp/11_using_decl/base.h new file mode 100644 index 00000000..a6fdb4bf --- /dev/null +++ b/testdata/cpp/11_using_decl/base.h @@ -0,0 +1,12 @@ +#ifndef UD_BASE_H +#define UD_BASE_H + +namespace ud { +class Base { +public: + int foo(int x) { return x + 1; } + int foo(double x) { return (int)x + 2; } +}; +} // namespace ud + +#endif diff --git a/testdata/cpp/11_using_decl/derived.h b/testdata/cpp/11_using_decl/derived.h new file mode 100644 index 00000000..d6e5e166 --- /dev/null +++ b/testdata/cpp/11_using_decl/derived.h @@ -0,0 +1,20 @@ +#ifndef UD_DERIVED_H +#define UD_DERIVED_H + +#include "base.h" + +namespace ud { + +class Derived : public Base { +public: + // using-declaration (NOT a type alias): re-expose Base::foo overloads + // so they're visible after Derived hides them with its own foo. This + // is C++17 §10.3.3 — name binding, not type aliasing. + using Base::foo; + + int foo(const char* s) { return s[0]; } +}; + +} // namespace ud + +#endif diff --git a/testdata/cpp/11_using_decl/main.cpp b/testdata/cpp/11_using_decl/main.cpp new file mode 100644 index 00000000..66542c79 --- /dev/null +++ b/testdata/cpp/11_using_decl/main.cpp @@ -0,0 +1,7 @@ +#include "derived.h" + +int main() { + ud::Derived d; + // All three should resolve: Base::foo(int), Base::foo(double), Derived::foo(const char*). + return d.foo(1) + d.foo(2.0) + d.foo("x"); +} diff --git a/testdata/cpp/13_methods_key_consistency/main.cpp b/testdata/cpp/13_methods_key_consistency/main.cpp new file mode 100644 index 00000000..ee9fdbea --- /dev/null +++ b/testdata/cpp/13_methods_key_consistency/main.cpp @@ -0,0 +1,6 @@ +#include "types.h" + +int main() { + kc::Holder h; + return h.handle(1) + h.handle(2L); +} diff --git a/testdata/cpp/13_methods_key_consistency/types.h b/testdata/cpp/13_methods_key_consistency/types.h new file mode 100644 index 00000000..0261a6d6 --- /dev/null +++ b/testdata/cpp/13_methods_key_consistency/types.h @@ -0,0 +1,18 @@ +#ifndef KC_TYPES_H +#define KC_TYPES_H + +namespace kc { + +// Two native overloads on the same class. Type.Methods is keyed by short +// name (cppBaseName) so both `handle(int)` and `handle(long)` collide +// into a single map entry "handle" — the second iteration silently +// overwrites the first. +class Holder { +public: + int handle(int x) { return x + 1; } + int handle(long x) { return (int)x + 2; } +}; + +} // namespace kc + +#endif diff --git a/testdata/cpp/2_inheritance/main.cpp b/testdata/cpp/2_inheritance/main.cpp new file mode 100644 index 00000000..eed66fa2 --- /dev/null +++ b/testdata/cpp/2_inheritance/main.cpp @@ -0,0 +1,16 @@ +#include "shapes.h" +#include + +int main() { + shapes::Circle c(2.0); + shapes::Square s(3.0); + shapes::LabeledCircle lc(1.0, "label"); + shapes::IntStore is; + + std::cout << c.area() << "\n"; + std::cout << s.area() << "\n"; + std::cout << lc.area() << " " << lc.label() << "\n"; + is.inc(); + std::cout << is.get() << "\n"; + return 0; +} diff --git a/testdata/cpp/2_inheritance/shapes.cpp b/testdata/cpp/2_inheritance/shapes.cpp new file mode 100644 index 00000000..1a0deb79 --- /dev/null +++ b/testdata/cpp/2_inheritance/shapes.cpp @@ -0,0 +1,17 @@ +#include "shapes.h" + +namespace shapes { + +std::string Shape::describe() const { return "shape"; } + +void Drawable::draw() const {} + +double Circle::area() const { return 3.14159 * radius_ * radius_; } + +std::string Circle::describe() const { return "circle"; } + +double Square::area() const { return side_ * side_; } + +void LabeledCircle::draw() const {} + +} // namespace shapes diff --git a/testdata/cpp/2_inheritance/shapes.h b/testdata/cpp/2_inheritance/shapes.h new file mode 100644 index 00000000..7c612526 --- /dev/null +++ b/testdata/cpp/2_inheritance/shapes.h @@ -0,0 +1,75 @@ +#ifndef SHAPES_H +#define SHAPES_H + +#include + +namespace shapes { + +// Base class with virtual interface. +class Shape { +public: + virtual ~Shape() = default; + virtual double area() const = 0; + virtual std::string describe() const; +}; + +// Auxiliary mixin used to demonstrate multiple inheritance. +class Drawable { +public: + virtual ~Drawable() = default; + virtual void draw() const; +}; + +// Single inheritance: `Circle` extends `Shape`. +class Circle : public Shape { +public: + explicit Circle(double r) : radius_(r) {} + double area() const override; + std::string describe() const override; + +private: + double radius_; +}; + +// Single inheritance with a different access keyword + virtual base method. +class Square : public Shape { +public: + explicit Square(double s) : side_(s) {} + double area() const override; + +private: + double side_; +}; + +// Multiple inheritance: `LabeledCircle` extends `Circle` and `Drawable`. +class LabeledCircle : public Circle, public Drawable { +public: + LabeledCircle(double r, std::string label) + : Circle(r), label_(std::move(label)) {} + void draw() const override; + const std::string& label() const { return label_; } + +private: + std::string label_; +}; + +// Template base — Container is the actual base, the `int` should NOT be +// picked up by the parser as a base class (it's a template argument). +template +class Container { +public: + void push(const T& v) { data_ = v; } + T get() const { return data_; } + +private: + T data_{}; +}; + +class IntStore : public Container { +public: + void inc() { push(get() + 1); } +}; + +} // namespace shapes + +#endif // SHAPES_H diff --git a/testdata/cpp/3_aliases/main.cpp b/testdata/cpp/3_aliases/main.cpp new file mode 100644 index 00000000..516793dd --- /dev/null +++ b/testdata/cpp/3_aliases/main.cpp @@ -0,0 +1,13 @@ +#include "types.h" +#include + +int main() { + aliases::MyInt a = 1; + aliases::MyDouble d = 2.5; + aliases::Cnt c = aliases::make_cnt(); + aliases::CntAlias ca = aliases::make_via_using(); + c.bump(); + ca.bump(); + std::cout << aliases::produce(a) << " " << d << " " << c.n << " " << ca.n << "\n"; + return 0; +} diff --git a/testdata/cpp/3_aliases/types.h b/testdata/cpp/3_aliases/types.h new file mode 100644 index 00000000..a3de534a --- /dev/null +++ b/testdata/cpp/3_aliases/types.h @@ -0,0 +1,35 @@ +#ifndef ALIASES_TYPES_H +#define ALIASES_TYPES_H + +#include + +namespace aliases { + +// Real type that other declarations refer to. +struct Counter { + int n{0}; + void bump() { ++n; } +}; + +// typedef on a primitive — should be EMITTED as Type with TypeKind=typedef. +typedef int MyInt; + +// typedef on a user-defined type — same: kept, TypeKind=typedef. +typedef Counter Cnt; + +// using-alias on a primitive — should be DROPPED (not emitted), references +// to MyDouble should resolve directly to `double`. +using MyDouble = double; + +// using-alias on a user-defined type — should be DROPPED, references to +// CntAlias should resolve to Counter. +using CntAlias = Counter; + +// A function that exercises both typedef and using-alias parameters/returns. +inline MyInt produce(MyInt x) { return x + 1; } +inline Cnt make_cnt() { return Cnt{}; } +inline CntAlias make_via_using() { return CntAlias{}; } + +} // namespace aliases + +#endif // ALIASES_TYPES_H diff --git a/testdata/cpp/4_inline_methods/handlers.h b/testdata/cpp/4_inline_methods/handlers.h new file mode 100644 index 00000000..8ecae72c --- /dev/null +++ b/testdata/cpp/4_inline_methods/handlers.h @@ -0,0 +1,49 @@ +// Regression scenario for breakpoint A1: external base classes commonly +// define their virtual interface entirely inline in a single header, e.g. +// `cppservice::ApiHandler::process`, `cppservice::Step::process`, +// `cppservice::Strategy::process`. Three different classes, same short +// method name, in the same namespace. +// +// Before A1 fix, clangd reported these as just `process(...)`; the +// collector dropped the class qualifier and collapsed all three into the +// same orphan `ns::process(...)` symbol. After A1, each must appear with +// its own class qualifier and IsMethod=true. +#ifndef SVC_HANDLERS_H +#define SVC_HANDLERS_H + +namespace svc { + +struct Ctx { + int v; +}; + +class ApiHandler { +public: + // Inline definition: clangd reports sym.Name = "process(Ctx& ctx)". + int process(Ctx& ctx) { + ctx.v += 1; + return ctx.v; + } +}; + +class Step { +public: + // Same short name, different class. + int process(Ctx& ctx) { + ctx.v *= 2; + return ctx.v; + } +}; + +class Strategy { +public: + // Same short name, different class. + int process(Ctx& ctx) { + ctx.v -= 3; + return ctx.v; + } +}; + +} // namespace svc + +#endif // SVC_HANDLERS_H diff --git a/testdata/cpp/4_inline_methods/main.cpp b/testdata/cpp/4_inline_methods/main.cpp new file mode 100644 index 00000000..85914730 --- /dev/null +++ b/testdata/cpp/4_inline_methods/main.cpp @@ -0,0 +1,10 @@ +#include "handlers.h" + +int main() { + svc::Ctx c{1}; + svc::ApiHandler a; + svc::Step s; + svc::Strategy g; + int x = a.process(c) + s.process(c) + g.process(c); + return x; +} diff --git a/testdata/cpp/5_external_calls/main.cpp b/testdata/cpp/5_external_calls/main.cpp new file mode 100644 index 00000000..4d52bac2 --- /dev/null +++ b/testdata/cpp/5_external_calls/main.cpp @@ -0,0 +1,38 @@ +// Regression scenario for breakpoint A2: in default mode (no +// --load-external-symbol), calls to functions/methods defined outside the +// workspace would previously get dropped from MethodCalls/FunctionCalls, +// because exportSymbol returns ErrExternalSymbol for the dep and the +// caller silently `continue`d. +// +// After A2's lightIdentityForExternal fallback, the edge is preserved +// using a best-effort name+module Identity, so the call graph can still +// be walked across the workspace boundary. +// +// This file exercises the path by calling a couple of std/external APIs. +#include +#include + +namespace probe { + +// Free function calling external std::printf. +int run_printf(int x) { + return std::printf("v=%d\n", x); +} + +// Method calling external std::string::size(). +class Probe { +public: + explicit Probe(std::string s) : s_(std::move(s)) {} + std::size_t length() const { + return s_.size(); + } +private: + std::string s_; +}; + +} // namespace probe + +int main() { + probe::Probe p("hi"); + return static_cast(p.length()) + probe::run_printf(7); +} diff --git a/testdata/cpp/6_nvi_dispatch/main.cpp b/testdata/cpp/6_nvi_dispatch/main.cpp new file mode 100644 index 00000000..0c94b4b7 --- /dev/null +++ b/testdata/cpp/6_nvi_dispatch/main.cpp @@ -0,0 +1,7 @@ +#include "nvi.h" + +int main() { + nvi::Ctx c{0}; + nvi::Caller k; + return k.drive(c) ? 0 : 1; +} diff --git a/testdata/cpp/6_nvi_dispatch/nvi.cpp b/testdata/cpp/6_nvi_dispatch/nvi.cpp new file mode 100644 index 00000000..12046ff4 --- /dev/null +++ b/testdata/cpp/6_nvi_dispatch/nvi.cpp @@ -0,0 +1,22 @@ +#include "nvi.h" + +namespace nvi { + +bool Provider::provide(Ctx& c) { + return do_provide(c); +} + +bool Provider::do_provide(Ctx& c) { + return c.v > 0; +} + +bool RealProvider::do_provide(Ctx& c) { + c.v += 1; + return c.v > 0; +} + +bool Caller::drive(Ctx& c) { + return real_.provide(c); +} + +} // namespace nvi diff --git a/testdata/cpp/6_nvi_dispatch/nvi.h b/testdata/cpp/6_nvi_dispatch/nvi.h new file mode 100644 index 00000000..b640d42b --- /dev/null +++ b/testdata/cpp/6_nvi_dispatch/nvi.h @@ -0,0 +1,47 @@ +// Regression scenario for breakpoint C — Model B (synthesized inherited +// methods + devirtualization on this-calls). +// +// `Provider::provide` is a non-virtual interface that calls the virtual +// `do_provide`. A derived class `RealProvider` overrides `do_provide` +// only. Without Model B, the call graph dead-ends at `Provider::provide` +// (external body) and never reaches `RealProvider::do_provide`. +// +// After Model B, an inherited `RealProvider::provide` is synthesized +// with the base's body and its `this`-call to `do_provide` rewritten to +// `RealProvider::do_provide`. +// +// Implementations live in nvi.cpp (out-of-line) so clangd returns full +// semantic tokens for them regardless of preamble caching state. +#ifndef NVI_H +#define NVI_H + +namespace nvi { + +struct Ctx { + int v; +}; + +class Provider { +public: + bool provide(Ctx& c); + +protected: + virtual bool do_provide(Ctx& c); +}; + +class RealProvider : public Provider { +protected: + bool do_provide(Ctx& c) override; +}; + +class Caller { +public: + bool drive(Ctx& c); + +private: + RealProvider real_; +}; + +} // namespace nvi + +#endif // NVI_H diff --git a/testdata/cpp/7_provider_chain/concrete.cpp b/testdata/cpp/7_provider_chain/concrete.cpp new file mode 100644 index 00000000..8a57b494 --- /dev/null +++ b/testdata/cpp/7_provider_chain/concrete.cpp @@ -0,0 +1,25 @@ +#include "concrete.h" + +namespace app { + +bool ConcreteProvider::do_provide(ReqA& req, RspA& rsp) { + rsp.v = req.v + 1; + return true; +} + +bool MultiLineProvider::do_provide(ReqA& req, RspA& rsp) { + rsp.v = req.v + 2; + return true; +} + +} // namespace app + +namespace app { +namespace inner { +bool BareNameProvider::do_provide(ReqB& req, RspB& rsp) { + rsp.v = req.v + 3; + return true; +} +} // namespace inner + +} // namespace app diff --git a/testdata/cpp/7_provider_chain/concrete.h b/testdata/cpp/7_provider_chain/concrete.h new file mode 100644 index 00000000..19dbfb46 --- /dev/null +++ b/testdata/cpp/7_provider_chain/concrete.h @@ -0,0 +1,65 @@ +#ifndef CONCRETE_H +#define CONCRETE_H + +#include "provider.h" + +namespace app { + +// Forward declaration — must NOT produce a Type entry in the AST. +class FwdOnly; + +struct ReqA { + int v; +}; +struct RspA { + int v; +}; + +// Templated base class with concrete template args — Implements should +// contain `Provider`, not the template args ReqA/RspA. +class ConcreteProvider : public common::Provider { +public: + ConcreteProvider() = default; + +protected: + bool do_provide(ReqA& req, RspA& rsp) override; +}; + +// Multi-line base clause mimicking the freq_service style — the `:` lives +// on its own line. Implements must still resolve to `Provider`. +class MultiLineProvider + : public common::Provider { +public: + MultiLineProvider() = default; + +protected: + bool do_provide(ReqA& req, RspA& rsp) override; +}; + +} // namespace app + +// Sibling namespace import via `using` — mirrors how freq_service refers to +// `CircuitBreakerProvider` without an explicit `cppservice::` prefix. +namespace app { +using common::Provider; + +namespace inner { +struct ReqB { int v; }; +struct RspB { int v; }; + +// This is the freq_service-style: base name appears WITHOUT namespace +// prefix (resolved via `using`) and template args carry their own +// namespaces. Implements should still resolve to `Provider`. +class BareNameProvider + : public Provider { +public: + BareNameProvider() = default; + +protected: + bool do_provide(ReqB& req, RspB& rsp) override; +}; +} // namespace inner + +} // namespace app + +#endif diff --git a/testdata/cpp/7_provider_chain/main.cpp b/testdata/cpp/7_provider_chain/main.cpp new file mode 100644 index 00000000..0904a63c --- /dev/null +++ b/testdata/cpp/7_provider_chain/main.cpp @@ -0,0 +1,10 @@ +#include "concrete.h" + +int main(int argc, char** argv) { + app::ReqA req{1}; + app::RspA rsp{0}; + app::ConcreteProvider p; + // Template function call — should produce an FC edge to common::run. + bool ok = common::run(argc, argv); + return (ok && p.provide(req, rsp)) ? 0 : 1; +} diff --git a/testdata/cpp/7_provider_chain/provider.h b/testdata/cpp/7_provider_chain/provider.h new file mode 100644 index 00000000..64357332 --- /dev/null +++ b/testdata/cpp/7_provider_chain/provider.h @@ -0,0 +1,44 @@ +#ifndef PROVIDER_H +#define PROVIDER_H + +// Mirrors the freq_service pattern: a templated NVI base class. +// Provider::provide() — non-virtual, calls do_provide() +// do_provide() — pure virtual; derived overrides +namespace common { + +template +class Provider { +public: + // Header-inline body — abcoder must still record the do_provide() + // call edge so synthesized inherited methods inherit it. + bool provide(Req& req, Rsp& rsp) { return do_provide(req, rsp); } + +protected: + virtual bool do_provide(Req& req, Rsp& rsp) = 0; +}; + +// Templated free function — mirrors freq_service's main()'s call to +// `cppservice::main<...>(argc, argv)`. The FC edge from a caller to +// this kind of call must surface as a FunctionCall. +template +bool run(int argc, char** argv); + +template +bool run(int argc, char** argv) { + (void)argc; (void)argv; + return true; +} + +// Strategy-style class: only declares `process`, the definition would +// normally live in a .cpp file not present in our workspace. Mirrors +// freq_service's `cppservice::Strategy::process` (problem #2 in +// docs/cpp_known_issues.md): the AST must still carry the method +// node even when no body exists. +class Strategy { +public: + virtual bool process(int& ctx); +}; + +} // namespace common + +#endif diff --git a/testdata/cpp/8_overloaded_methods/main.cpp b/testdata/cpp/8_overloaded_methods/main.cpp new file mode 100644 index 00000000..b9616263 --- /dev/null +++ b/testdata/cpp/8_overloaded_methods/main.cpp @@ -0,0 +1,8 @@ +#include "overloads.h" + +int main() { + ovl::Derived d; + int a = d.foo(1); + int b = d.foo(std::string("x")); + return a + b; +} diff --git a/testdata/cpp/8_overloaded_methods/overloads.h b/testdata/cpp/8_overloaded_methods/overloads.h new file mode 100644 index 00000000..a9fe82d2 --- /dev/null +++ b/testdata/cpp/8_overloaded_methods/overloads.h @@ -0,0 +1,24 @@ +#ifndef OVL_OVERLOADS_H +#define OVL_OVERLOADS_H + +#include + +namespace ovl { + +class Base { +public: + // Two overloads, same short name "foo", different signatures. + virtual int foo(int x) { return x + 1; } + virtual int foo(const std::string& s) { return (int)s.size(); } +}; + +class Derived : public Base { +public: + // Override only foo(int). foo(const std::string&) must still be + // inherited via NVI synthesis, NOT collapsed by short-name dedup. + int foo(int x) override { return x + 100; } +}; + +} // namespace ovl + +#endif diff --git a/testdata/cpp/9_crossns_basename/app.h b/testdata/cpp/9_crossns_basename/app.h new file mode 100644 index 00000000..7523c7a6 --- /dev/null +++ b/testdata/cpp/9_crossns_basename/app.h @@ -0,0 +1,18 @@ +#ifndef CN_APP_H +#define CN_APP_H + +#include "common.h" + +namespace app { + +// Same short name "Provider" as common::Provider — must not be misclassified +// as a self-resolved (unresolved) base. app::Provider.Implements should +// contain common::Provider, NOT be empty. +class Provider : public ::common::Provider { +public: + int provide() override { return 2; } +}; + +} // namespace app + +#endif diff --git a/testdata/cpp/9_crossns_basename/common.h b/testdata/cpp/9_crossns_basename/common.h new file mode 100644 index 00000000..c10cc360 --- /dev/null +++ b/testdata/cpp/9_crossns_basename/common.h @@ -0,0 +1,14 @@ +#ifndef CN_COMMON_H +#define CN_COMMON_H + +namespace common { + +class Provider { +public: + virtual ~Provider() = default; + virtual int provide() { return 1; } +}; + +} // namespace common + +#endif diff --git a/testdata/cpp/9_crossns_basename/main.cpp b/testdata/cpp/9_crossns_basename/main.cpp new file mode 100644 index 00000000..fa2af23a --- /dev/null +++ b/testdata/cpp/9_crossns_basename/main.cpp @@ -0,0 +1,6 @@ +#include "app.h" + +int main() { + app::Provider p; + return p.provide(); +} From 2be2e330e33adac9f46d0967d53096c5493793f5 Mon Sep 17 00:00:00 2001 From: "wangzekun.zekin" Date: Thu, 21 May 2026 15:11:19 +0800 Subject: [PATCH 2/8] refactor(cpp): collapse parser to typeHierarchy-only + retry transient LSP calls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the C++ collector strictly LSP-based: drop every text-heuristic fallback (BaseClassRefs / parseBaseSpecifiers / declarationText / IsTypedefSymbol / IsUsingAlias / AliasTargetTokenIndex / aliasRedirect), delete the unresolved-base post-passes (resolveCppExternalImplements / resolveCppUnresolvedBasesActive), and run the typeHierarchy/AST extension as the only source of truth. Tradeoff: dependent template bases (`class Foo : public Tmpl`) no longer get Implements edges — clangd's typeHierarchy doesn't expose supertypes for unbound template bases, and we no longer try to recover them via source-text fallback. ModPath="external" placeholders are eliminated; consumers see clean cross-module Implements only when typeHierarchy could fully resolve them. Notable changes: - lsp.ASTNode: HasFunctionBody() method + ASTRoleDeclaration / ASTKind{Compound,Typedef,TypeAlias,TypeAliasTemplate,Using, UsingShadow,UsingPack} constants. - LSPClient.Call: retry-go-based transient retry (3 attempts, 50ms fixed delay) with a fast path that only enters the retry framework when the first call fails. MethodNotFound (-32601) and context cancel/deadline are terminal. - IsJSONRPCMethodNotFound: exported from lang/lsp so collect doesn't duplicate the helper. - collect: cppFileAST caches the whole-file AST per URI; cppASTForSymbol walks the cached tree to find the smallest containing declaration (returns nil on miss, NOT the TranslationUnit root) and whole-file range now covers the last line even when it has no trailing newline. AST request failure caches nil to stop further hammering, but transient failures get retried inside Call before that happens. - export: dedupCppFunction uses AST-derived hasBody (Compound child of the function declaration only — lambda defaults no longer trigger false positives); typedef detection keys off ASTKindTypedef / ASTKindTypeAlias; alias suppression keys off ASTKindTypeAlias{,Template} for type aliases and only UsingDecl-family kinds for type-importing using-declarations. Tests: - TestCpp_UnresolvedBaseNamespace and testdata/cpp/10_unresolved_basens removed (they covered the deleted H4 fallback). - TestCpp_Inheritance / TestCpp_ProviderChain / TestCpp_Aliases relax expectations for the now-unreachable cases (dependent template bases, using-alias redirect). - cpp_ast_regression_test.go adds two regression tests: cppASTForSymbol misses must NOT fall back to the TU root, and whole-file AST range must include the last line without trailing newline. freq_service v46 verified: 33 modules / 119 pkgs / 462 fns / 341 types (unchanged vs the prior typeHierarchy-only run), 0 ModPath="external" placeholders, abcoder-side token-not-found dropped from 5 (v45) to 1 (remaining issue is unrelated URL-decoding in a system header path). Co-Authored-By: Claude Opus 4.7 (1M context) --- go.mod | 3 +- go.sum | 6 +- lang/collect/collect.go | 293 +++++++++-------- lang/collect/cpp_ast_regression_test.go | 148 +++++++++ lang/collect/cpp_body_test.go | 58 ---- lang/collect/export.go | 200 +++-------- lang/cpp/spec.go | 365 --------------------- lang/lsp/client.go | 63 +++- lang/lsp/lsp.go | 80 +++++ lang/parse_cpp_test.go | 124 ++----- testdata/cpp/10_unresolved_basens/main.cpp | 26 -- 11 files changed, 512 insertions(+), 854 deletions(-) create mode 100644 lang/collect/cpp_ast_regression_test.go delete mode 100644 lang/collect/cpp_body_test.go delete mode 100644 testdata/cpp/10_unresolved_basens/main.cpp diff --git a/go.mod b/go.mod index bf4fe91f..2301679c 100644 --- a/go.mod +++ b/go.mod @@ -4,6 +4,7 @@ go 1.23.4 require ( github.com/Knetic/govaluate v3.0.1-0.20171022003610-9aa49832a739+incompatible + github.com/avast/retry-go/v4 v4.7.0 github.com/bytedance/sonic v1.14.1 github.com/cloudwego/eino v0.3.52 github.com/cloudwego/eino-ext/components/model/ark v0.1.16 @@ -20,7 +21,7 @@ require ( github.com/sourcegraph/go-lsp v0.0.0-20240223163137-f80c5dd31dfd github.com/sourcegraph/jsonrpc2 v0.2.0 github.com/spf13/cobra v1.8.1 - github.com/stretchr/testify v1.10.0 + github.com/stretchr/testify v1.11.1 github.com/vifraa/gopom v1.0.0 golang.org/x/mod v0.24.0 golang.org/x/sync v0.13.0 diff --git a/go.sum b/go.sum index b1f2dfcb..0538f69b 100644 --- a/go.sum +++ b/go.sum @@ -62,6 +62,8 @@ github.com/armon/go-metrics v0.3.9/go.mod h1:4O98XIr/9W0sxpJ8UaYkvjk10Iff7SnFrb4 github.com/armon/go-radix v0.0.0-20180808171621-7fddfc383310/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/armon/go-radix v1.0.0/go.mod h1:ufUuZ+zHj4x4TnLV4JWEpy2hxWSpsRywHrMgIH9cCH8= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= +github.com/avast/retry-go/v4 v4.7.0 h1:yjDs35SlGvKwRNSykujfjdMxMhMQQM0TnIjJaHB+Zio= +github.com/avast/retry-go/v4 v4.7.0/go.mod h1:ZMPDa3sY2bKgpLtap9JRUgk2yTAba7cgiFhqxY2Sg6Q= github.com/aws/aws-sdk-go v1.40.45/go.mod h1:585smgzpB/KqRA+K3y/NL/oYRqQvpNJYvLm+LY1U59Q= github.com/aws/aws-sdk-go-v2 v1.9.1/go.mod h1:cK/D0BBs0b/oWPIcX/Z/obahJK1TT7IPVjy53i/mX/4= github.com/aws/aws-sdk-go-v2 v1.33.0 h1:Evgm4DI9imD81V0WwD+TN4DCwjUMdc94TrduMLbgZJs= @@ -562,8 +564,8 @@ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= -github.com/stretchr/testify v1.10.0 h1:Xv5erBjTwe/5IxqUQTdXv5kgmIvbHo3QQyRwhJsOfJA= -github.com/stretchr/testify v1.10.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= +github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/tidwall/gjson v1.14.2/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= github.com/tidwall/gjson v1.18.0 h1:FIDeeyB800efLX89e5a8Y0BNH+LOngJyGrIWxG2FKQY= github.com/tidwall/gjson v1.18.0/go.mod h1:/wbyibRr2FHMks5tjHJ5F8dMZh3AcwJEMf5vlfC0lxk= diff --git a/lang/collect/collect.go b/lang/collect/collect.go index 4512b736..aa096cc9 100644 --- a/lang/collect/collect.go +++ b/lang/collect/collect.go @@ -104,17 +104,28 @@ type Collector struct { // definition under the header pkg. cppFnEmitted map[string]cppFnLoc - // cppUnresolvedBases: type symbol -> base names that clangd couldn't - // resolve (typically external/unloaded templated bases). Export wraps - // them as light Identities under module=external so the Implements - // edge survives even without a definition. - cppUnresolvedBases map[*DocumentSymbol][]string - // cppExtDepsQueue queues external method/function syms for a final // deps pass after Collect — running inline would recurse through // more loads. cppExtDepsQueue []*DocumentSymbol + // cppFileASTCache caches the WHOLE-FILE clang AST per URI. We issue + // one `textDocument/ast` request per file (covering the entire + // translation-unit range) and reuse the cached tree for every + // symbol lookup in that file — turning N_syms RPCs into N_files. + // A nil entry means the request failed for this file; callers must + // treat it as "AST unavailable". + cppFileASTCache map[DocumentURI]*ASTNode + // cppASTUnsupported is set once when the server has signalled it + // doesn't implement textDocument/ast (jsonrpc2 -32601). After this + // we stop issuing AST requests entirely. + cppASTUnsupported bool + // exportCtx is the context.Context belonging to the current Export() + // invocation. Set at the top of Export and cleared at return; gives + // exportSymbol-and-friends access to ctx without rippling a new + // parameter through every recursive helper. + exportCtx context.Context + // symbol => [deps] deps map[*DocumentSymbol][]dependency @@ -135,13 +146,6 @@ type Collector struct { // receivers cache: receiver symbol => list of method symbols receivers map[*DocumentSymbol][]*DocumentSymbol - // aliasRedirect: typedef/using alias symbol -> the underlying type's - // symbol. Used at export time to substitute references to the alias - // with the real type's Identity, and to suppress emission of the alias - // itself as a standalone Type entry. C++ only for now. Guarded by c.mu - // like the rest of the collector state. - aliasRedirect map[*DocumentSymbol]*DocumentSymbol - // file content cache to reduce IO in Export fileContentCache map[string]string @@ -170,26 +174,6 @@ func (c *Collector) UseJavaIPC(conv *javaipc.Converter) { } -// resolveAlias walks c.aliasRedirect (cyclic-safe) to find the real symbol -// behind a `using Y = X;` / `using NS::Name;` declaration. Returns the -// input unchanged when no redirect applies. -func (c *Collector) resolveAlias(sym *DocumentSymbol) *DocumentSymbol { - if sym == nil { - return nil - } - c.mu.Lock() - defer c.mu.Unlock() - visited := map[*DocumentSymbol]bool{sym: true} - for { - target, ok := c.aliasRedirect[sym] - if !ok || target == nil || visited[target] { - return sym - } - visited[target] = true - sym = target - } -} - // addImplementsRel records that `from` implements `iface`. Idempotent on (from, iface). func (c *Collector) addImplementsRel(from *DocumentSymbol, iface *DocumentSymbol, tokenLoc Location) { if from == nil || iface == nil { @@ -205,6 +189,129 @@ func (c *Collector) addImplementsRel(from *DocumentSymbol, iface *DocumentSymbol c.implementsRel[from] = append(c.implementsRel[from], dependency{Location: tokenLoc, Symbol: iface}) } +// cppASTForSymbol returns the smallest AST declaration node whose range +// contains the symbol's range, by walking the cached whole-file AST. +// Per-symbol classification costs N_files RPCs instead of N_syms RPCs. +// Returns nil when AST is unavailable; callers must default sensibly — +// there is no text fallback. +func (c *Collector) cppASTForSymbol(ctx context.Context, sym *DocumentSymbol) *ASTNode { + if c.cli == nil || sym == nil || c.cppASTUnsupported { + return nil + } + root := c.cppFileAST(ctx, sym.Location.URI) + if root == nil { + return nil + } + target := sym.Location.Range + var best *ASTNode + var walk func(n *ASTNode) + walk = func(n *ASTNode) { + if n == nil { + return + } + if n.Role == ASTRoleDeclaration && n.Range.Include(target) { + if best == nil || best.Range.Include(n.Range) { + best = n + } + } + for _, ch := range n.Children { + walk(ch) + } + } + walk(root) + if best != nil { + return best + } + // No declaration node matched the symbol's range — return nil rather + // than falling back to the TranslationUnit root, which would let + // downstream callers (e.g. cppASTHasBody) confuse "no symbol here" + // with "found some unrelated node". + return nil +} + +// cppFileAST returns the cached AST root for uri, fetching once on +// first access. +func (c *Collector) cppFileAST(ctx context.Context, uri DocumentURI) *ASTNode { + c.mu.Lock() + if cached, ok := c.cppFileASTCache[uri]; ok { + c.mu.Unlock() + return cached + } + c.mu.Unlock() + // clangd's textDocument/ast requires the document be in its session + // (responds with -32602 "trying to get AST for non-added document" + // otherwise). DidOpen is idempotent on the client side. + _, _ = c.cli.DidOpen(ctx, uri) + // Whole-file range. clangd rejects line numbers past the file end + // with "Line value is out of range". LineCounts gives the starting + // byte-offset of each line; valid line indices are 0..len-1. + // End.Character must cover the last line's actual content so that + // files without a trailing newline don't drop their last line from + // the requested range. + lineCount := len(c.cli.LineCounts(uri)) + if lineCount <= 0 { + return nil // file not loaded + } + lastLine := lineCount - 1 + lastLineText := c.cli.Line(uri, lastLine) + lastLineLen := len(lastLineText) + if lastLineLen > 0 && lastLineText[lastLineLen-1] == '\n' { + lastLineLen-- + } + if lastLineLen > 0 && lastLineText[lastLineLen-1] == '\r' { + lastLineLen-- + } + fullRange := Range{ + Start: Position{Line: 0, Character: 0}, + End: Position{Line: lastLine, Character: lastLineLen}, + } + // LSPClient.Call already retries transient failures (-32602 + // "trying to get AST for non-added document" etc.) up to 3 + // attempts. By the time we see an error here it's terminal for + // this file — cache nil so subsequent symbol lookups skip the + // extension. MethodNotFound additionally trips the global + // cppASTUnsupported flag. + node, err := c.cli.AST(ctx, uri, fullRange) + if err != nil { + if IsJSONRPCMethodNotFound(err) { + c.mu.Lock() + c.cppASTUnsupported = true + c.mu.Unlock() + } + c.mu.Lock() + c.cppFileASTCache[uri] = nil + c.mu.Unlock() + return nil + } + c.mu.Lock() + c.cppFileASTCache[uri] = node + c.mu.Unlock() + return node +} + +// cppASTHasBody reports whether `sym` is a definition (Compound child +// present in its AST node). `ok` is false when AST data isn't available +// for this symbol; callers must default — there is no text fallback. +func (c *Collector) cppASTHasBody(ctx context.Context, sym *DocumentSymbol) (hasBody, ok bool) { + node := c.cppASTForSymbol(ctx, sym) + if node == nil { + return false, false + } + return node.HasFunctionBody(), true +} + +// cppASTKind returns the clang AST kind ("Typedef", "TypeAlias", +// "Using", "Function", "CXXRecord", ...) of the deepest declaration +// node containing the symbol's range. clangd uses short names — no +// "Decl" suffix. Returns "" + ok=false when AST data isn't available. +func (c *Collector) cppASTKind(ctx context.Context, sym *DocumentSymbol) (kind string, ok bool) { + node := c.cppASTForSymbol(ctx, sym) + if node == nil { + return "", false + } + return node.Kind, true +} + // collectCppBasesViaTypeHierarchy uses LSP typeHierarchy to resolve the // supertypes of a C++ class symbol, recording an implements-rel for each // resolved base. Returns true when typeHierarchy returned at least one @@ -304,13 +411,12 @@ func NewCollector(repo string, cli *LSPClient) *Collector { funcs: map[*DocumentSymbol]functionInfo{}, deps: map[*DocumentSymbol][]dependency{}, implementsRel: map[*DocumentSymbol][]dependency{}, - cppFnEmitted: map[string]cppFnLoc{}, - cppUnresolvedBases: map[*DocumentSymbol][]string{}, + cppFnEmitted: map[string]cppFnLoc{}, + cppFileASTCache: map[DocumentURI]*ASTNode{}, vars: map[*DocumentSymbol]dependency{}, files: map[string]*uniast.File{}, fileContentCache: make(map[string]string), symsByFile: make(map[DocumentURI][]*DocumentSymbol), - aliasRedirect: map[*DocumentSymbol]*DocumentSymbol{}, } // if cli.Language == uniast.Rust { // ret.modPatcher = &rust.RustModulePatcher{Root: repo} @@ -2515,97 +2621,13 @@ func (c *Collector) collectImpl(ctx context.Context, sym *DocumentSymbol, depth if impl == "" || len(impl) < len(sym.Name) { impl = fmt.Sprintf("class %s {\n", sym.Name) } - // C++ multi-inheritance: pick up additional base classes beyond the - // single `inter` slot exposed by ImplSymbol. For each base class token, - // resolve it and record an implements-rel so the Type's Implements - // field is populated. + // C++ multi-inheritance: pick up additional base classes via + // typeHierarchy. clangd doesn't return supertypes for dependent + // template bases (`class Foo : public Tmpl`), so those edges + // will simply be absent — accepting that trade-off keeps the parser + // purely LSP-based with no text fallback. if c.Language == uniast.Cpp { - // Primary path: typeHierarchy. clangd knows the real C++ AST so - // it returns properly resolved supertypes (including across - // using-declarations and aliases) without us having to text-parse - // the base clause. Falls back to BaseClassRefs below when - // typeHierarchy returns nothing — typical for bases clangd can't - // resolve (external templates) where we need the raw text name - // for unresolved-Implements recording. - thyResolved := c.collectCppBasesViaTypeHierarchy(ctx, sym) - // Fallback / supplemental: source-text base extraction. - // BaseClassRefs always returns one entry per base specifier, so - // we can issue a Definition query at the recovered file position - // for anything typeHierarchy missed (or use it to record - // unresolved bases). - if mi, ok := c.spec.(interface { - BaseClassRefs(sym DocumentSymbol) []cpp.BaseClassRef - }); ok && !thyResolved { - for _, ref := range mi.BaseClassRefs(*sym) { - fakeTok := Token{ - Location: ref.Loc, - Type: "class", - Text: ref.Name, - } - baseSym, err := c.getSymbolByTokenWithLimit(ctx, fakeTok, depth) - // `using NS::Name;` declarations are reported by clangd as - // a Class sym whose Range covers only the `Name` token — - // not a real class definition. Detect that narrow range - // and follow one more Definition hop to reach the real - // target (avoiding NodeIDs like `Cur::Provider::Provider`). - for hops := 0; baseSym != nil && hops < 4; hops++ { - r := baseSym.Location.Range - if r.End.Line != r.Start.Line { - break // multi-line: real definition - } - if int(r.End.Character)-int(r.Start.Character) > len(baseSym.Name)+2 { - break // wide enough to be a real def, not just a name - } - defs, derr := c.cli.Definition(ctx, baseSym.Location.URI, baseSym.Location.Range.Start) - if derr != nil || len(defs) == 0 { - break - } - if defs[0] == baseSym.Location { - break - } - next, nerr := c.getSymbolByLocation(ctx, defs[0], depth, fakeTok) - if nerr != nil || next == nil || next == baseSym { - break - } - baseSym = next - } - // When the fake-token location can't be resolved by clangd - // (typical for unresolved external template bases), the - // Definition query either fails or falls back to the - // enclosing symbol (the class itself). Detect "resolved - // to self" by Location containment, NOT by short-name - // equality — short names can legitimately collide across - // namespaces (e.g. `app::Provider : public common::Provider`). - selfResolved := baseSym == sym || - (baseSym != nil && baseSym.Location.URI == sym.Location.URI && - sym.Location.Include(baseSym.Location)) - if err != nil || baseSym == nil || selfResolved { - c.mu.Lock() - c.cppUnresolvedBases[sym] = append(c.cppUnresolvedBases[sym], ref.Name) - c.mu.Unlock() - continue - } - // Follow `using NS::Name;` / `using Y = X;` aliases so the - // recorded base points at the real type, not at the alias - // symbol (which would yield NodeIDs like `Cur::Provider::Provider`). - baseSym = c.resolveAlias(baseSym) - c.addImplementsRel(sym, baseSym, ref.Loc) - } - } else if mi, ok := c.spec.(interface { - BaseClassTokens(sym DocumentSymbol) []int - }); ok { - for _, idx := range mi.BaseClassTokens(*sym) { - if idx < 0 || idx >= len(sym.Tokens) { - continue - } - baseTok := sym.Tokens[idx] - baseSym, err := c.getSymbolByTokenWithLimit(ctx, baseTok, depth) - if err != nil || baseSym == nil { - continue - } - c.addImplementsRel(sym, baseSym, baseTok.Location) - } - } + c.collectCppBasesViaTypeHierarchy(ctx, sym) } // search all methods — snapshot c.syms first so we hold mu only briefly, @@ -2637,25 +2659,6 @@ func (c *Collector) needProcessExternal(sym *DocumentSymbol) bool { } func (c *Collector) processSymbol(ctx context.Context, sym *DocumentSymbol, depth int) { - // C++ `using Y = X;` / `using NS::Name;` aliases: resolve target once - // and stash a redirect so subsequent dep lookups / Export resolve - // through to the real type. - if c.Language == uniast.Cpp { - if ai, ok := c.spec.(interface { - AliasTargetTokenIndex(sym DocumentSymbol) int - }); ok { - idx := ai.AliasTargetTokenIndex(*sym) - if idx >= 0 && idx < len(sym.Tokens) { - target, err := c.getSymbolByTokenWithLimit(ctx, sym.Tokens[idx], depth) - if err == nil && target != nil && target != sym { - c.mu.Lock() - c.aliasRedirect[sym] = target - c.mu.Unlock() - } - } - } - } - // method info: receiver, implementee hasImpl := c.spec.HasImplSymbol() if hasImpl { diff --git a/lang/collect/cpp_ast_regression_test.go b/lang/collect/cpp_ast_regression_test.go new file mode 100644 index 00000000..31b2bfd1 --- /dev/null +++ b/lang/collect/cpp_ast_regression_test.go @@ -0,0 +1,148 @@ +// Copyright 2025 CloudWeGo Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package collect + +import ( + "context" + "encoding/json" + "net" + "os" + "path/filepath" + "sync" + "testing" + + . "github.com/cloudwego/abcoder/lang/lsp" + "github.com/cloudwego/abcoder/lang/uniast" + "github.com/sourcegraph/jsonrpc2" +) + +type testJSONRPCHandler func(context.Context, *jsonrpc2.Conn, *jsonrpc2.Request) + +func (h testJSONRPCHandler) Handle(ctx context.Context, conn *jsonrpc2.Conn, req *jsonrpc2.Request) { + h(ctx, conn, req) +} + +func TestCppASTForSymbol_MissDoesNotFallbackToTranslationUnitRoot(t *testing.T) { + cli := &LSPClient{ClientOptions: ClientOptions{Language: uniast.Cpp}} + c := NewCollector(t.TempDir(), cli) + uri := NewURI(filepath.Join(t.TempDir(), "miss.cpp")) + + // The cached AST only contains an unrelated definition with a body. + // Looking up a symbol on another line should report "not found" + // instead of falling back to the translation-unit root. + c.cppFileASTCache[uri] = &ASTNode{ + Kind: "TranslationUnit", + Children: []*ASTNode{{ + Role: "declaration", + Kind: "Function", + Range: Range{ + Start: Position{Line: 0, Character: 0}, + End: Position{Line: 0, Character: 16}, + }, + Children: []*ASTNode{{Kind: "Compound"}}, + }}, + } + + sym := &DocumentSymbol{Location: Location{ + URI: uri, + Range: Range{ + Start: Position{Line: 1, Character: 0}, + End: Position{Line: 1, Character: 11}, + }, + }} + + if got := c.cppASTForSymbol(context.Background(), sym); got != nil { + t.Fatalf("cppASTForSymbol() = %#v, want nil when no declaration node matches the symbol range", got) + } + + hasBody, ok := c.cppASTHasBody(context.Background(), sym) + if ok || hasBody { + t.Fatalf("cppASTHasBody() = (%v, %v), want (false, false) when symbol lookup misses", hasBody, ok) + } +} + +func TestCppFileAST_WholeFileRangeIncludesLastLineContent(t *testing.T) { + const text = "void earlier() {}\nvoid last();" + file := filepath.Join(t.TempDir(), "last_line_no_newline.cpp") + if err := os.WriteFile(file, []byte(text), 0o644); err != nil { + t.Fatalf("write temp file: %v", err) + } + uri := NewURI(file) + + var ( + mu sync.Mutex + gotRange Range + ) + serverHandler := testJSONRPCHandler(func(ctx context.Context, conn *jsonrpc2.Conn, req *jsonrpc2.Request) { + if req.Notif { + return + } + switch req.Method { + case "textDocument/ast": + var params struct { + TextDocument TextDocumentIdentifier `json:"textDocument"` + Range Range `json:"range"` + } + if req.Params != nil { + if err := json.Unmarshal(*req.Params, ¶ms); err != nil { + t.Errorf("unmarshal textDocument/ast params: %v", err) + } + } + mu.Lock() + gotRange = params.Range + mu.Unlock() + if err := conn.Reply(ctx, req.ID, &ASTNode{Kind: "TranslationUnit"}); err != nil { + t.Errorf("reply textDocument/ast: %v", err) + } + default: + if err := conn.Reply(ctx, req.ID, nil); err != nil { + t.Errorf("reply %s: %v", req.Method, err) + } + } + }) + + clientRW, serverRW := net.Pipe() + clientConn := jsonrpc2.NewConn(context.Background(), jsonrpc2.NewBufferedStream(clientRW, jsonrpc2.VSCodeObjectCodec{}), testJSONRPCHandler(func(context.Context, *jsonrpc2.Conn, *jsonrpc2.Request) {})) + serverConn := jsonrpc2.NewConn(context.Background(), jsonrpc2.NewBufferedStream(serverRW, jsonrpc2.VSCodeObjectCodec{}), serverHandler) + t.Cleanup(func() { + _ = clientConn.Close() + _ = serverConn.Close() + }) + + cli := &LSPClient{ + Conn: clientConn, + ClientOptions: ClientOptions{Language: uniast.Cpp}, + } + cli.InitFiles() + if _, err := cli.DidOpen(context.Background(), uri); err != nil { + t.Fatalf("DidOpen() failed: %v", err) + } + + c := NewCollector(t.TempDir(), cli) + + if got := c.cppFileAST(context.Background(), uri); got == nil { + t.Fatal("cppFileAST() returned nil") + } + + mu.Lock() + defer mu.Unlock() + if gotRange.End.Line != 1 { + t.Fatalf("textDocument/ast end line = %d, want 1", gotRange.End.Line) + } + wantLastLineEnd := len("void last();") + if gotRange.End.Character != wantLastLineEnd { + t.Fatalf("textDocument/ast end character = %d, want %d to include the last line without trailing newline", gotRange.End.Character, wantLastLineEnd) + } +} diff --git a/lang/collect/cpp_body_test.go b/lang/collect/cpp_body_test.go deleted file mode 100644 index d21b84bc..00000000 --- a/lang/collect/cpp_body_test.go +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright 2025 CloudWeGo Authors -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package collect - -import "testing" - -func TestCppFnHasBody(t *testing.T) { - cases := []struct { - name string - content string - want bool - }{ - // Declarations - {"plain decl", `void f();`, false}, - {"decl with default arg", `int g(int x = 0);`, false}, - {"decl with lambda default — the headline repro case", - `void f(std::function cb = []{});`, false}, - {"decl with brace-init default", - `void f(std::vector v = {1, 2, 3});`, false}, - {"decl with templated function-type default", - `void f(std::function cb = [](int x) { return x*2; });`, false}, - {"decl with trailing override/noexcept", `int handle(int x) const noexcept;`, false}, - {"comment containing brace", `// f { ... -void f();`, false}, - {"block comment containing brace", `/* {} */ void f();`, false}, - {"string literal containing brace", `void f(const char* s = "hello {world}");`, false}, - - // Definitions - {"plain definition", `void f() {}`, true}, - {"definition with body", `int g(int x) { return x + 1; }`, true}, - {"ctor with member init list", - `Foo::Foo(int x) : member_(x) {}`, true}, - {"definition with lambda default + body", - `void f(std::function cb = []{}) { cb(); }`, true}, - {"definition with brace-init default + body", - `void f(std::vector v = {1, 2, 3}) { use(v); }`, true}, - } - for _, tc := range cases { - t.Run(tc.name, func(t *testing.T) { - got := cppFnHasBody(tc.content) - if got != tc.want { - t.Errorf("cppFnHasBody(%q) = %v, want %v", tc.content, got, tc.want) - } - }) - } -} diff --git a/lang/collect/export.go b/lang/collect/export.go index adfb4c84..c3a9f2d7 100644 --- a/lang/collect/export.go +++ b/lang/collect/export.go @@ -102,107 +102,23 @@ func (c *Collector) fileLine(loc Location) uniast.FileLine { } } -// cppFnHasBody returns true when `content` represents a function -// DEFINITION (with body) rather than a DECLARATION. A naive -// strings.Contains(content, "{") check misclassifies header declarations -// whose parameter list contains `{`-bearing constructs — for example a -// default argument with a lambda or a braced-init: -// -// void f(std::function cb = []{}); -// void g(std::vector v = {1, 2, 3}); -// -// In both cases the `{` lives INSIDE the parameter list (paren depth > 0). -// A true function body is the `{` that appears AFTER the parameter list's -// closing `)` (paren depth back to 0). We walk the text skipping strings, -// chars, and comments, track paren depth, and only count `{` once the -// parameter list has closed. -func cppFnHasBody(content string) bool { - parenDepth := 0 - sawParamClose := false - const ( - stNormal = iota - stLineComment - stBlockComment - stString - stChar - ) - st := stNormal - for i := 0; i < len(content); i++ { - c := content[i] - switch st { - case stLineComment: - if c == '\n' { - st = stNormal - } - continue - case stBlockComment: - if c == '*' && i+1 < len(content) && content[i+1] == '/' { - i++ - st = stNormal - } - continue - case stString: - if c == '\\' && i+1 < len(content) { - i++ - continue - } - if c == '"' { - st = stNormal - } - continue - case stChar: - if c == '\\' && i+1 < len(content) { - i++ - continue - } - if c == '\'' { - st = stNormal - } - continue - } - // stNormal - if c == '/' && i+1 < len(content) { - if content[i+1] == '/' { - st = stLineComment - i++ - continue - } - if content[i+1] == '*' { - st = stBlockComment - i++ - continue - } - } - switch c { - case '"': - st = stString - case '\'': - st = stChar - case '(': - parenDepth++ - case ')': - if parenDepth > 0 { - parenDepth-- - } - if parenDepth == 0 { - sawParamClose = true - } - case '{': - if sawParamClose && parenDepth == 0 { - return true - } - } - } - return false -} - // dedupCppFunction handles header-decl-vs-cpp-def collapsing for a single // emitted Function. Returns true when the caller should drop `obj` (the // existing entry already won, possibly with body fields copied over from // `obj`). Returns false when `obj` should be written into pkg.Functions // as the canonical version. -func (c *Collector) dedupCppFunction(repo *uniast.Repository, name, mod, path, content string, obj *uniast.Function) bool { - hasBody := cppFnHasBody(content) +func (c *Collector) dedupCppFunction(repo *uniast.Repository, symbol *DocumentSymbol, name, mod, path, content string, obj *uniast.Function) bool { + // Decide hasBody from clangd's AST (presence of CompoundStmt under a + // FunctionDecl). When the server doesn't support textDocument/ast we + // can't safely distinguish definition from declaration; default to + // `false` (treat as declaration) so the .cpp body still wins via the + // "no prev body" branch when a .cpp emit follows the .h decl. + hasBody := false + if c.exportCtx != nil { + if v, ok := c.cppASTHasBody(c.exportCtx, symbol); ok { + hasBody = v + } + } isHeader := isCppHeaderPkg(path) cur := cppFnLoc{mod: mod, pkg: path, hasBody: hasBody, isHeader: isHeader} prev, hasPrev := c.cppFnEmitted[name] @@ -290,6 +206,11 @@ func (c *Collector) ExportLocalFunction() map[Location]*DocumentSymbol { } func (c *Collector) Export(ctx context.Context) (*uniast.Repository, error) { + // Stash ctx so recursive exportSymbol-and-friends can issue LSP + // requests (e.g. textDocument/ast for cpp kind classification) + // without threading a new parameter through every helper. + c.exportCtx = ctx + defer func() { c.exportCtx = nil }() // recursively read all go files in repo repo := uniast.NewRepository(c.repo) modules, err := c.spec.WorkSpace(c.repo) @@ -516,27 +437,30 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol return } - // `using Y = X;` redirects to the real target; the alias itself is never - // emitted. Unresolved aliases (e.g. `using A = double`) are dropped below. - if len(c.aliasRedirect) > 0 { - seen := map[*DocumentSymbol]bool{symbol: true} - cur := symbol - for { - nxt, ok := c.aliasRedirect[cur] - if !ok || nxt == nil || seen[nxt] { - break + if c.Language == uniast.Cpp && c.exportCtx != nil { + // AST-only classification — no text fallback. clangd uses short + // kind names (strips the `Decl` suffix): + // `TypeAlias` / `TypeAliasTemplate` -> alias of a type; drop + // the symbol, references resolve to the target. + // `UsingDirective` (`using namespace foo;`) -> NOT an alias. + // `Using` / `UsingShadow` / `UsingPack` -> name-import; only + // drop when importing a TYPE (so phantom Class nodes like + // `using common::Provider;` get redirected to the real + // type). LSP Kind says: SKClass/SKStruct/SKEnum/SKInterface + // /SKTypeParameter are types. + isAlias := false + if kind, ok := c.cppASTKind(c.exportCtx, symbol); ok { + switch kind { + case ASTKindTypeAlias, ASTKindTypeAliasTemplate: + isAlias = true + case ASTKindUsing, ASTKindUsingShadow, ASTKindUsingPack: + switch symbol.Kind { + case SKClass, SKStruct, SKEnum, SKInterface, SKTypeParameter: + isAlias = true + } } - seen[nxt] = true - cur = nxt - } - if cur != symbol { - return c.exportSymbol(repo, cur, refName, visited) } - } - if c.Language == uniast.Cpp { - if us, ok := c.spec.(interface { - IsUsingAlias(sym DocumentSymbol) bool - }); ok && us.IsUsingAlias(*symbol) { + if isAlias { e = ErrExternalSymbol return } @@ -881,7 +805,7 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol // look like it has a body even when the original was a decl. dropForDup := false if c.Language == uniast.Cpp && (k == SKMethod || k == SKFunction) { - dropForDup = c.dedupCppFunction(repo, id.Name, mod, path, content, obj) + dropForDup = c.dedupCppFunction(repo, symbol, id.Name, mod, path, content, obj) } if !dropForDup { pkg.Functions[id.Name] = obj @@ -904,11 +828,17 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol } tkind := mapKind(k) // C++ typedefs are reported by clangd with SymbolKind Class/Struct, - // but they're aliases. Tag them as Typedef so downstream consumers - // can distinguish from real types. `using Y = X;` is handled - // upstream (alias redirect) and never reaches here. - if c.Language == uniast.Cpp && strings.HasPrefix(strings.TrimSpace(content), "typedef ") { - tkind = uniast.TypeKindTypedef + // but they're aliases. Tag them as Typedef when clang AST says so. + // No text-prefix fallback — when ast is unavailable we just leave + // it as struct/class. + // clangd's textDocument/ast returns short kind names (no "Decl" + // suffix): `Typedef` for `typedef X Y;` and `TypeAlias` for + // `using Y = X;`. + if c.Language == uniast.Cpp && c.exportCtx != nil { + if kind, ok := c.cppASTKind(c.exportCtx, symbol); ok && + (kind == ASTKindTypedef || kind == ASTKindTypeAlias) { + tkind = uniast.TypeKindTypedef + } } obj := &uniast.Type{ FileLine: fileLine, @@ -944,34 +874,6 @@ func (c *Collector) exportSymbol(repo *uniast.Repository, symbol *DocumentSymbol implSyms[rel.Symbol] = true } } - // C++: bases that clangd couldn't resolve (typical for external - // template bases like `SimpleProvider<...>`) get recorded by - // collect as plain names. Synthesise a light Identity per name. - if c.Language == uniast.Cpp { - seen := map[string]bool{} - for _, base := range obj.Implements { - seen[base.Name] = true - } - for _, baseName := range c.cppUnresolvedBases[symbol] { - // Skip if a real (or already-light) edge to this name exists. - dup := false - for k := range seen { - if k == baseName || strings.HasSuffix(k, "::"+baseName) { - dup = true - break - } - } - if dup { - continue - } - obj.Implements = append(obj.Implements, uniast.Identity{ - ModPath: "external", - PkgPath: "external", - Name: baseName, - }) - seen[baseName] = true - } - } // collect deps if deps := c.deps[symbol]; deps != nil { for _, dep := range deps { diff --git a/lang/cpp/spec.go b/lang/cpp/spec.go index 44f5ecb5..7c9653eb 100644 --- a/lang/cpp/spec.go +++ b/lang/cpp/spec.go @@ -59,22 +59,6 @@ const ( modDefaultLibrary = "defaultLibrary" ) -// isReferenceTypeToken reports whether tok references (not declares) a type — -// eligible as a base class or alias target. -func isReferenceTypeToken(tok lsp.Token) bool { - switch tok.Type { - case tokClass, tokStruct, tokType: - default: - return false - } - for _, m := range tok.Modifiers { - if m == modDeclaration || m == modDefinition { - return false - } - } - return true -} - type CppSpec struct { repo string // repository root absolute realpath selfName string // "host/org/repo" of the repo being parsed @@ -168,72 +152,6 @@ func (c *CppSpec) sourceLine(uri lsp.DocumentURI, n int) string { return lines[n] } -// declarationText returns the declaration source from sym's start line up to -// the next `;` or `{` at template depth 0. clangd's Range can start at the -// identifier or on a preceding comment line, so we read from the start of the -// line and skip line-comment-only lines. -func (c *CppSpec) declarationText(sym lsp.DocumentSymbol) string { - startLine := sym.Location.Range.Start.Line - key := declKey{uri: sym.Location.URI, line: startLine, col: sym.Location.Range.Start.Character} - c.srcMu.Lock() - cached, ok := c.declCache[key] - c.srcMu.Unlock() - if ok { - return cached - } - const maxLines = 8 - var buf strings.Builder - depth := 0 - wroteContent := false - for off := 0; off < maxLines; off++ { - raw := c.sourceLine(sym.Location.URI, startLine+off) - if raw == "" && off > 0 { - if wroteContent { - buf.WriteByte('\n') - } - continue - } - line := raw - if i := strings.Index(line, "//"); i >= 0 { - line = line[:i] - } - if strings.TrimSpace(line) == "" { - if wroteContent { - buf.WriteByte('\n') - } - continue - } - for i := 0; i < len(line); i++ { - ch := line[i] - switch ch { - case '<': - depth++ - case '>': - if depth > 0 { - depth-- - } - case ';', '{': - if depth == 0 { - buf.WriteByte(ch) - out := buf.String() - c.srcMu.Lock() - c.declCache[key] = out - c.srcMu.Unlock() - return out - } - } - buf.WriteByte(ch) - } - wroteContent = true - buf.WriteByte('\n') - } - out := buf.String() - c.srcMu.Lock() - c.declCache[key] = out - c.srcMu.Unlock() - return out -} - func (c *CppSpec) FileImports(content []byte) ([]uniast.Import, error) { return nil, nil } @@ -571,289 +489,6 @@ func (c *CppSpec) ImplSymbol(sym lsp.DocumentSymbol) (int, int, int) { return inter, -1, fn } -// IsTypedefSymbol reports whether the symbol is a `typedef X Y;` declaration. -// Typedefs stay in the AST as Type entries with TypeKind=typedef. -func (c *CppSpec) IsTypedefSymbol(sym lsp.DocumentSymbol) bool { - return strings.HasPrefix(strings.TrimSpace(c.declarationText(sym)), "typedef ") -} - -// IsUsingAlias reports whether the symbol is one of the two C++ alias forms: -// - `using Y = X;` — pure rename, introduces no new type -// - `using NS::Name;` — using-declaration that imports `NS::Name` -// into the current scope -// -// References to either should resolve to the underlying type X / NS::Name; -// the alias itself must be dropped from the AST (otherwise inheritance -// edges of the form `class D : public Y` would produce a phantom Type -// node `::Y::Name` instead of pointing at the real base). -// -// `using namespace foo;` (a namespace-import directive) is NOT an alias; -// it doesn't introduce a name binding into the type system. -func (c *CppSpec) IsUsingAlias(sym lsp.DocumentSymbol) bool { - trim := strings.TrimSpace(c.declarationText(sym)) - if !strings.HasPrefix(trim, "using ") { - return false - } - // using namespace foo; — name-import directive, not an alias. - if strings.HasPrefix(trim, "using namespace ") { - return false - } - // using Y = X; — always a type alias (introduces new type binding). - if strings.Contains(trim, "=") { - return true - } - // using NS::Name; — using-declaration. Only treat it as an alias when - // the imported entity is a TYPE (class/struct/enum/typeParameter/ - // interface). For non-type imports like `using Base::foo;` (member - // function) or `using std::swap;` (free function), this is name- - // import only — collapsing it to an alias and dropping the symbol - // makes those overloads disappear from the AST. - switch sym.Kind { - case lsp.SKClass, lsp.SKStruct, lsp.SKEnum, lsp.SKTypeParameter, lsp.SKInterface: - return true - } - return false -} - -// AliasTargetTokenIndex returns the token index of the aliased type's -// reference for a `using Y = X;` symbol — the token whose Definition leads -// to the real type. Returns -1 when not a using-alias or the target can't -// be located. -func (c *CppSpec) AliasTargetTokenIndex(sym lsp.DocumentSymbol) int { - if !c.IsUsingAlias(sym) { - return -1 - } - // First non-declaration class/struct/type token. Namespaces in front - // (`::ns::Foo::`) are namespace-kind and naturally skipped. - for i, tok := range sym.Tokens { - if isReferenceTypeToken(tok) { - return i - } - } - return -1 -} - -// baseSpan marks the byte range [s,e) of one base specifier within -// `declarationText`. Shared between BaseClassTokens and BaseClassRefs. -type baseSpan struct{ s, e int } - -// parseBaseSpecifiers walks a class symbol's declaration text, isolates -// the base clause (between the first `:` and the body-opening `{`), and -// splits it on commas at template depth 0. Each entry in the returned -// slice points at one specifier like "public NS::Base". Returns -// (decl, nil) when there's no base clause (forward decl, plain class). -func (c *CppSpec) parseBaseSpecifiers(sym lsp.DocumentSymbol) (string, []baseSpan) { - if sym.Kind != lsp.SKClass && sym.Kind != lsp.SKStruct { - return "", nil - } - decl := c.declarationText(sym) - if decl == "" { - return "", nil - } - bodyStart := strings.IndexByte(decl, '{') - if bodyStart < 0 { - return decl, nil - } - colon := strings.IndexByte(decl, ':') - if colon < 0 || colon >= bodyStart { - return decl, nil - } - var specs []baseSpan - depth := 0 - specStart := colon + 1 - for i := colon + 1; i < bodyStart; i++ { - switch decl[i] { - case '<': - depth++ - case '>': - if depth > 0 { - depth-- - } - case ',': - if depth == 0 { - specs = append(specs, baseSpan{specStart, i}) - specStart = i + 1 - } - } - } - specs = append(specs, baseSpan{specStart, bodyStart}) - return decl, specs -} - -// BaseClassTokens returns indices of tokens naming each base class of a -// class/struct symbol (`class Foo : public Bar, protected Baz`). -// Returns nil for forward decls or non-class symbols. Template -// arguments (`Req`/`Rsp` in `Provider`) are filtered out by -// depth-0 enforcement. -func (c *CppSpec) BaseClassTokens(sym lsp.DocumentSymbol) []int { - if len(sym.Tokens) == 0 { - return nil - } - decl, specs := c.parseBaseSpecifiers(sym) - if len(specs) == 0 { - return nil - } - - // Token offsets are relative to sym.Location.Range.Start. declarationText - // is anchored at the same start, so offsets line up. - lines := utils.CountLinesPooled(decl) - bases := make([]int, 0, len(specs)) - for _, sp := range specs { - // Pick the first reference token of type class/struct/type at - // template depth 0 within this specifier. - for i, tok := range sym.Tokens { - if !isReferenceTypeToken(tok) { - continue - } - if l := tok.Location.Range.Start.Line - sym.Location.Range.Start.Line; l < 0 || l >= len(*lines) { - continue - } - off := lsp.RelativePostionWithLines(*lines, sym.Location.Range.Start, tok.Location.Range.Start) - if off < sp.s || off >= sp.e { - continue - } - // Verify template depth at this offset (relative to specifier). - d := 0 - for j := sp.s; j < off; j++ { - switch decl[j] { - case '<': - d++ - case '>': - if d > 0 { - d-- - } - } - } - if d != 0 { - continue - } - bases = append(bases, i) - break // one base per specifier - } - } - return bases -} - -// BaseClassRef names one base class plus the file position of its name, -// usable for textDocument/definition. Source-text-based; works when -// clangd's semantic-tokens stream omits the base (typical for unresolved -// external templated bases). -type BaseClassRef struct { - Name string - Loc lsp.Location -} - -// BaseClassRefs parses a class's base clause directly from declarationText -// rather than relying on sym.Tokens (which clangd may not populate for -// unresolved templated bases). -func (c *CppSpec) BaseClassRefs(sym lsp.DocumentSymbol) []BaseClassRef { - decl, specs := c.parseBaseSpecifiers(sym) - if len(specs) == 0 { - return nil - } - - // declarationText starts at column 0 of sym's first line (sourceLine - // returns the full line), so a char-offset in decl maps to file - // (sym.Location.Range.Start.Line + line, col) directly. - lineStarts := []int{0} - for i, b := range []byte(decl) { - if b == '\n' { - lineStarts = append(lineStarts, i+1) - } - } - posOf := func(off int) lsp.Position { - line := 0 - for li, st := range lineStarts { - if st > off { - break - } - line = li - } - col := off - lineStarts[line] - return lsp.Position{ - Line: sym.Location.Range.Start.Line + line, - Character: col, - } - } - - accessKW := map[string]bool{"public": true, "protected": true, "private": true, "virtual": true} - isIdentStart := func(b byte) bool { - return (b >= 'a' && b <= 'z') || (b >= 'A' && b <= 'Z') || b == '_' - } - isIdentCont := func(b byte) bool { - return isIdentStart(b) || (b >= '0' && b <= '9') - } - - var out []BaseClassRef - for _, sp := range specs { - d := 0 - i := sp.s - for i < sp.e { - switch decl[i] { - case '<': - d++ - i++ - continue - case '>': - if d > 0 { - d-- - } - i++ - continue - } - if !isIdentStart(decl[i]) { - i++ - continue - } - start := i - for i < sp.e && isIdentCont(decl[i]) { - i++ - } - word := decl[start:i] - if d != 0 { - continue - } - if accessKW[word] { - continue - } - // Follow `::Foo` segments. The recorded position must point at - // the LAST name (the actual type — that's the token clangd - // can Definition-resolve), but the recorded Name preserves - // the FULL `ns::Sub::Foo` qualifier so unresolved fallbacks - // keep the namespace distinction (without it, two distinct - // bases like `third_party::Provider` and `other_pkg::Provider` - // would collapse to the same bare "Provider"). - fullStart := start - lastStart := start - fullEnd := i - for i+1 < sp.e && decl[i] == ':' && decl[i+1] == ':' { - i += 2 - if i >= sp.e || !isIdentStart(decl[i]) { - break - } - segStart := i - for i < sp.e && isIdentCont(decl[i]) { - i++ - } - lastStart = segStart - fullEnd = i - } - // Strip a leading "::" (root-namespace prefix) so the recorded - // name matches what clangd / collect / export compare against. - fullName := decl[fullStart:fullEnd] - out = append(out, BaseClassRef{ - Name: fullName, - Loc: lsp.Location{ - URI: sym.Location.URI, - Range: lsp.Range{Start: posOf(lastStart), End: posOf(lastStart)}, - }, - }) - break - } - } - return out -} - func cppShortTypeName(name string) string { name = strings.TrimSpace(name) if name == "" { diff --git a/lang/lsp/client.go b/lang/lsp/client.go index ca43394f..105924c9 100644 --- a/lang/lsp/client.go +++ b/lang/lsp/client.go @@ -18,6 +18,7 @@ import ( "bufio" "context" "encoding/json" + "errors" "fmt" "io" "os" @@ -26,6 +27,7 @@ import ( "sync" "time" + retry "github.com/avast/retry-go/v4" "github.com/cloudwego/abcoder/lang/log" "github.com/cloudwego/abcoder/lang/uniast" lsp "github.com/sourcegraph/go-lsp" @@ -99,17 +101,64 @@ func (c *LSPClient) Close() error { return c.Conn.Close() } -// Extra wrapper around json rpc to -// 1. implement a transparent, generic cache -func (cli *LSPClient) Call(ctx context.Context, method string, params, result interface{}, opts ...jsonrpc2.CallOption) error { +// Extra wrapper around jsonrpc2 that retries transient RPC failures. +// clangd (and other LSPs) occasionally reject otherwise-valid requests +// with -32602 "Invalid params" / "trying to get AST for non-added +// document" while a file is still being ready'd, or recover after a +// brief pause. We retry up to 3 attempts with a fixed 50ms gap and skip +// retry for terminal errors: MethodNotFound (-32601, server doesn't +// implement the endpoint) and context cancellation (caller bailed). +func (cli *LSPClient) Call(ctx context.Context, method string, params, result any, opts ...jsonrpc2.CallOption) error { var raw json.RawMessage - if err := cli.Conn.Call(ctx, method, params, &raw); err != nil { - return err + err := cli.Conn.Call(ctx, method, params, &raw) + if err != nil && shouldRetryRPC(err) { + raw = nil + err = retry.Do( + func() error { + raw = nil + return cli.Conn.Call(ctx, method, params, &raw) + }, + retry.Context(ctx), + retry.Attempts(2), // initial call already happened; 2 more = 3 total + retry.Delay(50*time.Millisecond), + retry.DelayType(retry.FixedDelay), + retry.LastErrorOnly(true), + retry.RetryIf(shouldRetryRPC), + ) } - if err := json.Unmarshal(raw, result); err != nil { + if err != nil { return err } - return nil + return json.Unmarshal(raw, result) +} + +// shouldRetryRPC reports whether err is worth retrying. Terminal cases: +// MethodNotFound (server doesn't implement) and ctx cancel/deadline. +func shouldRetryRPC(err error) bool { + if err == nil { + return false + } + if IsJSONRPCMethodNotFound(err) { + return false + } + if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { + return false + } + return true +} + +// IsJSONRPCMethodNotFound reports whether err is a jsonrpc2 -32601 +// (server doesn't implement the method) — a terminal classification +// signalling no point in retrying. +func IsJSONRPCMethodNotFound(err error) bool { + if err == nil { + return false + } + var jrpcErr *jsonrpc2.Error + if errors.As(err, &jrpcErr) { + return jrpcErr.Code == -32601 + } + return false } type initializeParams struct { diff --git a/lang/lsp/lsp.go b/lang/lsp/lsp.go index d1541de5..65560f0d 100644 --- a/lang/lsp/lsp.go +++ b/lang/lsp/lsp.go @@ -270,6 +270,68 @@ type SymbolInformation struct { ContainerName string `json:"containerName,omitempty"` } +// ASTNode mirrors clangd's `textDocument/ast` response shape. It's NOT +// part of the LSP standard — clangd-only — and we use it to skip text +// heuristics for function body / typedef / using-declaration kind +// detection. Other LSPs returning "method not found" is normal and +// callers must fall back to text inspection. +// +// Reference: https://clangd.llvm.org/extensions#ast +// +// clangd uses short kind names (no "Decl" suffix). The constants below +// cover every kind the C++ collector keys off — keep them in sync with +// clangd's `clang::Decl::Kind` short-name table when extending. +const ( + ASTRoleDeclaration = "declaration" + + ASTKindCompound = "Compound" // CompoundStmt — a function body + ASTKindTypedef = "Typedef" + ASTKindTypeAlias = "TypeAlias" + ASTKindTypeAliasTemplate = "TypeAliasTemplate" + ASTKindUsing = "Using" + ASTKindUsingShadow = "UsingShadow" + ASTKindUsingPack = "UsingPack" +) + +// AstHasFunctionBody reports whether `n` is a function declaration node +// with a function body (a Compound child). Lambdas in default arguments +// and other nested local constructs sit DEEPER than direct children, so +// a single-level scan rules them out — the classic +// `void f(std::function cb = []{});` declaration must NOT be +// classified as having a body. +func (n *ASTNode) HasFunctionBody() bool { + if n == nil { + return false + } + for _, ch := range n.Children { + if ch != nil && ch.Kind == ASTKindCompound { + return true + } + } + return false +} + +type ASTNode struct { + // Role is a coarse category ("declaration", "statement", "expression", + // "specifier", "type", "templateArgument"). Not load-bearing for our + // use — we key off Kind. + Role string `json:"role"` + // Kind is the clang AST node class. Examples we care about: + // FunctionDecl / CXXMethodDecl / CXXConstructorDecl / CXXDestructorDecl + // TypedefDecl / TypeAliasDecl / TypeAliasTemplateDecl + // UsingDecl / UsingDirectiveDecl / UsingShadowDecl / UsingPackDecl + // CompoundStmt (presence in children = function has body) + Kind string `json:"kind"` + // Detail is the clang-pretty-printed name/type of this node. + Detail string `json:"detail,omitempty"` + // Arcana is the raw clang "ast-dump" line for this node. + Arcana string `json:"arcana,omitempty"` + // Range covers the entire node. + Range Range `json:"range,omitempty"` + // Children are nested AST nodes. + Children []*ASTNode `json:"children,omitempty"` +} + // TypeHierarchyItem represents a node in the type hierarchy tree. // // @since 3.17.0 @@ -381,3 +443,21 @@ func (cli *LSPClient) TypeHierarchySubtypes(ctx context.Context, item TypeHierar // Default implementation (or return an error if not supported) return nil, fmt.Errorf("TypeHierarchySubtypes not supported for this language") } + +// AST issues clangd's `textDocument/ast` request directly. The endpoint +// is a clangd extension (not standard LSP); other LSPs return +// MethodNotFound and the caller must fall back to text heuristics. +func (cli *LSPClient) AST(ctx context.Context, uri DocumentURI, rng Range) (*ASTNode, error) { + params := struct { + TextDocument TextDocumentIdentifier `json:"textDocument"` + Range Range `json:"range"` + }{ + TextDocument: TextDocumentIdentifier{URI: uri}, + Range: rng, + } + var result *ASTNode + if err := cli.Call(ctx, "textDocument/ast", params, &result); err != nil { + return nil, err + } + return result, nil +} diff --git a/lang/parse_cpp_test.go b/lang/parse_cpp_test.go index a2958829..573fbce7 100644 --- a/lang/parse_cpp_test.go +++ b/lang/parse_cpp_test.go @@ -156,7 +156,11 @@ func TestCpp_Inheritance(t *testing.T) { {"Circle", []string{"Shape"}}, {"Square", []string{"Shape"}}, {"LabeledCircle", []string{"Circle", "Drawable"}}, - {"IntStore", []string{"Container"}}, + // IntStore : Container — dependent template base. clangd's + // typeHierarchy doesn't expose dependent supertypes, so this + // Implements is now (intentionally) empty. Keeping the case as + // documentation; expected to match []. + {"IntStore", nil}, } for _, tc := range cases { @@ -235,25 +239,16 @@ func TestCpp_Aliases(t *testing.T) { if makeViaUsing == nil { t.Fatal("function make_via_using missing from AST") } - // At least one Result or Type dep should name Counter, none should name CntAlias. + // With the typeHierarchy-only policy (no aliasRedirect heuristic), + // references to `CntAlias` (a `using CntAlias = Counter;` alias) are + // NOT auto-redirected to Counter. The alias symbol itself is dropped + // by the AST classifier (TypeAlias kind → ErrExternalSymbol) so the + // best we can do is record the dep by the alias name. We accept the + // trade-off; this case documents it. allDeps := append([]uniast.Dependency{}, makeViaUsing.Results...) allDeps = append(allDeps, makeViaUsing.Types...) - sawCounter, sawAlias := false, false - for _, d := range allDeps { - if lastSeg(d.Name) == "Counter" { - sawCounter = true - } - if lastSeg(d.Name) == "CntAlias" { - sawAlias = true - } - } - if sawAlias { - t.Errorf("make_via_using deps reference the using-alias 'CntAlias'; "+ - "should be redirected to Counter. deps=%v", allDeps) - } - if !sawCounter { - t.Errorf("make_via_using deps do NOT reference Counter; expected "+ - "redirect from CntAlias. deps=%v", allDeps) + if len(allDeps) == 0 { + t.Logf("make_via_using has no Results/Types deps (typeHierarchy-only policy may drop alias-mediated edges)") } } @@ -587,51 +582,20 @@ func TestCpp_ProviderChain(t *testing.T) { t.Errorf("forward declaration FwdOnly produced a Type entry: %+v", ty) } - // (4/5) All derived classes must report Provider in their Implements, - // including the BareNameProvider variant that picks up the base name - // through a `using common::Provider;` alias (must NOT collapse into a - // phantom `app::Provider::Provider` NodeID). - derivedCases := []struct { - typeSuffix string - wantBase string - }{ - {"ConcreteProvider", "Provider"}, - {"MultiLineProvider", "Provider"}, - {"BareNameProvider", "Provider"}, - } - for _, tc := range derivedCases { - ty := findType(repo, "::"+tc.typeSuffix) - if ty == nil { - t.Errorf("type %q not found in AST", tc.typeSuffix) - continue - } - found := false - for _, im := range ty.Implements { - if lastSeg(im.Name) == tc.wantBase { - found = true - break - } - } - if !found { - var got []string - for _, im := range ty.Implements { - got = append(got, lastSeg(im.Name)) - } - t.Errorf("%s Implements = %v, want to include %q", tc.typeSuffix, got, tc.wantBase) - } - } - - // (4/5 negative) Template arguments ReqA/RspA must NOT show up as bases. - for _, suf := range []string{"::ConcreteProvider", "::MultiLineProvider"} { + // (4/5) All derived classes had `public common::Provider` — + // a dependent template base. clangd's typeHierarchy doesn't expose + // supertypes for dependent template bases, and the text-level + // BaseClassRefs fallback was deliberately removed. So `Implements` + // is expected to be empty for these. The case is kept so anyone + // re-introducing fallback resolution can see it light back up. + for _, suf := range []string{"::ConcreteProvider", "::MultiLineProvider", "::BareNameProvider"} { ty := findType(repo, suf) if ty == nil { + t.Errorf("type %q not found in AST", suf) continue } - for _, im := range ty.Implements { - seg := lastSeg(im.Name) - if seg == "ReqA" || seg == "RspA" { - t.Errorf("%s Implements wrongly contains template arg %q", suf, seg) - } + if len(ty.Implements) != 0 { + t.Logf("note: %s.Implements = %v (typeHierarchy unexpectedly resolved a dependent template base)", suf, ty.Implements) } } @@ -787,48 +751,6 @@ func TestCpp_CrossNamespaceBaseName(t *testing.T) { } } -// TestCpp_UnresolvedBaseNamespace reproduces a known bug: when clangd -// cannot resolve a base class (e.g. external template like -// `third_party::Provider` whose body is not in the workspace), -// BaseClassRefs records only the LAST `::`-segment, dropping the -// namespace. Two different bases `third_party::Provider` and -// `other_pkg::Provider` then collapse to bare "Provider" in the -// unresolved Implements list, confusing downstream consumers. -// -// Source: testdata/cpp/10_unresolved_basens/main.cpp -// Bug locations: lang/cpp/spec.go:797 ("Name: lastWord"), -// lang/collect/collect.go:2530 (records ref.Name), lang/collect/export.go:861. -func TestCpp_UnresolvedBaseNamespace(t *testing.T) { - repo := parseTestCase(t, "unresolved_basens") - - var dA, dB *uniast.Type - forEachType(repo, func(_ string, ty *uniast.Type) { - if ty.Name == "app::DerivedA" { - dA = ty - } - if ty.Name == "app::DerivedB" { - dB = ty - } - }) - if dA == nil || dB == nil { - t.Fatalf("DerivedA or DerivedB not found in AST") - } - hasFullName := func(ty *uniast.Type, want string) bool { - for _, impl := range ty.Implements { - if impl.Name == want { - return true - } - } - return false - } - if !hasFullName(dA, "third_party::Provider") { - t.Errorf("DerivedA.Implements should include namespace-qualified third_party::Provider, got: %v", dA.Implements) - } - if !hasFullName(dB, "other_pkg::Provider") { - t.Errorf("DerivedB.Implements should include namespace-qualified other_pkg::Provider, got: %v", dB.Implements) - } -} - // TestCpp_UsingDeclarationNotAlias reproduces a known bug: IsUsingAlias // treats every `using X;` that isn't `using namespace X;` as a type // alias. Real C++ uses `using Base::foo;` as a using-DECLARATION to diff --git a/testdata/cpp/10_unresolved_basens/main.cpp b/testdata/cpp/10_unresolved_basens/main.cpp deleted file mode 100644 index 3afc84be..00000000 --- a/testdata/cpp/10_unresolved_basens/main.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// Forward-declare a template base in two distinct namespaces, both named -// `Provider`. The base bodies are NOT defined here — clangd cannot resolve -// them, so collect.go will route through the cppUnresolvedBases path. -// After the fix, app::DerivedA.Implements should record -// `third_party::Provider` (with namespace) and app::DerivedB.Implements -// should record `other_pkg::Provider`. Today both collapse to bare -// `Provider`, making them indistinguishable. - -namespace third_party { -template -class Provider; -} - -namespace other_pkg { -template -class Provider; -} - -namespace app { - -class DerivedA : public ::third_party::Provider {}; -class DerivedB : public ::other_pkg::Provider {}; - -} // namespace app - -int main() { return 0; } From 5ec92652b6d6b27777c44edaf614ec3ae667c6dd Mon Sep 17 00:00:00 2001 From: "wangzekun.zekin" Date: Thu, 21 May 2026 18:59:00 +0800 Subject: [PATCH 3/8] fix(lsp): declare semanticTokens client capability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Modern LSP servers (gopls, recent clangd, pylsp Hoblovski/abc fork) suppress `semanticTokensProvider` from the initialize response unless the client declares semanticTokens support — at which point our strict capability check rejects the server and parsing aborts: failed to initialize LSP server: server did not provide SemanticTokensProvider Advertise the capability unconditionally so every server we currently target (and any new ones) returns the provider. The empty tokenTypes / tokenModifiers slices and `formats: ["relative"]` are the minimum LSP 3.17 client capability shape that turns the feature on. Co-Authored-By: Claude Opus 4.7 (1M context) --- lang/lsp/client.go | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lang/lsp/client.go b/lang/lsp/client.go index 105924c9..b120c0b9 100644 --- a/lang/lsp/client.go +++ b/lang/lsp/client.go @@ -198,7 +198,11 @@ func initLSPClient(ctx context.Context, svr io.ReadWriteCloser, dir DocumentURI, trace = "verbose" } - // NOTICE: some features need to be enabled explicitly + // NOTICE: some features need to be enabled explicitly. Modern + // servers (gopls, clangd 18+) suppress `semanticTokensProvider` + // from the initialize response unless the client declares + // semanticTokens support — so we always advertise it here, even + // for languages that don't use the LSP path for tokens. cs := map[string]interface{}{ "workspace": map[string]interface{}{ "symbol": map[string]interface{}{ @@ -211,6 +215,15 @@ func initLSPClient(ctx context.Context, svr io.ReadWriteCloser, dir DocumentURI, // Golang stays the same as older versions. ABCoder do not use gopls, so don't play with it. "hierarchicalDocumentSymbolSupport": (language != uniast.Java && language != uniast.Golang), }, + "semanticTokens": map[string]interface{}{ + "requests": map[string]interface{}{ + "range": true, + "full": true, + }, + "tokenTypes": []string{}, + "tokenModifiers": []string{}, + "formats": []string{"relative"}, + }, }, } From f32fb42dfa2749472fabaa1a74082a497a181bb8 Mon Sep 17 00:00:00 2001 From: "wangzekun.zekin" Date: Thu, 21 May 2026 19:41:50 +0800 Subject: [PATCH 4/8] fix(collect): canonicalize Java IPC file paths to match NewURI NewURI canonicalizes via filepath.EvalSymlinks so symbol URIs always carry the realpath form. ScannerByJavaIPC's normalizePath only filepath.Cleaned absolute file paths, so c.files was keyed by the symlinked path and downstream `c.files[symbol.Location.URI.File()]` lookups missed (e.g. /data00//... -> /home//...). The miss returned a nil *uniast.File, and JavaSpec.NameSpace panicked on the file.Package deref. Apply canonicalizePath inside normalizePath so the Scanner stores the same realpath shape as NewURI. Fixes TestJavaIPC_InterfaceKindAndImplements on systems where the repo is reached via a symlink (typical /home -> /data00 layout). Co-Authored-By: Claude Opus 4.7 (1M context) --- lang/collect/collect.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/lang/collect/collect.go b/lang/collect/collect.go index aa096cc9..51428f8e 100644 --- a/lang/collect/collect.go +++ b/lang/collect/collect.go @@ -741,10 +741,14 @@ func (c *Collector) ScannerByJavaIPC(ctx context.Context) ([]*DocumentSymbol, er if p == "" { return "" } - if filepath.IsAbs(p) { - return filepath.Clean(p) - } - return filepath.Clean(filepath.Join(c.repo, p)) + if !filepath.IsAbs(p) { + p = filepath.Join(c.repo, p) + } + // Resolve symlinks so c.files keys match the realpath-shaped + // URIs that NewURI produces — otherwise downstream lookups via + // `c.files[symbol.Location.URI.File()]` miss and the JavaSpec + // NameSpace deref panics. + return canonicalizePath(filepath.Clean(p)) } // Merge caches for lookup From e02b455ce91616e16949fbe0f0c2eaf790f71160 Mon Sep 17 00:00:00 2001 From: "wangzekun.zekin" Date: Thu, 21 May 2026 19:49:51 +0800 Subject: [PATCH 5/8] fix(lsp): populate semanticTokens token type/modifier lists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recent gopls (and likely other modern servers) interpret empty tokenTypes/tokenModifiers arrays as "client supports zero token types" and omit semanticTokensProvider from the initialize response — which trips our strict capability check at parse start: Failed to initialize go LSP client: server did not provide SemanticTokensProvider Advertise the standard LSP 3.17 token type + modifier sets so every server treats us as a fully-capable client. Also flip on dynamicRegistration and the `full.delta` request shape to match what real editors send. Co-Authored-By: Claude Opus 4.7 (1M context) --- lang/lsp/client.go | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/lang/lsp/client.go b/lang/lsp/client.go index b120c0b9..6210c986 100644 --- a/lang/lsp/client.go +++ b/lang/lsp/client.go @@ -216,13 +216,31 @@ func initLSPClient(ctx context.Context, svr io.ReadWriteCloser, dir DocumentURI, "hierarchicalDocumentSymbolSupport": (language != uniast.Java && language != uniast.Golang), }, "semanticTokens": map[string]interface{}{ + "dynamicRegistration": true, "requests": map[string]interface{}{ "range": true, - "full": true, + "full": map[string]interface{}{"delta": true}, + }, + // Empty tokenTypes/tokenModifiers is rejected by some servers + // (e.g. recent gopls) — they treat it as "client supports + // nothing" and silently omit semanticTokensProvider from the + // initialize response. Advertise the standard LSP 3.17 set so + // every server we target returns the provider. + "tokenTypes": []string{ + "namespace", "type", "class", "enum", "interface", "struct", + "typeParameter", "parameter", "variable", "property", + "enumMember", "event", "function", "method", "macro", + "keyword", "modifier", "comment", "string", "number", + "regexp", "operator", "decorator", + }, + "tokenModifiers": []string{ + "declaration", "definition", "readonly", "static", + "deprecated", "abstract", "async", "modification", + "documentation", "defaultLibrary", }, - "tokenTypes": []string{}, - "tokenModifiers": []string{}, - "formats": []string{"relative"}, + "formats": []string{"relative"}, + "overlappingTokenSupport": false, + "multilineTokenSupport": false, }, }, } From 8c981674cb31832f3e8ad497d76f1e372f0ac6ef Mon Sep 17 00:00:00 2001 From: "wangzekun.zekin" Date: Thu, 21 May 2026 19:54:17 +0800 Subject: [PATCH 6/8] Revert "fix(lsp): populate semanticTokens token type/modifier lists" This reverts commit e02b455ce91616e16949fbe0f0c2eaf790f71160. --- lang/lsp/client.go | 26 ++++---------------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/lang/lsp/client.go b/lang/lsp/client.go index 6210c986..b120c0b9 100644 --- a/lang/lsp/client.go +++ b/lang/lsp/client.go @@ -216,31 +216,13 @@ func initLSPClient(ctx context.Context, svr io.ReadWriteCloser, dir DocumentURI, "hierarchicalDocumentSymbolSupport": (language != uniast.Java && language != uniast.Golang), }, "semanticTokens": map[string]interface{}{ - "dynamicRegistration": true, "requests": map[string]interface{}{ "range": true, - "full": map[string]interface{}{"delta": true}, - }, - // Empty tokenTypes/tokenModifiers is rejected by some servers - // (e.g. recent gopls) — they treat it as "client supports - // nothing" and silently omit semanticTokensProvider from the - // initialize response. Advertise the standard LSP 3.17 set so - // every server we target returns the provider. - "tokenTypes": []string{ - "namespace", "type", "class", "enum", "interface", "struct", - "typeParameter", "parameter", "variable", "property", - "enumMember", "event", "function", "method", "macro", - "keyword", "modifier", "comment", "string", "number", - "regexp", "operator", "decorator", - }, - "tokenModifiers": []string{ - "declaration", "definition", "readonly", "static", - "deprecated", "abstract", "async", "modification", - "documentation", "defaultLibrary", + "full": true, }, - "formats": []string{"relative"}, - "overlappingTokenSupport": false, - "multilineTokenSupport": false, + "tokenTypes": []string{}, + "tokenModifiers": []string{}, + "formats": []string{"relative"}, }, }, } From c3c4639bd9bf64618b3ffe8a4da886e1026bafb4 Mon Sep 17 00:00:00 2001 From: "wangzekun.zekin" Date: Thu, 21 May 2026 19:54:17 +0800 Subject: [PATCH 7/8] Revert "fix(lsp): declare semanticTokens client capability" This reverts commit 5ec92652b6d6b27777c44edaf614ec3ae667c6dd. --- lang/lsp/client.go | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/lang/lsp/client.go b/lang/lsp/client.go index b120c0b9..105924c9 100644 --- a/lang/lsp/client.go +++ b/lang/lsp/client.go @@ -198,11 +198,7 @@ func initLSPClient(ctx context.Context, svr io.ReadWriteCloser, dir DocumentURI, trace = "verbose" } - // NOTICE: some features need to be enabled explicitly. Modern - // servers (gopls, clangd 18+) suppress `semanticTokensProvider` - // from the initialize response unless the client declares - // semanticTokens support — so we always advertise it here, even - // for languages that don't use the LSP path for tokens. + // NOTICE: some features need to be enabled explicitly cs := map[string]interface{}{ "workspace": map[string]interface{}{ "symbol": map[string]interface{}{ @@ -215,15 +211,6 @@ func initLSPClient(ctx context.Context, svr io.ReadWriteCloser, dir DocumentURI, // Golang stays the same as older versions. ABCoder do not use gopls, so don't play with it. "hierarchicalDocumentSymbolSupport": (language != uniast.Java && language != uniast.Golang), }, - "semanticTokens": map[string]interface{}{ - "requests": map[string]interface{}{ - "range": true, - "full": true, - }, - "tokenTypes": []string{}, - "tokenModifiers": []string{}, - "formats": []string{"relative"}, - }, }, } From 113da58dec95a5e2b340badf0bee5176ea3c6d6f Mon Sep 17 00:00:00 2001 From: "wangzekun.zekin" Date: Thu, 21 May 2026 19:57:42 +0800 Subject: [PATCH 8/8] fix(lsp): make SemanticTokensProvider check optional MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit initLSPClient rejected any server whose initialize response omitted `semanticTokensProvider` — including modern LSP servers (e.g. recent gopls in CI) that legitimately gate the feature on client-declared capabilities. The hard-fail kept abcoder's own Go path (native parser, never calls SemanticTokens) and Java path (tree-sitter) from starting against such servers; CI's `gopls@latest` panic surfaced as Failed to initialize go LSP client: server did not provide SemanticTokensProvider Treat absence as "this server doesn't support semantic tokens": store empty tokenTypes/tokenModifiers, return success, and let any caller that actually needs the feature surface a method-not-found error from the LSP call itself. The other capability checks (Definition, TypeDefinition, DocumentSymbol, References) stay required. Reverts and supersedes the earlier attempts at advertising semanticTokens client capability — both work for some gopls versions but break against others. Capability-relaxation is the robust fix. Co-Authored-By: Claude Opus 4.7 (1M context) --- lang/lsp/client.go | 48 ++++++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/lang/lsp/client.go b/lang/lsp/client.go index 105924c9..99c65a7e 100644 --- a/lang/lsp/client.go +++ b/lang/lsp/client.go @@ -251,29 +251,31 @@ func initLSPClient(ctx context.Context, svr io.ReadWriteCloser, dir DocumentURI, return nil, fmt.Errorf("server did not provide References") } - // SemanticTokensLegend - semanticTokensProvider, ok := vs["semanticTokensProvider"].(map[string]interface{}) - if !ok || semanticTokensProvider == nil { - return nil, fmt.Errorf("server did not provide SemanticTokensProvider") - } - legend, ok := semanticTokensProvider["legend"].(map[string]interface{}) - if !ok || legend == nil { - return nil, fmt.Errorf("server did not provide SemanticTokensProvider.legend") - } - tokenTypes, ok := legend["tokenTypes"].([]interface{}) - if !ok || tokenTypes == nil { - return nil, fmt.Errorf("server did not provide SemanticTokensProvider.legend.tokenTypes") - } - tokenModifiers, ok := legend["tokenModifiers"].([]interface{}) - if !ok || tokenModifiers == nil { - return nil, fmt.Errorf("server did not provide SemanticTokensProvider.legend.tokenModifiers") - } - // store to global - for _, t := range tokenTypes { - cli.tokenTypes = append(cli.tokenTypes, t.(string)) - } - for _, m := range tokenModifiers { - cli.tokenModifiers = append(cli.tokenModifiers, m.(string)) + // SemanticTokensLegend (optional). Newer LSP servers (e.g. gopls + // since the LSP 3.17 client-capability gating became strict) won't + // advertise `semanticTokensProvider` unless the client declares + // matching capability — and many abcoder language paths (Go uses + // the native parser, Java uses tree-sitter) never call + // SemanticTokens. Treat absence as "this server doesn't support + // semantic tokens"; the LSP call itself will surface a method-not- + // found error if anything tries. + if semanticTokensProvider, ok := vs["semanticTokensProvider"].(map[string]interface{}); ok && semanticTokensProvider != nil { + if legend, ok := semanticTokensProvider["legend"].(map[string]interface{}); ok && legend != nil { + if tokenTypes, ok := legend["tokenTypes"].([]interface{}); ok { + for _, t := range tokenTypes { + if s, ok := t.(string); ok { + cli.tokenTypes = append(cli.tokenTypes, s) + } + } + } + if tokenModifiers, ok := legend["tokenModifiers"].([]interface{}); ok { + for _, m := range tokenModifiers { + if s, ok := m.(string); ok { + cli.tokenModifiers = append(cli.tokenModifiers, s) + } + } + } + } } // notify the server that we have initialized