From 265c0c03228c6123190b424b300d95e5ef2f546a Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Mon, 11 May 2026 14:42:56 -0600 Subject: [PATCH 1/8] feat(native): port Erlang extractor to Rust Adds tree-sitter-erlang dependency and native extractor matching the WASM-side behavior for Erlang symbol, import, and call extraction. Part of #1071 --- Cargo.lock | 11 + crates/codegraph-core/Cargo.toml | 1 + .../codegraph-core/src/extractors/erlang.rs | 396 ++++++++++++++++++ .../codegraph-core/src/extractors/helpers.rs | 10 + crates/codegraph-core/src/extractors/mod.rs | 4 + crates/codegraph-core/src/file_collector.rs | 1 + crates/codegraph-core/src/parser_registry.rs | 12 +- src/ast-analysis/rules/index.ts | 7 + src/domain/parser.ts | 2 + .../native-drop-classification.test.ts | 3 +- 10 files changed, 442 insertions(+), 5 deletions(-) create mode 100644 crates/codegraph-core/src/extractors/erlang.rs diff --git a/Cargo.lock b/Cargo.lock index 413504b0d..908504594 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -86,6 +86,7 @@ dependencies = [ "tree-sitter-cpp", "tree-sitter-dart", "tree-sitter-elixir", + "tree-sitter-erlang", "tree-sitter-go", "tree-sitter-haskell", "tree-sitter-hcl", @@ -789,6 +790,16 @@ dependencies = [ "tree-sitter-language", ] +[[package]] +name = "tree-sitter-erlang" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ceb442e728225d601db0661f60d64e166bd9b9a55c587f71d4e4f27378717cce" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "tree-sitter-go" version = "0.23.4" diff --git a/crates/codegraph-core/Cargo.toml b/crates/codegraph-core/Cargo.toml index df4361e17..0a8ed7316 100644 --- a/crates/codegraph-core/Cargo.toml +++ b/crates/codegraph-core/Cargo.toml @@ -35,6 +35,7 @@ tree-sitter-dart = "0.0.4" tree-sitter-zig = "1" tree-sitter-haskell = "0.23" tree-sitter-ocaml = "0.24" +tree-sitter-erlang = "0.16" rayon = "1" ignore = "0.4" globset = "0.4" diff --git a/crates/codegraph-core/src/extractors/erlang.rs b/crates/codegraph-core/src/extractors/erlang.rs new file mode 100644 index 000000000..ca81e76b5 --- /dev/null +++ b/crates/codegraph-core/src/extractors/erlang.rs @@ -0,0 +1,396 @@ +use tree_sitter::{Node, Tree}; +use crate::types::*; +use super::helpers::*; +use super::SymbolExtractor; + +pub struct ErlangExtractor; + +impl SymbolExtractor for ErlangExtractor { + fn extract(&self, tree: &Tree, source: &[u8], file_path: &str) -> FileSymbols { + let mut symbols = FileSymbols::new(file_path.to_string()); + walk_tree(&tree.root_node(), source, &mut symbols, match_erlang_node); + walk_ast_nodes_with_config(&tree.root_node(), source, &mut symbols.ast_nodes, &ERLANG_AST_CONFIG); + symbols + } +} + +fn match_erlang_node(node: &Node, source: &[u8], symbols: &mut FileSymbols, _depth: usize) { + match node.kind() { + "module_attribute" => handle_module_attr(node, source, symbols), + "record_decl" => handle_record_decl(node, source, symbols), + "type_alias" | "opaque" => handle_type_alias(node, source, symbols), + "fun_decl" => handle_fun_decl(node, source, symbols), + "function_clause" => { + // Only handle if not inside fun_decl (fun_decl handles its own clauses) + if node.parent().map(|p| p.kind()) != Some("fun_decl") { + handle_function_clause(node, source, symbols); + } + } + "pp_define" => handle_define(node, source, symbols), + "pp_include" | "pp_include_lib" => handle_include(node, source, symbols), + "import_attribute" => handle_import_attr(node, source, symbols), + "call" => handle_call(node, source, symbols), + _ => {} + } +} + +fn handle_module_attr(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // module_attribute: - module ( atom ) . + let name_node = match find_child(node, "atom") { + Some(n) => n, + None => return, + }; + + symbols.definitions.push(Definition { + name: node_text(&name_node, source).to_string(), + kind: "module".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: None, + }); +} + +fn handle_record_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + let name_node = match find_child(node, "atom") { + Some(n) => n, + None => return, + }; + + let mut children: Vec = Vec::new(); + for i in 0..node.child_count() { + let child = match node.child(i) { + Some(c) => c, + None => continue, + }; + if child.kind() == "record_field" || child.kind() == "typed_record_field" { + if let Some(field_name) = find_child(&child, "atom") { + children.push(child_def( + node_text(&field_name, source).to_string(), + "property", + start_line(&child), + )); + } + } + } + + symbols.definitions.push(Definition { + name: node_text(&name_node, source).to_string(), + kind: "record".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: opt_children(children), + }); +} + +fn handle_type_alias(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // type_alias: -type name(...) :: ty. + // Name is typically wrapped in a `type_name` node containing an `atom`. + let name_text = find_child(node, "atom") + .map(|a| node_text(&a, source).to_string()) + .or_else(|| { + find_child(node, "type_name") + .and_then(|tn| find_child(&tn, "atom").map(|a| node_text(&a, source).to_string())) + }); + let name = match name_text { + Some(n) => n, + None => return, + }; + + symbols.definitions.push(Definition { + name, + kind: "type".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: None, + }); +} + +fn handle_fun_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // fun_decl contains one or more function_clause children + dots. + // Extract from the first function_clause. + let clause = match find_child(node, "function_clause") { + Some(c) => c, + None => return, + }; + handle_function_clause(&clause, source, symbols); +} + +fn handle_function_clause(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // function_clause: atom expr_args clause_body + let name_node = match find_child(node, "atom") { + Some(n) => n, + None => return, + }; + let name = node_text(&name_node, source).to_string(); + + // Don't duplicate if we already have this function + if symbols + .definitions + .iter() + .any(|d| d.name == name && d.kind == "function") + { + return; + } + + let params = extract_params(node, source); + + // End line spans the full fun_decl when this clause is wrapped in one + let end_node = match node.parent() { + Some(p) if p.kind() == "fun_decl" => p, + _ => *node, + }; + + symbols.definitions.push(Definition { + name, + kind: "function".to_string(), + line: start_line(node), + end_line: Some(end_line(&end_node)), + decorators: None, + complexity: None, + cfg: None, + children: opt_children(params), + }); +} + +fn extract_params(clause_node: &Node, source: &[u8]) -> Vec { + let mut params = Vec::new(); + let args_node = match find_child(clause_node, "expr_args") { + Some(n) => n, + None => return params, + }; + for i in 0..args_node.child_count() { + let child = match args_node.child(i) { + Some(c) => c, + None => continue, + }; + if child.kind() == "var" || child.kind() == "atom" { + params.push(child_def( + node_text(&child, source).to_string(), + "parameter", + start_line(&child), + )); + } + } + params +} + +fn handle_define(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // pp_define: -define(NAME, value). Name may be in `var`, `atom`, or `macro_lhs`. + let name = if let Some(v) = find_child(node, "var") { + node_text(&v, source).to_string() + } else if let Some(a) = find_child(node, "atom") { + node_text(&a, source).to_string() + } else if let Some(lhs) = find_child(node, "macro_lhs") { + find_child(&lhs, "var") + .map(|v| node_text(&v, source).to_string()) + .unwrap_or_else(|| node_text(&lhs, source).to_string()) + } else { + return; + }; + + symbols.definitions.push(Definition { + name, + kind: "variable".to_string(), + line: start_line(node), + end_line: Some(end_line(node)), + decorators: None, + complexity: None, + cfg: None, + children: None, + }); +} + +fn handle_include(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + let str_node = match find_child(node, "string") { + Some(n) => n, + None => return, + }; + + let raw = node_text(&str_node, source); + let source_path = raw.trim_matches('"').to_string(); + symbols.imports.push(Import::new( + source_path, + vec!["include".to_string()], + start_line(node), + )); +} + +fn handle_import_attr(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + let module_node = match find_child(node, "atom") { + Some(n) => n, + None => return, + }; + + let mut names: Vec = Vec::new(); + for i in 0..node.child_count() { + let child = match node.child(i) { + Some(c) => c, + None => continue, + }; + if child.kind() == "fa" { + if let Some(fn_name) = find_child(&child, "atom") { + names.push(node_text(&fn_name, source).to_string()); + } + } + } + + let module_text = node_text(&module_node, source).to_string(); + if names.is_empty() { + names.push(module_text.clone()); + } + + symbols.imports.push(Import::new( + module_text, + names, + start_line(node), + )); +} + +fn handle_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { + // call: first child is function ref (atom for plain, may be wrapped in `remote` + // in newer grammars). Mirrors the JS extractor's behavior so both engines emit + // the same set of calls. + let func_node = match node.child(0) { + Some(n) => n, + None => return, + }; + + match func_node.kind() { + "atom" | "identifier" => { + symbols.calls.push(Call { + name: node_text(&func_node, source).to_string(), + line: start_line(node), + dynamic: None, + receiver: None, + }); + } + "remote" => { + // Legacy grammar shape: `call > remote(atom, atom)`. Newer WhatsApp + // grammars invert this to `remote > call(atom, expr_args)`, in which + // case the inner `call` is visited as a plain call above. + let mut atoms: Vec = Vec::new(); + for i in 0..func_node.child_count() { + if let Some(child) = func_node.child(i) { + if child.kind() == "atom" || child.kind() == "var" { + atoms.push(node_text(&child, source).to_string()); + } + } + } + if atoms.len() >= 2 { + let name = atoms.last().cloned().unwrap_or_default(); + let receiver = atoms[..atoms.len() - 1].join(":"); + symbols.calls.push(Call { + name, + line: start_line(node), + dynamic: None, + receiver: Some(receiver), + }); + } else if atoms.len() == 1 { + symbols.calls.push(Call { + name: atoms.into_iter().next().unwrap_or_default(), + line: start_line(node), + dynamic: None, + receiver: None, + }); + } + } + _ => {} + } +} + +#[cfg(test)] +mod tests { + use super::*; + use tree_sitter::Parser; + + fn parse_erlang(code: &str) -> FileSymbols { + let mut parser = Parser::new(); + parser + .set_language(&tree_sitter_erlang::LANGUAGE.into()) + .unwrap(); + let tree = parser.parse(code.as_bytes(), None).unwrap(); + ErlangExtractor.extract(&tree, code.as_bytes(), "test.erl") + } + + #[test] + fn extracts_module_declaration() { + let s = parse_erlang("-module(mymodule)."); + let m = s + .definitions + .iter() + .find(|d| d.name == "mymodule") + .expect("module def"); + assert_eq!(m.kind, "module"); + } + + #[test] + fn extracts_function_definition() { + let s = parse_erlang("greet(Name) ->\n io:format(\"Hello ~s~n\", [Name]).\n"); + let f = s + .definitions + .iter() + .find(|d| d.kind == "function") + .expect("function def"); + assert_eq!(f.name, "greet"); + } + + #[test] + fn extracts_record_definition() { + let s = parse_erlang("-record(person, {name, age}).\n"); + let r = s + .definitions + .iter() + .find(|d| d.name == "person") + .expect("record def"); + assert_eq!(r.kind, "record"); + let children = r.children.as_ref().expect("record fields"); + let field_names: Vec<&str> = children.iter().map(|c| c.name.as_str()).collect(); + assert!(field_names.contains(&"name")); + assert!(field_names.contains(&"age")); + } + + #[test] + fn extracts_import_attribute() { + let s = parse_erlang("-import(lists, [map/2, filter/2]).\n"); + assert!(!s.imports.is_empty(), "expected at least one import"); + let imp = &s.imports[0]; + assert_eq!(imp.source, "lists"); + assert!(imp.names.contains(&"map".to_string())); + assert!(imp.names.contains(&"filter".to_string())); + } + + #[test] + fn extracts_function_calls() { + let s = parse_erlang("start() ->\n io:format(\"Hello~n\").\n"); + assert!(!s.calls.is_empty(), "expected at least one call"); + } + + #[test] + fn extracts_include_directive() { + let s = parse_erlang("-include(\"foo.hrl\").\n"); + assert!(s.imports.iter().any(|i| i.source == "foo.hrl")); + } + + #[test] + fn deduplicates_multi_clause_function() { + // Multiple clauses for the same function produce one definition only. + let s = parse_erlang( + "fact(0) -> 1;\nfact(N) when N > 0 -> N * fact(N - 1).\n", + ); + let fact_defs: Vec<&Definition> = s + .definitions + .iter() + .filter(|d| d.name == "fact" && d.kind == "function") + .collect(); + assert_eq!(fact_defs.len(), 1, "expected single function def for multi-clause"); + } +} diff --git a/crates/codegraph-core/src/extractors/helpers.rs b/crates/codegraph-core/src/extractors/helpers.rs index b02531896..768d6aafd 100644 --- a/crates/codegraph-core/src/extractors/helpers.rs +++ b/crates/codegraph-core/src/extractors/helpers.rs @@ -360,6 +360,16 @@ pub const OCAML_AST_CONFIG: LangAstConfig = LangAstConfig { string_prefixes: &[], }; +pub const ERLANG_AST_CONFIG: LangAstConfig = LangAstConfig { + new_types: &[], + throw_types: &[], + await_types: &[], + string_types: &["string"], + regex_types: &[], + quote_chars: &['"'], + string_prefixes: &[], +}; + // ── Generic AST node walker ────────────────────────────────────────────────── /// Node types that represent identifiers across languages. diff --git a/crates/codegraph-core/src/extractors/mod.rs b/crates/codegraph-core/src/extractors/mod.rs index 642f29f98..7a6e4375e 100644 --- a/crates/codegraph-core/src/extractors/mod.rs +++ b/crates/codegraph-core/src/extractors/mod.rs @@ -4,6 +4,7 @@ pub mod cpp; pub mod csharp; pub mod dart; pub mod elixir; +pub mod erlang; pub mod go; pub mod haskell; pub mod hcl; @@ -126,5 +127,8 @@ pub fn extract_symbols_with_opts( LanguageKind::Ocaml | LanguageKind::OcamlInterface => { ocaml::OcamlExtractor.extract_with_opts(tree, source, file_path, include_ast_nodes) } + LanguageKind::Erlang => { + erlang::ErlangExtractor.extract_with_opts(tree, source, file_path, include_ast_nodes) + } } } diff --git a/crates/codegraph-core/src/file_collector.rs b/crates/codegraph-core/src/file_collector.rs index 0cb157814..19f335fcb 100644 --- a/crates/codegraph-core/src/file_collector.rs +++ b/crates/codegraph-core/src/file_collector.rs @@ -36,6 +36,7 @@ const SUPPORTED_EXTENSIONS: &[&str] = &[ "js", "jsx", "mjs", "cjs", "ts", "tsx", "d.ts", "py", "pyi", "go", "rs", "java", "cs", "rb", "rake", "gemspec", "php", "phtml", "tf", "hcl", "c", "h", "cpp", "cc", "cxx", "hpp", "kt", "kts", "swift", "scala", "sh", "bash", "ex", "exs", "lua", "dart", "zig", "hs", "ml", "mli", + "erl", "hrl", ]; /// Returns whether `path` has an extension the Rust file_collector would accept. diff --git a/crates/codegraph-core/src/parser_registry.rs b/crates/codegraph-core/src/parser_registry.rs index c87957f29..3881ced85 100644 --- a/crates/codegraph-core/src/parser_registry.rs +++ b/crates/codegraph-core/src/parser_registry.rs @@ -27,6 +27,7 @@ pub enum LanguageKind { Haskell, Ocaml, OcamlInterface, + Erlang, } impl LanguageKind { @@ -58,6 +59,7 @@ impl LanguageKind { Self::Haskell => "haskell", Self::Ocaml => "ocaml", Self::OcamlInterface => "ocaml-interface", + Self::Erlang => "erlang", } } @@ -97,6 +99,7 @@ impl LanguageKind { "hs" => Some(Self::Haskell), "ml" => Some(Self::Ocaml), "mli" => Some(Self::OcamlInterface), + "erl" | "hrl" => Some(Self::Erlang), _ => None, } } @@ -129,6 +132,7 @@ impl LanguageKind { "haskell" => Some(Self::Haskell), "ocaml" => Some(Self::Ocaml), "ocaml-interface" => Some(Self::OcamlInterface), + "erlang" => Some(Self::Erlang), _ => None, } } @@ -160,6 +164,7 @@ impl LanguageKind { Self::Haskell => tree_sitter_haskell::LANGUAGE.into(), Self::Ocaml => tree_sitter_ocaml::LANGUAGE_OCAML.into(), Self::OcamlInterface => tree_sitter_ocaml::LANGUAGE_OCAML_INTERFACE.into(), + Self::Erlang => tree_sitter_erlang::LANGUAGE.into(), } } @@ -175,7 +180,7 @@ impl LanguageKind { &[ JavaScript, TypeScript, Tsx, Python, Go, Rust, Java, CSharp, Ruby, Php, Hcl, C, Cpp, Kotlin, Swift, Scala, Bash, Elixir, Lua, Dart, Zig, Haskell, Ocaml, - OcamlInterface, + OcamlInterface, Erlang, ] } } @@ -244,14 +249,15 @@ mod tests { | LanguageKind::Zig | LanguageKind::Haskell | LanguageKind::Ocaml - | LanguageKind::OcamlInterface => (), + | LanguageKind::OcamlInterface + | LanguageKind::Erlang => (), }; // IMPORTANT: this constant must equal the number of arms in the match // above AND the length of the slice returned by `LanguageKind::all()`. // Because both checks require the same manual update, they reinforce // each other: a developer who updates the match is reminded to also // update `all()` and this count. - const EXPECTED_LEN: usize = 24; + const EXPECTED_LEN: usize = 25; assert_eq!( LanguageKind::all().len(), EXPECTED_LEN, diff --git a/src/ast-analysis/rules/index.ts b/src/ast-analysis/rules/index.ts index 653cbd59b..a7a1e5d75 100644 --- a/src/ast-analysis/rules/index.ts +++ b/src/ast-analysis/rules/index.ts @@ -153,6 +153,10 @@ const OCAML_AST_TYPES: Record = { string: 'string', }; +const ERLANG_AST_TYPES: Record = { + string: 'string', +}; + export const AST_TYPE_MAPS: Map> = new Map([ ['javascript', JS_AST_TYPES], ['typescript', JS_AST_TYPES], @@ -177,6 +181,7 @@ export const AST_TYPE_MAPS: Map> = new Map([ ['haskell', HASKELL_AST_TYPES], ['ocaml', OCAML_AST_TYPES], ['ocaml-interface', OCAML_AST_TYPES], + ['erlang', ERLANG_AST_TYPES], ]); // ─── Per-language string-extraction config ─────────────────────────────── @@ -211,6 +216,7 @@ const DART_STRING_CONFIG: AstStringConfig = { quoteChars: '\'"', stringPrefixes: const ZIG_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; const HASKELL_STRING_CONFIG: AstStringConfig = { quoteChars: '"\'', stringPrefixes: '' }; const OCAML_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; +const ERLANG_STRING_CONFIG: AstStringConfig = { quoteChars: '"', stringPrefixes: '' }; export const AST_STRING_CONFIGS: Map = new Map([ ['javascript', JS_STRING_CONFIG], @@ -236,6 +242,7 @@ export const AST_STRING_CONFIGS: Map = new Map([ ['haskell', HASKELL_STRING_CONFIG], ['ocaml', OCAML_STRING_CONFIG], ['ocaml-interface', OCAML_STRING_CONFIG], + ['erlang', ERLANG_STRING_CONFIG], ]); // ─── Per-language "stop-after-collect" kinds ───────────────────────────── diff --git a/src/domain/parser.ts b/src/domain/parser.ts index f1c7dd809..e2b767282 100644 --- a/src/domain/parser.ts +++ b/src/domain/parser.ts @@ -471,6 +471,8 @@ export const NATIVE_SUPPORTED_EXTENSIONS: ReadonlySet = new Set([ '.hs', '.ml', '.mli', + '.erl', + '.hrl', ]); /** diff --git a/tests/parsers/native-drop-classification.test.ts b/tests/parsers/native-drop-classification.test.ts index 24aee1d53..8a0dfbaa9 100644 --- a/tests/parsers/native-drop-classification.test.ts +++ b/tests/parsers/native-drop-classification.test.ts @@ -20,14 +20,13 @@ describe('classifyNativeDrops', () => { 'src/c.clj', 'src/d.jl', 'src/e.R', - 'src/f.erl', 'src/g.sol', 'src/h.cu', 'src/i.groovy', 'src/j.v', 'src/k.m', ]); - expect(totals['unsupported-by-native']).toBe(11); + expect(totals['unsupported-by-native']).toBe(10); expect(totals['native-extractor-failure']).toBe(0); expect(byReason['unsupported-by-native'].get('.fs')).toEqual(['src/a.fs']); expect(byReason['unsupported-by-native'].get('.gleam')).toEqual(['src/b.gleam']); From 4c3c13f3d21b548cd66ca113c42e69f8f9a9b9ce Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Mon, 11 May 2026 23:33:08 -0600 Subject: [PATCH 2/8] fix(extractors): preserve Erlang arity-based overloads and patterns (#1103) - Dedupe Erlang function defs by (name, arity) so foo/1 and foo/2 are both kept - Count every argument pattern (tuple, list, binary) as one parameter via named children, using placeholder labels for complex patterns - Prefer the named 'name'/'args' fields for module attributes and clause args, falling back to the previous atom/expr_args lookups - Add Rust and TS tests covering multi-arity overloads and complex pattern args --- .../codegraph-core/src/extractors/erlang.rs | 94 +++++++++++++++---- src/extractors/erlang.ts | 44 ++++++--- tests/parsers/erlang.test.ts | 19 ++++ 3 files changed, 123 insertions(+), 34 deletions(-) diff --git a/crates/codegraph-core/src/extractors/erlang.rs b/crates/codegraph-core/src/extractors/erlang.rs index ca81e76b5..8528dd7b7 100644 --- a/crates/codegraph-core/src/extractors/erlang.rs +++ b/crates/codegraph-core/src/extractors/erlang.rs @@ -36,7 +36,13 @@ fn match_erlang_node(node: &Node, source: &[u8], symbols: &mut FileSymbols, _dep fn handle_module_attr(node: &Node, source: &[u8], symbols: &mut FileSymbols) { // module_attribute: - module ( atom ) . - let name_node = match find_child(node, "atom") { + // Prefer the named `name` field exposed by tree-sitter-erlang so we don't + // accidentally pick up the `module` keyword if a future grammar exposes it + // as a named `atom` child. + let name_node = match node + .child_by_field_name("name") + .or_else(|| find_child(node, "atom")) + { Some(n) => n, None => return, }; @@ -126,23 +132,28 @@ fn handle_fun_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) { fn handle_function_clause(node: &Node, source: &[u8], symbols: &mut FileSymbols) { // function_clause: atom expr_args clause_body - let name_node = match find_child(node, "atom") { + let name_node = match node + .child_by_field_name("name") + .or_else(|| find_child(node, "atom")) + { Some(n) => n, None => return, }; let name = node_text(&name_node, source).to_string(); - // Don't duplicate if we already have this function - if symbols - .definitions - .iter() - .any(|d| d.name == name && d.kind == "function") - { + let params = extract_params(node, source); + let arity = params.len(); + + // Don't duplicate if we already have this function at the same arity. + // Erlang overloads by arity, so `foo/1` and `foo/2` are distinct definitions. + if symbols.definitions.iter().any(|d| { + d.name == name + && d.kind == "function" + && d.children.as_ref().map_or(0, |c| c.len()) == arity + }) { return; } - let params = extract_params(node, source); - // End line spans the full fun_decl when this clause is wrapped in one let end_node = match node.parent() { Some(p) if p.kind() == "fun_decl" => p, @@ -163,22 +174,29 @@ fn handle_function_clause(node: &Node, source: &[u8], symbols: &mut FileSymbols) fn extract_params(clause_node: &Node, source: &[u8]) -> Vec { let mut params = Vec::new(); - let args_node = match find_child(clause_node, "expr_args") { + let args_node = match clause_node + .child_by_field_name("args") + .or_else(|| find_child(clause_node, "expr_args")) + { Some(n) => n, None => return params, }; - for i in 0..args_node.child_count() { - let child = match args_node.child(i) { + // Iterate named children so every argument pattern counts as one parameter, + // independent of whether it is a bare `var`/`atom` or a complex destructuring + // pattern (tuple, list, binary, etc.). Punctuation tokens are anonymous and + // therefore excluded automatically. + for i in 0..args_node.named_child_count() { + let child = match args_node.named_child(i) { Some(c) => c, None => continue, }; - if child.kind() == "var" || child.kind() == "atom" { - params.push(child_def( - node_text(&child, source).to_string(), - "parameter", - start_line(&child), - )); - } + let label = if child.kind() == "var" || child.kind() == "atom" { + node_text(&child, source).to_string() + } else { + // Placeholder for complex patterns so arity is preserved. + format!("_{}", i) + }; + params.push(child_def(label, "parameter", start_line(&child))); } params } @@ -393,4 +411,40 @@ mod tests { .collect(); assert_eq!(fact_defs.len(), 1, "expected single function def for multi-clause"); } + + #[test] + fn keeps_distinct_arities_for_same_name() { + // Erlang overloads by arity: foo/1 and foo/2 are distinct definitions + // and must not be collapsed by name-only deduplication. + let s = parse_erlang( + "foo(X) -> X.\nfoo(X, Y) -> X + Y.\nfoo(X, Y, Z) -> X + Y + Z.\n", + ); + let foo_defs: Vec<&Definition> = s + .definitions + .iter() + .filter(|d| d.name == "foo" && d.kind == "function") + .collect(); + assert_eq!(foo_defs.len(), 3, "expected one def per arity"); + let mut arities: Vec = foo_defs + .iter() + .map(|d| d.children.as_ref().map_or(0, |c| c.len())) + .collect(); + arities.sort(); + assert_eq!(arities, vec![1, 2, 3]); + } + + #[test] + fn counts_complex_pattern_arguments_as_parameters() { + // Tuple, list and binary pattern arguments must still count toward arity. + let s = parse_erlang( + "handle({ok, X}, [H | T]) -> {X, H, T}.\n", + ); + let f = s + .definitions + .iter() + .find(|d| d.name == "handle" && d.kind == "function") + .expect("function def"); + let params = f.children.as_ref().expect("params"); + assert_eq!(params.len(), 2, "expected one parameter per pattern"); + } } diff --git a/src/extractors/erlang.ts b/src/extractors/erlang.ts index a78b29493..a556ae57d 100644 --- a/src/extractors/erlang.ts +++ b/src/extractors/erlang.ts @@ -70,7 +70,10 @@ function walkErlangNode(node: TreeSitterNode, ctx: ExtractorOutput): void { function handleModuleAttr(node: TreeSitterNode, ctx: ExtractorOutput): void { // module_attribute: - module ( atom ) . - const nameNode = findChild(node, 'atom'); + // Prefer the named `name` field exposed by tree-sitter-erlang so we don't + // accidentally pick up the `module` keyword if a future grammar exposes it + // as a named `atom` child. + const nameNode = node.childForFieldName('name') ?? findChild(node, 'atom'); if (!nameNode) return; ctx.definitions.push({ @@ -134,13 +137,22 @@ function handleFunDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { function handleFunctionClause(node: TreeSitterNode, ctx: ExtractorOutput): void { // function_clause: atom expr_args clause_body - const nameNode = findChild(node, 'atom'); + const nameNode = node.childForFieldName('name') ?? findChild(node, 'atom'); if (!nameNode) return; - // Don't duplicate if we already have this function - if (ctx.definitions.some((d) => d.name === nameNode.text && d.kind === 'function')) return; - const params = extractErlangParams(node); + const arity = params.length; + + // Don't duplicate if we already have this function at the same arity. + // Erlang overloads by arity, so `foo/1` and `foo/2` are distinct definitions. + if ( + ctx.definitions.some( + (d) => + d.name === nameNode.text && d.kind === 'function' && (d.children?.length ?? 0) === arity, + ) + ) { + return; + } ctx.definitions.push({ name: nameNode.text, @@ -154,18 +166,22 @@ function handleFunctionClause(node: TreeSitterNode, ctx: ExtractorOutput): void function extractErlangParams(clauseNode: TreeSitterNode): SubDeclaration[] { const params: SubDeclaration[] = []; - const argsNode = findChild(clauseNode, 'expr_args'); + const argsNode = clauseNode.childForFieldName('args') ?? findChild(clauseNode, 'expr_args'); if (!argsNode) return params; - for (let i = 0; i < argsNode.childCount; i++) { - const child = argsNode.child(i); + // Iterate named children so every argument pattern counts as one parameter, + // independent of whether it is a bare `var`/`atom` or a complex destructuring + // pattern (tuple, list, binary, etc.). Punctuation tokens are anonymous and + // therefore excluded automatically. + for (let i = 0; i < argsNode.namedChildCount; i++) { + const child = argsNode.namedChild(i); if (!child) continue; - if (child.type === 'var') { - params.push({ name: child.text, kind: 'parameter', line: child.startPosition.row + 1 }); - } - if (child.type === 'atom') { - params.push({ name: child.text, kind: 'parameter', line: child.startPosition.row + 1 }); - } + const label = + child.type === 'var' || child.type === 'atom' + ? child.text + : // Placeholder for complex patterns so arity is preserved. + `_${i}`; + params.push({ name: label, kind: 'parameter', line: child.startPosition.row + 1 }); } return params; } diff --git a/tests/parsers/erlang.test.ts b/tests/parsers/erlang.test.ts index cde7e4220..c59a2bbf0 100644 --- a/tests/parsers/erlang.test.ts +++ b/tests/parsers/erlang.test.ts @@ -43,4 +43,23 @@ describe('Erlang parser', () => { io:format("Hello~n").`); expect(symbols.calls.length).toBeGreaterThanOrEqual(1); }); + + it('keeps distinct arities for the same function name', () => { + // Erlang overloads by arity; foo/1 and foo/2 are distinct definitions. + const symbols = parseErlang(`foo(X) -> X. +foo(X, Y) -> X + Y. +foo(X, Y, Z) -> X + Y + Z.`); + const fooDefs = symbols.definitions.filter((d) => d.name === 'foo' && d.kind === 'function'); + expect(fooDefs).toHaveLength(3); + const arities = fooDefs.map((d) => d.children?.length ?? 0).sort(); + expect(arities).toEqual([1, 2, 3]); + }); + + it('counts complex pattern arguments as parameters', () => { + // Tuple, list, and binary pattern arguments must still count toward arity. + const symbols = parseErlang(`handle({ok, X}, [H | T]) -> {X, H, T}.`); + const f = symbols.definitions.find((d) => d.name === 'handle' && d.kind === 'function'); + expect(f).toBeDefined(); + expect(f?.children?.length).toBe(2); + }); }); From 13c489c367fd6a206adcfbb6f6c9cb71e3e7a309 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Mon, 11 May 2026 23:54:25 -0600 Subject: [PATCH 3/8] fix(extractors): align Erlang record_decl and TS type-alias with name-field fallback (#1103) - Rust handle_record_decl now prefers child_by_field_name("name") before falling back to find_child(atom), matching the other Erlang handlers and avoiding accidental keyword pickup if the grammar exposes 'record' as a named atom. - TypeScript handleTypeAlias now mirrors the Rust type_name->atom fallback so the two engines agree when the grammar wraps the alias name in a type_name node. --- crates/codegraph-core/src/extractors/erlang.rs | 8 +++++++- src/extractors/erlang.ts | 11 ++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/crates/codegraph-core/src/extractors/erlang.rs b/crates/codegraph-core/src/extractors/erlang.rs index 8528dd7b7..fbed612f2 100644 --- a/crates/codegraph-core/src/extractors/erlang.rs +++ b/crates/codegraph-core/src/extractors/erlang.rs @@ -60,7 +60,13 @@ fn handle_module_attr(node: &Node, source: &[u8], symbols: &mut FileSymbols) { } fn handle_record_decl(node: &Node, source: &[u8], symbols: &mut FileSymbols) { - let name_node = match find_child(node, "atom") { + // Mirror the defensive pattern used by `handle_module_attr` and + // `handle_function_clause`: prefer the named field if the grammar exposes + // it, otherwise fall back to the first `atom` child. + let name_node = match node + .child_by_field_name("name") + .or_else(|| find_child(node, "atom")) + { Some(n) => n, None => return, }; diff --git a/src/extractors/erlang.ts b/src/extractors/erlang.ts index a556ae57d..e11c21fd0 100644 --- a/src/extractors/erlang.ts +++ b/src/extractors/erlang.ts @@ -115,7 +115,16 @@ function handleRecordDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { } function handleTypeAlias(node: TreeSitterNode, ctx: ExtractorOutput): void { - const nameNode = findChild(node, 'atom'); + // type_alias: -type name(...) :: ty. + // Name is typically wrapped in a `type_name` node containing an `atom`. + // Mirrors the Rust `handle_type_alias` fallback so the two engines agree + // even when the grammar nests the name inside `type_name`. + const directAtom = findChild(node, 'atom'); + const wrappedAtom = + !directAtom && findChild(node, 'type_name') != null + ? findChild(findChild(node, 'type_name') as TreeSitterNode, 'atom') + : null; + const nameNode = directAtom ?? wrappedAtom; if (!nameNode) return; ctx.definitions.push({ From 06169e97f7a3438b64993829e27262cc88c485c4 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 13 May 2026 03:47:54 -0600 Subject: [PATCH 4/8] fix(extractors): harden erlang handle_call against grammar drift (#1103) --- crates/codegraph-core/src/extractors/erlang.rs | 10 ++++++---- src/extractors/erlang.ts | 14 ++++++++------ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/crates/codegraph-core/src/extractors/erlang.rs b/crates/codegraph-core/src/extractors/erlang.rs index fbed612f2..bbba7fdfb 100644 --- a/crates/codegraph-core/src/extractors/erlang.rs +++ b/crates/codegraph-core/src/extractors/erlang.rs @@ -280,10 +280,12 @@ fn handle_import_attr(node: &Node, source: &[u8], symbols: &mut FileSymbols) { } fn handle_call(node: &Node, source: &[u8], symbols: &mut FileSymbols) { - // call: first child is function ref (atom for plain, may be wrapped in `remote` - // in newer grammars). Mirrors the JS extractor's behavior so both engines emit - // the same set of calls. - let func_node = match node.child(0) { + // call: first named child is function ref (atom for plain, may be wrapped in + // `remote` in newer grammars). Using `named_child(0)` instead of `child(0)` + // skips anonymous tokens (punctuation, keywords) so a future grammar revision + // that inserts a leading anonymous node won't silently drop the call. Mirrors + // the JS extractor's behavior so both engines emit the same set of calls. + let func_node = match node.named_child(0) { Some(n) => n, None => return, }; diff --git a/src/extractors/erlang.ts b/src/extractors/erlang.ts index e11c21fd0..01d8b00ca 100644 --- a/src/extractors/erlang.ts +++ b/src/extractors/erlang.ts @@ -120,10 +120,8 @@ function handleTypeAlias(node: TreeSitterNode, ctx: ExtractorOutput): void { // Mirrors the Rust `handle_type_alias` fallback so the two engines agree // even when the grammar nests the name inside `type_name`. const directAtom = findChild(node, 'atom'); - const wrappedAtom = - !directAtom && findChild(node, 'type_name') != null - ? findChild(findChild(node, 'type_name') as TreeSitterNode, 'atom') - : null; + const typeNameNode = !directAtom ? findChild(node, 'type_name') : null; + const wrappedAtom = typeNameNode ? findChild(typeNameNode, 'atom') : null; const nameNode = directAtom ?? wrappedAtom; if (!nameNode) return; @@ -249,8 +247,12 @@ function handleImportAttr(node: TreeSitterNode, ctx: ExtractorOutput): void { } function handleCall(node: TreeSitterNode, ctx: ExtractorOutput): void { - // call: first child is function ref (atom or remote), then expr_args - const funcNode = node.child(0); + // call: first named child is function ref (atom or remote), then expr_args. + // Using `namedChild(0)` rather than `child(0)` skips anonymous tokens + // (punctuation, keywords) so a future grammar revision that inserts a + // leading anonymous node won't silently drop the call. Mirrors the Rust + // `handle_call` so both engines emit the same set of calls. + const funcNode = node.namedChild(0); if (!funcNode) return; if (funcNode.type === 'atom' || funcNode.type === 'identifier') { From 4c97a2fcaa7a8dbe12dc0409bec8a47eaf515e37 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 13 May 2026 04:25:32 -0600 Subject: [PATCH 5/8] test(extractors): cover erlang type-alias, opaque, and define handlers (#1103) --- .../codegraph-core/src/extractors/erlang.rs | 39 +++++++++++++++++++ tests/parsers/erlang.test.ts | 25 ++++++++++++ 2 files changed, 64 insertions(+) diff --git a/crates/codegraph-core/src/extractors/erlang.rs b/crates/codegraph-core/src/extractors/erlang.rs index bbba7fdfb..e079b4374 100644 --- a/crates/codegraph-core/src/extractors/erlang.rs +++ b/crates/codegraph-core/src/extractors/erlang.rs @@ -455,4 +455,43 @@ mod tests { let params = f.children.as_ref().expect("params"); assert_eq!(params.len(), 2, "expected one parameter per pattern"); } + + #[test] + fn extracts_type_alias() { + // -type creates a type definition; name is wrapped in `type_name(atom)` + // in the current grammar but may be a direct atom in older grammars — + // the extractor handles both shapes via the fallback in `handle_type_alias`. + let s = parse_erlang("-type id() :: integer().\n"); + let t = s + .definitions + .iter() + .find(|d| d.name == "id" && d.kind == "type") + .expect("type def"); + assert_eq!(t.kind, "type"); + } + + #[test] + fn extracts_opaque_type() { + // -opaque uses the same `type_alias` node shape and must produce a + // type definition keyed on the alias name. + let s = parse_erlang("-opaque handle() :: reference().\n"); + let t = s + .definitions + .iter() + .find(|d| d.name == "handle" && d.kind == "type") + .expect("opaque type def"); + assert_eq!(t.kind, "type"); + } + + #[test] + fn extracts_macro_define() { + // -define produces a variable-kind definition keyed on the macro name. + let s = parse_erlang("-define(MAX_SIZE, 1024).\n"); + let m = s + .definitions + .iter() + .find(|d| d.name == "MAX_SIZE") + .expect("define def"); + assert_eq!(m.kind, "variable"); + } } diff --git a/tests/parsers/erlang.test.ts b/tests/parsers/erlang.test.ts index c59a2bbf0..cc913eceb 100644 --- a/tests/parsers/erlang.test.ts +++ b/tests/parsers/erlang.test.ts @@ -62,4 +62,29 @@ foo(X, Y, Z) -> X + Y + Z.`); expect(f).toBeDefined(); expect(f?.children?.length).toBe(2); }); + + it('extracts -type aliases', () => { + // Type-alias names are wrapped in a `type_name` node containing an atom in + // the current grammar; the extractor handles both the wrapped form and a + // direct atom fallback. + const symbols = parseErlang(`-type id() :: integer().`); + expect(symbols.definitions).toContainEqual( + expect.objectContaining({ name: 'id', kind: 'type' }), + ); + }); + + it('extracts -opaque types', () => { + // -opaque uses the same `type_alias` node shape and must produce a type def. + const symbols = parseErlang(`-opaque handle() :: reference().`); + expect(symbols.definitions).toContainEqual( + expect.objectContaining({ name: 'handle', kind: 'type' }), + ); + }); + + it('extracts -define macros as variables', () => { + const symbols = parseErlang(`-define(MAX_SIZE, 1024).`); + expect(symbols.definitions).toContainEqual( + expect.objectContaining({ name: 'MAX_SIZE', kind: 'variable' }), + ); + }); }); From 16d6b33c2507def8be974f12cbbe12fde55baa35 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 13 May 2026 04:35:41 -0600 Subject: [PATCH 6/8] fix(extractors): prefer atom over var in erlang macro_lhs lookup (#1103) --- .../codegraph-core/src/extractors/erlang.rs | 44 ++++++++++++++++++- src/extractors/erlang.ts | 9 +++- tests/parsers/erlang.test.ts | 21 +++++++++ 3 files changed, 71 insertions(+), 3 deletions(-) diff --git a/crates/codegraph-core/src/extractors/erlang.rs b/crates/codegraph-core/src/extractors/erlang.rs index e079b4374..1e43d25d8 100644 --- a/crates/codegraph-core/src/extractors/erlang.rs +++ b/crates/codegraph-core/src/extractors/erlang.rs @@ -209,13 +209,21 @@ fn extract_params(clause_node: &Node, source: &[u8]) -> Vec { fn handle_define(node: &Node, source: &[u8], symbols: &mut FileSymbols) { // pp_define: -define(NAME, value). Name may be in `var`, `atom`, or `macro_lhs`. + // For parametric macros, the grammar wraps the name in a `macro_lhs(name, args)` + // node. Inside `macro_lhs` the name comes first, followed by `(`, the argument + // `var` children, and `)`. We must therefore try `atom` (lowercase macros, + // e.g. `-define(foo(X), X+1)`) before `var` (uppercase macros, e.g. + // `-define(FOO(X), X+1)`) — otherwise `find_child(.., "var")` skips the + // leading atom and lands on the first argument variable, mislabeling the + // definition with the argument name instead of the macro name. let name = if let Some(v) = find_child(node, "var") { node_text(&v, source).to_string() } else if let Some(a) = find_child(node, "atom") { node_text(&a, source).to_string() } else if let Some(lhs) = find_child(node, "macro_lhs") { - find_child(&lhs, "var") - .map(|v| node_text(&v, source).to_string()) + find_child(&lhs, "atom") + .or_else(|| find_child(&lhs, "var")) + .map(|n| node_text(&n, source).to_string()) .unwrap_or_else(|| node_text(&lhs, source).to_string()) } else { return; @@ -494,4 +502,36 @@ mod tests { .expect("define def"); assert_eq!(m.kind, "variable"); } + + #[test] + fn extracts_uppercase_parametric_macro_name() { + // Parametric macros wrap the name in `macro_lhs(atom_or_var, ...)`. + // For uppercase names the leading child is a `var` and the macro name + // must come from that var, not from any argument-position var. + let s = parse_erlang("-define(FOO(X), X + 1).\n"); + let m = s + .definitions + .iter() + .find(|d| d.name == "FOO") + .expect("FOO define def"); + assert_eq!(m.kind, "variable"); + } + + #[test] + fn extracts_lowercase_parametric_macro_name() { + // For lowercase parametric macros the macro_lhs children are + // `atom("foo"), '(', var("X"), ')'`. Without preferring the atom we + // would mislabel the definition with the first argument's variable + // name ("X") instead of the macro name ("foo"). + let s = parse_erlang("-define(foo(X), X + 1).\n"); + let m = s + .definitions + .iter() + .find(|d| d.name == "foo") + .expect("foo define def"); + assert_eq!(m.kind, "variable"); + // Make sure we did NOT mistakenly record the argument variable as the + // macro name. + assert!(s.definitions.iter().all(|d| d.name != "X")); + } } diff --git a/src/extractors/erlang.ts b/src/extractors/erlang.ts index 01d8b00ca..6be170178 100644 --- a/src/extractors/erlang.ts +++ b/src/extractors/erlang.ts @@ -195,13 +195,20 @@ function extractErlangParams(clauseNode: TreeSitterNode): SubDeclaration[] { function handleDefine(node: TreeSitterNode, ctx: ExtractorOutput): void { // pp_define: -define(NAME, value). + // For parametric macros, the grammar wraps the name in a `macro_lhs(name, args)` + // node. Inside `macro_lhs` the name comes first, followed by `(`, the argument + // `var` children, and `)`. We must therefore try `atom` (lowercase macros, + // e.g. `-define(foo(X), X+1)`) before `var` (uppercase macros, e.g. + // `-define(FOO(X), X+1)`) — otherwise `findChild(.., 'var')` skips the leading + // atom and lands on the first argument variable, mislabeling the definition. + // Mirrors the Rust `handle_define` so both engines agree. const nameNode = findChild(node, 'var') || findChild(node, 'atom') || findChild(node, 'macro_lhs'); if (!nameNode) return; const name = nameNode.type === 'macro_lhs' - ? (findChild(nameNode, 'var')?.text ?? nameNode.text) + ? (findChild(nameNode, 'atom')?.text ?? findChild(nameNode, 'var')?.text ?? nameNode.text) : nameNode.text; ctx.definitions.push({ diff --git a/tests/parsers/erlang.test.ts b/tests/parsers/erlang.test.ts index cc913eceb..d443e40cb 100644 --- a/tests/parsers/erlang.test.ts +++ b/tests/parsers/erlang.test.ts @@ -87,4 +87,25 @@ foo(X, Y, Z) -> X + Y + Z.`); expect.objectContaining({ name: 'MAX_SIZE', kind: 'variable' }), ); }); + + it('extracts uppercase parametric macro names', () => { + // Parametric macros wrap the name in `macro_lhs(name, args)`; the leading + // child is the name (var for uppercase). + const symbols = parseErlang(`-define(FOO(X), X + 1).`); + expect(symbols.definitions).toContainEqual( + expect.objectContaining({ name: 'FOO', kind: 'variable' }), + ); + }); + + it('extracts lowercase parametric macro names without mislabeling on argument vars', () => { + // For lowercase parametric macros, macro_lhs children are + // `atom("foo"), '(', var("X"), ')'`. The macro name must come from the + // atom, not from `findChild(.., 'var')` which would land on the argument. + const symbols = parseErlang(`-define(foo(X), X + 1).`); + expect(symbols.definitions).toContainEqual( + expect.objectContaining({ name: 'foo', kind: 'variable' }), + ); + // Argument variable must not be recorded as the macro name. + expect(symbols.definitions.some((d) => d.name === 'X')).toBe(false); + }); }); From 0a86034fb5f5bf6282506eb308599cbdf0aea83b Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 13 May 2026 04:37:51 -0600 Subject: [PATCH 7/8] fix(extractors): distinguish erlang include vs include_lib (#1103) --- .../codegraph-core/src/extractors/erlang.rs | 34 +++++++++++++++++-- src/extractors/erlang.ts | 8 ++++- tests/parsers/erlang.test.ts | 14 ++++++++ 3 files changed, 53 insertions(+), 3 deletions(-) diff --git a/crates/codegraph-core/src/extractors/erlang.rs b/crates/codegraph-core/src/extractors/erlang.rs index 1e43d25d8..52bd24322 100644 --- a/crates/codegraph-core/src/extractors/erlang.rs +++ b/crates/codegraph-core/src/extractors/erlang.rs @@ -249,9 +249,18 @@ fn handle_include(node: &Node, source: &[u8], symbols: &mut FileSymbols) { let raw = node_text(&str_node, source); let source_path = raw.trim_matches('"').to_string(); + // Preserve the distinction between local includes (`-include("foo.hrl")`) + // and OTP library includes (`-include_lib("kernel/include/file.hrl")`) so + // downstream consumers can apply the correct path-resolution strategy + // (local: relative to the source file; lib: relative to an OTP app root). + let kind = if node.kind() == "pp_include_lib" { + "include_lib" + } else { + "include" + }; symbols.imports.push(Import::new( source_path, - vec!["include".to_string()], + vec![kind.to_string()], start_line(node), )); } @@ -410,8 +419,29 @@ mod tests { #[test] fn extracts_include_directive() { + // Local includes carry kind "include" so downstream consumers resolve + // them relative to the source file. let s = parse_erlang("-include(\"foo.hrl\").\n"); - assert!(s.imports.iter().any(|i| i.source == "foo.hrl")); + let imp = s + .imports + .iter() + .find(|i| i.source == "foo.hrl") + .expect("include import"); + assert_eq!(imp.names, vec!["include".to_string()]); + } + + #[test] + fn extracts_include_lib_directive() { + // OTP library includes carry kind "include_lib" so downstream consumers + // resolve them against application code paths rather than the source + // file's directory. + let s = parse_erlang("-include_lib(\"kernel/include/file.hrl\").\n"); + let imp = s + .imports + .iter() + .find(|i| i.source == "kernel/include/file.hrl") + .expect("include_lib import"); + assert_eq!(imp.names, vec!["include_lib".to_string()]); } #[test] diff --git a/src/extractors/erlang.ts b/src/extractors/erlang.ts index 6be170178..d9f497403 100644 --- a/src/extractors/erlang.ts +++ b/src/extractors/erlang.ts @@ -224,9 +224,15 @@ function handleInclude(node: TreeSitterNode, ctx: ExtractorOutput): void { if (!strNode) return; const source = strNode.text.replace(/^"|"$/g, ''); + // Preserve the distinction between local includes (`-include("foo.hrl")`) + // and OTP library includes (`-include_lib("kernel/include/file.hrl")`) so + // downstream consumers can apply the correct path-resolution strategy + // (local: relative to the source file; lib: relative to an OTP app root). + // Mirrors the Rust `handle_include` so both engines agree. + const kind = node.type === 'pp_include_lib' ? 'include_lib' : 'include'; ctx.imports.push({ source, - names: ['include'], + names: [kind], line: node.startPosition.row + 1, }); } diff --git a/tests/parsers/erlang.test.ts b/tests/parsers/erlang.test.ts index d443e40cb..6147d767b 100644 --- a/tests/parsers/erlang.test.ts +++ b/tests/parsers/erlang.test.ts @@ -108,4 +108,18 @@ foo(X, Y, Z) -> X + Y + Z.`); // Argument variable must not be recorded as the macro name. expect(symbols.definitions.some((d) => d.name === 'X')).toBe(false); }); + + it('records -include with kind "include" so consumers resolve locally', () => { + const symbols = parseErlang(`-include("foo.hrl").`); + const imp = symbols.imports.find((i) => i.source === 'foo.hrl'); + expect(imp).toBeDefined(); + expect(imp?.names).toEqual(['include']); + }); + + it('records -include_lib with kind "include_lib" so consumers resolve against OTP paths', () => { + const symbols = parseErlang(`-include_lib("kernel/include/file.hrl").`); + const imp = symbols.imports.find((i) => i.source === 'kernel/include/file.hrl'); + expect(imp).toBeDefined(); + expect(imp?.names).toEqual(['include_lib']); + }); }); From 1e9971e12b062078fcfd285798fd2799986ee097 Mon Sep 17 00:00:00 2001 From: carlos-alm Date: Wed, 13 May 2026 05:34:18 -0600 Subject: [PATCH 8/8] fix(extractors): mirror Rust handle_record_decl field-name lookup in JS (#1103) --- .../codegraph-core/src/extractors/erlang.rs | 20 ++++++++++++------- src/extractors/erlang.ts | 5 ++++- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/crates/codegraph-core/src/extractors/erlang.rs b/crates/codegraph-core/src/extractors/erlang.rs index 52bd24322..d9f62929a 100644 --- a/crates/codegraph-core/src/extractors/erlang.rs +++ b/crates/codegraph-core/src/extractors/erlang.rs @@ -209,13 +209,19 @@ fn extract_params(clause_node: &Node, source: &[u8]) -> Vec { fn handle_define(node: &Node, source: &[u8], symbols: &mut FileSymbols) { // pp_define: -define(NAME, value). Name may be in `var`, `atom`, or `macro_lhs`. - // For parametric macros, the grammar wraps the name in a `macro_lhs(name, args)` - // node. Inside `macro_lhs` the name comes first, followed by `(`, the argument - // `var` children, and `)`. We must therefore try `atom` (lowercase macros, - // e.g. `-define(foo(X), X+1)`) before `var` (uppercase macros, e.g. - // `-define(FOO(X), X+1)`) — otherwise `find_child(.., "var")` skips the - // leading atom and lands on the first argument variable, mislabeling the - // definition with the argument name instead of the macro name. + // For non-parametric macros the grammar exposes the name directly as either + // a `var` (uppercase, e.g. `-define(FOO, 1)`) or an `atom` (lowercase, + // e.g. `-define(foo, 1)`) child of `pp_define`. We check `var` first + // because uppercase macros are the common case. + // + // For parametric macros the grammar wraps the name in a + // `macro_lhs(name, args)` node. Inside `macro_lhs` the name comes first, + // followed by `(`, the argument `var` children, and `)`. We must therefore + // try `atom` (lowercase, e.g. `-define(foo(X), X+1)`) before `var` + // (uppercase, e.g. `-define(FOO(X), X+1)`) inside `macro_lhs` — + // otherwise `find_child(.., "var")` skips the leading atom and lands on + // the first argument variable, mislabeling the definition with the + // argument name instead of the macro name. let name = if let Some(v) = find_child(node, "var") { node_text(&v, source).to_string() } else if let Some(a) = find_child(node, "atom") { diff --git a/src/extractors/erlang.ts b/src/extractors/erlang.ts index d9f497403..e1a973e46 100644 --- a/src/extractors/erlang.ts +++ b/src/extractors/erlang.ts @@ -86,7 +86,10 @@ function handleModuleAttr(node: TreeSitterNode, ctx: ExtractorOutput): void { function handleRecordDecl(node: TreeSitterNode, ctx: ExtractorOutput): void { // record_decl: - record ( atom , { record_field, ... } ) . - const nameNode = findChild(node, 'atom'); + // Prefer the named `name` field exposed by tree-sitter-erlang; fall back to + // the first atom child for grammar versions that don't expose it. Mirrors + // the Rust `handle_record_decl` defensive pattern. + const nameNode = node.childForFieldName('name') ?? findChild(node, 'atom'); if (!nameNode) return; const children: SubDeclaration[] = [];