diff --git a/crates/codegraph-core/src/extractors/julia.rs b/crates/codegraph-core/src/extractors/julia.rs index 0f0e7e59..61acb77a 100644 --- a/crates/codegraph-core/src/extractors/julia.rs +++ b/crates/codegraph-core/src/extractors/julia.rs @@ -77,6 +77,14 @@ fn handle_module_def(node: &Node, source: &[u8], symbols: &mut FileSymbols) -> O /// `function_definition` / `macro_definition` in a `signature` node whose /// first child is the `call_expression` — `find_child` only inspects direct /// children, so we unwrap one level explicitly. +/// +/// Grammar assumption: every `function_definition` / `macro_definition` emits +/// a `signature` child in the current tree-sitter-julia grammar. The fallback +/// to `find_child(node, "call_expression")` exists only as a defensive measure +/// for grammar drift — if it ever fires on a real definition, that fallback +/// would silently match the first body call_expression and mis-record the +/// function name. Callers must therefore treat a missing `signature` as a +/// parser/grammar mismatch worth investigating, not as a routine code path. fn signature_call<'a>(node: &Node<'a>) -> Option> { if let Some(sig) = find_child(node, "signature") { return find_child(&sig, "call_expression"); @@ -297,9 +305,17 @@ fn handle_abstract_def(node: &Node, source: &[u8], symbols: &mut FileSymbols) { /// /// Handles plain identifiers, `Name <: Super` binary expressions, and /// parameterized forms like `Name{T}` / `Name{T} <: Super{T,1}` by recursing -/// into common wrapper kinds (binary expressions, parametrized identifiers, -/// type-parameter lists). Returns `None` when no identifier can be located — +/// into wrapper kinds the Julia grammar actually emits for type heads +/// (binary expressions, parametrized type expressions, parameterized +/// identifiers). Returns `None` when no identifier can be located — /// callers should skip emitting a definition in that case. +/// +/// Note: `type_parameter_list` / `type_argument_list` are intentionally +/// excluded — Julia's grammar uses `curly_expression` for `{T}` constructs, +/// not those node kinds. Including them would risk recursing into a +/// type-parameter list and returning a type variable (e.g. `T`) instead of +/// the struct name if `find_base_name` were ever called on a node lacking a +/// direct `identifier` child. fn find_base_name<'a>(node: &Node<'a>) -> Option> { // The node itself may already be the identifier (e.g. when called on a // direct side of a binary_expression like `Point <: AbstractPoint`). @@ -317,9 +333,7 @@ fn find_base_name<'a>(node: &Node<'a>) -> Option> { match child.kind() { "binary_expression" | "parametrized_type_expression" - | "parameterized_identifier" - | "type_parameter_list" - | "type_argument_list" => { + | "parameterized_identifier" => { if let Some(found) = find_base_name(&child) { return Some(found); } diff --git a/src/extractors/julia.ts b/src/extractors/julia.ts index 9a0681c9..7667ec95 100644 --- a/src/extractors/julia.ts +++ b/src/extractors/julia.ts @@ -83,17 +83,49 @@ function handleModuleDef(node: TreeSitterNode, ctx: ExtractorOutput): string | n return nameNode.text; } +function qualifyName(base: string, currentModule: string | null): string { + // For qualified names (`function Base.show ... end` inside `module Foo`, + // or short-form `Foo.bar(x, y) = x + y` inside `module Outer`), the LHS + // is a `scoped_identifier` already containing the qualifier — skip the + // module prefix to avoid producing `Foo.Base.show` / `Outer.Foo.bar`. + if (currentModule && !base.includes('.')) return `${currentModule}.${base}`; + return base; +} + +/** + * Extract the call_expression from a function/macro definition's signature. + * + * tree-sitter-julia wraps the signature in a `signature` node whose direct + * children include the `call_expression` for the function name and parameters. + * `findChild` only inspects direct children, so we unwrap one level explicitly. + * Without this step, `findChild(node, 'call_expression')` on a + * `function_definition` would match the *body's* first call_expression + * (e.g. `println(...)` inside the body) instead of the signature. + * + * Grammar assumption: every `function_definition` / `macro_definition` emits a + * `signature` child in the current tree-sitter-julia grammar. The fallback to + * `findChild(node, 'call_expression')` exists only as a defensive measure for + * grammar drift — if it ever fires on a real definition, that fallback would + * silently match the first body call_expression and mis-record the function + * name. Callers must therefore treat a missing `signature` as a parser/grammar + * mismatch worth investigating, not as a routine code path. + */ +function signatureCall(node: TreeSitterNode): TreeSitterNode | null { + const sig = findChild(node, 'signature'); + if (sig) return findChild(sig, 'call_expression'); + return findChild(node, 'call_expression'); +} + function handleFunctionDef( node: TreeSitterNode, ctx: ExtractorOutput, currentModule: string | null, ): void { - // function_definition may have a call_expression child as the signature - const callSig = findChild(node, 'call_expression'); + const callSig = signatureCall(node); if (callSig) { const funcNameNode = callSig.child(0); if (funcNameNode) { - const name = currentModule ? `${currentModule}.${funcNameNode.text}` : funcNameNode.text; + const name = qualifyName(funcNameNode.text, currentModule); const params = extractJuliaParams(callSig); ctx.definitions.push({ name, @@ -110,9 +142,8 @@ function handleFunctionDef( const nameNode = node.childForFieldName('name') || findChild(node, 'identifier'); if (!nameNode) return; - const name = currentModule ? `${currentModule}.${nameNode.text}` : nameNode.text; ctx.definitions.push({ - name, + name: qualifyName(nameNode.text, currentModule), kind: 'function', line: node.startPosition.row + 1, endLine: nodeEndLine(node), @@ -133,11 +164,10 @@ function handleAssignment( const funcNameNode = lhs.child(0); if (!funcNameNode) return; - const name = currentModule ? `${currentModule}.${funcNameNode.text}` : funcNameNode.text; const params = extractJuliaParams(lhs); ctx.definitions.push({ - name, + name: qualifyName(funcNameNode.text, currentModule), kind: 'function', line: node.startPosition.row + 1, endLine: nodeEndLine(node), @@ -146,16 +176,74 @@ function handleAssignment( } } +/** + * Locate the base-name identifier within a `type_head` node. + * + * Handles plain identifiers, `Name <: Super` binary expressions, and + * parameterized forms like `Name{T}` / `Name{T} <: Super{T,1}` by recursing + * into wrapper kinds the Julia grammar actually emits for type heads + * (binary expressions, parametrized type expressions, parameterized + * identifiers). Returns `null` when no identifier can be located — callers + * should skip emitting a definition in that case. + * + * Note: `type_parameter_list` / `type_argument_list` are intentionally + * excluded — Julia's grammar uses `curly_expression` for `{T}` constructs, + * not those node kinds. Including them would risk recursing into a + * type-parameter list and returning a type variable (e.g. `T`) instead of + * the struct name if `findBaseName` were ever called on a node lacking a + * direct `identifier` child. + */ +const TYPE_HEAD_WRAPPERS: ReadonlySet = new Set([ + 'binary_expression', + 'parametrized_type_expression', + 'parameterized_identifier', +]); + +function findBaseName(node: TreeSitterNode): TreeSitterNode | null { + if (node.type === 'identifier') return node; + const direct = findChild(node, 'identifier'); + if (direct) return direct; + for (let i = 0; i < node.childCount; i++) { + const child = node.child(i); + if (!child) continue; + if (TYPE_HEAD_WRAPPERS.has(child.type)) { + const found = findBaseName(child); + if (found) return found; + } + } + return null; +} + function handleStructDef(node: TreeSitterNode, ctx: ExtractorOutput): void { // struct_definition: struct type_head fields... end + // type_head wraps the name and optional supertype. The name may be a + // bare `identifier`, a parameterized form (e.g. `Vec{T}`), or either + // of those nested inside a `binary_expression` (`Name <: Super`). const typeHead = findChild(node, 'type_head'); - const nameNode = typeHead - ? (findChild(typeHead, 'identifier') ?? typeHead) - : findChild(node, 'identifier'); + if (!typeHead) return; + + let nameNode: TreeSitterNode | null; + let supertypeNode: TreeSitterNode | null = null; + + const binary = findChild(typeHead, 'binary_expression'); + if (binary) { + // Walk into each side of the binary expression to find the base-name + // identifier — handles parameterized forms like `Vec{T} <: AbstractArray{T,1}`. + const sides: TreeSitterNode[] = []; + for (let i = 0; i < binary.childCount; i++) { + const c = binary.child(i); + if (c && c.type !== 'operator') sides.push(c); + } + nameNode = sides[0] ? findBaseName(sides[0]) : null; + supertypeNode = sides[1] ? findBaseName(sides[1]) : null; + } else { + nameNode = findBaseName(typeHead); + } + if (!nameNode) return; + const structName = nameNode.text; const children: SubDeclaration[] = []; - // Fields are typed_expression children of struct_definition for (let i = 0; i < node.childCount; i++) { const child = node.child(i); if (!child) continue; @@ -168,33 +256,24 @@ function handleStructDef(node: TreeSitterNode, ctx: ExtractorOutput): void { line: child.startPosition.row + 1, }); } - } - // Plain identifier fields (no type annotation) - if (child.type === 'identifier' && child !== nameNode && typeHead && child !== typeHead) { + } else if (child.type === 'identifier') { + // Plain identifier fields (no type annotation) appear as direct + // identifier children of struct_definition. The type_head is a + // separate node so there is nothing to filter out here. children.push({ name: child.text, kind: 'property', line: child.startPosition.row + 1 }); } } - // Check for supertype in type_head (Point <: AbstractPoint) - if (typeHead) { - const subtypeExpr = findChild(typeHead, 'subtype_expression'); - if (subtypeExpr) { - // Find the supertype identifier - for (let i = 0; i < subtypeExpr.childCount; i++) { - const child = subtypeExpr.child(i); - if (child?.type === 'identifier' && i > 0) { - ctx.classes.push({ - name: nameNode.text, - extends: child.text, - line: node.startPosition.row + 1, - }); - } - } - } + if (supertypeNode) { + ctx.classes.push({ + name: structName, + extends: supertypeNode.text, + line: node.startPosition.row + 1, + }); } ctx.definitions.push({ - name: nameNode.text, + name: structName, kind: 'struct', line: node.startPosition.row + 1, endLine: nodeEndLine(node), @@ -203,7 +282,14 @@ function handleStructDef(node: TreeSitterNode, ctx: ExtractorOutput): void { } function handleAbstractDef(node: TreeSitterNode, ctx: ExtractorOutput): void { - const nameNode = node.childForFieldName('name') || findChild(node, 'identifier'); + // abstract_definition: `abstract type` type_head `end` + // The identifier is nested inside `type_head` — possibly wrapped in a + // `Name <: Super` binary_expression or a `Name{T,...}` parameterized form. + // Mirror handleStructDef and skip rather than emit a garbled name when no + // base identifier can be located. + const typeHead = findChild(node, 'type_head'); + if (!typeHead) return; + const nameNode = findBaseName(typeHead); if (!nameNode) return; ctx.definitions.push({ @@ -219,10 +305,17 @@ function handleMacroDef( ctx: ExtractorOutput, currentModule: string | null, ): void { - const nameNode = node.childForFieldName('name') || findChild(node, 'identifier'); + // macro_definition: `macro` signature/call_expression body `end`. + // The name lives in the same shape as a function signature — unwrap via + // signatureCall so we don't pick up an identifier from the body (e.g. + // `macro mymac(x) x end` would otherwise resolve to `@x`). + const callSig = signatureCall(node); + const nameNode = + callSig?.child(0) ?? node.childForFieldName('name') ?? findChild(node, 'identifier'); if (!nameNode) return; - const name = currentModule ? `${currentModule}.@${nameNode.text}` : `@${nameNode.text}`; + const base = nameNode.text; + const name = currentModule ? `${currentModule}.@${base}` : `@${base}`; ctx.definitions.push({ name, kind: 'function', @@ -232,19 +325,40 @@ function handleMacroDef( } function handleImport(node: TreeSitterNode, ctx: ExtractorOutput): void { + // tree-sitter-julia shapes: + // `using LinearAlgebra` → using_statement [ using, identifier ] + // `import Foo.Bar` → import_statement [ import, scoped_identifier ] + // `import Base: show` → import_statement [ import, selected_import[Base, show] ] + // `import Foo.Bar: baz` → import_statement [ import, selected_import[scoped_identifier, baz] ] const names: string[] = []; let source = ''; for (let i = 0; i < node.childCount; i++) { const child = node.child(i); if (!child) continue; - if ( - child.type === 'identifier' || - child.type === 'scoped_identifier' || - child.type === 'selected_import' - ) { - if (!source) source = child.text; - names.push(child.text.split('.').pop() || child.text); + if (child.type === 'identifier' || child.type === 'scoped_identifier') { + const txt = child.text; + if (!source) source = txt; + names.push(txt.split('.').pop() || txt); + } else if (child.type === 'selected_import') { + // First identifier-bearing node is the source module; the rest are + // imported names. The module may itself be a `scoped_identifier` + // (e.g. `import Foo.Bar: baz`) — handle it alongside bare + // `identifier` and use the trailing segment as the display name, + // mirroring the outer loop. + let first = true; + for (let j = 0; j < child.childCount; j++) { + const part = child.child(j); + if (!part) continue; + if (part.type !== 'identifier' && part.type !== 'scoped_identifier') continue; + const txt = part.text; + if (first) { + if (!source) source = txt; + first = false; + } else { + names.push(txt.split('.').pop() || txt); + } + } } } @@ -260,8 +374,15 @@ function handleImport(node: TreeSitterNode, ctx: ExtractorOutput): void { function handleCall(node: TreeSitterNode, ctx: ExtractorOutput): void { // Don't record if parent is assignment LHS (that's a function definition) if (node.parent?.type === 'assignment' && node === node.parent.child(0)) return; - // Don't record if parent is function_definition (that's a signature) - if (node.parent?.type === 'function_definition') return; + // Skip when this call is the signature of a function/macro definition. + // tree-sitter-julia wraps the signature in a `signature` node whose parent + // is `function_definition` or `macro_definition`. Body calls (e.g. + // `println(name)` inside `function greet ... end`) appear as descendants of + // the body, not as direct children of `signature`, so they are unaffected. + if (node.parent?.type === 'signature') { + const grand = node.parent.parent; + if (grand?.type === 'function_definition' || grand?.type === 'macro_definition') return; + } const funcNode = node.child(0); if (!funcNode) return; diff --git a/tests/parsers/julia.test.ts b/tests/parsers/julia.test.ts index b1fc3bb3..dd85280f 100644 --- a/tests/parsers/julia.test.ts +++ b/tests/parsers/julia.test.ts @@ -48,6 +48,12 @@ end`); const symbols = parseJulia(`using LinearAlgebra import Base: show`); expect(symbols.imports.length).toBeGreaterThanOrEqual(1); + // `import Base: show` must record `source: 'Base'` and `names: ['show']`, + // not the previously-broken `source: 'Base: show', names: ['Base: show']`. + const selected = symbols.imports.find((imp) => imp.source === 'Base'); + expect(selected).toBeDefined(); + expect(selected?.names).toContain('show'); + expect(selected?.names).not.toContain('Base'); }); it('extracts function calls', () => { @@ -55,4 +61,107 @@ import Base: show`); push!(arr, 1)`); expect(symbols.calls.length).toBeGreaterThanOrEqual(1); }); + + it('extracts parameterized struct base name', () => { + // Parameterized struct names (e.g. `Vec{T}`) must record the base + // identifier — not be silently dropped or include type-parameter text. + const symbols = parseJulia(`struct Vec{T} <: AbstractArray{T,1} + data::Vector{T} +end`); + const names = symbols.definitions.map((d) => d.name); + expect(names).toContain('Vec'); + expect(names.every((n) => !n.includes('{') && !n.includes('<'))).toBe(true); + expect(symbols.classes).toHaveLength(1); + expect(symbols.classes[0]).toMatchObject({ name: 'Vec', extends: 'AbstractArray' }); + }); + + it('extracts non-parameterized struct inheritance', () => { + // Simple `struct Name <: Super` must still record both the definition + // and the `extends` relationship — the grammar wraps it in a + // `binary_expression` just like the parameterized form. + const symbols = parseJulia(`struct Point <: AbstractPoint + x::Float64 + y::Float64 +end`); + const names = symbols.definitions.map((d) => d.name); + expect(names).toContain('Point'); + expect(symbols.classes).toHaveLength(1); + expect(symbols.classes[0]).toMatchObject({ name: 'Point', extends: 'AbstractPoint' }); + }); + + it('qualified short-form method does not double-prefix', () => { + // `Foo.bar(x, y) = x + y` inside `module Outer` must record `Foo.bar`, + // not `Outer.Foo.bar` — the scoped_identifier already carries the qualifier. + const symbols = parseJulia(`module Outer + Foo.bar(x, y) = x + y +end`); + const names = symbols.definitions.map((d) => d.name); + expect(names).toContain('Foo.bar'); + expect(names).not.toContain('Outer.Foo.bar'); + }); + + it('qualified function def does not double-prefix', () => { + // `function Base.show(io, x) ... end` inside `module Foo` must record + // `Base.show`, not `Foo.Base.show`. + const symbols = parseJulia(`module Foo + function Base.show(io, x) + println(io, x) + end +end`); + const names = symbols.definitions.map((d) => d.name); + expect(names).toContain('Base.show'); + expect(names).not.toContain('Foo.Base.show'); + }); + + it('extracts abstract type', () => { + const symbols = parseJulia(`abstract type AbstractShape end`); + const abs = symbols.definitions.find((d) => d.name === 'AbstractShape'); + expect(abs).toBeDefined(); + expect(abs).toMatchObject({ kind: 'type' }); + }); + + it('extracts parameterized abstract type base name', () => { + // Parameterized generics with a supertype must record only the base + // identifier — never the raw `Name{T} <: Super{T,1}` text. + const symbols = parseJulia(`abstract type AbstractVector{T} <: AbstractArray{T,1} end`); + const names = symbols.definitions.map((d) => d.name); + expect(names).toContain('AbstractVector'); + expect(names.every((n) => !n.includes('{') && !n.includes('<'))).toBe(true); + }); + + it('extracts macro definitions with correct name', () => { + // `findChild(node, 'identifier')` would resolve to the body's `x` here, + // recording the macro as `@x` instead of `@mymac`. + const symbols = parseJulia(`macro mymac(x) + x +end`); + const names = symbols.definitions.map((d) => d.name); + expect(names).toContain('@mymac'); + expect(names).not.toContain('@x'); + }); + + it('does not record function signature as call', () => { + // The signature's `call_expression` lives inside a `signature` node — a + // naive `parent.type === 'function_definition'` guard misses it and + // records `greet` as both a definition and a call. + const symbols = parseJulia(`function greet(name) + println(name) +end`); + const callNames = symbols.calls.map((c) => c.name); + expect(callNames).not.toContain('greet'); + expect(callNames).toContain('println'); + }); + + it('selected_import handles qualified module', () => { + // `import Foo.Bar: baz` — module is a scoped_identifier. The import + // must record `Foo.Bar` as the source and `baz` as the imported name, + // not the malformed `source="baz", names=["baz"]`. + const symbols = parseJulia(`import LinearAlgebra.BLAS: gemm`); + expect(symbols.imports).toHaveLength(1); + expect(symbols.imports[0]).toMatchObject({ + source: 'LinearAlgebra.BLAS', + }); + expect(symbols.imports[0].names).toContain('gemm'); + expect(symbols.imports[0].names).not.toContain('LinearAlgebra.BLAS'); + }); });