From 768dec76ed33d0d48c3de193f5701e74791a23b3 Mon Sep 17 00:00:00 2001 From: Steve Ramage Date: Sun, 21 Jun 2026 09:03:08 -0700 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20grammar=20engine=20PoC=20=E2=80=94?= =?UTF-8?q?=20list-of-successes=20matcher=20(#467=20step=202)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Parallel, self-contained proof of concept for a new option-value grammar engine in a `grammar2` package. It does not touch the existing `grammar` package; it sits beside it so we can play with the approach and validate it against existing behaviour first. Core idea: every matcher returns ALL the ways it can match, lazily (`parse(input, offset): Sequence` — Wadler's "list of successes"), instead of one greedy first match. Seq threads each possibility into the next and a value is valid if any path consumes the whole input, so Seq(ZeroOrMore("a"), "a") now matches "aa" (the case the current engine's own docs warn it fails). Two more ideas come along for free: - a labeled parse tree (Branch with a Role), so a span like an IPv4 address is one labeled unit rather than per-terminal colors; - per-leaf validity flags, collapsing the old syntactic/semantic two passes into one lenient parse (a token can match yet be flagged invalid). Capabilities (validate, and the seed of coloring) are free functions over the parse result — no per-combinator code. No IntelliJ types in the engine; it speaks plain Int offsets so the IntelliJ layer can adapt later. Tests reproduce the RestrictAddressFamilies= canary cases, show well-formed-but- unknown vs malformed error kinds, expose leaf roles for coloring, and prove the greedy completeness win. Refs #467 #345 #343 #342 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../optionvalues/grammar2/GrammarPoc.kt | 220 ++++++++++++++++++ .../optionvalues/grammar2/GrammarPocTest.kt | 111 +++++++++ 2 files changed, 331 insertions(+) create mode 100644 src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPoc.kt create mode 100644 src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPocTest.kt diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPoc.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPoc.kt new file mode 100644 index 0000000..0d0236b --- /dev/null +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPoc.kt @@ -0,0 +1,220 @@ +package net.sjrx.intellij.plugins.systemdunitfiles.semanticdata.optionvalues.grammar2 + +/* + * ============================================================================ + * Grammar engine PoC (GitHub #467, step 2) + * ============================================================================ + * + * This is a parallel, self-contained proof of concept for a new option-value + * grammar engine. It does NOT replace the existing `..optionvalues.grammar` + * package yet; it sits beside it so we can play with the idea and validate it + * against the existing behavioural tests before committing. + * + * THE ONE IDEA + * ------------ + * Today every combinator returns *one* answer (greedy, first match, no + * backtracking), so `Seq(ZeroOrMore("a"), "a")` fails to match "aa": the star + * eats both a's and the trailing "a" has nothing left. + * + * Here every matcher instead returns *all the ways it could match*, lazily: + * + * fun parse(input, offset): Sequence + * + * `Seq` then threads each possibility of one part into the next, and a value is + * valid if *any* path consumes the whole string. The star "gives back" an `a` + * for free, simply because it OFFERED the shorter match as one of its results. + * This is Wadler's "list of successes". Laziness means we explore depth-first + * and stop at the first complete parse, so we pay only for what we use. + * + * Two more things fall out of carrying a little structure on each result: + * - a *labeled* parse tree (a `Branch` with a role) — so an IPv4 address is + * ONE labeled span, not blue-octet/black-dot/blue-octet. Coloring, + * deprecation warnings and canonicalization all become free functions that + * walk this tree. + * - per-leaf *validity* flags, so the old "syntactic vs semantic" two passes + * collapse into one lenient parse: a token can match (so we can color / + * locate it) while still being flagged invalid. + * + * No IntelliJ types appear in this file on purpose: the engine stays pure + * Kotlin and speaks plain Int offsets, and the IntelliJ layer adapts later. + */ + +/** A semantic role attached to a labeled span. Drives coloring / warnings / rewrites. */ +enum class Role { + KEYWORD, // a fixed literal like "none" or the "~" inversion prefix + ADDRESS_FAMILY, // an AF_* name + WHITESPACE, +} + +/** A node in the parse tree produced for one specific input. */ +sealed interface Node { + val start: Int + val end: Int +} + +/** A terminal that matched the text `input[start until end]`. `valid` is the strict check. */ +data class Leaf( + override val start: Int, + override val end: Int, + val text: String, + val role: Role?, + val valid: Boolean, +) : Node + +/** A labeled grouping of child nodes — this is what gives the tree its shape. */ +data class Branch( + val role: Role, + override val start: Int, + override val end: Int, + val children: List, +) : Node + +/** One successful way a matcher consumed input: it ended at `end` and produced `nodes`. */ +data class Parse(val end: Int, val nodes: List) + +/** A matcher is just a function from (input, offset) to every way it can match there. */ +fun interface Matcher { + /** Empty sequence == no match. Otherwise, one `Parse` per distinct way to match. */ + fun parse(input: String, offset: Int): Sequence +} + +// --------------------------------------------------------------------------- +// Terminals +// --------------------------------------------------------------------------- + +/** Matches an exact string, e.g. Lit("none") or Lit("~"). Always strictly valid when it matches. */ +class Lit(private val text: String, private val role: Role? = Role.KEYWORD) : Matcher { + override fun parse(input: String, offset: Int): Sequence = + if (input.startsWith(text, offset)) { + sequenceOf(Parse(offset + text.length, listOf(Leaf(offset, offset + text.length, text, role, valid = true)))) + } else { + emptySequence() + } +} + +/** + * Matches an identifier-shaped token loosely, then flags it valid only if it is an exact choice. + * This is the "lenient shape, strict membership" trick: a bad token like AF_BOGUS still matches + * (so we can highlight exactly it), but is reported invalid. + */ +class FlexibleChoice( + private val choices: Set, + private val role: Role? = null, + private val shape: Regex = Regex("[A-Za-z0-9_]+"), +) : Matcher { + constructor(vararg choices: String, role: Role? = null) : this(choices.toSet(), role) + + override fun parse(input: String, offset: Int): Sequence { + val m = shape.matchAt(input, offset) ?: return emptySequence() + val text = m.value + return sequenceOf(Parse(offset + text.length, listOf(Leaf(offset, offset + text.length, text, role, text in choices)))) + } +} + +/** Matches a maximal run of whitespace (a separator). */ +object Whitespace : Matcher { + override fun parse(input: String, offset: Int): Sequence { + var end = offset + while (end < input.length && input[end].isWhitespace()) end++ + return if (end == offset) emptySequence() + else sequenceOf(Parse(end, listOf(Leaf(offset, end, input.substring(offset, end), Role.WHITESPACE, valid = true)))) + } +} + +// --------------------------------------------------------------------------- +// Combinators +// --------------------------------------------------------------------------- + +/** All parts in order. Threads each possibility of one part into the next (the cartesian product). */ +class Seq(private vararg val parts: Matcher) : Matcher { + override fun parse(input: String, offset: Int): Sequence { + var results = sequenceOf(Parse(offset, emptyList())) + for (part in parts) { + results = results.flatMap { acc -> + part.parse(input, acc.end).map { next -> Parse(next.end, acc.nodes + next.nodes) } + } + } + return results + } +} + +/** Any of the options. Yields *all* options' matches concatenated, so ordering no longer matters. */ +class Alt(private vararg val options: Matcher) : Matcher { + override fun parse(input: String, offset: Int): Sequence = + options.asSequence().flatMap { it.parse(input, offset) } +} + +/** Zero or one of `inner`. Offers both the empty match and `inner`'s matches. */ +class ZeroOrOne(private val inner: Matcher) : Matcher { + override fun parse(input: String, offset: Int): Sequence = + sequenceOf(Parse(offset, emptyList())) + inner.parse(input, offset) +} + +/** + * Zero or more of `inner`. Crucially this offers EVERY repetition count (0, 1, 2, ...), not just + * the greedy maximum — that is what makes matching complete. The `> from.end` guard keeps an + * inner matcher that can match empty from looping forever. + */ +class ZeroOrMore(private val inner: Matcher) : Matcher { + override fun parse(input: String, offset: Int): Sequence { + fun extend(from: Parse): Sequence = sequence { + yield(from) // stop repeating here... + for (step in inner.parse(input, from.end)) { + if (step.end > from.end) { + yieldAll(extend(Parse(step.end, from.nodes + step.nodes))) // ...or take one more and recurse + } + } + } + return extend(Parse(offset, emptyList())) + } +} + +/** Wraps `inner` and collapses everything it matched into a single labeled `Branch`. */ +class Labeled(private val role: Role, private val inner: Matcher) : Matcher { + override fun parse(input: String, offset: Int): Sequence = + inner.parse(input, offset).map { p -> + val end = p.nodes.lastOrNull()?.end ?: offset + Parse(p.end, listOf(Branch(role, offset, end, p.nodes))) + } +} + +// --------------------------------------------------------------------------- +// Capabilities — free functions over the parse result, no combinator code +// --------------------------------------------------------------------------- + +/** Flatten a parse tree to its leaves, in source order. */ +fun Parse.leaves(): List = nodes.flatMap { it.leaves() } + +private fun Node.leaves(): List = when (this) { + is Leaf -> listOf(this) + is Branch -> children.flatMap { it.leaves() } +} + +/** The outcome of validating a value against a grammar. */ +sealed interface Outcome { + /** Some path consumed the whole input with every token strictly valid. */ + object Valid : Outcome + /** A path consumed the whole input, but a token is not strictly valid (well-formed but wrong). */ + data class SemanticError(val badToken: Leaf) : Outcome + /** No path consumed the whole input. `furthest` is how far we got (for error localization). */ + data class SyntaxError(val furthest: Int) : Outcome +} + +/** + * Validate `input` against `grammar`, requiring the whole string to be consumed. One lenient parse + * answers both questions: syntactic well-formedness (did any path reach the end?) and semantic + * validity (did any such path have only valid tokens?). + */ +fun validate(grammar: Matcher, input: String): Outcome { + var firstBad: Leaf? = null + for (p in grammar.parse(input, 0)) { + if (p.end != input.length) continue // not a full match — ignore for validity + val bad = p.leaves().firstOrNull { !it.valid } + if (bad == null) return Outcome.Valid // short-circuit on the first fully-valid full parse + if (firstBad == null) firstBad = bad + } + if (firstBad != null) return Outcome.SemanticError(firstBad) + // Nothing reached the end. Report the furthest offset any partial path reached. + val furthest = grammar.parse(input, 0).maxOfOrNull { it.end } ?: 0 + return Outcome.SyntaxError(furthest) +} diff --git a/src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPocTest.kt b/src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPocTest.kt new file mode 100644 index 0000000..c0140ec --- /dev/null +++ b/src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPocTest.kt @@ -0,0 +1,111 @@ +package net.sjrx.intellij.plugins.systemdunitfiles.semanticdata.optionvalues.grammar2 + +import org.junit.Assert.assertEquals +import org.junit.Assert.assertTrue +import org.junit.Test + +/** + * Behavioural tests for the grammar engine PoC (#467 step 2). + * + * The RestrictAddressFamilies= cases mirror the canary suite + * (ConfigParseAddressFamiliesOptionValueTest) so we can see the new engine reproduce the existing + * behaviour before wiring it into IntelliJ. + */ +class GrammarPocTest { + + // A representative slice of the AF_* set (enough for the canary cases). + private val families = setOf( + "AF_INET", "AF_INET6", "AF_UNIX", "AF_NETLINK", "AF_PACKET", "AF_BRIDGE", + "AF_X25", "AF_AX25", "AF_LOCAL", "AF_DECnet", "AF_VSOCK", "AF_XDP", "AF_MCTP", + ) + + // none | [~] family (ws family)* — note: no EOF here; validate() requires full consumption. + private val family = FlexibleChoice(families, role = Role.ADDRESS_FAMILY) + private val restrictAddressFamilies: Matcher = Alt( + Lit("none"), + Seq(ZeroOrOne(Lit("~")), family, ZeroOrMore(Seq(Whitespace, family))), + ) + + private fun isValid(value: String) = validate(restrictAddressFamilies, value) == Outcome.Valid + + @Test + fun testValidValues() { + val valid = listOf( + "none", + "AF_INET", + "AF_INET AF_INET6", + "AF_UNIX AF_NETLINK", + "~AF_PACKET", + "~AF_INET AF_INET6", + "AF_BRIDGE AF_X25 AF_AX25", + // newer additions / aliases / mixed case + "AF_LOCAL", + "AF_DECnet", + "AF_VSOCK AF_XDP AF_MCTP", + "~AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_PACKET", + ) + for (v in valid) assertTrue("expected valid: '$v'", isValid(v)) + } + + @Test + fun testInvalidValues() { + val invalid = listOf( + "inet", "AF_inet", "AF_INET, AF_INET6", "~ AF_PACKET", "NONE", + "AF_BOGUS", "AF_INETZ", "AF_INET AF_MADEUP", "AF_DECNET", + ) + for (v in invalid) assertTrue("expected invalid: '$v'", !isValid(v)) + } + + @Test + fun testWellFormedButUnknownIsSemanticError() { + // AF_BOGUS matches the token shape (so we can point at it) but is not a real family. + val outcome = validate(restrictAddressFamilies, "AF_BOGUS") + assertTrue(outcome is Outcome.SemanticError) + val bad = (outcome as Outcome.SemanticError).badToken + assertEquals("AF_BOGUS", bad.text) + assertEquals(Role.ADDRESS_FAMILY, bad.role) + } + + @Test + fun testMalformedIsSyntaxErrorLocatedAtFurthestProgress() { + // After "AF_INET" the comma is neither whitespace nor a family, so the shape breaks at offset 7. + val outcome = validate(restrictAddressFamilies, "AF_INET, AF_INET6") + assertTrue(outcome is Outcome.SyntaxError) + assertEquals(7, (outcome as Outcome.SyntaxError).furthest) + } + + @Test + fun testRolesAreAvailableForColoring() { + // The leaves of a successful parse already carry the roles a colorizer / annotator would use. + val parse = restrictAddressFamilies.parse("~AF_INET AF_INET6", 0).first { it.end == 17 } + val roles = parse.leaves().map { it.role } + assertEquals(listOf(Role.KEYWORD, Role.ADDRESS_FAMILY, Role.WHITESPACE, Role.ADDRESS_FAMILY), roles) + } + + @Test + fun testGreedyCaseTheOldEngineFails() { + // Seq(ZeroOrMore("a"), "a") on "aa": the old single-path greedy engine fails this because the + // star eats both a's. List-of-successes offers the shorter star match, so the trailing "a" fits. + val grammar = Seq(ZeroOrMore(Lit("a")), Lit("a")) + assertTrue(validate(grammar, "a") == Outcome.Valid) + assertTrue(validate(grammar, "aa") == Outcome.Valid) + assertTrue(validate(grammar, "aaa") == Outcome.Valid) + assertTrue(validate(grammar, "") != Outcome.Valid) // needs at least one "a" + assertTrue(validate(grammar, "ab") != Outcome.Valid) // trailing junk + } + + @Test + fun testLabeledProducesOneBranchSpanningItsChildren() { + // The mechanism behind "127.0.0.1 is ONE labeled span, not blue-octet/black-dot/...": + // Labeled collapses everything its inner matcher produced into a single Branch with a role. + val grammar = Labeled(Role.ADDRESS_FAMILY, Seq(Lit("x", null), Lit("y", null))) + val full = grammar.parse("xy", 0).single { it.end == 2 } + + val branch = full.nodes.single() as Branch + assertEquals(Role.ADDRESS_FAMILY, branch.role) + assertEquals(0, branch.start) + assertEquals(2, branch.end) + assertEquals(2, branch.children.size) + assertEquals(2, full.leaves().size) // and leaves() still flattens the tree + } +} From c3b8f6d80fd2ddbd525cabfdc65e8562f16add93 Mon Sep 17 00:00:00 2001 From: Steve Ramage Date: Sun, 21 Jun 2026 09:28:05 -0700 Subject: [PATCH 2/2] refactor: implement parse() in-place on the real combinators, drop grammar2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the standalone grammar2 PoC with the same list-of-successes idea grown directly on the existing combinators, per review feedback: the parallel package had a jarringly different surface and implied a rewrite of all 225 grammars. Instead, add one new method — Combinator.parse(value, offset): Sequence — ALONGSIDE the existing SyntacticMatch/SemanticMatch, implemented on each of the 12 combinators next to its existing match logic. The caller (GrammarOptionValue) is untouched and still uses the old methods; nothing in the 225 grammar definitions changes. parse() can therefore be validated against the REAL production grammars before any migration decision. - Parse.kt: ParsedToken / Parse result types + validate() free function. One lenient pass folds the strict "semantic" check into a per-token `valid` flag, so it answers both syntactic (any full parse?) and semantic (a full parse with only valid tokens?) without two traversals. - Each combinator returns every way it can match, lazily; Alt offers all options (ordering no longer matters), ZeroOrMore/OneOrMore/Repeat offer every count. - ParseTest runs validate() against the actual ConfigParseAddressFamiliesOptionValue grammar and the real IPV6_ADDR combinator (15+ hand-ordered alternatives), an integer-range grammar, and the greedy Seq(ZeroOrMore("a"),"a") case — which it shows the old SemanticMatch still fails while parse() succeeds. Known limitation pinned in a test: SyntaxError `furthest` is best-effort and collapses to 0 when a trailing EOF() discards partial progress; precise localization needs the frontier/expected-set layer (the same machinery as completion, #343), deliberately out of scope here. Refs #467 #345 #343 #342 Co-Authored-By: Claude Opus 4.8 (1M context) --- .../grammar/AlternativeCombinator.kt | 4 + .../optionvalues/grammar/Combinator.kt | 11 + .../semanticdata/optionvalues/grammar/EOF.kt | 3 + .../grammar/FlexibleLiteralChoiceTerminal.kt | 9 + .../optionvalues/grammar/IntegerTerminal.kt | 8 + .../grammar/LiteralChoiceTerminal.kt | 6 + .../optionvalues/grammar/OneOrMore.kt | 11 + .../optionvalues/grammar/Parse.kt | 60 +++++ .../optionvalues/grammar/RegexTerminal.kt | 8 + .../optionvalues/grammar/Repeat.kt | 14 ++ .../grammar/SequenceCombinator.kt | 11 + .../grammar/WhitespaceTerminal.kt | 7 + .../optionvalues/grammar/ZeroOrMore.kt | 12 + .../optionvalues/grammar/ZeroOrOne.kt | 4 + .../optionvalues/grammar2/GrammarPoc.kt | 220 ------------------ .../optionvalues/grammar/ParseTest.kt | 97 ++++++++ .../optionvalues/grammar2/GrammarPocTest.kt | 111 --------- 17 files changed, 265 insertions(+), 331 deletions(-) create mode 100644 src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Parse.kt delete mode 100644 src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPoc.kt create mode 100644 src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ParseTest.kt delete mode 100644 src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPocTest.kt diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/AlternativeCombinator.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/AlternativeCombinator.kt index 76320d7..cb10cec 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/AlternativeCombinator.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/AlternativeCombinator.kt @@ -38,6 +38,10 @@ open class AlternativeCombinator(vararg val tokens: Combinator) : Combinator { return match(value, offset, Combinator::SemanticMatch) } + override fun parse(value: String, offset: Int): Sequence = + // Offer every alternative's matches, so the order of options no longer affects correctness. + tokens.asSequence().flatMap { it.parse(value, offset) } + override fun toString(): String = toStringIndented(0) override fun toStringIndented(indent: Int): String { diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Combinator.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Combinator.kt index 29757da..b9140ad 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Combinator.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Combinator.kt @@ -32,5 +32,16 @@ interface Combinator { */ fun SemanticMatch(value : String, offset: Int): MatchResult + /** + * List-of-successes matcher (#467). Returns EVERY way this combinator can consume [value] + * starting at [offset], lazily; an empty sequence means no match. + * + * This lives alongside Syntactic/SemanticMatch and is a single lenient pass: each [ParsedToken] + * carries a `valid` flag for the strict (semantic) check. Because every alternative is offered + * rather than the first greedy one committed to, matching is complete — e.g. + * Seq(ZeroOrMore("a"), "a") on "aa" matches, because ZeroOrMore offers the shorter match too. + */ + fun parse(value: String, offset: Int): Sequence + fun toStringIndented(indent: Int): String } diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/EOF.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/EOF.kt index 770b4ef..aea72e8 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/EOF.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/EOF.kt @@ -17,6 +17,9 @@ class EOF : Combinator { } } + override fun parse(value: String, offset: Int): Sequence = + if (offset == value.length) sequenceOf(Parse(offset, emptyList())) else emptySequence() + override fun toStringIndented(indent: Int): String { return "EOF" } diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/FlexibleLiteralChoiceTerminal.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/FlexibleLiteralChoiceTerminal.kt index 4876bfd..c24012a 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/FlexibleLiteralChoiceTerminal.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/FlexibleLiteralChoiceTerminal.kt @@ -91,6 +91,15 @@ class FlexibleLiteralChoiceTerminal(vararg val choices: String) : TerminalCombin return NoMatch.copy(longestMatch = offset) } + override fun parse(value: String, offset: Int): Sequence { + // Lenient shape match (so a wrong token like AF_BOGUS still matches and can be highlighted), + // valid only if the matched text is one of the exact choices. + val m = syntaticMatch.matchAt(value, offset) ?: return emptySequence() + val text = m.value + val valid = choices.any { it == text } + return sequenceOf(Parse(offset + text.length, listOf(ParsedToken(offset, offset + text.length, text, this, valid)))) + } + override fun toString(): String { return if (choices.size == 1) { "Literal(\"${choices[0]}\")" diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/IntegerTerminal.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/IntegerTerminal.kt index 11864d7..744b5a3 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/IntegerTerminal.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/IntegerTerminal.kt @@ -29,6 +29,14 @@ class IntegerTerminal(private val minInclusive: Long,private val maxExclusive: L } } + override fun parse(value: String, offset: Int): Sequence { + val m = intRegex.matchAt(value, offset) ?: return emptySequence() + val text = m.value + // Lenient: any integer matches (so we can locate it); valid only if it is within range. + val valid = text.toLongOrNull()?.let { it >= minInclusive && it < maxExclusive } ?: false + return sequenceOf(Parse(offset + text.length, listOf(ParsedToken(offset, offset + text.length, text, this, valid)))) + } + override fun toString(): String { return "Int($minInclusive,$maxExclusive)" } diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/LiteralChoiceTerminal.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/LiteralChoiceTerminal.kt index 870db3d..17d6fbf 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/LiteralChoiceTerminal.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/LiteralChoiceTerminal.kt @@ -24,6 +24,12 @@ class LiteralChoiceTerminal(vararg var choices: String) : TerminalCombinator { return match(value, offset) } + override fun parse(value: String, offset: Int): Sequence = + // Offer every choice that matches here (e.g. both ":" and "::"); each is always strictly valid. + choices.asSequence() + .filter { value.startsWith(it, offset) } + .map { Parse(offset + it.length, listOf(ParsedToken(offset, offset + it.length, it, this, valid = true))) } + override fun toString(): String { return if (choices.size == 1) { "Literal(\"${choices[0]}\")" diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/OneOrMore.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/OneOrMore.kt index f56a0f1..fc999cd 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/OneOrMore.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/OneOrMore.kt @@ -40,6 +40,17 @@ class OneOrMore(val combinator : Combinator) : Combinator { return match(value, offset, combinator::SemanticMatch) } + override fun parse(value: String, offset: Int): Sequence { + // Same as ZeroOrMore, but the first repetition is mandatory (and must make progress). + fun extend(from: Parse): Sequence = sequence { + yield(from) + for (step in combinator.parse(value, from.end)) { + if (step.end > from.end) yieldAll(extend(Parse(step.end, from.tokens + step.tokens))) + } + } + return combinator.parse(value, offset).filter { it.end > offset }.flatMap { extend(it) } + } + override fun toString(): String = toStringIndented(0) override fun toStringIndented(indent: Int): String { diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Parse.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Parse.kt new file mode 100644 index 0000000..aa3c389 --- /dev/null +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Parse.kt @@ -0,0 +1,60 @@ +package net.sjrx.intellij.plugins.systemdunitfiles.semanticdata.optionvalues.grammar + +/* + * List-of-successes matcher (GitHub #467, step 2). + * + * These types support a second matching method, `Combinator.parse()`, that lives ALONGSIDE the + * existing SyntacticMatch / SemanticMatch on every combinator. Nothing here is wired into + * GrammarOptionValue yet — the goal is to flesh the approach out on the real combinators and + * validate it against the real grammars in tests before deciding to migrate the caller. + * + * Where the existing engine returns ONE greedy result and runs two near-identical passes, parse() + * returns EVERY way a combinator can match (lazily), and folds the strict "semantic" check into a + * `valid` flag on each token. So one lenient pass answers both questions, and greedy traps like + * Seq(ZeroOrMore("a"), "a") on "aa" resolve themselves (see Combinator.parse docs). + */ + +/** A single terminal token, with the strict-validity verdict (the old "semantic" check) folded in. */ +data class ParsedToken( + val start: Int, + val end: Int, + val text: String, + val terminal: TerminalCombinator, + val valid: Boolean, +) + +/** One way a combinator consumed input from some offset: it ended at [end], producing [tokens]. */ +data class Parse(val end: Int, val tokens: List) + +/** The outcome of validating a whole value against a grammar via parse(). */ +sealed interface ParseOutcome { + /** Some path consumed the whole value with every token strictly valid. */ + object Valid : ParseOutcome + + /** A path consumed the whole value, but a token is not strictly valid (well-formed but wrong). */ + data class SemanticError(val badToken: ParsedToken) : ParseOutcome + + /** No path consumed the whole value. [furthest] is how far any path got (for error localization). */ + data class SyntaxError(val furthest: Int) : ParseOutcome +} + +/** Every way [this] grammar can consume the entire [value]. */ +fun Combinator.fullParses(value: String): Sequence = + parse(value, 0).filter { it.end == value.length } + +/** + * One lenient parse answers both questions the old two passes did: + * - syntactic ("could be this, color it"): did any path consume the whole value? + * - semantic ("actually valid"): did any such path use only valid tokens? + */ +fun Combinator.validate(value: String): ParseOutcome { + var firstBad: ParsedToken? = null + for (p in fullParses(value)) { + val bad = p.tokens.firstOrNull { !it.valid } + if (bad == null) return ParseOutcome.Valid // short-circuit on the first fully-valid full parse + if (firstBad == null) firstBad = bad + } + if (firstBad != null) return ParseOutcome.SemanticError(firstBad) + val furthest = parse(value, 0).maxOfOrNull { it.end } ?: 0 + return ParseOutcome.SyntaxError(furthest) +} diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/RegexTerminal.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/RegexTerminal.kt index 2010e78..f31b854 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/RegexTerminal.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/RegexTerminal.kt @@ -18,6 +18,14 @@ class RegexTerminal(syntaticMatchStr : String, semanticMatchStr: String ) : Term return MatchResult(listOf(matchResult.value), offset + matchResult.value.length, listOf(this), offset + matchResult.value.length) } + override fun parse(value: String, offset: Int): Sequence { + // The syntactic regex gives the lenient span; valid iff the semantic regex matches that same span. + val syn = syntaticMatch.matchAt(value, offset) ?: return emptySequence() + val text = syn.value + val valid = semanticMatch.matchAt(value, offset)?.value == text + return sequenceOf(Parse(offset + text.length, listOf(ParsedToken(offset, offset + text.length, text, this, valid)))) + } + override fun toString(): String { return "Regex(\"${semanticMatch.pattern}\")" } diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Repeat.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Repeat.kt index 869a577..3cec7fa 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Repeat.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/Repeat.kt @@ -62,6 +62,20 @@ class Repeat(val combinator : Combinator, val minInclusive: Int, val maxExclusiv return match(value, offset, combinator::SemanticMatch) } + override fun parse(value: String, offset: Int): Sequence { + // Offer every repetition count in [minInclusive, maxExclusive] (maxExclusive is the cap on the + // count, mirroring the existing match() loop). Yield only once enough repetitions have happened. + fun extend(from: Parse, count: Int): Sequence = sequence { + if (count >= minInclusive) yield(from) + if (count < maxExclusive) { + for (step in combinator.parse(value, from.end)) { + if (step.end > from.end) yieldAll(extend(Parse(step.end, from.tokens + step.tokens), count + 1)) + } + } + } + return extend(Parse(offset, emptyList()), 0) + } + override fun toString(): String = toStringIndented(0) override fun toStringIndented(indent: Int): String { diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/SequenceCombinator.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/SequenceCombinator.kt index ddab6a6..c959fd5 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/SequenceCombinator.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/SequenceCombinator.kt @@ -55,6 +55,17 @@ open class SequenceCombinator(vararg val tokens: Combinator) : Combinator { return MatchResult(resultTokens, index, resultTerminals, maxLength) } + override fun parse(value: String, offset: Int): Sequence { + // Thread each possibility of one part into the next: the cartesian product of the parts. + var results = sequenceOf(Parse(offset, emptyList())) + for (token in tokens) { + results = results.flatMap { acc -> + token.parse(value, acc.end).map { next -> Parse(next.end, acc.tokens + next.tokens) } + } + } + return results + } + override fun toString(): String = toStringIndented(0) override fun toStringIndented(indent: Int): String { diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/WhitespaceTerminal.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/WhitespaceTerminal.kt index 180e5eb..cf9be72 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/WhitespaceTerminal.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/WhitespaceTerminal.kt @@ -27,6 +27,13 @@ class WhitespaceTerminal : TerminalCombinator { return match(value, offset) } + override fun parse(value: String, offset: Int): Sequence { + var end = offset + while (end < value.length && value[end].isWhitespace()) end++ + return if (end == offset) emptySequence() + else sequenceOf(Parse(end, listOf(ParsedToken(offset, end, value.substring(offset, end), this, valid = true)))) + } + override fun toString(): String { return "\\s+" } diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ZeroOrMore.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ZeroOrMore.kt index beeced1..0b68640 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ZeroOrMore.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ZeroOrMore.kt @@ -42,6 +42,18 @@ class ZeroOrMore(val combinator : Combinator) : Combinator { return match(value, offset, combinator::SemanticMatch) } + override fun parse(value: String, offset: Int): Sequence { + // Offer EVERY repetition count (0, 1, 2, ...), not just the greedy maximum. The `> from.end` + // guard keeps an inner matcher that can match empty from looping forever. + fun extend(from: Parse): Sequence = sequence { + yield(from) // stop repeating here... + for (step in combinator.parse(value, from.end)) { + if (step.end > from.end) yieldAll(extend(Parse(step.end, from.tokens + step.tokens))) // ...or take one more + } + } + return extend(Parse(offset, emptyList())) + } + override fun toString(): String = toStringIndented(0) override fun toStringIndented(indent: Int): String { diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ZeroOrOne.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ZeroOrOne.kt index 1751b24..279026b 100644 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ZeroOrOne.kt +++ b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ZeroOrOne.kt @@ -40,6 +40,10 @@ class ZeroOrOne(val combinator : Combinator) : Combinator { return match(value, offset, combinator::SemanticMatch) } + override fun parse(value: String, offset: Int): Sequence = + // Both the empty match and whatever the inner matcher offers. + sequenceOf(Parse(offset, emptyList())) + combinator.parse(value, offset) + override fun toString(): String = toStringIndented(0) override fun toStringIndented(indent: Int): String { diff --git a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPoc.kt b/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPoc.kt deleted file mode 100644 index 0d0236b..0000000 --- a/src/main/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPoc.kt +++ /dev/null @@ -1,220 +0,0 @@ -package net.sjrx.intellij.plugins.systemdunitfiles.semanticdata.optionvalues.grammar2 - -/* - * ============================================================================ - * Grammar engine PoC (GitHub #467, step 2) - * ============================================================================ - * - * This is a parallel, self-contained proof of concept for a new option-value - * grammar engine. It does NOT replace the existing `..optionvalues.grammar` - * package yet; it sits beside it so we can play with the idea and validate it - * against the existing behavioural tests before committing. - * - * THE ONE IDEA - * ------------ - * Today every combinator returns *one* answer (greedy, first match, no - * backtracking), so `Seq(ZeroOrMore("a"), "a")` fails to match "aa": the star - * eats both a's and the trailing "a" has nothing left. - * - * Here every matcher instead returns *all the ways it could match*, lazily: - * - * fun parse(input, offset): Sequence - * - * `Seq` then threads each possibility of one part into the next, and a value is - * valid if *any* path consumes the whole string. The star "gives back" an `a` - * for free, simply because it OFFERED the shorter match as one of its results. - * This is Wadler's "list of successes". Laziness means we explore depth-first - * and stop at the first complete parse, so we pay only for what we use. - * - * Two more things fall out of carrying a little structure on each result: - * - a *labeled* parse tree (a `Branch` with a role) — so an IPv4 address is - * ONE labeled span, not blue-octet/black-dot/blue-octet. Coloring, - * deprecation warnings and canonicalization all become free functions that - * walk this tree. - * - per-leaf *validity* flags, so the old "syntactic vs semantic" two passes - * collapse into one lenient parse: a token can match (so we can color / - * locate it) while still being flagged invalid. - * - * No IntelliJ types appear in this file on purpose: the engine stays pure - * Kotlin and speaks plain Int offsets, and the IntelliJ layer adapts later. - */ - -/** A semantic role attached to a labeled span. Drives coloring / warnings / rewrites. */ -enum class Role { - KEYWORD, // a fixed literal like "none" or the "~" inversion prefix - ADDRESS_FAMILY, // an AF_* name - WHITESPACE, -} - -/** A node in the parse tree produced for one specific input. */ -sealed interface Node { - val start: Int - val end: Int -} - -/** A terminal that matched the text `input[start until end]`. `valid` is the strict check. */ -data class Leaf( - override val start: Int, - override val end: Int, - val text: String, - val role: Role?, - val valid: Boolean, -) : Node - -/** A labeled grouping of child nodes — this is what gives the tree its shape. */ -data class Branch( - val role: Role, - override val start: Int, - override val end: Int, - val children: List, -) : Node - -/** One successful way a matcher consumed input: it ended at `end` and produced `nodes`. */ -data class Parse(val end: Int, val nodes: List) - -/** A matcher is just a function from (input, offset) to every way it can match there. */ -fun interface Matcher { - /** Empty sequence == no match. Otherwise, one `Parse` per distinct way to match. */ - fun parse(input: String, offset: Int): Sequence -} - -// --------------------------------------------------------------------------- -// Terminals -// --------------------------------------------------------------------------- - -/** Matches an exact string, e.g. Lit("none") or Lit("~"). Always strictly valid when it matches. */ -class Lit(private val text: String, private val role: Role? = Role.KEYWORD) : Matcher { - override fun parse(input: String, offset: Int): Sequence = - if (input.startsWith(text, offset)) { - sequenceOf(Parse(offset + text.length, listOf(Leaf(offset, offset + text.length, text, role, valid = true)))) - } else { - emptySequence() - } -} - -/** - * Matches an identifier-shaped token loosely, then flags it valid only if it is an exact choice. - * This is the "lenient shape, strict membership" trick: a bad token like AF_BOGUS still matches - * (so we can highlight exactly it), but is reported invalid. - */ -class FlexibleChoice( - private val choices: Set, - private val role: Role? = null, - private val shape: Regex = Regex("[A-Za-z0-9_]+"), -) : Matcher { - constructor(vararg choices: String, role: Role? = null) : this(choices.toSet(), role) - - override fun parse(input: String, offset: Int): Sequence { - val m = shape.matchAt(input, offset) ?: return emptySequence() - val text = m.value - return sequenceOf(Parse(offset + text.length, listOf(Leaf(offset, offset + text.length, text, role, text in choices)))) - } -} - -/** Matches a maximal run of whitespace (a separator). */ -object Whitespace : Matcher { - override fun parse(input: String, offset: Int): Sequence { - var end = offset - while (end < input.length && input[end].isWhitespace()) end++ - return if (end == offset) emptySequence() - else sequenceOf(Parse(end, listOf(Leaf(offset, end, input.substring(offset, end), Role.WHITESPACE, valid = true)))) - } -} - -// --------------------------------------------------------------------------- -// Combinators -// --------------------------------------------------------------------------- - -/** All parts in order. Threads each possibility of one part into the next (the cartesian product). */ -class Seq(private vararg val parts: Matcher) : Matcher { - override fun parse(input: String, offset: Int): Sequence { - var results = sequenceOf(Parse(offset, emptyList())) - for (part in parts) { - results = results.flatMap { acc -> - part.parse(input, acc.end).map { next -> Parse(next.end, acc.nodes + next.nodes) } - } - } - return results - } -} - -/** Any of the options. Yields *all* options' matches concatenated, so ordering no longer matters. */ -class Alt(private vararg val options: Matcher) : Matcher { - override fun parse(input: String, offset: Int): Sequence = - options.asSequence().flatMap { it.parse(input, offset) } -} - -/** Zero or one of `inner`. Offers both the empty match and `inner`'s matches. */ -class ZeroOrOne(private val inner: Matcher) : Matcher { - override fun parse(input: String, offset: Int): Sequence = - sequenceOf(Parse(offset, emptyList())) + inner.parse(input, offset) -} - -/** - * Zero or more of `inner`. Crucially this offers EVERY repetition count (0, 1, 2, ...), not just - * the greedy maximum — that is what makes matching complete. The `> from.end` guard keeps an - * inner matcher that can match empty from looping forever. - */ -class ZeroOrMore(private val inner: Matcher) : Matcher { - override fun parse(input: String, offset: Int): Sequence { - fun extend(from: Parse): Sequence = sequence { - yield(from) // stop repeating here... - for (step in inner.parse(input, from.end)) { - if (step.end > from.end) { - yieldAll(extend(Parse(step.end, from.nodes + step.nodes))) // ...or take one more and recurse - } - } - } - return extend(Parse(offset, emptyList())) - } -} - -/** Wraps `inner` and collapses everything it matched into a single labeled `Branch`. */ -class Labeled(private val role: Role, private val inner: Matcher) : Matcher { - override fun parse(input: String, offset: Int): Sequence = - inner.parse(input, offset).map { p -> - val end = p.nodes.lastOrNull()?.end ?: offset - Parse(p.end, listOf(Branch(role, offset, end, p.nodes))) - } -} - -// --------------------------------------------------------------------------- -// Capabilities — free functions over the parse result, no combinator code -// --------------------------------------------------------------------------- - -/** Flatten a parse tree to its leaves, in source order. */ -fun Parse.leaves(): List = nodes.flatMap { it.leaves() } - -private fun Node.leaves(): List = when (this) { - is Leaf -> listOf(this) - is Branch -> children.flatMap { it.leaves() } -} - -/** The outcome of validating a value against a grammar. */ -sealed interface Outcome { - /** Some path consumed the whole input with every token strictly valid. */ - object Valid : Outcome - /** A path consumed the whole input, but a token is not strictly valid (well-formed but wrong). */ - data class SemanticError(val badToken: Leaf) : Outcome - /** No path consumed the whole input. `furthest` is how far we got (for error localization). */ - data class SyntaxError(val furthest: Int) : Outcome -} - -/** - * Validate `input` against `grammar`, requiring the whole string to be consumed. One lenient parse - * answers both questions: syntactic well-formedness (did any path reach the end?) and semantic - * validity (did any such path have only valid tokens?). - */ -fun validate(grammar: Matcher, input: String): Outcome { - var firstBad: Leaf? = null - for (p in grammar.parse(input, 0)) { - if (p.end != input.length) continue // not a full match — ignore for validity - val bad = p.leaves().firstOrNull { !it.valid } - if (bad == null) return Outcome.Valid // short-circuit on the first fully-valid full parse - if (firstBad == null) firstBad = bad - } - if (firstBad != null) return Outcome.SemanticError(firstBad) - // Nothing reached the end. Report the furthest offset any partial path reached. - val furthest = grammar.parse(input, 0).maxOfOrNull { it.end } ?: 0 - return Outcome.SyntaxError(furthest) -} diff --git a/src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ParseTest.kt b/src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ParseTest.kt new file mode 100644 index 0000000..8e91dab --- /dev/null +++ b/src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar/ParseTest.kt @@ -0,0 +1,97 @@ +package net.sjrx.intellij.plugins.systemdunitfiles.semanticdata.optionvalues.grammar + +import net.sjrx.intellij.plugins.systemdunitfiles.semanticdata.optionvalues.ai.ConfigParseAddressFamiliesOptionValue +import org.junit.Assert.assertEquals +import org.junit.Assert.assertTrue +import org.junit.Test + +/** + * Tests for the list-of-successes matcher `Combinator.parse()` (#467 step 2). + * + * The point of these tests is that parse()/validate() run against the EXISTING combinator classes + * and the REAL production grammars — nothing in the 200+ grammar definitions changed. We just grew + * a second matching method on the same combinators. + */ +class ParseTest { + + private fun isValid(grammar: Combinator, value: String) = grammar.validate(value) == ParseOutcome.Valid + + @Test + fun testRealAddressFamiliesGrammarValidates() { + // The actual production grammar, unchanged — pulled straight off the validator. + val grammar = ConfigParseAddressFamiliesOptionValue().combinator + + val valid = listOf( + "none", "AF_INET", "AF_INET AF_INET6", "AF_UNIX AF_NETLINK", "~AF_PACKET", + "~AF_INET AF_INET6", "AF_BRIDGE AF_X25 AF_AX25", "AF_LOCAL", "AF_DECnet", + "AF_VSOCK AF_XDP AF_MCTP", "~AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_PACKET", + ) + val invalid = listOf( + "inet", "AF_inet", "AF_INET, AF_INET6", "~ AF_PACKET", "NONE", + "AF_BOGUS", "AF_INETZ", "AF_INET AF_MADEUP", "AF_DECNET", + ) + for (v in valid) assertTrue("expected valid: '$v'", isValid(grammar, v)) + for (v in invalid) assertTrue("expected invalid: '$v'", !isValid(grammar, v)) + } + + @Test + fun testAddressFamiliesErrorKinds() { + val grammar = ConfigParseAddressFamiliesOptionValue().combinator + + // Well-formed shape, unknown name -> semantic error pointing at the bad token. + val semantic = grammar.validate("AF_BOGUS") + assertTrue(semantic is ParseOutcome.SemanticError) + assertEquals("AF_BOGUS", (semantic as ParseOutcome.SemanticError).badToken.text) + + // Comma breaks the shape after "AF_INET" -> syntax error (malformed, not just an unknown name). + val syntax = grammar.validate("AF_INET, AF_INET6") + assertTrue(syntax is ParseOutcome.SyntaxError) + // KNOWN LIMITATION: `furthest` is best-effort. This grammar ends in EOF(), so the outer + // Seq(..., EOF()) drops the partial "AF_INET" path when EOF fails, and furthest collapses to 0 + // (we'd want 7). Precise localization needs the frontier/expected-set layer — the same machinery + // that powers completion (#343) — which is deliberately not in this step. Pinned to document it. + assertEquals(0, (syntax as ParseOutcome.SyntaxError).furthest) + } + + @Test + fun testRealIpv6GrammarValidates() { + // IPV6_ADDR is the real, hand-ordered Alt of 15+ forms in Combinators.kt. The old engine needed + // that careful ordering to avoid greedy traps; parse() explores all forms, so it just works. + val grammar = SequenceCombinator(IPV6_ADDR, EOF()) + + val valid = listOf("::", "::1", "fe80::1", "2001:db8::1", "1:2:3:4:5:6:7:8", "::ffff:192.168.0.1") + val invalid = listOf("2001:db8:::1", "1:2:3:4:5:6:7:8:9", "gggg::1", "") + for (v in valid) assertTrue("expected valid IPv6: '$v'", isValid(grammar, v)) + for (v in invalid) assertTrue("expected invalid IPv6: '$v'", !isValid(grammar, v)) + } + + @Test + fun testIntegerRangeGrammar() { + // Equivalent to the config_parse_ip_port grammar: a port in [0, 65536). + val grammar = SequenceCombinator(IntegerTerminal(0, 65536), EOF()) + assertTrue(isValid(grammar, "0")) + assertTrue(isValid(grammar, "65535")) + assertTrue(!isValid(grammar, "65536")) // out of range -> well-formed but invalid + assertTrue(!isValid(grammar, "-1")) + assertTrue(!isValid(grammar, "80x")) + + assertTrue(grammar.validate("65536") is ParseOutcome.SemanticError) // int matched, range failed + } + + @Test + fun testGreedyCaseTheOldEngineFails() { + // Built from the SAME combinator classes the old engine uses. Seq(ZeroOrMore("a"), "a") on "aa" + // fails under SyntacticMatch/SemanticMatch (the star eats both a's) but succeeds under parse(). + val grammar = SequenceCombinator(ZeroOrMore(LiteralChoiceTerminal("a")), LiteralChoiceTerminal("a"), EOF()) + + assertTrue(isValid(grammar, "a")) + assertTrue(isValid(grammar, "aa")) + assertTrue(isValid(grammar, "aaa")) + assertTrue(!isValid(grammar, "")) // needs at least one "a" + assertTrue(!isValid(grammar, "ab")) // trailing junk + + // Demonstrate the old engine really does fail "aa" (documents the difference, not just asserts ours). + val oldEngineFullMatch = grammar.SemanticMatch("aa", 0).matchResult + assertEquals(-1, oldEngineFullMatch) + } +} diff --git a/src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPocTest.kt b/src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPocTest.kt deleted file mode 100644 index c0140ec..0000000 --- a/src/test/kotlin/net/sjrx/intellij/plugins/systemdunitfiles/semanticdata/optionvalues/grammar2/GrammarPocTest.kt +++ /dev/null @@ -1,111 +0,0 @@ -package net.sjrx.intellij.plugins.systemdunitfiles.semanticdata.optionvalues.grammar2 - -import org.junit.Assert.assertEquals -import org.junit.Assert.assertTrue -import org.junit.Test - -/** - * Behavioural tests for the grammar engine PoC (#467 step 2). - * - * The RestrictAddressFamilies= cases mirror the canary suite - * (ConfigParseAddressFamiliesOptionValueTest) so we can see the new engine reproduce the existing - * behaviour before wiring it into IntelliJ. - */ -class GrammarPocTest { - - // A representative slice of the AF_* set (enough for the canary cases). - private val families = setOf( - "AF_INET", "AF_INET6", "AF_UNIX", "AF_NETLINK", "AF_PACKET", "AF_BRIDGE", - "AF_X25", "AF_AX25", "AF_LOCAL", "AF_DECnet", "AF_VSOCK", "AF_XDP", "AF_MCTP", - ) - - // none | [~] family (ws family)* — note: no EOF here; validate() requires full consumption. - private val family = FlexibleChoice(families, role = Role.ADDRESS_FAMILY) - private val restrictAddressFamilies: Matcher = Alt( - Lit("none"), - Seq(ZeroOrOne(Lit("~")), family, ZeroOrMore(Seq(Whitespace, family))), - ) - - private fun isValid(value: String) = validate(restrictAddressFamilies, value) == Outcome.Valid - - @Test - fun testValidValues() { - val valid = listOf( - "none", - "AF_INET", - "AF_INET AF_INET6", - "AF_UNIX AF_NETLINK", - "~AF_PACKET", - "~AF_INET AF_INET6", - "AF_BRIDGE AF_X25 AF_AX25", - // newer additions / aliases / mixed case - "AF_LOCAL", - "AF_DECnet", - "AF_VSOCK AF_XDP AF_MCTP", - "~AF_UNIX AF_INET AF_INET6 AF_NETLINK AF_PACKET", - ) - for (v in valid) assertTrue("expected valid: '$v'", isValid(v)) - } - - @Test - fun testInvalidValues() { - val invalid = listOf( - "inet", "AF_inet", "AF_INET, AF_INET6", "~ AF_PACKET", "NONE", - "AF_BOGUS", "AF_INETZ", "AF_INET AF_MADEUP", "AF_DECNET", - ) - for (v in invalid) assertTrue("expected invalid: '$v'", !isValid(v)) - } - - @Test - fun testWellFormedButUnknownIsSemanticError() { - // AF_BOGUS matches the token shape (so we can point at it) but is not a real family. - val outcome = validate(restrictAddressFamilies, "AF_BOGUS") - assertTrue(outcome is Outcome.SemanticError) - val bad = (outcome as Outcome.SemanticError).badToken - assertEquals("AF_BOGUS", bad.text) - assertEquals(Role.ADDRESS_FAMILY, bad.role) - } - - @Test - fun testMalformedIsSyntaxErrorLocatedAtFurthestProgress() { - // After "AF_INET" the comma is neither whitespace nor a family, so the shape breaks at offset 7. - val outcome = validate(restrictAddressFamilies, "AF_INET, AF_INET6") - assertTrue(outcome is Outcome.SyntaxError) - assertEquals(7, (outcome as Outcome.SyntaxError).furthest) - } - - @Test - fun testRolesAreAvailableForColoring() { - // The leaves of a successful parse already carry the roles a colorizer / annotator would use. - val parse = restrictAddressFamilies.parse("~AF_INET AF_INET6", 0).first { it.end == 17 } - val roles = parse.leaves().map { it.role } - assertEquals(listOf(Role.KEYWORD, Role.ADDRESS_FAMILY, Role.WHITESPACE, Role.ADDRESS_FAMILY), roles) - } - - @Test - fun testGreedyCaseTheOldEngineFails() { - // Seq(ZeroOrMore("a"), "a") on "aa": the old single-path greedy engine fails this because the - // star eats both a's. List-of-successes offers the shorter star match, so the trailing "a" fits. - val grammar = Seq(ZeroOrMore(Lit("a")), Lit("a")) - assertTrue(validate(grammar, "a") == Outcome.Valid) - assertTrue(validate(grammar, "aa") == Outcome.Valid) - assertTrue(validate(grammar, "aaa") == Outcome.Valid) - assertTrue(validate(grammar, "") != Outcome.Valid) // needs at least one "a" - assertTrue(validate(grammar, "ab") != Outcome.Valid) // trailing junk - } - - @Test - fun testLabeledProducesOneBranchSpanningItsChildren() { - // The mechanism behind "127.0.0.1 is ONE labeled span, not blue-octet/black-dot/...": - // Labeled collapses everything its inner matcher produced into a single Branch with a role. - val grammar = Labeled(Role.ADDRESS_FAMILY, Seq(Lit("x", null), Lit("y", null))) - val full = grammar.parse("xy", 0).single { it.end == 2 } - - val branch = full.nodes.single() as Branch - assertEquals(Role.ADDRESS_FAMILY, branch.role) - assertEquals(0, branch.start) - assertEquals(2, branch.end) - assertEquals(2, branch.children.size) - assertEquals(2, full.leaves().size) // and leaves() still flattens the tree - } -}