johnsoncodehk · johnsoncodehk · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026 · Jun 20, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -51,6 +51,64 @@ jobs:
       - name: Test
         run: npm run check
 
+  # Engine-parity BREADTH guard. The `test` job already runs the three parity gates
+  # (emit-parser-verify / emit-reject-messages / emit-lexer-verify) on the corpus-free
+  # in-repo corpus — that is the standing mechanism that forces a gen-parser change to
+  # propagate to emit-parser. This job adds the full external TS corpus for breadth, so a
+  # divergence on some construct the in-repo corpus does not exercise still gets caught.
+  # Gated on parser/grammar changes (like the treesitter job) so it doesn't clone the
+  # corpus on doc-only pushes; schedule / workflow_dispatch force the full run.
+  emit-parity:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v5
+        with:
+          fetch-depth: 0   # need history to diff against the base for the path gate below
+
+      - name: Did the parser/grammar inputs change?
+        id: changed
+        run: |
+          if [ "${{ github.event_name }}" != "push" ] && [ "${{ github.event_name }}" != "pull_request" ]; then
+            echo "value=true" >> "$GITHUB_OUTPUT"; echo "forced full run (${{ github.event_name }})"; exit 0
+          fi
+          if [ "${{ github.event_name }}" = "pull_request" ]; then base="${{ github.event.pull_request.base.sha }}"; else base="${{ github.event.before }}"; fi
+          if [ -z "$base" ] || ! git cat-file -e "$base^{commit}" 2>/dev/null; then
+            echo "value=true" >> "$GITHUB_OUTPUT"; echo "no usable base — running the gate"; exit 0
+          fi
+          if git diff --name-only "$base" HEAD | grep -qE '^src/|^[^/]+\.ts$|^test/emit-'; then
+            echo "value=true" >> "$GITHUB_OUTPUT"; echo "parser/grammar changed — running the breadth gate"
+          else
+            echo "value=false" >> "$GITHUB_OUTPUT"; echo "no parser/grammar change — skipping the corpus clone"
+          fi
+
+      - uses: actions/setup-node@v4
+        if: steps.changed.outputs.value == 'true'
+        with:
+          node-version: 24
+      - if: steps.changed.outputs.value == 'true'
+        run: npm ci
+
+      # Pinned-SHA, shallow, sparse clone of the TS conformance corpus to the fixed path the
+      # parity gates auto-detect (same pin + technique as the readme-bench workflow).
+      - name: Clone the pinned TS corpus
+        if: steps.changed.outputs.value == 'true'
+        run: |
+          set -euo pipefail
+          rm -rf /tmp/ts-repo; mkdir -p /tmp/ts-repo
+          git -C /tmp/ts-repo init -q
+          git -C /tmp/ts-repo remote add origin https://github.com/microsoft/TypeScript
+          git -C /tmp/ts-repo config core.sparseCheckout true
+          printf 'tests/cases/\n' > /tmp/ts-repo/.git/info/sparse-checkout
+          git -C /tmp/ts-repo fetch -q --depth 1 --filter=blob:none origin 6fbce89821d93a5b761581d9ac540455f38e9acb
+          git -C /tmp/ts-repo checkout -q FETCH_HEAD
+
+      - name: Engine-parity over the full corpus
+        if: steps.changed.outputs.value == 'true'
+        run: |
+          node test/emit-parser-verify.ts all
+          node test/emit-reject-messages.ts
+          node test/emit-lexer-verify.ts
+
   # The derived tree-sitter highlighter is the strongest thesis proof (a real GLR
   # parser from the same grammar, beating the official hand-written one). Build its
   # wasm and gate the accuracy so the 95.9% is verified, not just claimed. The

diff --git a/TOTAL-PARSING.md b/TOTAL-PARSING.md
@@ -228,5 +228,10 @@ first-error agreement 57.5%.
   determinism on an invalid corpus, a char-by-char typing session, and
   exact-match diagnostic pins (synthesis quality must not silently regress to
   absorption).
-- `test/emit-parser-verify.ts` / `test/emit-lexer-verify.ts` — emitted runtime
-  ≡ interpreter on the corpus, token streams and error messages included.
+- `test/emit-parser-verify.ts` / `test/emit-reject-messages.ts` /
+  `test/emit-lexer-verify.ts` — the emitted runtime ≡ the interpreter (CST,
+  token streams, and reject messages). They run on a corpus-free in-repo corpus
+  (`test/emit-corpus.ts`: curated snippets + the repo's own sources), so they are
+  part of `npm run check` on every machine — the mechanism that forces a
+  gen-parser change to propagate to emit-parser. The CI `emit-parity` job adds the
+  full external TS corpus for breadth.
diff --git a/src/emit-lexer.ts b/src/emit-lexer.ts
@@ -28,6 +28,23 @@ export interface LexerSymtab {
 
 const J = (v: unknown) => JSON.stringify(v);
 
+// The resync retract one-liner is emitted at two points in the relex loop (mid-loop and the
+// post-loop EOF check); a single producer keeps the two from drifting (#45 B3).
+const resyncRetractLine = (indent: string): string =>
+  `${indent}if (wndHit >= 0) { tokN--; while (docLex.length > lexDiagBase && docLex[docLex.length - 1].offset >= tkOff[tokN]) docLex.length--; return wndHit; }`;
+
+// The non-ASCII members of JS \s (the /u-free set), baked as a charCode test so a
+// non-whitespace cc>127 (e.g. a Unicode identifier char) skips the LX_WS regex entirely. The
+// regex `/\s+/y` matches at pos iff the lead char is \s, and ASCII \s is handled by the char
+// loop, so `cc>127 && lxNonAsciiWs(cc)` is EXACTLY "the regex would match here" → byte-
+// identical, minus the wasted exec on the common non-whitespace case (#45 B4).
+const NON_ASCII_WS_FN =
+  `function lxNonAsciiWs(cc) { return cc === 0xa0 || cc === 0x1680 || (cc >= 0x2000 && cc <= 0x200a) || cc === 0x2028 || cc === 0x2029 || cc === 0x202f || cc === 0x205f || cc === 0x3000 || cc === 0xfeff; }`;
+// The non-ASCII whitespace fallback, emitted at the two sites that need it (after an ASCII run,
+// and as the lead char). `cont` appends the `continue` the lead-char site needs.
+const nonAsciiWsConsume = (v: string, cont: boolean, indent: string): string =>
+  `${indent}if (${v} > 127 && lxNonAsciiWs(${v})) { LX_WS.lastIndex = pos; const m = LX_WS.exec(source); if (m !== null) { if (m[0].includes('\\n')) pendingNl = true; pos += m[0].length;${cont ? ' continue;' : ''} } }`;
+
 export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
   // Out of scope: the markup / indentation / newline state machines.
   if (grammar.markup || grammar.indent || grammar.newline) return null;
@@ -103,6 +120,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
   emit(`// ── Emitted lexer (emit-lexer.ts): specialized tokenize for this grammar ──`);
   for (const m of matchers) emit(`const ${m.re} = new RegExp(${J(`(?:${m.pattern})`)}, ${J(m.flags)});`);
   emit(`const LX_WS = /\\s+/y;`);
+  emit(NON_ASCII_WS_FN);
   emit(`// window-truncation retry: a matcher failing at the WINDOW edge is not a lex`);
   emit(`// error — the caller re-materializes a larger window (truncation cannot fake a`);
   emit(`// resync: suffix-zone equality makes a cut token's END mismatch the old one)`);
@@ -248,6 +266,13 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
   emit(`  lexCore(source, 0, -1, 0, -1, 0, 0);`);
   emit(`  return tokN;`);
   emit(`}`);
+  // Verification of the WINDOWED path (issue #45 B2): emit-lexer-verify only exercises a FULL
+  // lex (emit ≡ createLexer), and gen-lexer has no windowed counterpart to diff against — but the
+  // windowed re-lex IS independently checked at the tree level. incremental-verify / exhaustive-
+  // edits compare an edited parse (whose tokens come from this windowed re-lex) to a FRESH FULL
+  // parse of the same text, byte-identical: a wrong windowed token would change the tree (or its
+  // newlineBefore/commentBefore-driven shape) and fail there. So the oracle is the fresh full
+  // parse, applied transitively through the parser.
   emit(`// The lexer core, parameterized for WINDOWED re-lexing: start at startPos with`);
   emit(`// the previous token's (k, t) as the regex-context seed (-1 = none / file start)`);
   emit(`// and EMPTY template/paren stacks (the caller restarts only at depth-0 safe`);
@@ -359,7 +384,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
   emit(`    // resync retracts the duplicated token push — and any lexer diagnostics
     // emitted FOR it (the old stream's persisted entry survives via the shift;
     // keeping the window's copy too double-reports the same character)`);
-  emit(`    if (wndHit >= 0) { tokN--; while (docLex.length > lexDiagBase && docLex[docLex.length - 1].offset >= tkOff[tokN]) docLex.length--; return wndHit; }`);
+  emit(resyncRetractLine('    '));
   emit(`    const cc = source.charCodeAt(pos);`);
   emit(`    // whitespace: ASCII \\s run by char loop; a non-ASCII candidate falls back to the regex`);
   emit(`    if (cc === 32 || (cc >= 9 && cc <= 13)) {`);
@@ -369,18 +394,10 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
   emit(`        pos++;`);
   emit(`        wc = source.charCodeAt(pos);`);
   emit(`      } while (wc === 32 || (wc >= 9 && wc <= 13));`);
-  emit(`      if (wc > 127) {`);
-  emit(`        LX_WS.lastIndex = pos;`);
-  emit(`        const m = LX_WS.exec(source);`);
-  emit(`        if (m !== null) { if (m[0].includes('\\n')) pendingNl = true; pos += m[0].length; }`);
-  emit(`      }`);
+  emit(`${nonAsciiWsConsume('wc', false, '      ')}`);
   emit(`      continue;`);
   emit(`    }`);
-  emit(`    if (cc > 127) {`);
-  emit(`      LX_WS.lastIndex = pos;`);
-  emit(`      const m = LX_WS.exec(source);`);
-  emit(`      if (m !== null) { if (m[0].includes('\\n')) pendingNl = true; pos += m[0].length; continue; }`);
-  emit(`    }`);
+  emit(`${nonAsciiWsConsume('cc', true, '    ')}`);
   if (templateToken) {
     const tplCloseT = kwFirstCcs.has(tplInterpClose.charCodeAt(0)) ? 'lexKwT(source, startPos, r.end)' : '0';
     const tplOpenT = kwFirstCcs.has(tplOpen.charCodeAt(0)) ? 'lexKwT(source, startPos, r.end)' : '0';
@@ -610,7 +627,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
   emit(`    }`);
   emit(`    throw new Error("Unexpected character at offset " + pos + ": '" + source[pos] + "'");`);
   emit(`  }`);
-  emit(`  if (wndHit >= 0) { tokN--; while (docLex.length > lexDiagBase && docLex[docLex.length - 1].offset >= tkOff[tokN]) docLex.length--; return wndHit; }`);
+  emit(resyncRetractLine('  '));
   emit(`  return hasMore ? -2 : -1;`);
   emit(`}`);
   emit(`// Windowed-relex restart anchor: the last token B ending at/before the damage`);