Skip to content
Merged
58 changes: 58 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,64 @@ jobs:
- name: Test
run: npm run check

# Engine-parity BREADTH guard. The `test` job already runs the three parity gates
# (emit-parser-verify / emit-reject-messages / emit-lexer-verify) on the corpus-free
# in-repo corpus — that is the standing mechanism that forces a gen-parser change to
# propagate to emit-parser. This job adds the full external TS corpus for breadth, so a
# divergence on some construct the in-repo corpus does not exercise still gets caught.
# Gated on parser/grammar changes (like the treesitter job) so it doesn't clone the
# corpus on doc-only pushes; schedule / workflow_dispatch force the full run.
emit-parity:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v5
with:
fetch-depth: 0 # need history to diff against the base for the path gate below

- name: Did the parser/grammar inputs change?
id: changed
run: |
if [ "${{ github.event_name }}" != "push" ] && [ "${{ github.event_name }}" != "pull_request" ]; then
echo "value=true" >> "$GITHUB_OUTPUT"; echo "forced full run (${{ github.event_name }})"; exit 0
fi
if [ "${{ github.event_name }}" = "pull_request" ]; then base="${{ github.event.pull_request.base.sha }}"; else base="${{ github.event.before }}"; fi
if [ -z "$base" ] || ! git cat-file -e "$base^{commit}" 2>/dev/null; then
echo "value=true" >> "$GITHUB_OUTPUT"; echo "no usable base — running the gate"; exit 0
fi
if git diff --name-only "$base" HEAD | grep -qE '^src/|^[^/]+\.ts$|^test/emit-'; then
echo "value=true" >> "$GITHUB_OUTPUT"; echo "parser/grammar changed — running the breadth gate"
else
echo "value=false" >> "$GITHUB_OUTPUT"; echo "no parser/grammar change — skipping the corpus clone"
fi

- uses: actions/setup-node@v4
if: steps.changed.outputs.value == 'true'
with:
node-version: 24
- if: steps.changed.outputs.value == 'true'
run: npm ci

# Pinned-SHA, shallow, sparse clone of the TS conformance corpus to the fixed path the
# parity gates auto-detect (same pin + technique as the readme-bench workflow).
- name: Clone the pinned TS corpus
if: steps.changed.outputs.value == 'true'
run: |
set -euo pipefail
rm -rf /tmp/ts-repo; mkdir -p /tmp/ts-repo
git -C /tmp/ts-repo init -q
git -C /tmp/ts-repo remote add origin https://github.com/microsoft/TypeScript
git -C /tmp/ts-repo config core.sparseCheckout true
printf 'tests/cases/\n' > /tmp/ts-repo/.git/info/sparse-checkout
git -C /tmp/ts-repo fetch -q --depth 1 --filter=blob:none origin 6fbce89821d93a5b761581d9ac540455f38e9acb
git -C /tmp/ts-repo checkout -q FETCH_HEAD

- name: Engine-parity over the full corpus
if: steps.changed.outputs.value == 'true'
run: |
node test/emit-parser-verify.ts all
node test/emit-reject-messages.ts
node test/emit-lexer-verify.ts

# The derived tree-sitter highlighter is the strongest thesis proof (a real GLR
# parser from the same grammar, beating the official hand-written one). Build its
# wasm and gate the accuracy so the 95.9% is verified, not just claimed. The
Expand Down
9 changes: 7 additions & 2 deletions TOTAL-PARSING.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,5 +228,10 @@ first-error agreement 57.5%.
determinism on an invalid corpus, a char-by-char typing session, and
exact-match diagnostic pins (synthesis quality must not silently regress to
absorption).
- `test/emit-parser-verify.ts` / `test/emit-lexer-verify.ts` — emitted runtime
≡ interpreter on the corpus, token streams and error messages included.
- `test/emit-parser-verify.ts` / `test/emit-reject-messages.ts` /
`test/emit-lexer-verify.ts` — the emitted runtime ≡ the interpreter (CST,
token streams, and reject messages). They run on a corpus-free in-repo corpus
(`test/emit-corpus.ts`: curated snippets + the repo's own sources), so they are
part of `npm run check` on every machine — the mechanism that forces a
gen-parser change to propagate to emit-parser. The CI `emit-parity` job adds the
full external TS corpus for breadth.
41 changes: 29 additions & 12 deletions src/emit-lexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,23 @@ export interface LexerSymtab {

const J = (v: unknown) => JSON.stringify(v);

// The resync retract one-liner is emitted at two points in the relex loop (mid-loop and the
// post-loop EOF check); a single producer keeps the two from drifting (#45 B3).
const resyncRetractLine = (indent: string): string =>
`${indent}if (wndHit >= 0) { tokN--; while (docLex.length > lexDiagBase && docLex[docLex.length - 1].offset >= tkOff[tokN]) docLex.length--; return wndHit; }`;

// The non-ASCII members of JS \s (the /u-free set), baked as a charCode test so a
// non-whitespace cc>127 (e.g. a Unicode identifier char) skips the LX_WS regex entirely. The
// regex `/\s+/y` matches at pos iff the lead char is \s, and ASCII \s is handled by the char
// loop, so `cc>127 && lxNonAsciiWs(cc)` is EXACTLY "the regex would match here" → byte-
// identical, minus the wasted exec on the common non-whitespace case (#45 B4).
const NON_ASCII_WS_FN =
`function lxNonAsciiWs(cc) { return cc === 0xa0 || cc === 0x1680 || (cc >= 0x2000 && cc <= 0x200a) || cc === 0x2028 || cc === 0x2029 || cc === 0x202f || cc === 0x205f || cc === 0x3000 || cc === 0xfeff; }`;
// The non-ASCII whitespace fallback, emitted at the two sites that need it (after an ASCII run,
// and as the lead char). `cont` appends the `continue` the lead-char site needs.
const nonAsciiWsConsume = (v: string, cont: boolean, indent: string): string =>
`${indent}if (${v} > 127 && lxNonAsciiWs(${v})) { LX_WS.lastIndex = pos; const m = LX_WS.exec(source); if (m !== null) { if (m[0].includes('\\n')) pendingNl = true; pos += m[0].length;${cont ? ' continue;' : ''} } }`;

export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
// Out of scope: the markup / indentation / newline state machines.
if (grammar.markup || grammar.indent || grammar.newline) return null;
Expand Down Expand Up @@ -103,6 +120,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
emit(`// ── Emitted lexer (emit-lexer.ts): specialized tokenize for this grammar ──`);
for (const m of matchers) emit(`const ${m.re} = new RegExp(${J(`(?:${m.pattern})`)}, ${J(m.flags)});`);
emit(`const LX_WS = /\\s+/y;`);
emit(NON_ASCII_WS_FN);
emit(`// window-truncation retry: a matcher failing at the WINDOW edge is not a lex`);
emit(`// error — the caller re-materializes a larger window (truncation cannot fake a`);
emit(`// resync: suffix-zone equality makes a cut token's END mismatch the old one)`);
Expand Down Expand Up @@ -248,6 +266,13 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
emit(` lexCore(source, 0, -1, 0, -1, 0, 0);`);
emit(` return tokN;`);
emit(`}`);
// Verification of the WINDOWED path (issue #45 B2): emit-lexer-verify only exercises a FULL
// lex (emit ≡ createLexer), and gen-lexer has no windowed counterpart to diff against — but the
// windowed re-lex IS independently checked at the tree level. incremental-verify / exhaustive-
// edits compare an edited parse (whose tokens come from this windowed re-lex) to a FRESH FULL
// parse of the same text, byte-identical: a wrong windowed token would change the tree (or its
// newlineBefore/commentBefore-driven shape) and fail there. So the oracle is the fresh full
// parse, applied transitively through the parser.
emit(`// The lexer core, parameterized for WINDOWED re-lexing: start at startPos with`);
emit(`// the previous token's (k, t) as the regex-context seed (-1 = none / file start)`);
emit(`// and EMPTY template/paren stacks (the caller restarts only at depth-0 safe`);
Expand Down Expand Up @@ -359,7 +384,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
emit(` // resync retracts the duplicated token push — and any lexer diagnostics
// emitted FOR it (the old stream's persisted entry survives via the shift;
// keeping the window's copy too double-reports the same character)`);
emit(` if (wndHit >= 0) { tokN--; while (docLex.length > lexDiagBase && docLex[docLex.length - 1].offset >= tkOff[tokN]) docLex.length--; return wndHit; }`);
emit(resyncRetractLine(' '));
emit(` const cc = source.charCodeAt(pos);`);
emit(` // whitespace: ASCII \\s run by char loop; a non-ASCII candidate falls back to the regex`);
emit(` if (cc === 32 || (cc >= 9 && cc <= 13)) {`);
Expand All @@ -369,18 +394,10 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
emit(` pos++;`);
emit(` wc = source.charCodeAt(pos);`);
emit(` } while (wc === 32 || (wc >= 9 && wc <= 13));`);
emit(` if (wc > 127) {`);
emit(` LX_WS.lastIndex = pos;`);
emit(` const m = LX_WS.exec(source);`);
emit(` if (m !== null) { if (m[0].includes('\\n')) pendingNl = true; pos += m[0].length; }`);
emit(` }`);
emit(`${nonAsciiWsConsume('wc', false, ' ')}`);
emit(` continue;`);
emit(` }`);
emit(` if (cc > 127) {`);
emit(` LX_WS.lastIndex = pos;`);
emit(` const m = LX_WS.exec(source);`);
emit(` if (m !== null) { if (m[0].includes('\\n')) pendingNl = true; pos += m[0].length; continue; }`);
emit(` }`);
emit(`${nonAsciiWsConsume('cc', true, ' ')}`);
if (templateToken) {
const tplCloseT = kwFirstCcs.has(tplInterpClose.charCodeAt(0)) ? 'lexKwT(source, startPos, r.end)' : '0';
const tplOpenT = kwFirstCcs.has(tplOpen.charCodeAt(0)) ? 'lexKwT(source, startPos, r.end)' : '0';
Expand Down Expand Up @@ -610,7 +627,7 @@ export function emitLexer(grammar: CstGrammar, st: LexerSymtab): string | null {
emit(` }`);
emit(` throw new Error("Unexpected character at offset " + pos + ": '" + source[pos] + "'");`);
emit(` }`);
emit(` if (wndHit >= 0) { tokN--; while (docLex.length > lexDiagBase && docLex[docLex.length - 1].offset >= tkOff[tokN]) docLex.length--; return wndHit; }`);
emit(resyncRetractLine(' '));
emit(` return hasMore ? -2 : -1;`);
emit(`}`);
emit(`// Windowed-relex restart anchor: the last token B ending at/before the damage`);
Expand Down
Loading
Loading