diff --git a/src/main/java/org/javacc/java/JavaArrayHelper.java b/src/main/java/org/javacc/java/JavaArrayHelper.java new file mode 100644 index 0000000..61a6aa5 --- /dev/null +++ b/src/main/java/org/javacc/java/JavaArrayHelper.java @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2025, JavaCC contributors. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the names of the copyright holders nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ +package org.javacc.java; + +import java.util.List; + +/** + * Utility for emitting large Java static array declarations that would otherwise + * exceed the JVM's 64 KB bytecode limit on the {@code } method. + * + *

Instead of generating inline array initializers that all compile into + * {@code }, this helper generates dedicated {@code _init()} methods. + * For very large arrays the init body is further split across multiple chunk + * methods so that no single method exceeds the bytecode limit. + * + *

For 2D arrays ({@code int[][]}), this helper can flatten them into a + * contiguous 1D representation with offset/length arrays, eliminating both + * the {@code } problem and the per-row object header overhead. + */ +final class JavaArrayHelper { + + /** + * Maximum number of elements emitted per init chunk method. + * Each {@code r[i]=val;} compiles to roughly 8 bytes of bytecode, so 4000 + * elements is about 32 KB, well within the 64 KB limit. + */ + private static final int CHUNK_SIZE = 4000; + + /** Elements per line in the generated source for readability. */ + private static final int ELEMENTS_PER_LINE = 16; + + private JavaArrayHelper() {} + + // ---------------------------------------------------------------- + // int[] + // ---------------------------------------------------------------- + + /** + * Emits a {@code static final int[]} field backed by init method(s). + * + * @param jcb the code builder + * @param indent leading whitespace (e.g. {@code " "} for a class member) + * @param vis visibility keyword ({@code "private"}, {@code "public"}, etc.) + * @param name field name + * @param data the array data + */ + static void emitIntArray( + final JavaCodeBuilder jcb, final String indent, + final String vis, final String name, final int[] data) { + + jcb.println(indent + vis + " static final int[] " + name + " = " + name + "_init();"); + jcb.println(); + + if (data.length <= CHUNK_SIZE) { + jcb.println(indent + "private static int[] " + name + "_init() {"); + jcb.print(indent + " return new int[] {"); + appendIntElements(jcb, indent + " ", data, 0, data.length); + jcb.println(indent + " };"); + jcb.println(indent + "}"); + } else { + final int chunks = (data.length + CHUNK_SIZE - 1) / CHUNK_SIZE; + jcb.println(indent + "private static int[] " + name + "_init() {"); + jcb.println(indent + " final int[] r = new int[" + data.length + "];"); + for (int c = 0; c < chunks; c++) { + jcb.println(indent + " " + name + "_init_" + c + "(r);"); + } + jcb.println(indent + " return r;"); + jcb.println(indent + "}"); + jcb.println(); + for (int c = 0; c < chunks; c++) { + final int start = c * CHUNK_SIZE; + final int end = Math.min(start + CHUNK_SIZE, data.length); + jcb.println(indent + "private static void " + name + "_init_" + c + "(final int[] r) {"); + for (int i = start; i < end; i++) { + if ((i - start) % ELEMENTS_PER_LINE == 0) { + if (i > start) { + jcb.println(); + } + jcb.print(indent + " "); + } + jcb.print("r[" + i + "]=" + data[i] + "; "); + } + jcb.println(); + jcb.println(indent + "}"); + if (c < chunks - 1) { + jcb.println(); + } + } + } + jcb.println(); + } + + // ---------------------------------------------------------------- + // long[] + // ---------------------------------------------------------------- + + /** + * Emits a {@code static final long[]} field backed by init method(s). + * + *

Each {@code long} literal compiles to ~12 bytes of bytecode + * ({@code ldc2_w} + {@code lastore}), so the chunk size is halved + * compared to {@code int[]} to stay within the 64 KB limit.

+ * + * @param jcb the code builder + * @param indent leading whitespace + * @param vis visibility keyword + * @param name field name + * @param data the array data + */ + static void emitLongArray( + final JavaCodeBuilder jcb, final String indent, + final String vis, final String name, final long[] data) { + + // long literals are larger in bytecode, so use a smaller chunk + final int longChunkSize = CHUNK_SIZE / 2; + + jcb.println(indent + vis + " static final long[] " + name + " = " + name + "_init();"); + jcb.println(); + + if (data.length <= longChunkSize) { + jcb.println(indent + "private static long[] " + name + "_init() {"); + jcb.print(indent + " return new long[] {"); + jcb.println(); + for (int i = 0; i < data.length; i++) { + if (i % 8 == 0) { + if (i > 0) { + jcb.println(); + } + jcb.print(indent + " "); + } + jcb.print(data[i] + "L"); + if (i < data.length - 1) { + jcb.print(", "); + } + } + jcb.println(); + jcb.println(indent + " };"); + jcb.println(indent + "}"); + } else { + final int chunks = (data.length + longChunkSize - 1) / longChunkSize; + jcb.println(indent + "private static long[] " + name + "_init() {"); + jcb.println(indent + " final long[] r = new long[" + data.length + "];"); + for (int c = 0; c < chunks; c++) { + jcb.println(indent + " " + name + "_init_" + c + "(r);"); + } + jcb.println(indent + " return r;"); + jcb.println(indent + "}"); + jcb.println(); + for (int c = 0; c < chunks; c++) { + final int start = c * longChunkSize; + final int end = Math.min(start + longChunkSize, data.length); + jcb.println(indent + "private static void " + name + "_init_" + c + "(final long[] r) {"); + for (int i = start; i < end; i++) { + if ((i - start) % 8 == 0) { + if (i > start) { + jcb.println(); + } + jcb.print(indent + " "); + } + jcb.print("r[" + i + "]=" + data[i] + "L; "); + } + jcb.println(); + jcb.println(indent + "}"); + if (c < chunks - 1) { + jcb.println(); + } + } + } + jcb.println(); + } + + // ---------------------------------------------------------------- + // String[] + // ---------------------------------------------------------------- + + /** + * Emits a {@code static final String[]} field backed by init method(s). + * + * @param data array of already-quoted string values (e.g. {@code "\"hello\""}); + * {@code null} entries are emitted as the Java literal {@code null} + */ + static void emitStringArray( + final JavaCodeBuilder jcb, final String indent, + final String vis, final String name, final String[] data) { + + jcb.println(indent + vis + " static final String[] " + name + " = " + name + "_init();"); + jcb.println(); + + if (data.length <= CHUNK_SIZE) { + jcb.println(indent + "private static String[] " + name + "_init() {"); + jcb.println(indent + " return new String[] {"); + for (int i = 0; i < data.length; i++) { + jcb.print(indent + " " + (data[i] == null ? "null" : data[i])); + if (i < data.length - 1) { + jcb.println(","); + } else { + jcb.println(); + } + } + jcb.println(indent + " };"); + jcb.println(indent + "}"); + } else { + final int chunks = (data.length + CHUNK_SIZE - 1) / CHUNK_SIZE; + jcb.println(indent + "private static String[] " + name + "_init() {"); + jcb.println(indent + " final String[] r = new String[" + data.length + "];"); + for (int c = 0; c < chunks; c++) { + jcb.println(indent + " " + name + "_init_" + c + "(r);"); + } + jcb.println(indent + " return r;"); + jcb.println(indent + "}"); + jcb.println(); + for (int c = 0; c < chunks; c++) { + final int start = c * CHUNK_SIZE; + final int end = Math.min(start + CHUNK_SIZE, data.length); + jcb.println(indent + "private static void " + name + "_init_" + c + "(final String[] r) {"); + for (int i = start; i < end; i++) { + jcb.println(indent + " r[" + i + "] = " + + (data[i] == null ? "null" : data[i]) + ";"); + } + jcb.println(indent + "}"); + if (c < chunks - 1) { + jcb.println(); + } + } + } + jcb.println(); + } + + // ---------------------------------------------------------------- + // int[][] → flattened 1D representation + // ---------------------------------------------------------------- + + /** + * Result of flattening a 2D {@code int[][]} array into a contiguous 1D representation. + */ + static final class FlatIntArray2D { + + final int[] data; + final int[] offsets; + final int[] lengths; + final int rows; + + FlatIntArray2D(final int[] data, final int[] offsets, final int[] lengths, final int rows) { + this.data = data; + this.offsets = offsets; + this.lengths = lengths; + this.rows = rows; + } + } + + /** + * Flattens a list of {@code int[]} rows into a contiguous representation. + * Null or empty rows get {@code length == 0} and a valid offset. + * + * @param rows list of rows; null entries are treated as empty + * @return the flat representation + */ + static FlatIntArray2D flatten(final List rows) { + int totalSize = 0; + for (final int[] row : rows) { + if (row != null) { + totalSize += row.length; + } + } + // Ensure at least 1 element so data array is never zero-length + final int[] data = new int[Math.max(totalSize, 1)]; + final int[] offsets = new int[rows.size()]; + final int[] lengths = new int[rows.size()]; + int pos = 0; + for (int i = 0; i < rows.size(); i++) { + offsets[i] = pos; + final int[] row = rows.get(i); + if (row != null && row.length > 0) { + lengths[i] = row.length; + System.arraycopy(row, 0, data, pos, row.length); + pos += row.length; + } + } + return new FlatIntArray2D(data, offsets, lengths, rows.size()); + } + + /** + * Emits a flattened 2D array as three 1D arrays ({@code _data}, {@code _offsets}, + * {@code _lengths}) plus accessor methods. + * + *

Generated accessors: + *

+ * + * @param jcb the code builder + * @param indent leading whitespace + * @param vis visibility for the underlying arrays + * @param name base name (e.g. {@code "jjnextStateSet"}) + * @param flat the flattened data from {@link #flatten} + */ + static void emitFlatIntArray2D( + final JavaCodeBuilder jcb, final String indent, + final String vis, final String name, final FlatIntArray2D flat) { + + emitIntArray(jcb, indent, vis, name + "_data", flat.data); + emitIntArray(jcb, indent, vis, name + "_offsets", flat.offsets); + emitIntArray(jcb, indent, vis, name + "_lengths", flat.lengths); + + // Accessor: element by row+col + jcb.println(indent + "private static int " + name + "(final int row, final int col) {"); + jcb.println(indent + " return " + name + "_data[" + name + "_offsets[row] + col];"); + jcb.println(indent + "}"); + jcb.println(); + + // Accessor: row length + jcb.println(indent + "private static int " + name + "_length(final int row) {"); + jcb.println(indent + " return " + name + "_lengths[row];"); + jcb.println(indent + "}"); + jcb.println(); + + // Accessor: get row as array (for for-each compatibility in template) + jcb.println(indent + "private static int[] " + name + "_row(final int row) {"); + jcb.println(indent + " final int off = " + name + "_offsets[row];"); + jcb.println(indent + " final int len = " + name + "_lengths[row];"); + jcb.println(indent + " final int[] r = new int[len];"); + jcb.println(indent + " System.arraycopy(" + name + "_data, off, r, 0, len);"); + jcb.println(indent + " return r;"); + jcb.println(indent + "}"); + jcb.println(); + } + + // ---------------------------------------------------------------- + // Private helpers + // ---------------------------------------------------------------- + + private static void appendIntElements( + final JavaCodeBuilder jcb, final String indent, + final int[] data, final int from, final int to) { + jcb.println(); + for (int i = from; i < to; i++) { + if ((i - from) % ELEMENTS_PER_LINE == 0) { + if (i > from) { + jcb.println(); + } + jcb.print(indent); + } + jcb.print(data[i]); + if (i < to - 1) { + jcb.print(", "); + } + } + jcb.println(); + } +} diff --git a/src/main/java/org/javacc/java/ParserCodeGenerator.java b/src/main/java/org/javacc/java/ParserCodeGenerator.java index 40d4c25..3372b7f 100644 --- a/src/main/java/org/javacc/java/ParserCodeGenerator.java +++ b/src/main/java/org/javacc/java/ParserCodeGenerator.java @@ -97,6 +97,26 @@ class ParserCodeGenerator implements org.javacc.parser.ParserCodeGenerator { private GenericCodeBuilder cb; + /* + * Safety thresholds to prevent generated methods from exceeding + * the JVM's 64KB bytecode limit per method. + * + * Each int literal in an array init compiles to ~7 bytes of bytecode. + * Each switch case compiles to ~25 bytes. + * Each nested if-else in a Choice chain compiles to ~30 bytes. + * + * With a 64KB (65535 byte) limit, these thresholds provide ~2x safety margin. + */ + + /** Max switch cases in jj_rescan_token before splitting into sub-methods. */ + private static final int RESCAN_CHUNK_SIZE = 500; + + /** Max array elements per jj_la1_init method before splitting into sub-methods. */ + private static final int LA1_INIT_CHUNK_SIZE = 4000; + + /** Max Choice alternatives in a jj_3R method before splitting into sub-methods. */ + private static final int CHOICE_CHAIN_THRESHOLD = 1000; + /** * To be set to true to add debug comment tags in the generated code (to ease linking it with this * generator), false otherwise (which should be the normal case). @@ -265,6 +285,7 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData + context.globals().maskindex + "];"); final int tokenMaskSize = ((context.globals().tokenCount - 1) / 32) + 1; + final int maskCount = context.globals().maskVals.size(); for (int i = 0; i < tokenMaskSize; i++) { cb.println(" private static int[] jj_la1_" + i + ";"); } @@ -275,15 +296,43 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData cb.println(" jj_la1_init_" + i + "();"); } cb.println(" }"); + for (int i = 0; i < tokenMaskSize; i++) { cb.println(); - cb.println(" private static void jj_la1_init_" + i + "() {"); - cb.print(" jj_la1_" + i + " = new int[] {"); - for (final int[] tokenMask : context.globals().maskVals) { - cb.print("0x" + Integer.toHexString(tokenMask[i]) + ", "); + if (maskCount <= LA1_INIT_CHUNK_SIZE) { + // Small enough — original single-method behavior + cb.println(" private static void jj_la1_init_" + i + "() {"); + cb.print(" jj_la1_" + i + " = new int[] {"); + for (final int[] tokenMask : context.globals().maskVals) { + cb.print("0x" + Integer.toHexString(tokenMask[i]) + ", "); + } + cb.println("};"); + cb.println(" }"); + } else { + // Split: allocate array in main method, fill in chunk sub-methods + final int numChunks = (maskCount + LA1_INIT_CHUNK_SIZE - 1) / LA1_INIT_CHUNK_SIZE; + cb.println(" private static void jj_la1_init_" + i + "() {"); + cb.println(" jj_la1_" + i + " = new int[" + maskCount + "];"); + for (int chunk = 0; chunk < numChunks; chunk++) { + cb.println(" jj_la1_init_" + i + "_chunk_" + chunk + "(jj_la1_" + i + ");"); + } + cb.println(" }"); + + // Generate chunk sub-methods + final List maskVals = context.globals().maskVals; + for (int chunk = 0; chunk < numChunks; chunk++) { + final int lo = chunk * LA1_INIT_CHUNK_SIZE; + final int hi = Math.min(lo + LA1_INIT_CHUNK_SIZE, maskCount); + cb.println(); + cb.println(" private static void jj_la1_init_" + i + "_chunk_" + chunk + + "(final int[] a) {"); + for (int j = lo; j < hi; j++) { + cb.println(" a[" + j + "] = 0x" + + Integer.toHexString(maskVals.get(j)[i]) + ";"); + } + cb.println(" }"); + } } - cb.println("};"); - cb.println(" }"); } } if ((context.globals().jj2index != 0) && Options.getErrorReporting()) { @@ -874,7 +923,6 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData cb.println(" }"); cb.println(); - cb.println(" private static final boolean DBG_EXP = false;"); cb.println(); if (context.globals().jj2index != 0) { @@ -909,18 +957,6 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData cb.print(", final String loc"); } cb.println(") {"); - cb.println(" if (DBG_EXP) System.out.println(\"scan1: kind = \" + kind +"); - if (Options.getErrorReporting()) { - cb.println(" \", loc = \" + loc +"); - } - cb.println(" \", jj_la = \" + jj_la +"); - cb.println(" \", jj_scanpos = \" + jj_scanpos + \" \" + jj_scanpos.hashCode() +"); - if (Options.getErrorReporting()) { - cb.println(" \", jj_lastpos = \" + jj_lastpos + \" \" + jj_lastpos.hashCode() +"); - cb.println(" \", jj_rescan = \" + jj_rescan);"); - } else { - cb.println(" \", jj_lastpos = \" + jj_lastpos + \" \" + jj_lastpos.hashCode());"); - } cb.println(" if (jj_scanpos == jj_lastpos) {"); cb.println(" jj_la--;"); cb.println(" if (jj_scanpos.next == null) {"); @@ -951,22 +987,12 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData } else if (Options.getDebugLookahead()) { cb.println(" trace_scan(jj_scanpos, kind);"); } - cb.println(" if (DBG_EXP) System.out.println(\"scan2: kind = \" + kind +"); - cb.println(" \", jj_scanpos.kind = \" + jj_scanpos.kind +"); - cb.println(" \", jj_la = \" + jj_la +"); - cb.println(" \", jj_scanpos = \" + jj_scanpos + \" \" + jj_scanpos.hashCode() +"); - cb.println(" \", jj_lastpos = \" + jj_lastpos + \" \" + jj_lastpos.hashCode());"); cb.println(" if (jj_scanpos.kind != kind) {"); cb.println(" return LA_Scan_Token_Failure;"); cb.println(" }"); cb.println(" if (jj_la == 0 && jj_scanpos == jj_lastpos) {"); cb.println(" throw jj_ls;"); cb.println(" }"); - cb.println(" if (DBG_EXP) System.out.println(\"scan3: kind = \" + kind +"); - cb.println(" \", jj_scanpos.kind = \" + jj_scanpos.kind +"); - cb.println(" \", jj_la = \" + jj_la +"); - cb.println(" \", jj_scanpos = \" + jj_scanpos + \" \" + jj_scanpos.hashCode() +"); - cb.println(" \", jj_lastpos = \" + jj_lastpos + \" \" + jj_lastpos.hashCode());"); cb.println(" return LA_Scan_Token_Success;"); cb.println(" }"); cb.println(); @@ -1043,7 +1069,7 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData cb.println(" private " + pStatic + "int jj_kind = -1;"); cb.println(" private " + pStatic + "int[] jj_expentry;"); cb.println(" private " + pStatic + "String[] jj_expentry_loc;"); - cb.println(" private static final int MAX_NB_POS = 100;"); + cb.println(" private static final int MAX_NB_POS = 512;"); if (context.globals().jj2index != 0) { cb.println(" private " + pStatic + "int[] jj_lasttokens = new int[MAX_NB_POS];"); cb.println( @@ -1054,25 +1080,9 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData /* jj_add_error_token(int kind, int pos) */ cb.println( " private " + pStatic + "void jj_add_error_token(int kind, int pos, String loc) {"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet1: jj_add_error_token: kind = \" + kind +"); - cb.println( - " \", pos = \" + pos + \", loc = \" + loc + \", jj_endpos = \" + jj_endpos);"); cb.println(" if (pos >= MAX_NB_POS) {"); cb.println(" return;"); cb.println(" }"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet1: jj_lasttokens = \" + java.util.Arrays.toString(jj_lasttokens));"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet1: jj_lasttokens_loc = \" + java.util.Arrays.toString(jj_lasttokens_loc));"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet1: jj_expentry = \" + java.util.Arrays.toString(jj_expentry));"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet1: jj_expentry_loc = \" + java.util.Arrays.toString(jj_expentry_loc));"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet1: jj_expentries = \" + java.util.Arrays.deepToString(jj_expentries.toArray()));"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet1: jj_expentries_loc = \" + java.util.Arrays.deepToString(jj_expentries_loc.toArray()));"); cb.println(" if (pos == jj_endpos + 1) {"); cb.println(" jj_lasttokens[jj_endpos] = kind;"); cb.println(" jj_lasttokens_loc[jj_endpos] = loc;"); @@ -1106,18 +1116,6 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData cb.println(" jj_endpos = pos;"); cb.println(" }"); cb.println(" }"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet2: jj_lasttokens = \" + java.util.Arrays.toString(jj_lasttokens));"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet2: jj_lasttokens_loc = \" + java.util.Arrays.toString(jj_lasttokens_loc));"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet2: jj_expentry = \" + java.util.Arrays.toString(jj_expentry));"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet2: jj_expentry_loc = \" + java.util.Arrays.toString(jj_expentry_loc));"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet2: jj_expentries = \" + java.util.Arrays.deepToString(jj_expentries.toArray()));"); - cb.println( - " if (DBG_EXP) System.out.println(\"aet2: jj_expentries_loc = \" + java.util.Arrays.deepToString(jj_expentries_loc.toArray()));"); cb.println(" }"); } cb.println(); @@ -1126,20 +1124,8 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData cb.println(" /** Generate a ParseException. */"); cb.println( " public " + pStatic + "ParseException generateParseException(final String loc) {"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe1: jj_la1 = \" + java.util.Arrays.toString(jj_la1));"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe1: jj_la1_loc = \" + java.util.Arrays.toString(jj_la1_loc));"); cb.println(" jj_expentries.clear();"); cb.println(" jj_expentries_loc.clear();"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe1: jj_expentry = \" + java.util.Arrays.toString(jj_expentry));"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe1: jj_expentry_loc = \" + java.util.Arrays.toString(jj_expentry_loc));"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe1: jj_expentries = \" + java.util.Arrays.deepToString(jj_expentries.toArray()));"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe1: jj_expentries_loc = \" + java.util.Arrays.deepToString(jj_expentries_loc.toArray()));"); cb.println( " final boolean[] la1tokens = new boolean[" + context.globals().tokenCount @@ -1163,10 +1149,6 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData cb.println(" }"); cb.println(" }"); cb.println(" }"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe2: la1tokens = \" + java.util.Arrays.toString(la1tokens));"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe2: la1tokens_loc = \" + java.util.Arrays.toString(la1tokens_loc));"); cb.println(" for (int k = 0; k < " + context.globals().tokenCount + "; k++) {"); cb.println(" if (la1tokens[k]) {"); cb.println(" jj_expentry = new int[1];"); @@ -1177,14 +1159,6 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData cb.println(" jj_expentries_loc.add(jj_expentry_loc);"); cb.println(" }"); cb.println(" }"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe3: jj_expentry = \" + java.util.Arrays.toString(jj_expentry));"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe3: jj_expentry_loc = \" + java.util.Arrays.toString(jj_expentry_loc));"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe3: jj_expentries = \" + java.util.Arrays.deepToString(jj_expentries.toArray()));"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe3: jj_expentries_loc = \" + java.util.Arrays.deepToString(jj_expentries_loc.toArray()));"); if (context.globals().jj2index != 0) { cb.println(" jj_endpos = 0;"); cb.println(" jj_rescan_token();"); @@ -1200,10 +1174,6 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData cb.println(" exptokseqloc[x] = jj_expentries_loc.get(x);"); // } cb.println(" }"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe4: exptokseq = \" + java.util.Arrays.deepToString(exptokseq));"); - cb.println( - " if (DBG_EXP) System.out.println(\"gpe4: exptokseqloc = \" + java.util.Arrays.deepToString(exptokseqloc));"); if (isJavaModernMode) { cb.println( " return new ParseException(token, exptokseq, exptokseqloc, tokenImage, loc, "); @@ -1255,7 +1225,6 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData + JavaTemplates.getTokenMgrErrorClass() + ".addEscapes(t.image) + \"\\\"\";"); cb.println(" }"); - cb.println(" if (DBG_EXP) s += \" / \" + t.hashCode();"); cb.println(" s += \">\";"); cb.println(" return s;"); cb.println(" }"); @@ -1442,26 +1411,47 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData } // end else if (Options.getDebugLookahead()) if ((context.globals().jj2index != 0) && Options.getErrorReporting()) { + final int jj2cnt = context.globals().jj2index; cb.println( " /** When and for reporting error, rescans tokens (rerun phase 3 routines), to build ad-hoc info. */"); cb.println(" private " + pStatic + "void jj_rescan_token() {"); cb.println(" jj_rescan = true;"); - cb.println(" for (int i = 0; i < " + context.globals().jj2index + "; i++) {"); + cb.println(" for (int i = 0; i < " + jj2cnt + "; i++) {"); cb.println(" try {"); cb.println(" JJCalls p = jj_2_rtns[i];"); cb.println(" do {"); cb.println(" if (p.gen > jj_gen) {"); cb.println(" jj_la = p.arg;"); cb.println(" jj_lastpos = jj_scanpos = p.first;"); - cb.println(" switch (i) {"); - for (int i = 0; i < context.globals().jj2index; i++) { - cb.println(" case " + i + ":"); - cb.println(" jj_3_" + (i + 1) + "();"); + + if (jj2cnt <= RESCAN_CHUNK_SIZE) { + // Small enough for a single switch — original behavior + cb.println(" switch (i) {"); + for (int i = 0; i < jj2cnt; i++) { + cb.println(" case " + i + ":"); + cb.println(" jj_3_" + (i + 1) + "();"); + cb.println(" break;"); + } + cb.println(" default:"); cb.println(" break;"); + cb.println(" }"); + } else { + // Split into chunk sub-methods to stay under 64KB bytecode per method + final int numChunks = (jj2cnt + RESCAN_CHUNK_SIZE - 1) / RESCAN_CHUNK_SIZE; + for (int chunk = 0; chunk < numChunks; chunk++) { + final int lo = chunk * RESCAN_CHUNK_SIZE; + if (chunk == 0) { + cb.println(" if (i < " + (lo + RESCAN_CHUNK_SIZE) + ") {"); + } else if (chunk == numChunks - 1) { + cb.println(" } else {"); + } else { + cb.println(" } else if (i < " + (lo + RESCAN_CHUNK_SIZE) + ") {"); + } + cb.println(" jj_rescan_token_chunk_" + chunk + "(i);"); + } + cb.println(" }"); } - cb.println(" default:"); - cb.println(" break;"); - cb.println(" }"); + cb.println(" }"); cb.println(" p = p.next;"); cb.println(" } while (p != null);"); @@ -1473,6 +1463,27 @@ public void generateCode(final CodeGeneratorSettings settings, final ParserData cb.println(" }"); cb.println(); + // Generate chunk sub-methods if splitting was needed + if (jj2cnt > RESCAN_CHUNK_SIZE) { + final int numChunks = (jj2cnt + RESCAN_CHUNK_SIZE - 1) / RESCAN_CHUNK_SIZE; + for (int chunk = 0; chunk < numChunks; chunk++) { + final int lo = chunk * RESCAN_CHUNK_SIZE; + final int hi = Math.min(lo + RESCAN_CHUNK_SIZE, jj2cnt); + cb.println(" private " + pStatic + "void jj_rescan_token_chunk_" + chunk + "(final int i) {"); + cb.println(" switch (i) {"); + for (int i = lo; i < hi; i++) { + cb.println(" case " + i + ":"); + cb.println(" jj_3_" + (i + 1) + "();"); + cb.println(" break;"); + } + cb.println(" default:"); + cb.println(" break;"); + cb.println(" }"); + cb.println(" }"); + cb.println(); + } + } + cb.println(" private " + pStatic + "void jj_save(final int index, final int xla) {"); cb.println(" JJCalls p = jj_2_rtns[index];"); cb.println(" while (p.gen > jj_gen) {"); @@ -1603,6 +1614,18 @@ private void build() { enumeration.hasMoreElements(); ) { buildPhase3Routine(enumeration.nextElement(), false, ""); } + + // Emit deferred continuation methods for split Choice chains (Fix 3: code-too-large prevention) + for (final DeferredChoiceChunk chunk : deferredChoiceChunks) { + cb.println(" private " + JavaUtil.getStatic() + "boolean jj_3" + chunk.methodName + "() {"); + cb.println(" " + getTypeForToken() + " xsp;"); + emitChoiceChunkFlat(chunk.choice, chunk.lo, chunk.hi, + chunk.totalChoices, "", chunk.nextPartName); + cb.println(" return LA_Phase3_Success;"); + cb.println(" }"); + cb.println(); + } + deferredChoiceChunks.clear(); } /* @@ -2457,6 +2480,10 @@ private void buildPhase2Routine(final Lookahead la) { cb.println(" try {"); NormalProduction prod = null; + final String saveCall = Options.getErrorReporting() + ? " jj_save(" + (Integer.parseInt(internalNames.get(e).substring(1)) - 1) + ", xla);" + : null; + if (Options.getDebugLookahead()) { // parent null for a top level lookahead expansion, // need to go through the lookahead itself (with mod in grammar) @@ -2473,11 +2500,24 @@ private void buildPhase2Routine(final Lookahead la) { + " LOOKAHEAD (\" + xla + \"/\" + jj_la + \") " + fmtAt(e, prod) + "\");"); + if (saveCall != null) { + cb.println(" if (rc) {"); + cb.println(saveCall); + cb.println(" }"); + } cb.println(" return (!rc);"); } else { // no DebugLookahead - cb.println(" return (!jj_3" + internalNames.get(e) + "()" + ret_suffix + ");"); + if (saveCall != null) { + cb.println(" final boolean _la_failed = jj_3" + internalNames.get(e) + "()" + ret_suffix + ";"); + cb.println(" if (_la_failed) {"); + cb.println(saveCall); + cb.println(" }"); + cb.println(" return !_la_failed;"); + } else { + cb.println(" return (!jj_3" + internalNames.get(e) + "()" + ret_suffix + ");"); + } } cb.println(" } catch (LookaheadSuccess ls) {"); @@ -2488,11 +2528,6 @@ private void buildPhase2Routine(final Lookahead la) { + "\");"); } cb.println(" return LA_Phase2_Success;"); - if (Options.getErrorReporting()) { - cb.println(" } finally {"); - cb.println( - " jj_save(" + (Integer.parseInt(internalNames.get(e).substring(1)) - 1) + ", xla);"); - } cb.println(" }"); cb.println(" }"); cb.println(); @@ -2716,64 +2751,74 @@ private void buildPhase3Routine( } else if (e instanceof Choice) { final Choice e_nrw = (Choice) e; final int nbChoices = e_nrw.getChoices().size(); - if (nbChoices != 1) { - if (!xsp_declared) { - xsp_declared = true; - cb.println(ind + " " + getTypeForToken() + " xsp;"); - } - cb.println(ind + " xsp = jj_scanpos;"); - } - for (int i = 0; i < nbChoices; i++) { - String dec = ""; - for (int k = 0; k < i; k++) { - dec += " "; - } - final Sequence nested_seq = (Sequence) e_nrw.getChoices().get(i); - final Lookahead la = (Lookahead) nested_seq.units.get(0); - if (la.getActionTokens().size() != 0) { - // We have semantic lookahead that must be evaluated. - context.globals().lookaheadNeeded = true; - cb.println(dec + ind + " jj_lookingAhead = true;"); - cb.print(dec + ind + " jj_semLA = "); - cb.printTokenSetup(la.getActionTokens().get(0)); - for (final Iterator it = la.getActionTokens().iterator(); it.hasNext(); ) { - t = it.next(); - cb.printToken(t); + + if (nbChoices > CHOICE_CHAIN_THRESHOLD && !recursive_call) { + // Large Choice: split into continuation sub-methods to avoid 64KB bytecode limit. + // Each chunk handles up to CHOICE_CHAIN_THRESHOLD alternatives. + // The last alternative in each chunk (except the final chunk) delegates + // to the next continuation method instead of returning failure. + buildPhase3ChoiceSplit(e, e_nrw, nbChoices, inf, ind); + } else { + // Original behavior for small-to-medium choices + if (nbChoices != 1) { + if (!xsp_declared) { + xsp_declared = true; + cb.println(ind + " " + getTypeForToken() + " xsp;"); } - // gcb.printTrailingComments(t); - cb.println(";"); - cb.println(dec + ind + " jj_lookingAhead = false;"); - } - cb.print(dec + ind + " if ("); - if (DCT) { - cb.print("/*bp3r-ch1*/ "); + cb.println(ind + " xsp = jj_scanpos;"); } - if (la.getActionTokens().size() != 0) { - cb.print("!jj_semLA || "); + for (int i = 0; i < nbChoices; i++) { + String dec = ""; + for (int k = 0; k < i; k++) { + dec += " "; + } + final Sequence nested_seq = (Sequence) e_nrw.getChoices().get(i); + final Lookahead la = (Lookahead) nested_seq.units.get(0); + if (la.getActionTokens().size() != 0) { + // We have semantic lookahead that must be evaluated. + context.globals().lookaheadNeeded = true; + cb.println(dec + ind + " jj_lookingAhead = true;"); + cb.print(dec + ind + " jj_semLA = "); + cb.printTokenSetup(la.getActionTokens().get(0)); + for (final Iterator it = la.getActionTokens().iterator(); it.hasNext(); ) { + t = it.next(); + cb.printToken(t); + } + // gcb.printTrailingComments(t); + cb.println(";"); + cb.println(dec + ind + " jj_lookingAhead = false;"); + } + cb.print(dec + ind + " if ("); + if (DCT) { + cb.print("/*bp3r-ch1*/ "); + } + if (la.getActionTokens().size() != 0) { + cb.print("!jj_semLA || "); + } + cb.print(genjj_3Call(nested_seq)); + cb.println(") {"); + if (i != (nbChoices - 1)) { + cb.println(dec + ind + " jj_scanpos = xsp;"); + } else { + cb.println(dec + ind + " " + genReturn(true, i, ind)); + cb.print(dec + ind + " }"); + if (DCT) { + cb.print(" /*bp3r-ch2*/"); + } + cb.println(); + } } - cb.print(genjj_3Call(nested_seq)); - cb.println(") {"); - if (i != (nbChoices - 1)) { - cb.println(dec + ind + " jj_scanpos = xsp;"); - } else { - cb.println(dec + ind + " " + genReturn(true, i, ind)); - cb.print(dec + ind + " }"); + for (int i = nbChoices; i > 1; i--) { + for (int k = i - 1; k > 1; k--) { + cb.print(" "); + } + cb.print(ind + " }"); if (DCT) { - cb.print(" /*bp3r-ch2*/"); + cb.print(" /*bp3r-ch3*/"); } cb.println(); } } - for (int i = nbChoices; i > 1; i--) { - for (int k = i - 1; k > 1; k--) { - cb.print(" "); - } - cb.print(ind + " }"); - if (DCT) { - cb.print(" /*bp3r-ch3*/"); - } - cb.println(); - } } else if (e instanceof Sequence) { final Sequence e_nrw = (Sequence) e; @@ -2890,6 +2935,150 @@ private void buildPhase3Routine( } } + /** + * Generates a split Choice chain for phase 3 scanning when the number of alternatives + * exceeds {@link #CHOICE_CHAIN_THRESHOLD}. + * + *

Instead of one deeply-nested if-else chain that may exceed 64KB of bytecode, + * this generates a flat try-each-and-return pattern split across continuation + * sub-methods:

+ *
+   *   // Main method: tries alternatives 0..999
+   *   Token xsp;
+   *   xsp = jj_scanpos;
+   *   if (!(call(alt0))) return false;  // success
+   *   jj_scanpos = xsp;
+   *   if (!(call(alt1))) return false;
+   *   ...
+   *   jj_scanpos = xsp;
+   *   return jj_3R_Foo_part1();  // delegate to continuation
+   *
+   *   // Continuation method: tries alternatives 1000..1999
+   *   private boolean jj_3R_Foo_part1() { ... }
+   * 
+ */ + private void buildPhase3ChoiceSplit(final Expansion e, final Choice choice, + final int nbChoices, final Phase3Data inf, final String ind) { + // Compute the method name base from the expansion's internal name + final String baseName = internalNames.get(e); + final int numChunks = (nbChoices + CHOICE_CHAIN_THRESHOLD - 1) / CHOICE_CHAIN_THRESHOLD; + + if (!xsp_declared) { + xsp_declared = true; + cb.println(ind + " " + getTypeForToken() + " xsp;"); + } + + // Generate flat alternatives for the first chunk (inside the current method body) + emitChoiceChunkFlat(choice, 0, Math.min(CHOICE_CHAIN_THRESHOLD, nbChoices), nbChoices, ind, + numChunks > 1 ? baseName + "_part1" : null); + + // Generate continuation sub-methods for remaining chunks + for (int chunk = 1; chunk < numChunks; chunk++) { + final int lo = chunk * CHOICE_CHAIN_THRESHOLD; + final int hi = Math.min(lo + CHOICE_CHAIN_THRESHOLD, nbChoices); + final String nextPart = (chunk < numChunks - 1) + ? baseName + "_part" + (chunk + 1) + : null; + + // These continuation methods are emitted AFTER the enclosing method finishes, + // so we buffer them and emit after buildPhase3Routine completes. + // We use a simpler approach: emit them as deferred code via the cb directly, + // since buildPhase3Routine for non-recursive calls will close the method afterward. + deferredChoiceChunks.add(new DeferredChoiceChunk( + baseName + "_part" + chunk, choice, lo, hi, nbChoices, nextPart)); + } + } + + /** Buffer for deferred continuation methods generated by choice splitting. */ + private final List deferredChoiceChunks = new ArrayList<>(); + + /** Data for a deferred continuation method. */ + private static class DeferredChoiceChunk { + final String methodName; + final Choice choice; + final int lo, hi, totalChoices; + final String nextPartName; // null if this is the last chunk + + DeferredChoiceChunk(String methodName, Choice choice, + int lo, int hi, int totalChoices, String nextPartName) { + this.methodName = methodName; + this.choice = choice; + this.lo = lo; + this.hi = hi; + this.totalChoices = totalChoices; + this.nextPartName = nextPartName; + } + } + + /** + * Emits a flat sequence of alternatives (no deep nesting) for a Choice chunk. + * Each alternative is: if (!(call(alt_i))) return false; jj_scanpos = xsp; + * The last alternative either returns failure or delegates to a continuation method. + * + * @param choice the Choice expansion + * @param lo first alternative index (inclusive) + * @param hi last alternative index (exclusive) + * @param total total number of alternatives in the full Choice + * @param ind indentation + * @param nextMethod name of continuation method, or null for the last chunk + */ + private void emitChoiceChunkFlat(final Choice choice, final int lo, final int hi, + final int total, final String ind, final String nextMethod) { + Token t = null; + cb.println(ind + " xsp = jj_scanpos;"); + for (int i = lo; i < hi; i++) { + final Sequence nested_seq = (Sequence) choice.getChoices().get(i); + final Lookahead la = (Lookahead) nested_seq.units.get(0); + + if (la.getActionTokens().size() != 0) { + context.globals().lookaheadNeeded = true; + cb.println(ind + " jj_lookingAhead = true;"); + cb.print(ind + " jj_semLA = "); + cb.printTokenSetup(la.getActionTokens().get(0)); + for (final Iterator it = la.getActionTokens().iterator(); it.hasNext(); ) { + t = it.next(); + cb.printToken(t); + } + cb.println(";"); + cb.println(ind + " jj_lookingAhead = false;"); + } + + if (i == hi - 1 && nextMethod == null) { + // Last alternative of the last chunk: return failure if it fails + cb.print(ind + " if ("); + if (la.getActionTokens().size() != 0) { + cb.print("!jj_semLA || "); + } + cb.print(genjj_3Call(nested_seq)); + cb.println(") {"); + cb.println(ind + " return LA_Phase3_Failure;"); + cb.println(ind + " }"); + } else if (i == hi - 1 && nextMethod != null) { + // Last alternative of a non-final chunk: delegate to continuation + cb.print(ind + " if ("); + if (la.getActionTokens().size() != 0) { + cb.print("!jj_semLA || "); + } + cb.print(genjj_3Call(nested_seq)); + cb.println(") {"); + cb.println(ind + " jj_scanpos = xsp;"); + cb.println(ind + " return jj_3" + nextMethod + "();"); + cb.println(ind + " }"); + } else { + // Normal alternative: try it, reset on failure, continue + cb.print(ind + " if (!("); + if (la.getActionTokens().size() != 0) { + cb.print("!jj_semLA || "); + } + cb.print(genjj_3Call(nested_seq)); + cb.println(")) {"); + cb.println(ind + " return LA_Phase3_Success;"); + cb.println(ind + " }"); + cb.println(ind + " jj_scanpos = xsp;"); + } + } + } + private String genjj_3Call(final Expansion e) { if (internalNames.containsKey(e) && internalNames.get(e).startsWith("jj_scan_token")) { return "LA_Scan_Token_Failure == " + internalNames.get(e) + (DCT ? " /*g3c-s*/" : ""); diff --git a/src/main/java/org/javacc/java/TokenManagerCodeGenerator.java b/src/main/java/org/javacc/java/TokenManagerCodeGenerator.java index fc34774..50f4fec 100644 --- a/src/main/java/org/javacc/java/TokenManagerCodeGenerator.java +++ b/src/main/java/org/javacc/java/TokenManagerCodeGenerator.java @@ -38,7 +38,6 @@ import java.util.Map; import org.javacc.parser.CodeGeneratorSettings; import org.javacc.parser.Context; -import org.javacc.parser.JavaCCGlobals; import org.javacc.parser.JavaCCParserConstants; import org.javacc.parser.Options; import org.javacc.parser.Token; @@ -49,619 +48,532 @@ /** Class that implements a table driven code generator for the token manager in Java. */ class TokenManagerCodeGenerator implements org.javacc.parser.TokenManagerCodeGenerator { - private static final String tokenManagerTemplate = "/templates/java/TokenManagerDriver.template"; - - private final Context context; - private JavaCodeBuilder jcb; - private static final String EOL = System.getProperty("line.separator"); - - TokenManagerCodeGenerator(final Context context) { - this.context = context; - } - - @Override - public void generateCode( - final CodeGeneratorSettings settings, final TokenizerData tokenizerData) { - - settings.putAll(Options.getOptions()); - - settings.put("maxOrdinal", tokenizerData.allMatches.size()); - settings.put("maxLexStates", tokenizerData.lexStateNames.length); - settings.put("nfaSize", tokenizerData.nfa.size()); - settings.put("charsVectorSize", ((Character.MAX_VALUE >> 6) + 1)); - settings.put("stateSetSize", tokenizerData.nfa.size()); - settings.put("parserName", tokenizerData.parserName); - settings.put("maxLongs", (tokenizerData.allMatches.size() / 64) + 1); - settings.put("parserName", tokenizerData.parserName); - settings.put("charStreamName", Options.getCharStreamName()); - settings.put("defaultLexState", tokenizerData.lexStateNames[tokenizerData.defaultLexState]); - settings.put("decls", tokenizerData.decls); - settings.put("generatedStates", tokenizerData.nfa.size()); - settings.put("initMatch", tokenizerData.initialMatchForLexState); - - final String tmSuperClass = (String) settings.get(Options.UO__TOKEN_MANAGER_SUPER_CLASS); - settings.put( - "tmSuperClass", - ((tmSuperClass == null) || tmSuperClass.equals("")) ? "" : "extends " + tmSuperClass); - settings.put("noDfa", Options.getNoDfa()); - - try { - - final File file = - new File(Options.getOutputDirectory(), tokenizerData.parserName + "TokenManager.java"); - jcb = JavaCodeBuilder.of(context, settings).setFile(file); - jcb.setPackageName(JavaUtil.parsePackage(context)); - - if (context.globals().cu_to_insertion_point_1.size() != 0) { - List tokens = null; - final Object firstToken = context.globals().cu_to_insertion_point_1.get(0); - jcb.printTokenSetup((Token) firstToken); - for (final Token t : context.globals().cu_to_insertion_point_1) { - if (t.kind == JavaCCParserConstants.IMPORT) { - tokens = new ArrayList<>(); - } else if ((tokens != null) && (t.kind == JavaCCParserConstants.SEMICOLON)) { - jcb.println("import", String.join("", tokens), ";"); - tokens = null; - } else if (tokens != null) { - tokens.add(CodeBuilder.toString(t)); - } - } - jcb.println(); - } + private static final String tokenManagerTemplate = "/templates/java/TokenManagerDriver.template"; - jcb.println("/* Beginning of code from " + tokenManagerTemplate + " */"); - jcb.println(); - jcb.printTemplate(tokenManagerTemplate); - jcb.println(); - jcb.println("/* End of code from " + tokenManagerTemplate + " */"); - jcb.println(); + private final Context context; + private JavaCodeBuilder jcb; + private static final String EOL = System.getProperty("line.separator"); - jcb.println(" /* Match info. */"); - jcb.println(); - dumpMatchInfo(jcb, tokenizerData); + TokenManagerCodeGenerator(final Context context) { + this.context = context; + } - if (!Options.getNoDfa()) { - jcb.println(" /* DFA tables. */"); - jcb.println(); - dumpDfaTables(jcb, tokenizerData); - } - - jcb.println(" /* NFA tables. */"); - jcb.println(); - dumpNfaTables(jcb, tokenizerData); - - jcb.println(" static {"); - if (!Options.getNoDfa()) { - jcb.println(" InitStartAndSize();"); - } - jcb.println(" initJjChars();"); - jcb.println(" }"); - jcb.println(); - jcb.println("}"); - - } catch (final IOException ioe) { - ioe.printStackTrace(); - assert (false); + @Override + public void generateCode(final CodeGeneratorSettings settings, final TokenizerData tokenizerData) { + settings.putAll(Options.getOptions()); + + settings.put("maxOrdinal", tokenizerData.allMatches.size()); + settings.put("maxLexStates", tokenizerData.lexStateNames.length); + settings.put("nfaSize", tokenizerData.nfa.size()); + settings.put("charsVectorSize", ((Character.MAX_VALUE >> 6) + 1)); + settings.put("stateSetSize", tokenizerData.nfa.size()); + settings.put("parserName", tokenizerData.parserName); + settings.put("maxLongs", (tokenizerData.allMatches.size() / 64) + 1); + settings.put("parserName", tokenizerData.parserName); + settings.put("charStreamName", Options.getCharStreamName()); + settings.put("defaultLexState", tokenizerData.lexStateNames[tokenizerData.defaultLexState]); + settings.put("decls", tokenizerData.decls); + settings.put("generatedStates", tokenizerData.nfa.size()); + settings.put("initMatch", tokenizerData.initialMatchForLexState); + + final String tmSuperClass = (String) settings.get(Options.UO__TOKEN_MANAGER_SUPER_CLASS); + settings.put( + "tmSuperClass", + ((tmSuperClass == null) || tmSuperClass.equals("")) ? "" : "extends " + tmSuperClass + ); + settings.put("noDfa", Options.getNoDfa()); + + try { + final File file = new File(Options.getOutputDirectory(), tokenizerData.parserName + "TokenManager.java"); + jcb = JavaCodeBuilder.of(context, settings).setFile(file); + jcb.setPackageName(JavaUtil.parsePackage(context)); + + if (context.globals().cu_to_insertion_point_1.size() != 0) { + List tokens = null; + final Object firstToken = context.globals().cu_to_insertion_point_1.get(0); + jcb.printTokenSetup((Token) firstToken); + for (final Token t : context.globals().cu_to_insertion_point_1) { + if (t.kind == JavaCCParserConstants.IMPORT) { + tokens = new ArrayList<>(); + } else if ((tokens != null) && (t.kind == JavaCCParserConstants.SEMICOLON)) { + jcb.println("import", String.join("", tokens), ";"); + tokens = null; + } else if (tokens != null) { + tokens.add(CodeBuilder.toString(t)); + } + } + jcb.println(); + } + + jcb.println("/* Beginning of code from " + tokenManagerTemplate + " */"); + jcb.println(); + jcb.printTemplate(tokenManagerTemplate); + jcb.println(); + jcb.println("/* End of code from " + tokenManagerTemplate + " */"); + jcb.println(); + + jcb.println(" /* Match info. */"); + jcb.println(); + dumpMatchInfo(jcb, tokenizerData); + + if (!Options.getNoDfa()) { + jcb.println(" /* DFA tables. */"); + jcb.println(); + dumpDfaTables(jcb, tokenizerData); + } + + jcb.println(" /* NFA tables. */"); + jcb.println(); + dumpNfaTables(jcb, tokenizerData); + + jcb.println(" static {"); + if (!Options.getNoDfa()) { + // ssKeys/ssValues are static final arrays; no init needed + } + jcb.println(" initJjChars();"); + jcb.println(" }"); + jcb.println(); + jcb.println("}"); + } catch (final IOException ioe) { + ioe.printStackTrace(); + assert (false); + } } - } - @Override - public void finish(final CodeGeneratorSettings settings, final TokenizerData tokenizerData) { + @Override + public void finish(final CodeGeneratorSettings settings, final TokenizerData tokenizerData) { + if (!Options.getBuildTokenManager()) { + return; + } - if (!Options.getBuildTokenManager()) { - return; + try { + jcb.close(); + } catch (final IOException ioe) { + ioe.printStackTrace(); + } } - try { - jcb.close(); - } catch (final IOException ioe) { - ioe.printStackTrace(); - } - } - - private static void dumpDfaTables(final JavaCodeBuilder jcb, final TokenizerData tokenizerData) { - - /* stringLiterals. */ - jcb.println(" // pairs of comment line & data line; format of comment line: // k, s"); - jcb.println( - " // k: key of map of lists of literals starting by char 'c', indexed by ((LexicalState << 16 | (int) c)"); - jcb.println(" // s: each string of the literals list for key k"); - jcb.println( - " // data line: len, ign_case (t/f=1/0), charAt(0..len-1), UCcharAt(0..len) if ic=true, kind, nfaStartState"); - jcb.println(" private static final int[] stringLiterals = {"); - int i = 0; - final Map startAndSize = new HashMap<>(); - for (final int key : tokenizerData.literalSequence.keySet()) { - final int[] arr = new int[2]; - final List l = tokenizerData.literalSequence.get(key); - final List kinds = tokenizerData.literalKinds.get(key); - arr[0] = i; - arr[1] = l.size(); - int j = 0; - if (i > 0) { - jcb.println(","); - } - for (final String s : l) { - if (j > 0) { - jcb.println(", "); + private static void dumpDfaTables(final JavaCodeBuilder jcb, final TokenizerData tokenizerData) { + /* stringLiterals — collect into int[], emit via init method to avoid overflow. */ + // Pre-calculate total size to avoid boxing overhead from List. + int totalSize = 0; + for (final int key : tokenizerData.literalSequence.keySet()) { + final List l = tokenizerData.literalSequence.get(key); + final List kinds = tokenizerData.literalKinds.get(key); + int j = 0; + for (final String s : l) { + final int kind = kinds.get(j); + final boolean ignoreCase = tokenizerData.ignoreCaseKinds.contains(kind); + // length + ignoreCase flag + chars + (uppercased chars if ignoreCase) + kind + nfaStartState + totalSize += 2 + s.length() + (ignoreCase ? s.length() : 0) + 2; + j++; + } } - final int kind = kinds.get(j); - final boolean ignoreCase = tokenizerData.ignoreCaseKinds.contains(kind); - jcb.println(" // " + key + ", \"" + JavaCCGlobals.add_escapes(s) + "\""); - jcb.print(" "); - jcb.print(s.length()); - jcb.print(", "); - jcb.print(ignoreCase ? 1 : 0); - for (int k = 0; k < s.length(); k++) { - jcb.print(", "); - jcb.print((int) s.charAt(k)); - i++; + final int[] slData = new int[totalSize]; + int slPos = 0; + final Map startAndSize = new HashMap<>(tokenizerData.literalSequence.size() * 4 / 3 + 1); + for (final int key : tokenizerData.literalSequence.keySet()) { + final int[] arr = new int[2]; + final List l = tokenizerData.literalSequence.get(key); + final List kinds = tokenizerData.literalKinds.get(key); + arr[0] = slPos; + arr[1] = l.size(); + int j = 0; + for (final String s : l) { + final int kind = kinds.get(j); + final boolean ignoreCase = tokenizerData.ignoreCaseKinds.contains(kind); + slData[slPos++] = s.length(); + slData[slPos++] = ignoreCase ? 1 : 0; + for (int k = 0; k < s.length(); k++) { + slData[slPos++] = s.charAt(k); + } + if (ignoreCase) { + final String upper = s.toUpperCase(); + for (int k = 0; k < upper.length(); k++) { + slData[slPos++] = upper.charAt(k); + } + } + slData[slPos++] = kind; + slData[slPos++] = tokenizerData.kindToNfaStartState.get(kind); + j++; + } + startAndSize.put(key, arr); } - if (ignoreCase) { - for (int k = 0; k < s.length(); k++) { - jcb.print(", "); - jcb.print((int) s.toUpperCase().charAt(k)); - i++; - } + JavaArrayHelper.emitIntArray(jcb, " ", "private", "stringLiterals", slData); + + /* startAndSize — sorted parallel arrays for binary-search lookup (no autoboxing). */ + final List sortedKeys = new ArrayList<>(startAndSize.keySet()); + java.util.Collections.sort(sortedKeys); + final int[] ssKeysArr = new int[sortedKeys.size()]; + final int[] ssValuesArr = new int[sortedKeys.size() * 2]; + for (int idx = 0; idx < sortedKeys.size(); idx++) { + ssKeysArr[idx] = sortedKeys.get(idx); + final int[] v = startAndSize.get(sortedKeys.get(idx)); + ssValuesArr[idx * 2] = v[0]; + ssValuesArr[idx * 2 + 1] = v[1]; } - jcb.print(", " + kind); - jcb.print(", " + tokenizerData.kindToNfaStartState.get(kind)); - i += 4; - j++; - } - startAndSize.put(key, arr); - } - jcb.println(); - jcb.println(" };"); - jcb.println(); - - /* startAndSize. */ - jcb.println(" private static final java.util.Map startAndSize ="); - jcb.println(" new java.util.HashMap();"); - jcb.println(); - - /* InitStartAndSize. */ - jcb.println(" // format of \"startAndSize.put(k, new int[] {ix, sz})\":"); - jcb.println( - " // k: key of map of lists of literals starting by char 'c', indexed by ((LexicalState << 16 | (int) c)"); - jcb.println(" // ix: index (list's start in stringLiterals)"); - jcb.println(" // sz: list's size"); - jcb.println(" private static void InitStartAndSize() {"); - for (final int key : tokenizerData.literalSequence.keySet()) { - final int[] arr = startAndSize.get(key); - jcb.println(" startAndSize.put(" + key + ", new int[] {" + arr[0] + ", " + arr[1] + "});"); - } - jcb.println(" }"); - jcb.println(); - } - - private static void dumpNfaTables(final JavaCodeBuilder jcb, final TokenizerData tokenizerData) { - - /* canMatchAnyChar. */ - jcb.print(" private static final int[] canMatchAnyChar = {"); - int v = 0; - for (int i = 0; i < tokenizerData.wildcardKind.size(); i++) { - if (v++ > 0) { - jcb.print(", "); - } else { + JavaArrayHelper.emitIntArray(jcb, " ", "private", "ssKeys", ssKeysArr); + JavaArrayHelper.emitIntArray(jcb, " ", "private", "ssValues", ssValuesArr); jcb.println(); - jcb.print(" "); - } - jcb.print(tokenizerData.wildcardKind.get(i)); } - if (!tokenizerData.wildcardKind.isEmpty()) { - jcb.println(); - jcb.println(" };"); - } else { - jcb.println("};"); - } - jcb.println(); - - /* jjInitStates. */ - jcb.print(" private static final int[] jjInitStates = {"); - v = 0; - for (final int i : tokenizerData.initialStates.keySet()) { - if (v++ > 0) { - jcb.print(", "); - } else { + + private static void dumpNfaTables(final JavaCodeBuilder jcb, final TokenizerData tokenizerData) { + /* canMatchAnyChar — emit via init method for clinit safety. */ + { + final int[] arr = new int[tokenizerData.wildcardKind.size()]; + for (int i = 0; i < arr.length; i++) { + arr[i] = tokenizerData.wildcardKind.get(i); + } + JavaArrayHelper.emitIntArray(jcb, " ", "private", "canMatchAnyChar", arr); + } + + /* jjInitStates — emit via init method. */ + { + final int[] keys = tokenizerData.initialStates.keySet().stream() + .mapToInt(Integer::intValue).toArray(); + final int[] arr = new int[keys.length]; + for (int i = 0; i < keys.length; i++) { + arr[i] = tokenizerData.initialStates.get(keys[i]); + } + JavaArrayHelper.emitIntArray(jcb, " ", "private", "jjInitStates", arr); + } + + /* jjInitialMatchForLexState — emit via init method. */ + JavaArrayHelper.emitIntArray(jcb, " ", "private", "jjInitialMatchForLexState", + tokenizerData.initialMatchForLexState); + + // We do the following for Java so that the generated code is reasonable + // size and can be compiled. May not be needed for other languages. + + /* EMPTY_CHAR_DATA. */ + jcb.println(" private static final long[] EMPTY_CHAR_DATA = new long[] {};"); jcb.println(); - jcb.print(" "); - } - jcb.print(tokenizerData.initialStates.get(i)); - } - if (!tokenizerData.initialStates.isEmpty()) { - jcb.println(); - jcb.println(" };"); - } else { - jcb.println("};"); - } - jcb.println(); - - /* jjInitialMatchForLexState. */ - jcb.print(" private static final int[] jjInitialMatchForLexState = {"); - v = 0; - for (int i = 0; i < tokenizerData.lexStateNames.length; i++) { - if (v++ > 0) { - jcb.print(", "); - } else { + + /* CharDataConsts. */ + jcb.println(" private static final class CharDataConsts {"); jcb.println(); - jcb.print(" "); - } - jcb.print(tokenizerData.initialMatchForLexState[i]); - } - jcb.println("};"); - jcb.println(); - - // We do the following for Java so that the generated code is reasonable - // size and can be compiled. May not be needed for other languages. - - /* EMPTY_CHAR_DATA. */ - jcb.println(" private static final long[] EMPTY_CHAR_DATA = new long[] {};"); - jcb.println(); - - /* CharDataConsts. */ - jcb.println(" private static final class CharDataConsts {"); - jcb.println(); - - /* jjCharData into a buffer. */ - final Map nfa = tokenizerData.nfa; - final Map charDataVars = new HashMap(); - final Map charDataCdbs = new HashMap(); - - final StringBuilder sb = new StringBuilder(64 + 18 * nfa.size()); - sb.append(" private static final long[][] jjCharData = {"); - final String charDataVarPrefix = "CHAR_DATA"; - final StringBuilder charDataBuilder = new StringBuilder(64); - for (int i = 0; i < nfa.size(); i++) { - if (i > 0) { - sb.append(',').append(EOL); - } else { - sb.append(EOL); - } - charDataBuilder.setLength(0); - // We have a lot of similar states. So factor them so we don't get "Code too large" errors. - final TokenizerData.NfaState tmp = nfa.get(i); - if (tmp == null) { - // sb.append(" EMPTY_CHAR_DATA").append(EOL); - sb.append(" /* ").append(i).append(" */ EMPTY_CHAR_DATA").append(EOL); - } else { - charDataBuilder.append("new long[] {"); - final BitSet bits = new BitSet(); - for (final char c : tmp.characters) { - bits.set(c); - } - final long[] longs = bits.toLongArray(); - for (int k = 0; k < longs.length; k++) { - int rep = 1; - while (((k + rep) < longs.length) && (longs[k + rep] == longs[k])) { - rep++; - } - if (k > 0) { - charDataBuilder.append(", "); - } - charDataBuilder.append(rep).append(", "); - charDataBuilder.append(Long.toString(longs[k])).append("L"); - k += rep - 1; + + /* jjCharData into a buffer. */ + final Map nfa = tokenizerData.nfa; + final Map charDataVars = new HashMap<>(); + final Map charDataCdbs = new HashMap<>(); + + final StringBuilder sb = new StringBuilder(64 + 18 * nfa.size()); + sb.append(" private static final long[][] jjCharData = {"); + final String charDataVarPrefix = "CHAR_DATA"; + final StringBuilder charDataBuilder = new StringBuilder(64); + for (int i = 0; i < nfa.size(); i++) { + if (i > 0) { + sb.append(',').append(EOL); + } else { + sb.append(EOL); + } + charDataBuilder.setLength(0); + // We have a lot of similar states. So factor them so we don't get "Code too large" errors. + final TokenizerData.NfaState tmp = nfa.get(i); + if (tmp == null) { + // sb.append(" EMPTY_CHAR_DATA").append(EOL); + sb.append(" /* ").append(i).append(" */ EMPTY_CHAR_DATA").append(EOL); + } else { + charDataBuilder.append("new long[] {"); + final BitSet bits = new BitSet(); + for (final char c : tmp.characters) { + bits.set(c); + } + final long[] longs = bits.toLongArray(); + for (int k = 0; k < longs.length; k++) { + int rep = 1; + while (((k + rep) < longs.length) && (longs[k + rep] == longs[k])) { + rep++; + } + if (k > 0) { + charDataBuilder.append(", "); + } + charDataBuilder.append(rep).append(", "); + charDataBuilder.append(Long.toString(longs[k])).append("L"); + k += rep - 1; + } + charDataBuilder.append("}"); + final String cdb = charDataBuilder.toString(); + String var = charDataVars.get(cdb); + if (var == null) { + var = charDataVarPrefix + (charDataVars.size() + 1); + charDataVars.put(cdb, var); + charDataCdbs.put(var, cdb); + } + // sb.append(" " + var); + sb.append(" /* ").append(i).append(" */ ").append(var); + } } - charDataBuilder.append("}"); - final String cdb = charDataBuilder.toString(); - String var = charDataVars.get(cdb); - if (var == null) { - var = charDataVarPrefix + (charDataVars.size() + 1); - charDataVars.put(cdb, var); - charDataCdbs.put(var, cdb); + if (!nfa.isEmpty()) { + sb.append(EOL).append(" };").append(EOL); + } else { + sb.append(" };").append(EOL); } - // sb.append(" " + var); - sb.append(" /* ").append(i).append(" */ ").append(var); - } - } - if (!nfa.isEmpty()) { - sb.append(EOL).append(" };").append(EOL); - } else { - sb.append(" };").append(EOL); - } - // in order, for easier comparison with C# & C++ - for (int k = 1; k <= charDataCdbs.size(); k++) { - final String key = charDataVarPrefix + Integer.toString(k); - jcb.println(" private static final long[] " + key + " = " + charDataCdbs.get(key) + ";"); - } - jcb.println(); - - // now print jjCharData buffer - jcb.println(sb); - - /* end class CharDataConsts. */ - jcb.println(" }"); - jcb.println(); - - /* EMPTY_STATE_SET. */ - jcb.println(" private static final int[] EMPTY_STATE_SET = new int[] {};"); - jcb.println(); - - /* jjcompositeState. */ - jcb.print(" private static final int[][] jjcompositeState = {"); - for (int i = 0; i < nfa.size(); i++) { - final TokenizerData.NfaState tmp = nfa.get(i); - if (i > 0) { - jcb.println(","); - } else { - jcb.println(); - } - if (tmp == null || tmp.compositeStates.isEmpty()) { - // jcb.print(" EMPTY_STATE_SET"); - jcb.print(" /* " + i + " */ EMPTY_STATE_SET"); - } else { - // jcb.print(" new int[] { "); - jcb.print(" /* " + i + " */ new int[] { "); - int k = 0; - for (final int st : tmp.compositeStates) { - if (k++ > 0) { - jcb.print(", "); - } - jcb.print(st); + // Emit CHAR_DATA_N arrays via init methods to prevent CharDataConsts. overflow. + // Each long literal compiles to ~12 bytes of bytecode; large Unicode character classes + // (like CHAR_DATA27/28/31 in full-Unicode grammars) can have 500+ RLE pairs, and with + // 70+ unique arrays the combined clinit easily exceeds 64KB. + final int CHAR_DATA_INLINE_LIMIT = 100; // elements; above this → init method + for (int k = 1; k <= charDataCdbs.size(); k++) { + final String key = charDataVarPrefix + Integer.toString(k); + final String initExpr = charDataCdbs.get(key); // e.g. "new long[] {1, 4294977024L}" + // Count elements to decide inline vs init method + final int commaCount = initExpr.length() - initExpr.replace(",", "").length(); + final int elemCount = commaCount + 1; // rough count of array elements + if (elemCount <= CHAR_DATA_INLINE_LIMIT) { + // Small array — keep inline (original behavior) + jcb.println(" private static final long[] " + key + " = " + initExpr + ";"); + } else { + // Large array — wrap in init method to keep out of + jcb.println(" private static final long[] " + key + " = " + key + "_init();"); + jcb.println(" private static long[] " + key + "_init() {"); + jcb.println(" return " + initExpr + ";"); + jcb.println(" }"); + } } - jcb.print(" }"); - } - } - if (!nfa.isEmpty()) { - jcb.println(); - jcb.println(" };"); - } else { - jcb.println("};"); - } - jcb.println(); - - /* jjmatchKinds. */ - jcb.print(" private static final int[] jjmatchKinds = {"); - for (int i = 0; i < nfa.size(); i++) { - final TokenizerData.NfaState tmp = nfa.get(i); - if (i > 0) { - jcb.println(","); - } else { - jcb.println(); - } - // jcb.print(" "); - jcb.print(" /* " + i + " */ "); - // TODO(sreeni) : Fix this mess. - jcb.print(tmp == null ? Integer.MAX_VALUE : tmp.kind); - } - if (!nfa.isEmpty()) { - jcb.println(); - jcb.println(" };"); - } else { - jcb.println("};"); - } - jcb.println(); - - /* jjnextStateSet. */ - jcb.print(" private static final int[][] jjnextStateSet = {"); - for (int i = 0; i < nfa.size(); i++) { - final TokenizerData.NfaState tmp = nfa.get(i); - if (i > 0) { - jcb.println(","); - } else { jcb.println(); - } - if (tmp == null || tmp.nextStates.isEmpty()) { - // jcb.print(" EMPTY_STATE_SET"); - jcb.print(" /* " + i + " */ EMPTY_STATE_SET"); - } else { - int k = 0; - // jcb.print(" new int[] { "); - jcb.print(" /* " + i + " */ new int[] { "); - for (final int s : tmp.nextStates) { - if (k++ > 0) { - jcb.print(", "); - } - jcb.print(s); + + // Emit jjCharData reference array via init method when large. + // The StringBuilder 'sb' contains the full "private static final long[][] jjCharData = { ... };" + // declaration. For large NFA state counts (641+ states), this reference array alone + // contributes ~5KB+ to . Wrap in init method for safety. + if (nfa.size() > 500) { + // Replace inline declaration with init method + String jjCharDataDecl = sb.toString(); + // Change "private static final long[][] jjCharData = {" to return statement + jjCharDataDecl = jjCharDataDecl.replace( + "private static final long[][] jjCharData = {", + "private static final long[][] jjCharData = jjCharData_init();" + + EOL + " private static long[][] jjCharData_init() {" + + EOL + " return new long[][] {"); + // Close the init method after the array + jjCharDataDecl = jjCharDataDecl.replace("};", "};" + EOL + " }"); + jcb.println(jjCharDataDecl); + } else { + // Small enough — original inline behavior + jcb.println(sb); } - jcb.print(" }"); - } - } - if (!nfa.isEmpty()) { - jcb.println(); - jcb.println(" };"); - } else { - jcb.println("};"); - } - jcb.println(); - } - - private static void dumpMatchInfo(final JavaCodeBuilder jcb, final TokenizerData tokenizerData) { - final Map allMatches = tokenizerData.allMatches; - - // A bit ugly. - - final BitSet toSkip = new BitSet(allMatches.size()); - final BitSet toSpecial = new BitSet(allMatches.size()); - final BitSet toMore = new BitSet(allMatches.size()); - final BitSet toToken = new BitSet(allMatches.size()); - final int[] newStates = new int[allMatches.size()]; - toSkip.set(allMatches.size() + 1, true); - toToken.set(allMatches.size() + 1, true); - toMore.set(allMatches.size() + 1, true); - toSpecial.set(allMatches.size() + 1, true); - - /* jjstrLiteralImages. */ - jcb.println(" public static final String[] jjstrLiteralImages = {"); - int k = 0; - for (int i = 0; i < allMatches.size(); i++) { - final TokenizerData.MatchInfo matchInfo = allMatches.get(i); - switch (matchInfo.matchType) { - case SKIP: - toSkip.set(i); - break; - case SPECIAL_TOKEN: - toSpecial.set(i); - break; - case MORE: - toMore.set(i); - break; - case TOKEN: - toToken.set(i); - break; - } - newStates[i] = matchInfo.newLexState; - final String image = matchInfo.image; - if (k++ > 0) { - jcb.println(","); - } - if (image != null) { - jcb.print(" \""); - for (int j = 0; j < image.length(); j++) { - final int cj = image.charAt(j); - switch (cj) { - case '\b': - jcb.print("\\b"); - continue; - case '\t': - jcb.print("\\t"); - continue; - case '\n': - jcb.print("\\n"); - continue; - case '\f': - jcb.print("\\f"); - continue; - case '\r': - jcb.print("\\r"); - continue; - case '\"': - jcb.print("\\\""); - continue; - case '\'': - jcb.print("\\\'"); - continue; - case '\\': - jcb.print("\\\\"); - continue; - default: - if (cj <= 0xff) { - if (cj < 0x20 || (cj > 0x7e)) { - jcb.print("0x" + Integer.toHexString(cj)); - } else { - jcb.print(image.charAt(j)); + + /* end class CharDataConsts. */ + jcb.println(" }"); + jcb.println(); + + /* jjcompositeState and jjnextStateSet are now flattened; EMPTY_STATE_SET no longer needed. */ + + /* jjcompositeState — flatten int[][] to 1D for clinit safety and cache efficiency. */ + final List compositeRows = new ArrayList<>(nfa.size()); + for (int i = 0; i < nfa.size(); i++) { + final TokenizerData.NfaState tmp = nfa.get(i); + if (tmp == null || tmp.compositeStates.isEmpty()) { + compositeRows.add(null); + } else { + final int[] row = new int[tmp.compositeStates.size()]; + int k = 0; + for (final int st : tmp.compositeStates) { + row[k++] = st; } - } else { - String hexVal = Integer.toHexString(image.charAt(j)); - if (hexVal.length() == 3) { - hexVal = "0" + hexVal; + compositeRows.add(row); + } + } + final JavaArrayHelper.FlatIntArray2D flatComposite = JavaArrayHelper.flatten(compositeRows); + JavaArrayHelper.emitFlatIntArray2D(jcb, " ", "private", "jjcompositeState", flatComposite); + + /* jjmatchKinds — emit via init method. */ + final int[] matchKindsArr = new int[nfa.size()]; + for (int i = 0; i < nfa.size(); i++) { + final TokenizerData.NfaState tmp = nfa.get(i); + matchKindsArr[i] = (tmp == null) ? Integer.MAX_VALUE : tmp.kind; + } + JavaArrayHelper.emitIntArray(jcb, " ", "private", "jjmatchKinds", matchKindsArr); + + /* jjnextStateSet — flatten int[][] to 1D for clinit safety and cache efficiency. */ + final List nextStateRows = new ArrayList<>(nfa.size()); + for (int i = 0; i < nfa.size(); i++) { + final TokenizerData.NfaState tmp = nfa.get(i); + if (tmp == null || tmp.nextStates.isEmpty()) { + nextStateRows.add(null); + } else { + final int[] row = new int[tmp.nextStates.size()]; + int k = 0; + for (final int s : tmp.nextStates) { + row[k++] = s; } - jcb.print("\\u" + hexVal); - } - continue; - } + nextStateRows.add(row); + } } - jcb.print("\""); - } else { - jcb.print(" null"); - } + final JavaArrayHelper.FlatIntArray2D flatNextState = JavaArrayHelper.flatten(nextStateRows); + JavaArrayHelper.emitFlatIntArray2D(jcb, " ", "private", "jjnextStateSet", flatNextState); } - jcb.println(); - jcb.println(" };"); - jcb.println(); - - /* Bit masks. */ - generateBitVector(jcb, "jjtoToken", toToken); - jcb.println(); - generateBitVector(jcb, "jjtoSkip", toSkip); - jcb.println(); - generateBitVector(jcb, "jjtoSpecial", toSpecial); - jcb.println(); - generateBitVector(jcb, "jjtoMore", toMore); - jcb.println(); - - /* jjnewLexState. */ - jcb.println(" private static final int[] jjnewLexState = {"); - for (int i = 0; i < newStates.length; i++) { - if (i > 0) { - jcb.print(", "); - } else { - jcb.print(" "); - } - // codeGenerator.genCode("0x" + Integer.toHexString(newStates[i])); - jcb.print(Integer.toString(newStates[i])); + + private static void dumpMatchInfo(final JavaCodeBuilder jcb, final TokenizerData tokenizerData) { + final Map allMatches = tokenizerData.allMatches; + + // A bit ugly. + + final BitSet toSkip = new BitSet(allMatches.size()); + final BitSet toSpecial = new BitSet(allMatches.size()); + final BitSet toMore = new BitSet(allMatches.size()); + final BitSet toToken = new BitSet(allMatches.size()); + final int[] newStates = new int[allMatches.size()]; + toSkip.set(allMatches.size() + 1, true); + toToken.set(allMatches.size() + 1, true); + toMore.set(allMatches.size() + 1, true); + toSpecial.set(allMatches.size() + 1, true); + + /* jjstrLiteralImages — collect into String[], emit via init method. */ + final String[] literalImages = new String[allMatches.size()]; + final StringBuilder imgBuilder = new StringBuilder(32); + for (int i = 0; i < allMatches.size(); i++) { + final TokenizerData.MatchInfo matchInfo = allMatches.get(i); + switch (matchInfo.matchType) { + case SKIP: + toSkip.set(i); + break; + case SPECIAL_TOKEN: + toSpecial.set(i); + break; + case MORE: + toMore.set(i); + break; + case TOKEN: + toToken.set(i); + break; + } + newStates[i] = matchInfo.newLexState; + final String image = matchInfo.image; + if (image != null) { + imgBuilder.setLength(0); + imgBuilder.append('"'); + for (int j = 0; j < image.length(); j++) { + final int cj = image.charAt(j); + switch (cj) { + case '\b': + imgBuilder.append("\\b"); + continue; + case '\t': + imgBuilder.append("\\t"); + continue; + case '\n': + imgBuilder.append("\\n"); + continue; + case '\f': + imgBuilder.append("\\f"); + continue; + case '\r': + imgBuilder.append("\\r"); + continue; + case '\"': + imgBuilder.append("\\\""); + continue; + case '\'': + imgBuilder.append("\\\'"); + continue; + case '\\': + imgBuilder.append("\\\\"); + continue; + default: + if (cj <= 0xff) { + if (cj < 0x20 || (cj > 0x7e)) { + imgBuilder.append("0x").append(Integer.toHexString(cj)); + } else { + imgBuilder.append(image.charAt(j)); + } + } else { + String hexVal = Integer.toHexString(image.charAt(j)); + if (hexVal.length() == 3) { + hexVal = "0" + hexVal; + } + imgBuilder.append("\\u").append(hexVal); + } + continue; + } + } + imgBuilder.append("\""); + literalImages[i] = imgBuilder.toString(); + } + // null entries stay null in the array + } + JavaArrayHelper.emitStringArray(jcb, " ", "public", "jjstrLiteralImages", literalImages); + + /* Bit masks. */ + generateBitVector(jcb, "jjtoToken", toToken); + jcb.println(); + generateBitVector(jcb, "jjtoSkip", toSkip); + jcb.println(); + generateBitVector(jcb, "jjtoSpecial", toSpecial); + jcb.println(); + generateBitVector(jcb, "jjtoMore", toMore); + jcb.println(); + + /* jjnewLexState — emit via init method. */ + JavaArrayHelper.emitIntArray(jcb, " ", "private", "jjnewLexState", newStates); + + // Action functions. + + final String staticString = Options.getStatic() ? " static " : " "; + + // Token actions. + jcb.println(staticString + "void TokenLexicalActions(Token matchedToken) {"); + jcb.println(" // TOKEN lexical actions"); + // dumpLexicalActions(jcb, allMatches, TokenizerData.MatchType.TOKEN, "matchedToken.kind"); + dumpLexicalActions(jcb, allMatches, TokenizerData.MatchType.TOKEN, "jjmatchedKind"); + jcb.println(" }"); + jcb.println(); + + // Skip actions. + // TODO(sreeni) : Streamline this mess. + jcb.println(staticString + "void SkipLexicalActions(Token matchedToken) {"); + jcb.println(" // SKIP lexical actions"); + dumpLexicalActions(jcb, allMatches, TokenizerData.MatchType.SKIP, "jjmatchedKind"); + // jcb.println(" // SPECIAL_TOKEN lexical actions"); + // dumpLexicalActions(jcb, allMatches, TokenizerData.MatchType.SPECIAL_TOKEN, + // "jjmatchedKind"); + jcb.println(" }"); + jcb.println(); + + // More actions. + jcb.println(staticString + "void MoreLexicalActions() {"); + // jcb.println(" jjimageLen += (lengthOfMatch = jjmatchedPos + 1);"); + jcb.println(" // MORE lexical actions"); + dumpLexicalActions(jcb, allMatches, TokenizerData.MatchType.MORE, "jjmatchedKind"); + jcb.println(" }"); + jcb.println(); } - jcb.println(); - jcb.println(" };"); - jcb.println(); - - // Action functions. - - final String staticString = Options.getStatic() ? " static " : " "; - - // Token actions. - jcb.println(staticString + "void TokenLexicalActions(Token matchedToken) {"); - jcb.println(" // TOKEN lexical actions"); - // dumpLexicalActions(jcb, allMatches, TokenizerData.MatchType.TOKEN, "matchedToken.kind"); - dumpLexicalActions(jcb, allMatches, TokenizerData.MatchType.TOKEN, "jjmatchedKind"); - jcb.println(" }"); - jcb.println(); - - // Skip actions. - // TODO(sreeni) : Streamline this mess. - jcb.println(staticString + "void SkipLexicalActions(Token matchedToken) {"); - jcb.println(" // SKIP lexical actions"); - dumpLexicalActions(jcb, allMatches, TokenizerData.MatchType.SKIP, "jjmatchedKind"); - // jcb.println(" // SPECIAL_TOKEN lexical actions"); - // dumpLexicalActions(jcb, allMatches, TokenizerData.MatchType.SPECIAL_TOKEN, - // "jjmatchedKind"); - jcb.println(" }"); - jcb.println(); - - // More actions. - jcb.println(staticString + "void MoreLexicalActions() {"); - // jcb.println(" jjimageLen += (lengthOfMatch = jjmatchedPos + 1);"); - jcb.println(" // MORE lexical actions"); - dumpLexicalActions(jcb, allMatches, TokenizerData.MatchType.MORE, "jjmatchedKind"); - jcb.println(" }"); - jcb.println(); - } - - private static void dumpLexicalActions( - final JavaCodeBuilder jcb, - final Map allMatches, - final TokenizerData.MatchType matchType, - final String kindString) { - - jcb.println(" switch (" + kindString + ") {"); - for (final int i : allMatches.keySet()) { - final TokenizerData.MatchInfo matchInfo = allMatches.get(i); - if ((matchInfo.action == null) || (matchInfo.matchType != matchType)) { - continue; - } - jcb.println(" case " + i + ": {"); - // TODO check (MMa start added) comes from v7 output cf. JSqlParser - if (matchInfo.matchType == MatchType.SKIP) { - } else if (matchInfo.matchType == MatchType.MORE) { - jcb.println(" jjimageLen += (lengthOfMatch = jjmatchedPos);"); - } else if (matchInfo.matchType == MatchType.TOKEN) { - jcb.println( - " image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos)));"); - } - // TODO check (MMa end added) - jcb.println(" " + matchInfo.action.trim()); - jcb.println(" break;"); - jcb.println(" }"); + + private static void dumpLexicalActions( + final JavaCodeBuilder jcb, + final Map allMatches, + final TokenizerData.MatchType matchType, + final String kindString + ) { + jcb.println(" switch (" + kindString + ") {"); + for (final int i : allMatches.keySet()) { + final TokenizerData.MatchInfo matchInfo = allMatches.get(i); + if ((matchInfo.action == null) || (matchInfo.matchType != matchType)) { + continue; + } + jcb.println(" case " + i + ": {"); + // TODO check (MMa start added) comes from v7 output cf. JSqlParser + if (matchInfo.matchType == MatchType.SKIP) { + } else if (matchInfo.matchType == MatchType.MORE) { + jcb.println(" jjimageLen += (lengthOfMatch = jjmatchedPos);"); + } else if (matchInfo.matchType == MatchType.TOKEN) { + jcb.println( + " image.append(input_stream.GetSuffix(jjimageLen + (lengthOfMatch = jjmatchedPos)));" + ); + } + // TODO check (MMa end added) + jcb.println(" " + matchInfo.action.trim()); + jcb.println(" break;"); + jcb.println(" }"); + } + jcb.println(" default: break;"); + jcb.println(" }"); } - jcb.println(" default: break;"); - jcb.println(" }"); - } - - private static void generateBitVector( - final JavaCodeBuilder jcb, final String name, final BitSet bits) { - jcb.println(); - jcb.println(" private static final long[] " + name + " = {"); - final long[] longs = bits.toLongArray(); - for (int i = 0; i < longs.length; i++) { - if (i > 0) { - jcb.print(","); - } - // codeGenerator.genCode("0x" + Long.toHexString(longs[i]) + "L"); - jcb.print(" " + Long.toString(longs[i]) + "L"); + + private static void generateBitVector(final JavaCodeBuilder jcb, final String name, final BitSet bits) { + JavaArrayHelper.emitLongArray(jcb, " ", "private", name, bits.toLongArray()); } - jcb.println(); - jcb.println(" };"); - } } diff --git a/src/main/resources/templates/java/TokenManagerDriver.template b/src/main/resources/templates/java/TokenManagerDriver.template index fda7648..98f393d 100644 --- a/src/main/resources/templates/java/TokenManagerDriver.template +++ b/src/main/resources/templates/java/TokenManagerDriver.template @@ -350,12 +350,8 @@ ${decls} } #if TOKEN_FACTORY final Token t = ${TOKEN_FACTORY}.newToken(jjmatchedKind, curTokenImage); -#elif BINARY_NEW_TOKEN - final Token t = Token.newToken(jjmatchedKind, curTokenImage); #else - final Token t = Token.newToken(jjmatchedKind); - t.kind = jjmatchedKind; - t.image = curTokenImage; + final Token t = Token.newToken(jjmatchedKind, curTokenImage); #fi #if KEEP_LINE_COLUMN t.beginLine = beginLine; @@ -371,10 +367,10 @@ ${decls} int curPos = 0; int key = (int) curLexState << 16 | curChar; int startState = jjInitStates[curLexState]; - if (startAndSize.containsKey(key)) { - int[] arr = startAndSize.get(key); - int index = arr[0]; - for (int i = 0; i < arr[1]; i++) { + int _ssPos = java.util.Arrays.binarySearch(ssKeys, key); + if (_ssPos >= 0) { + int index = ssValues[_ssPos * 2]; + for (int i = 0; i < ssValues[_ssPos * 2 + 1]; i++) { int len = stringLiterals[index++]; int ignoreCase = stringLiterals[index++]; #if DEBUG_TOKEN_MANAGER @@ -496,10 +492,15 @@ ${decls} // Some NFA states have epsilon transitions (move on empty string). // So we just start with all of them. Note that the nextStates array already adds // the epsilon closure. Only the initial state needs to do this explicitly. - for (int s : jjcompositeState[startState]) { - if (moved[s] != moveIndex) { - stateSet[cnt++] = s; - moved[s] = moveIndex; + { + final int _csOff = jjcompositeState_offsets[startState]; + final int _csLen = jjcompositeState_lengths[startState]; + for (int _j = 0; _j < _csLen; _j++) { + final int s = jjcompositeState_data[_csOff + _j]; + if (moved[s] != moveIndex) { + stateSet[cnt++] = s; + moved[s] = moveIndex; + } } } @@ -538,11 +539,16 @@ ${decls} if ((jjChars[state][vectorIndex] & bitpattern) != 0L) { // Current input character can move this NFA state. // So add all the next states of the current states for use with the next input char. - for (int newState : jjnextStateSet[state]) { - if (moved[newState] != moveIndex) { - // We add each state only once. - newStateSet[newCnt++] = newState; - moved[newState] = moveIndex; + { + final int _nsOff = jjnextStateSet_offsets[state]; + final int _nsLen = jjnextStateSet_lengths[state]; + for (int _k = 0; _k < _nsLen; _k++) { + final int newState = jjnextStateSet_data[_nsOff + _k]; + if (moved[newState] != moveIndex) { + // We add each state only once. + newStateSet[newCnt++] = newState; + moved[newState] = moveIndex; + } } } int newKind = jjmatchKinds[state];