From 8b43dc77c1af598971c74e1c6552dacbed2b2414 Mon Sep 17 00:00:00 2001 From: Bertrand Martin Date: Tue, 2 Jun 2026 19:30:52 +0200 Subject: [PATCH 1/4] Optimize chained string concatenation with counted CONCAT --- .../jawk/backend/AVMExpressionBenchmark.java | 40 +++++++++++ src/main/java/io/jawk/backend/AVM.java | 23 ++++++ .../java/io/jawk/intermediate/AwkTuples.java | 67 ++++++++++++++++-- .../java/io/jawk/intermediate/Opcode.java | 10 +++ .../io/jawk/AwkTupleOptimizationTest.java | 70 +++++++++++++++++++ 5 files changed, 206 insertions(+), 4 deletions(-) diff --git a/src/jmh/java/io/jawk/backend/AVMExpressionBenchmark.java b/src/jmh/java/io/jawk/backend/AVMExpressionBenchmark.java index 2c67a53..a6b6917 100644 --- a/src/jmh/java/io/jawk/backend/AVMExpressionBenchmark.java +++ b/src/jmh/java/io/jawk/backend/AVMExpressionBenchmark.java @@ -61,6 +61,9 @@ public class AVMExpressionBenchmark { private AwkExpression fieldConcatenation; private AwkExpression fieldRegexMatch; private AwkExpression multiStringConcatenation; + private AwkExpression constantStringConcatenation; + private AwkExpression stringConstantStringConstantConcatenation; + private AwkExpression fourStringConcatenation; private AwkExpression mixedExpression; /** @@ -78,6 +81,9 @@ public void setup() throws IOException { this.fieldConcatenation = awk.compileExpression("$1 \" test\""); this.fieldRegexMatch = awk.compileExpression("$1 ~ /test/"); this.multiStringConcatenation = awk.compileExpression("$1 \" test1\" \" test2\" \" test3\""); + this.constantStringConcatenation = awk.compileExpression("\"constant\" \"constant\" \"constant\" \"constant\""); + this.stringConstantStringConstantConcatenation = awk.compileExpression("$1 \"constant\" $2 \"constant\""); + this.fourStringConcatenation = awk.compileExpression("$1 $2 $3 $4"); this.mixedExpression = awk.compileExpression("($1 + $2) \":\" ($3 ~ /test/) \":\" $4"); this.avm = new AVM(new AwkSettings(), Collections.emptyMap()); this.avm.prepareForEval("42 3.14 test-value suffix"); @@ -159,6 +165,40 @@ public Object multiStringConcatenation() throws IOException { return this.avm.eval(this.multiStringConcatenation); } + /** + * Measures the optimized constant-folded case for four constant string + * operands. + * + * @return expression result + * @throws IOException if input preparation or evaluation fails + */ + @Benchmark + public Object constantStringConcatenation() throws IOException { + return this.avm.eval(this.constantStringConcatenation); + } + + /** + * Measures alternating field and constant string concatenation. + * + * @return expression result + * @throws IOException if input preparation or evaluation fails + */ + @Benchmark + public Object stringConstantStringConstantConcatenation() throws IOException { + return this.avm.eval(this.stringConstantStringConstantConcatenation); + } + + /** + * Measures concatenation of four field string operands. + * + * @return expression result + * @throws IOException if input preparation or evaluation fails + */ + @Benchmark + public Object fourStringConcatenation() throws IOException { + return this.avm.eval(this.fourStringConcatenation); + } + /** * Measures mixed numeric, string, field, and regular expression operations. * diff --git a/src/main/java/io/jawk/backend/AVM.java b/src/main/java/io/jawk/backend/AVM.java index 089f7e7..f9cbccc 100644 --- a/src/main/java/io/jawk/backend/AVM.java +++ b/src/main/java/io/jawk/backend/AVM.java @@ -1177,6 +1177,29 @@ private void executeTuples(PositionTracker position) position.next(); break; } + case MULTI_CONCAT: { + // arg[0] = number of stack items to concatenate + // stack[0] = last concatenation operand + CountTuple countTuple = (CountTuple) tuple; + int count = (int) countTuple.getCount(); + // Store String references so appends run left-to-right. Converting + // operands to char[] would copy them once before StringBuilder + // copies them again, and front-inserting would shift existing + // content on each operand. + String[] values = new String[count]; + int resultLength = 0; + for (int i = count - 1; i >= 0; i--) { + values[i] = jrt.toAwkString(pop()); + resultLength += values[i].length(); + } + StringBuilder resultString = new StringBuilder(resultLength); + for (String value : values) { + resultString.append(value); + } + push(resultString.toString()); + position.next(); + break; + } case ASSIGN: case ASSIGN_NOPUSH: { // arg[0] = offset diff --git a/src/main/java/io/jawk/intermediate/AwkTuples.java b/src/main/java/io/jawk/intermediate/AwkTuples.java index 851e69f..57dc21d 100644 --- a/src/main/java/io/jawk/intermediate/AwkTuples.java +++ b/src/main/java/io/jawk/intermediate/AwkTuples.java @@ -32,6 +32,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.IdentityHashMap; +import java.util.List; import java.util.Map; import java.util.Set; import java.util.function.Supplier; @@ -69,7 +70,7 @@ public class AwkTuples implements Serializable { * can be serialized and patched efficiently. A linked list would make every * lookup O(n) and complicate address reassignment. */ - private java.util.List queue = new ArrayList(100) { + private List queue = new ArrayList(100) { private static final long serialVersionUID = -6334362156408598578L; @Override @@ -1878,10 +1879,10 @@ private boolean peepholeOptimizePass() { return false; } - java.util.List original = new ArrayList(queue); + List original = new ArrayList(queue); int[] indexMapping = new int[originalSize]; Arrays.fill(indexMapping, -1); - java.util.List optimizedQueue = new ArrayList(originalSize); + List optimizedQueue = new ArrayList(originalSize); boolean[] isAddressTarget = addressTargets(original, originalSize); boolean modified = false; @@ -1889,6 +1890,25 @@ private boolean peepholeOptimizePass() { int newIndex = 0; while (oldIndex < originalSize) { Tuple tuple = original.get(oldIndex); + // If an earlier rewrite already happened in this pass, wait for the + // next pass before collapsing concat runs. That gives literal folding + // priority so fully constant chains become one PUSH_STRING instead of a + // partially folded PUSH_STRING plus MULTI_CONCAT. + ConcatRun concatRun = !modified ? concatRun(original, isAddressTarget, oldIndex) : null; + if (concatRun != null) { + // Chained concatenations compile as a run of binary CONCAT tuples + // after all operands have been pushed. Collapse that postfix run into + // one counted MULTI_CONCAT, e.g. CONCAT, CONCAT, CONCAT -> + // MULTI_CONCAT 4. + Tuple replacement = createMultiConcat(concatRun.itemCount, tuple.getLineNumber()); + optimizedQueue.add(replacement); + mapFoldedRange(indexMapping, oldIndex, concatRun.tupleCount, newIndex); + oldIndex += concatRun.tupleCount; + newIndex++; + modified = true; + continue; + } + if (tuple.getOpcode() == Opcode.ASSIGN && (oldIndex + 1) < originalSize) { Tuple nextTuple = original.get(oldIndex + 1); // Statement assignments compile as ASSIGN followed by POP because @@ -1987,7 +2007,7 @@ private boolean peepholeOptimizePass() { return true; } - private boolean[] addressTargets(java.util.List tuples, int tupleCount) { + private boolean[] addressTargets(List tuples, int tupleCount) { boolean[] targets = new boolean[tupleCount]; for (Tuple tuple : tuples) { Address address = tuple.getAddress(); @@ -2007,6 +2027,29 @@ private void mapFoldedRange(int[] indexMapping, int startIndex, int length, int } } + private ConcatRun concatRun(List original, boolean[] isAddressTarget, int oldIndex) { + Tuple tuple = original.get(oldIndex); + if (tuple.getOpcode() != Opcode.CONCAT) { + return null; + } + + int itemCount = 2; + int tupleCount = 1; + int currentIndex = oldIndex + 1; + while (currentIndex < original.size() + && original.get(currentIndex).getOpcode() == Opcode.CONCAT + && !isAddressTarget[currentIndex]) { + itemCount++; + tupleCount++; + currentIndex++; + } + + if (tupleCount < 2) { + return null; + } + return new ConcatRun(tupleCount, itemCount); + } + private Object literalValue(Tuple tuple) { switch (tuple.getOpcode()) { case PUSH_LONG: @@ -2162,6 +2205,22 @@ private Tuple createGetInputFieldConst(long fieldIndex, int lineNumber) { return tuple; } + private Tuple createMultiConcat(int itemCount, int lineNumber) { + Tuple tuple = new Tuple.CountTuple(Opcode.MULTI_CONCAT, itemCount); + tuple.setLineNumber(lineNumber); + return tuple; + } + + private static final class ConcatRun { + private final int tupleCount; + private final int itemCount; + + private ConcatRun(int tupleCount, int itemCount) { + this.tupleCount = tupleCount; + this.itemCount = itemCount; + } + } + private void remapAddresses(int[] indexMapping) { if (indexMapping.length == 0) { return; diff --git a/src/main/java/io/jawk/intermediate/Opcode.java b/src/main/java/io/jawk/intermediate/Opcode.java index b9af6d6..9297975 100644 --- a/src/main/java/io/jawk/intermediate/Opcode.java +++ b/src/main/java/io/jawk/intermediate/Opcode.java @@ -196,6 +196,16 @@ public enum Opcode { * Stack after: x-concatenated-with-y ... */ CONCAT, + /** + * Pops and concatenates N strings from the top-of-stack; push the result onto + * the stack. The number of items is passed in as a tuple argument. + *

+ * Argument: # of items (N) + *

+ * Stack before: x1 x2 x3 .. xN ...
+ * Stack after: x1-concatenated-through-xN ... + */ + MULTI_CONCAT, /** * Assigns the top-of-stack to a variable and pushes the assigned value back * onto the stack. diff --git a/src/test/java/io/jawk/AwkTupleOptimizationTest.java b/src/test/java/io/jawk/AwkTupleOptimizationTest.java index a4a2ccd..5fff44f 100644 --- a/src/test/java/io/jawk/AwkTupleOptimizationTest.java +++ b/src/test/java/io/jawk/AwkTupleOptimizationTest.java @@ -136,9 +136,60 @@ public void foldsLiteralStringConcatenation() throws Exception { AwkProgram tuples = new Awk().compile(script); List opcodes = collectOpcodes(tuples); assertFalse("Literal concatenation should eliminate CONCAT tuple", opcodes.contains(Opcode.CONCAT)); + assertFalse("Literal concatenation should eliminate MULTI_CONCAT tuple", opcodes.contains(Opcode.MULTI_CONCAT)); assertTrue("Expected folded literal push of foobar", hasLiteralPush(tuples, "foobar")); } + @Test + public void foldsChainedLiteralStringConcatenation() throws Exception { + String script = "BEGIN { print \"foo\" \"bar\" \"baz\" \"qux\" }\n"; + AwkTestSupport + .awkTest("folds chained literal string concatenation") + .script(script) + .expect("foobarbazqux\n") + .runAndAssert(); + + AwkProgram tuples = new Awk().compile(script); + List opcodes = collectOpcodes(tuples); + assertFalse("Chained literal concatenation should eliminate CONCAT tuple", opcodes.contains(Opcode.CONCAT)); + assertFalse( + "Chained literal concatenation should eliminate MULTI_CONCAT tuple", + opcodes.contains(Opcode.MULTI_CONCAT)); + assertTrue("Expected folded literal push of foobarbazqux", hasLiteralPush(tuples, "foobarbazqux")); + } + + @Test + public void optimizesChainedStringConcatenationAsSingleMultiConcat() throws Exception { + String script = "BEGIN { s1 = \"alpha\"; s2 = \"beta\"; print s1 \"-\" s2 \":\" }\n"; + AwkTestSupport + .awkTest("counted chained string concatenation") + .script(script) + .expect("alpha-beta:\n") + .runAndAssert(); + + AwkProgram tuples = new Awk().compile(script); + assertEquals( + "Expected one counted MULTI_CONCAT for the mixed chain", + 1, + countOpcodeWithCount(tuples, Opcode.MULTI_CONCAT, 4)); + assertEquals("Optimized mixed chain should not keep binary CONCAT tuples", 0, countOpcode(tuples, Opcode.CONCAT)); + } + + @Test + public void keepsParserConcatenationBinaryWhenOptimizationDisabled() throws Exception { + String script = "BEGIN { s1 = \"alpha\"; s2 = \"beta\"; print s1 \"-\" s2 \":\" }\n"; + AwkProgram tuples = new Awk().compile(script, true); + + assertEquals( + "Unoptimized parser output should keep one binary CONCAT per expression pair", + 3, + countOpcode(tuples, Opcode.CONCAT)); + assertEquals( + "Unoptimized parser output should not emit counted chain MULTI_CONCAT", + 0, + countOpcode(tuples, Opcode.MULTI_CONCAT)); + } + @Test public void foldsScalarAssignmentPopIntoNonPushingAssignment() throws Exception { String script = "BEGIN { a = -2; b = 2; c = 4; print a + b + c }\n"; @@ -204,6 +255,11 @@ public void doesNotFoldNumericConcatenation() throws Exception { AwkProgram tuples = new Awk().compile(script); List opcodes = collectOpcodes(tuples); assertTrue("Numeric literal concatenation should preserve CONCAT tuple", opcodes.contains(Opcode.CONCAT)); + assertEquals("Numeric literal concatenation should remain binary", 1, countOpcode(tuples, Opcode.CONCAT)); + assertEquals( + "Binary numeric literal concatenation should not use MULTI_CONCAT", + 0, + countOpcode(tuples, Opcode.MULTI_CONCAT)); assertFalse("Optimizer should not fold numeric/string concatenation", hasLiteralPush(tuples, "1x")); } @@ -570,6 +626,20 @@ private static int countOpcode(AwkProgram tuples, Opcode opcode) { return count; } + private static int countOpcodeWithCount(AwkProgram tuples, Opcode opcode, long expectedCount) { + int count = 0; + PositionTracker tracker = rawTuples(tuples).top(); + while (!tracker.isEOF()) { + if (tracker.opcode() == opcode + && tracker.current() instanceof Tuple.CountTuple + && ((Tuple.CountTuple) tracker.current()).getCount() == expectedCount) { + count++; + } + tracker.next(); + } + return count; + } + private static String dumpTuples(AwkProgram tuples) throws Exception { ByteArrayOutputStream out = new ByteArrayOutputStream(); try (PrintStream ps = new PrintStream(out, true, StandardCharsets.UTF_8.name())) { From 2932324139acc51cfa4df6b5bd5686b7b850f213 Mon Sep 17 00:00:00 2001 From: Bertrand Martin Date: Tue, 2 Jun 2026 20:27:09 +0200 Subject: [PATCH 2/4] Fixed grammar --- src/main/java/io/jawk/intermediate/Opcode.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/io/jawk/intermediate/Opcode.java b/src/main/java/io/jawk/intermediate/Opcode.java index 9297975..dc3222a 100644 --- a/src/main/java/io/jawk/intermediate/Opcode.java +++ b/src/main/java/io/jawk/intermediate/Opcode.java @@ -197,7 +197,7 @@ public enum Opcode { */ CONCAT, /** - * Pops and concatenates N strings from the top-of-stack; push the result onto + * Pops and concatenates N strings from the top-of-stack; pushes the result onto * the stack. The number of items is passed in as a tuple argument. *

* Argument: # of items (N) From e03ec8b1182452f2ffad384542b1c5c5ce564387 Mon Sep 17 00:00:00 2001 From: Bertrand Martin Date: Tue, 2 Jun 2026 20:27:09 +0200 Subject: [PATCH 3/4] Guard targeted CONCAT run optimization --- .../java/io/jawk/intermediate/AwkTuples.java | 2 +- .../io/jawk/AwkTupleOptimizationTest.java | 30 ++++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/src/main/java/io/jawk/intermediate/AwkTuples.java b/src/main/java/io/jawk/intermediate/AwkTuples.java index 57dc21d..b01d9fa 100644 --- a/src/main/java/io/jawk/intermediate/AwkTuples.java +++ b/src/main/java/io/jawk/intermediate/AwkTuples.java @@ -2029,7 +2029,7 @@ private void mapFoldedRange(int[] indexMapping, int startIndex, int length, int private ConcatRun concatRun(List original, boolean[] isAddressTarget, int oldIndex) { Tuple tuple = original.get(oldIndex); - if (tuple.getOpcode() != Opcode.CONCAT) { + if (tuple.getOpcode() != Opcode.CONCAT || isAddressTarget[oldIndex]) { return null; } diff --git a/src/test/java/io/jawk/AwkTupleOptimizationTest.java b/src/test/java/io/jawk/AwkTupleOptimizationTest.java index 5fff44f..8c55109 100644 --- a/src/test/java/io/jawk/AwkTupleOptimizationTest.java +++ b/src/test/java/io/jawk/AwkTupleOptimizationTest.java @@ -190,6 +190,30 @@ public void keepsParserConcatenationBinaryWhenOptimizationDisabled() throws Exce countOpcode(tuples, Opcode.MULTI_CONCAT)); } + @Test + public void keepsConcatRunWhenFirstConcatIsBranchTarget() { + AwkTuples tuples = new AwkTuples(); + tuples.pushSourceLineNumber(1); + Address concatTarget = tuples.createAddress("concat-target"); + + tuples.dereference(1, false, true); + tuples.ifFalse(concatTarget); + tuples.dereference(2, false, true); + tuples.dereference(3, false, true); + tuples.address(concatTarget); + tuples.concat(); + tuples.dereference(4, false, true); + tuples.concat(); + + tuples.optimize(); + + assertEquals("Targeted CONCAT run should remain binary", 2, countOpcode(tuples, Opcode.CONCAT)); + assertEquals( + "Targeted CONCAT run should not be folded into MULTI_CONCAT", + 0, + countOpcode(tuples, Opcode.MULTI_CONCAT)); + } + @Test public void foldsScalarAssignmentPopIntoNonPushingAssignment() throws Exception { String script = "BEGIN { a = -2; b = 2; c = 4; print a + b + c }\n"; @@ -615,8 +639,12 @@ private static boolean hasAddressTargetWithPredecessor(AwkProgram tuples, Opcode } private static int countOpcode(AwkProgram tuples, Opcode opcode) { + return countOpcode(rawTuples(tuples), opcode); + } + + private static int countOpcode(AwkTuples tuples, Opcode opcode) { int count = 0; - PositionTracker tracker = rawTuples(tuples).top(); + PositionTracker tracker = tuples.top(); while (!tracker.isEOF()) { if (tracker.opcode() == opcode) { count++; From 0f75d58524b918d6f9f28aeb1838e253ed7f7955 Mon Sep 17 00:00:00 2001 From: Bertrand Martin Date: Wed, 3 Jun 2026 11:21:52 +0200 Subject: [PATCH 4/4] Clarify MULTI_CONCAT documentation --- src/main/java/io/jawk/intermediate/Opcode.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/main/java/io/jawk/intermediate/Opcode.java b/src/main/java/io/jawk/intermediate/Opcode.java index dc3222a..b0995d2 100644 --- a/src/main/java/io/jawk/intermediate/Opcode.java +++ b/src/main/java/io/jawk/intermediate/Opcode.java @@ -197,8 +197,9 @@ public enum Opcode { */ CONCAT, /** - * Pops and concatenates N strings from the top-of-stack; pushes the result onto - * the stack. The number of items is passed in as a tuple argument. + * Pops and concatenates N values from the top-of-stack after AWK string + * conversion; pushes the result onto the stack. The number of items is passed + * in as a tuple argument. *

* Argument: # of items (N) *