From 2078851b480e90e473d37c0622106d5f29a7d07c Mon Sep 17 00:00:00 2001 From: yangjie01 Date: Tue, 2 Jun 2026 17:07:02 +0800 Subject: [PATCH] [SPARK-57208][SQL] Simplify Ascii codegen by extracting a static Java helper Move Ascii's first-character logic into a single ExpressionImplUtils.ascii(UTF8String) helper. nullSafeEval delegates to it and doGenCode becomes a one-line defineCodeGen call, so eval and codegen share one implementation instead of duplicating the substring/codePointAt block. This collapses the inlined block into one invokestatic per call site (fewer constant-pool entries, smaller generated method), helping with the JVM 64KB method / constant-pool limits, Janino compile time, and JIT work. Part of SPARK-56908. --- .../expressions/ExpressionImplUtils.java | 15 +++++++++++++ .../expressions/stringExpressions.scala | 22 +++---------------- .../expressions/StringExpressionsSuite.scala | 5 +++++ 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java index 7bad7c430b862..5b32fcf99d89f 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/ExpressionImplUtils.java @@ -354,4 +354,19 @@ public static long crc32(byte[] bytes) { checksum.update(bytes, 0, bytes.length); return checksum.getValue(); } + + /** + * Returns the numeric value of the first character of the input string, or 0 if it is empty. + * Shared by the Ascii expression's eval and codegen paths so the generated Java is a single + * call rather than an inline substring/if-else block. + */ + public static int ascii(UTF8String str) { + // only pick the first character to reduce the `toString` cost + UTF8String firstCharStr = str.substring(0, 1); + if (firstCharStr.numChars() > 0) { + return firstCharStr.toString().codePointAt(0); + } else { + return 0; + } + } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index 5b5a63812dcab..5d8363d0e5260 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2774,27 +2774,11 @@ case class Ascii(child: Expression) override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeWithCollation(supportsTrimCollation = true)) - protected override def nullSafeEval(string: Any): Any = { - // only pick the first character to reduce the `toString` cost - val firstCharStr = string.asInstanceOf[UTF8String].substring(0, 1) - if (firstCharStr.numChars > 0) { - firstCharStr.toString.codePointAt(0) - } else { - 0 - } - } + protected override def nullSafeEval(string: Any): Any = + ExpressionImplUtils.ascii(string.asInstanceOf[UTF8String]) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { - nullSafeCodeGen(ctx, ev, (child) => { - val firstCharStr = ctx.freshName("firstCharStr") - s""" - UTF8String $firstCharStr = $child.substring(0, 1); - if ($firstCharStr.numChars() > 0) { - ${ev.value} = $firstCharStr.toString().codePointAt(0); - } else { - ${ev.value} = 0; - } - """}) + defineCodeGen(ctx, ev, c => s"${classOf[ExpressionImplUtils].getName}.ascii($c)") } override protected def withNewChildInternal(newChild: Expression): Ascii = copy(child = newChild) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index bca4984cfac9c..aac4fafb78028 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -378,6 +378,11 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { val a = $"a".string.at(0) checkEvaluation(Ascii(Literal("efg")), 101, create_row("abdef")) checkEvaluation(Ascii(a), 97, create_row("abdef")) + // U+1F600 is a supplementary-plane code point; ascii must return the full code point + // (128512 via codePointAt), not the leading UTF-16 surrogate (55357 via charAt). + // scalastyle:off + checkEvaluation(Ascii(Literal("😀")), 128512, create_row("😀")) + // scalastyle:on checkEvaluation(Ascii(a), 0, create_row("")) checkEvaluation(Ascii(a), null, create_row(null)) checkEvaluation(Ascii(Literal.create(null, StringType)), null, create_row("abdef"))