From 9b6ad556f13da660d1cde100953c59af8f6217e1 Mon Sep 17 00:00:00 2001
From: He-Pin <hepin1989@gmail.com>
Date: Thu, 18 Jun 2026 13:04:15 +0800
Subject: [PATCH] fix: std.char() rejects surrogate codepoints with an error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Motivation:
std.char() accepted surrogate codepoints (0xD800-0xDFFF) and produced
unpaired surrogate characters in strings. The three reference
implementations diverge on this: go-jsonnet replaces with U+FFFD,
jsonnet-cpp preserves raw surrogates, and jrsonnet rejects with an
error. Surrogates are not valid Unicode codepoints per the Unicode
spec, so sjsonnet should reject them — matching jrsonnet's approach
and sjsonnet's own behavior for out-of-range codepoints like 0x110000.

Modification:
- In Char_.evalRhs, extend the invalid codepoint check to include the
  surrogate range 0xD800-0xDFFF, raising "Invalid unicode code point"
- Updated UnicodeHandlingTests: renamed stdCharReplacesSurrogates to
  stdCharRejectsSurrogates, using evalErr for all surrogate inputs
- Replaced char_surrogate_replacement golden test with
  char_surrogate_boundary (valid boundary values) and
  error.char_surrogate (surrogate rejection)

Result:
std.char(0xD800) now raises "Invalid unicode code point, got 55296"
instead of producing an unpaired surrogate. Valid codepoints adjacent
to the surrogate range (0xD7FF, 0xE000) are unaffected. This aligns
with jrsonnet and follows the Unicode specification.

References:
- jrsonnet: rejects surrogates with "invalid unicode codepoint" error
- go-jsonnet: replaces surrogates with U+FFFD (different strategy)
- jsonnet-cpp: preserves raw surrogates (different strategy)
- Unicode spec: U+D800-U+DFFF are reserved, not assignable codepoints
---
 .../src/sjsonnet/stdlib/StringModule.scala    | 11 ++++----
 .../char_surrogate_boundary.jsonnet           |  9 ++++++
 .../char_surrogate_boundary.jsonnet.golden    |  7 +++++
 .../error.char_surrogate.jsonnet              |  3 ++
 .../error.char_surrogate.jsonnet.golden       |  1 +
 .../src/sjsonnet/UnicodeHandlingTests.scala   | 28 ++++++++-----------
 6 files changed, 36 insertions(+), 23 deletions(-)
 create mode 100644 sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet
 create mode 100644 sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet.golden
 create mode 100644 sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet
 create mode 100644 sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet.golden

diff --git a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala
index 8215ebaf..77d8e923 100644
--- a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala
+++ b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala
@@ -210,13 +210,12 @@ object StringModule extends AbstractFunctionModule {
    */
   private object Char_ extends Val.Builtin1("char", "n") {
     def evalRhs(n: Eval, ev: EvalScope, pos: Position): Val = {
-      val c = n.value.asInt
-      if (!Character.isValidCodePoint(c)) {
-        Error.fail(s"Invalid unicode code point, got " + c)
+      val c0 = n.value.asInt
+      if (!Character.isValidCodePoint(c0) || (c0 >= 0xd800 && c0 <= 0xdfff)) {
+        Error.fail(s"Invalid unicode code point, got " + c0)
       }
-      val s = Character.toString(c)
-      // Single-codepoint result; ASCII printable except '"' and '\\' is JSON-safe.
-      if (c >= 0x20 && c < 0x7f && c != '"' && c != '\\') Val.Str.asciiSafe(pos, s)
+      val s = Character.toString(c0)
+      if (c0 >= 0x20 && c0 < 0x7f && c0 != '"' && c0 != '\\') Val.Str.asciiSafe(pos, s)
       else Val.Str(pos, s)
     }
   }
diff --git a/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet
new file mode 100644
index 00000000..c5781048
--- /dev/null
+++ b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet
@@ -0,0 +1,9 @@
+// Test that std.char() works correctly for valid codepoints near the surrogate range
+// and rejects surrogate codepoints (0xD800-0xDFFF) with an error.
+[
+  std.codepoint(std.char(55295)),  // 0xD7FF - last valid before surrogates
+  std.codepoint(std.char(57344)),  // 0xE000 - first valid after surrogates
+  std.codepoint(std.char(65533)),  // 0xFFFD - replacement character itself
+  std.codepoint(std.char(0)),      // 0x0000 - null
+  std.codepoint(std.char(65)),     // 0x0041 - 'A'
+]
diff --git a/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet.golden
new file mode 100644
index 00000000..47132726
--- /dev/null
+++ b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet.golden
@@ -0,0 +1,7 @@
+[
+   55295,
+   57344,
+   65533,
+   0,
+   65
+]
diff --git a/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet
new file mode 100644
index 00000000..2133e28a
--- /dev/null
+++ b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet
@@ -0,0 +1,3 @@
+// std.char() should reject surrogate codepoints (0xD800-0xDFFF)
+// High surrogate 0xD800 = 55296
+std.char(55296)
diff --git a/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet.golden
new file mode 100644
index 00000000..64de38eb
--- /dev/null
+++ b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet.golden
@@ -0,0 +1 @@
+sjsonnet.Error: [std.char] Invalid unicode code point, got 55296
diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala
index 25c9d70f..e746b958 100644
--- a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala
+++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala
@@ -163,23 +163,17 @@ object UnicodeHandlingTests extends TestSuite {
 
       assert(sjsonnet.Util.compareStringsByCodepoint(rawSurrogatePrefix, validSurrogatePair) < 0)
       assert(sjsonnet.Util.compareStringsByCodepoint(validSurrogatePair, rawSurrogatePrefix) > 0)
-
-      eval("(std.char(55296) + std.char(65535)) < (std.char(55296) + std.char(56320))") ==>
-      ujson.Bool(true)
-
-      eval(
-        "std.sort([std.char(55296) + std.char(56320), std.char(55296) + std.char(65535)])"
-      ) ==> ujson.Arr(rawSurrogatePrefix, validSurrogatePair)
     }
 
-    // Unpaired surrogate handling - sjsonnet-specific behavior
+    // Unpaired surrogate handling
     //
-    // Note: This is an intentional divergence from go-jsonnet and C++ jsonnet:
-    // - go/C++ reject unpaired surrogates in escape sequences at parse time
-    // - go-jsonnet's std.char() replaces surrogate codepoints with U+FFFD
-    // - sjsonnet was preserving unpaired surrogates throughout
+    // Note: The three reference implementations diverge on surrogate codepoints:
+    // - go-jsonnet replaces surrogates with U+FFFD
+    // - jsonnet-cpp preserves raw surrogates
+    // - jrsonnet rejects surrogates with an error
     //
-    // sjsonnet now reject these to align with go-jsonet/ c++ jsonnet
+    // sjsonnet rejects surrogates with an error (matching jrsonnet),
+    // since surrogates are not valid Unicode codepoints per the Unicode spec.
     //
 
     test("unpairedSurrogatesInEscapes") {
@@ -191,10 +185,10 @@ object UnicodeHandlingTests extends TestSuite {
       eval("\"\\uD83C\\uDF0D\"") ==> ujson.Str("🌍") // Earth emoji
     }
 
-    test("stdCharPreservesRawSurrogates") {
-      // sjsonnet preserves raw surrogate codepoints (go-jsonnet would replace with U+FFFD)
-      eval("std.codepoint(std.char(55296))") ==> ujson.Num(55296) // 0xD800 high surrogate
-      eval("std.codepoint(std.char(56320))") ==> ujson.Num(56320) // 0xDC00 low surrogate
+    test("stdCharRejectsSurrogates") {
+      evalErr("std.char(55296)").contains("Invalid unicode code point") // 0xD800
+      evalErr("std.char(56320)").contains("Invalid unicode code point") // 0xDC00
+      evalErr("std.char(57343)").contains("Invalid unicode code point") // 0xDFFF
     }
 
     test("invalidSurrogateHandling") {