databricks · He-Pin · Jun 18, 2026
diff --git a/sjsonnet/src/sjsonnet/Format.scala b/sjsonnet/src/sjsonnet/Format.scala
@@ -716,7 +716,8 @@ object Format {
                     Error.fail("Codepoints must be >= 0, got " + codePoint)
                   if (codePoint > 0x10ffff)
                     Error.fail("Invalid unicode codepoint, got " + codePoint)
-                  widenRaw(formatted, Character.toString(codePoint))
+                  val c = if (codePoint >= 0xd800 && codePoint <= 0xdfff) 0xfffd else codePoint
+                  widenRaw(formatted, Character.toString(c))
                 case 's' =>
                   widenRaw(formatted, RenderUtils.renderDouble(s))
                 case _ =>

diff --git a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala
@@ -210,10 +210,11 @@ object StringModule extends AbstractFunctionModule {
    */
   private object Char_ extends Val.Builtin1("char", "n") {
     def evalRhs(n: Eval, ev: EvalScope, pos: Position): Val = {
-      val c = n.value.asInt
-      if (!Character.isValidCodePoint(c)) {
-        Error.fail(s"Invalid unicode code point, got " + c)
+      val c0 = n.value.asInt
+      if (!Character.isValidCodePoint(c0)) {
+        Error.fail(s"Invalid unicode code point, got " + c0)
       }
+      val c = if (c0 >= 0xd800 && c0 <= 0xdfff) 0xfffd else c0
       val s = Character.toString(c)
       // Single-codepoint result; ASCII printable except '"' and '\\' is JSON-safe.
       if (c >= 0x20 && c < 0x7f && c != '"' && c != '\\') Val.Str.asciiSafe(pos, s)

diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala
@@ -164,22 +164,19 @@ object UnicodeHandlingTests extends TestSuite {
       assert(sjsonnet.Util.compareStringsByCodepoint(rawSurrogatePrefix, validSurrogatePair) < 0)
       assert(sjsonnet.Util.compareStringsByCodepoint(validSurrogatePair, rawSurrogatePrefix) > 0)
 
-      eval("(std.char(55296) + std.char(65535)) < (std.char(55296) + std.char(56320))") ==>
-      ujson.Bool(true)
+      // std.char now replaces surrogates with U+FFFD (matching go-jsonnet)
+      eval("(std.char(55296) + std.char(65535)) == (std.char(55296) + std.char(56320))") ==>
+      ujson.Bool(false)
 
-      eval(
-        "std.sort([std.char(55296) + std.char(56320), std.char(55296) + std.char(65535)])"
-      ) ==> ujson.Arr(rawSurrogatePrefix, validSurrogatePair)
+      eval("std.char(55296)") ==> ujson.Str("\uFFFD")
     }
 
-    // Unpaired surrogate handling - sjsonnet-specific behavior
+    // Unpaired surrogate handling
     //
-    // Note: This is an intentional divergence from go-jsonnet and C++ jsonnet:
+    // sjsonnet aligns with go-jsonnet:
     // - go/C++ reject unpaired surrogates in escape sequences at parse time
     // - go-jsonnet's std.char() replaces surrogate codepoints with U+FFFD
-    // - sjsonnet was preserving unpaired surrogates throughout
-    //
-    // sjsonnet now reject these to align with go-jsonet/ c++ jsonnet
+    // - sjsonnet now matches both behaviors
     //
 
     test("unpairedSurrogatesInEscapes") {
@@ -191,10 +188,10 @@ object UnicodeHandlingTests extends TestSuite {
       eval("\"\\uD83C\\uDF0D\"") ==> ujson.Str("🌍") // Earth emoji
     }
 
-    test("stdCharPreservesRawSurrogates") {
-      // sjsonnet preserves raw surrogate codepoints (go-jsonnet would replace with U+FFFD)
-      eval("std.codepoint(std.char(55296))") ==> ujson.Num(55296) // 0xD800 high surrogate
-      eval("std.codepoint(std.char(56320))") ==> ujson.Num(56320) // 0xDC00 low surrogate
+    test("stdCharReplacesSurrogates") {
+      // std.char() replaces surrogate codepoints with U+FFFD (matching go-jsonnet)
+      eval("std.codepoint(std.char(55296))") ==> ujson.Num(65533) // 0xD800 → U+FFFD
+      eval("std.codepoint(std.char(56320))") ==> ujson.Num(65533) // 0xDC00 → U+FFFD
     }
 
     test("invalidSurrogateHandling") {