diff --git a/sjsonnet/src/sjsonnet/Format.scala b/sjsonnet/src/sjsonnet/Format.scala index 731c341d..5e73e3a0 100644 --- a/sjsonnet/src/sjsonnet/Format.scala +++ b/sjsonnet/src/sjsonnet/Format.scala @@ -716,7 +716,8 @@ object Format { Error.fail("Codepoints must be >= 0, got " + codePoint) if (codePoint > 0x10ffff) Error.fail("Invalid unicode codepoint, got " + codePoint) - widenRaw(formatted, Character.toString(codePoint)) + val c = if (codePoint >= 0xd800 && codePoint <= 0xdfff) 0xfffd else codePoint + widenRaw(formatted, Character.toString(c)) case 's' => widenRaw(formatted, RenderUtils.renderDouble(s)) case _ => diff --git a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala index 8215ebaf..070ca276 100644 --- a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala +++ b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala @@ -210,10 +210,11 @@ object StringModule extends AbstractFunctionModule { */ private object Char_ extends Val.Builtin1("char", "n") { def evalRhs(n: Eval, ev: EvalScope, pos: Position): Val = { - val c = n.value.asInt - if (!Character.isValidCodePoint(c)) { - Error.fail(s"Invalid unicode code point, got " + c) + val c0 = n.value.asInt + if (!Character.isValidCodePoint(c0)) { + Error.fail(s"Invalid unicode code point, got " + c0) } + val c = if (c0 >= 0xd800 && c0 <= 0xdfff) 0xfffd else c0 val s = Character.toString(c) // Single-codepoint result; ASCII printable except '"' and '\\' is JSON-safe. if (c >= 0x20 && c < 0x7f && c != '"' && c != '\\') Val.Str.asciiSafe(pos, s) diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala index 25c9d70f..f7ec27a2 100644 --- a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala +++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala @@ -164,22 +164,19 @@ object UnicodeHandlingTests extends TestSuite { assert(sjsonnet.Util.compareStringsByCodepoint(rawSurrogatePrefix, validSurrogatePair) < 0) assert(sjsonnet.Util.compareStringsByCodepoint(validSurrogatePair, rawSurrogatePrefix) > 0) - eval("(std.char(55296) + std.char(65535)) < (std.char(55296) + std.char(56320))") ==> - ujson.Bool(true) + // std.char now replaces surrogates with U+FFFD (matching go-jsonnet) + eval("(std.char(55296) + std.char(65535)) == (std.char(55296) + std.char(56320))") ==> + ujson.Bool(false) - eval( - "std.sort([std.char(55296) + std.char(56320), std.char(55296) + std.char(65535)])" - ) ==> ujson.Arr(rawSurrogatePrefix, validSurrogatePair) + eval("std.char(55296)") ==> ujson.Str("\uFFFD") } - // Unpaired surrogate handling - sjsonnet-specific behavior + // Unpaired surrogate handling // - // Note: This is an intentional divergence from go-jsonnet and C++ jsonnet: + // sjsonnet aligns with go-jsonnet: // - go/C++ reject unpaired surrogates in escape sequences at parse time // - go-jsonnet's std.char() replaces surrogate codepoints with U+FFFD - // - sjsonnet was preserving unpaired surrogates throughout - // - // sjsonnet now reject these to align with go-jsonet/ c++ jsonnet + // - sjsonnet now matches both behaviors // test("unpairedSurrogatesInEscapes") { @@ -191,10 +188,10 @@ object UnicodeHandlingTests extends TestSuite { eval("\"\\uD83C\\uDF0D\"") ==> ujson.Str("🌍") // Earth emoji } - test("stdCharPreservesRawSurrogates") { - // sjsonnet preserves raw surrogate codepoints (go-jsonnet would replace with U+FFFD) - eval("std.codepoint(std.char(55296))") ==> ujson.Num(55296) // 0xD800 high surrogate - eval("std.codepoint(std.char(56320))") ==> ujson.Num(56320) // 0xDC00 low surrogate + test("stdCharReplacesSurrogates") { + // std.char() replaces surrogate codepoints with U+FFFD (matching go-jsonnet) + eval("std.codepoint(std.char(55296))") ==> ujson.Num(65533) // 0xD800 → U+FFFD + eval("std.codepoint(std.char(56320))") ==> ujson.Num(65533) // 0xDC00 → U+FFFD } test("invalidSurrogateHandling") {