diff --git a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala index 8215ebaf..77d8e923 100644 --- a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala +++ b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala @@ -210,13 +210,12 @@ object StringModule extends AbstractFunctionModule { */ private object Char_ extends Val.Builtin1("char", "n") { def evalRhs(n: Eval, ev: EvalScope, pos: Position): Val = { - val c = n.value.asInt - if (!Character.isValidCodePoint(c)) { - Error.fail(s"Invalid unicode code point, got " + c) + val c0 = n.value.asInt + if (!Character.isValidCodePoint(c0) || (c0 >= 0xd800 && c0 <= 0xdfff)) { + Error.fail(s"Invalid unicode code point, got " + c0) } - val s = Character.toString(c) - // Single-codepoint result; ASCII printable except '"' and '\\' is JSON-safe. - if (c >= 0x20 && c < 0x7f && c != '"' && c != '\\') Val.Str.asciiSafe(pos, s) + val s = Character.toString(c0) + if (c0 >= 0x20 && c0 < 0x7f && c0 != '"' && c0 != '\\') Val.Str.asciiSafe(pos, s) else Val.Str(pos, s) } } diff --git a/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet new file mode 100644 index 00000000..c5781048 --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet @@ -0,0 +1,9 @@ +// Test that std.char() works correctly for valid codepoints near the surrogate range +// and rejects surrogate codepoints (0xD800-0xDFFF) with an error. +[ + std.codepoint(std.char(55295)), // 0xD7FF - last valid before surrogates + std.codepoint(std.char(57344)), // 0xE000 - first valid after surrogates + std.codepoint(std.char(65533)), // 0xFFFD - replacement character itself + std.codepoint(std.char(0)), // 0x0000 - null + std.codepoint(std.char(65)), // 0x0041 - 'A' +] diff --git a/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet.golden new file mode 100644 index 00000000..47132726 --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet.golden @@ -0,0 +1,7 @@ +[ + 55295, + 57344, + 65533, + 0, + 65 +] diff --git a/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet new file mode 100644 index 00000000..2133e28a --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet @@ -0,0 +1,3 @@ +// std.char() should reject surrogate codepoints (0xD800-0xDFFF) +// High surrogate 0xD800 = 55296 +std.char(55296) diff --git a/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet.golden new file mode 100644 index 00000000..64de38eb --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet.golden @@ -0,0 +1 @@ +sjsonnet.Error: [std.char] Invalid unicode code point, got 55296 diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala index 25c9d70f..e746b958 100644 --- a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala +++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala @@ -163,23 +163,17 @@ object UnicodeHandlingTests extends TestSuite { assert(sjsonnet.Util.compareStringsByCodepoint(rawSurrogatePrefix, validSurrogatePair) < 0) assert(sjsonnet.Util.compareStringsByCodepoint(validSurrogatePair, rawSurrogatePrefix) > 0) - - eval("(std.char(55296) + std.char(65535)) < (std.char(55296) + std.char(56320))") ==> - ujson.Bool(true) - - eval( - "std.sort([std.char(55296) + std.char(56320), std.char(55296) + std.char(65535)])" - ) ==> ujson.Arr(rawSurrogatePrefix, validSurrogatePair) } - // Unpaired surrogate handling - sjsonnet-specific behavior + // Unpaired surrogate handling // - // Note: This is an intentional divergence from go-jsonnet and C++ jsonnet: - // - go/C++ reject unpaired surrogates in escape sequences at parse time - // - go-jsonnet's std.char() replaces surrogate codepoints with U+FFFD - // - sjsonnet was preserving unpaired surrogates throughout + // Note: The three reference implementations diverge on surrogate codepoints: + // - go-jsonnet replaces surrogates with U+FFFD + // - jsonnet-cpp preserves raw surrogates + // - jrsonnet rejects surrogates with an error // - // sjsonnet now reject these to align with go-jsonet/ c++ jsonnet + // sjsonnet rejects surrogates with an error (matching jrsonnet), + // since surrogates are not valid Unicode codepoints per the Unicode spec. // test("unpairedSurrogatesInEscapes") { @@ -191,10 +185,10 @@ object UnicodeHandlingTests extends TestSuite { eval("\"\\uD83C\\uDF0D\"") ==> ujson.Str("🌍") // Earth emoji } - test("stdCharPreservesRawSurrogates") { - // sjsonnet preserves raw surrogate codepoints (go-jsonnet would replace with U+FFFD) - eval("std.codepoint(std.char(55296))") ==> ujson.Num(55296) // 0xD800 high surrogate - eval("std.codepoint(std.char(56320))") ==> ujson.Num(56320) // 0xDC00 low surrogate + test("stdCharRejectsSurrogates") { + evalErr("std.char(55296)").contains("Invalid unicode code point") // 0xD800 + evalErr("std.char(56320)").contains("Invalid unicode code point") // 0xDC00 + evalErr("std.char(57343)").contains("Invalid unicode code point") // 0xDFFF } test("invalidSurrogateHandling") {