From 9b6ad556f13da660d1cde100953c59af8f6217e1 Mon Sep 17 00:00:00 2001 From: He-Pin Date: Thu, 18 Jun 2026 13:04:15 +0800 Subject: [PATCH] fix: std.char() rejects surrogate codepoints with an error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Motivation: std.char() accepted surrogate codepoints (0xD800-0xDFFF) and produced unpaired surrogate characters in strings. The three reference implementations diverge on this: go-jsonnet replaces with U+FFFD, jsonnet-cpp preserves raw surrogates, and jrsonnet rejects with an error. Surrogates are not valid Unicode codepoints per the Unicode spec, so sjsonnet should reject them — matching jrsonnet's approach and sjsonnet's own behavior for out-of-range codepoints like 0x110000. Modification: - In Char_.evalRhs, extend the invalid codepoint check to include the surrogate range 0xD800-0xDFFF, raising "Invalid unicode code point" - Updated UnicodeHandlingTests: renamed stdCharReplacesSurrogates to stdCharRejectsSurrogates, using evalErr for all surrogate inputs - Replaced char_surrogate_replacement golden test with char_surrogate_boundary (valid boundary values) and error.char_surrogate (surrogate rejection) Result: std.char(0xD800) now raises "Invalid unicode code point, got 55296" instead of producing an unpaired surrogate. Valid codepoints adjacent to the surrogate range (0xD7FF, 0xE000) are unaffected. This aligns with jrsonnet and follows the Unicode specification. References: - jrsonnet: rejects surrogates with "invalid unicode codepoint" error - go-jsonnet: replaces surrogates with U+FFFD (different strategy) - jsonnet-cpp: preserves raw surrogates (different strategy) - Unicode spec: U+D800-U+DFFF are reserved, not assignable codepoints --- .../src/sjsonnet/stdlib/StringModule.scala | 11 ++++---- .../char_surrogate_boundary.jsonnet | 9 ++++++ .../char_surrogate_boundary.jsonnet.golden | 7 +++++ .../error.char_surrogate.jsonnet | 3 ++ .../error.char_surrogate.jsonnet.golden | 1 + .../src/sjsonnet/UnicodeHandlingTests.scala | 28 ++++++++----------- 6 files changed, 36 insertions(+), 23 deletions(-) create mode 100644 sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet create mode 100644 sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet.golden create mode 100644 sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet create mode 100644 sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet.golden diff --git a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala index 8215ebaf..77d8e923 100644 --- a/sjsonnet/src/sjsonnet/stdlib/StringModule.scala +++ b/sjsonnet/src/sjsonnet/stdlib/StringModule.scala @@ -210,13 +210,12 @@ object StringModule extends AbstractFunctionModule { */ private object Char_ extends Val.Builtin1("char", "n") { def evalRhs(n: Eval, ev: EvalScope, pos: Position): Val = { - val c = n.value.asInt - if (!Character.isValidCodePoint(c)) { - Error.fail(s"Invalid unicode code point, got " + c) + val c0 = n.value.asInt + if (!Character.isValidCodePoint(c0) || (c0 >= 0xd800 && c0 <= 0xdfff)) { + Error.fail(s"Invalid unicode code point, got " + c0) } - val s = Character.toString(c) - // Single-codepoint result; ASCII printable except '"' and '\\' is JSON-safe. - if (c >= 0x20 && c < 0x7f && c != '"' && c != '\\') Val.Str.asciiSafe(pos, s) + val s = Character.toString(c0) + if (c0 >= 0x20 && c0 < 0x7f && c0 != '"' && c0 != '\\') Val.Str.asciiSafe(pos, s) else Val.Str(pos, s) } } diff --git a/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet new file mode 100644 index 00000000..c5781048 --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet @@ -0,0 +1,9 @@ +// Test that std.char() works correctly for valid codepoints near the surrogate range +// and rejects surrogate codepoints (0xD800-0xDFFF) with an error. +[ + std.codepoint(std.char(55295)), // 0xD7FF - last valid before surrogates + std.codepoint(std.char(57344)), // 0xE000 - first valid after surrogates + std.codepoint(std.char(65533)), // 0xFFFD - replacement character itself + std.codepoint(std.char(0)), // 0x0000 - null + std.codepoint(std.char(65)), // 0x0041 - 'A' +] diff --git a/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet.golden new file mode 100644 index 00000000..47132726 --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/char_surrogate_boundary.jsonnet.golden @@ -0,0 +1,7 @@ +[ + 55295, + 57344, + 65533, + 0, + 65 +] diff --git a/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet new file mode 100644 index 00000000..2133e28a --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet @@ -0,0 +1,3 @@ +// std.char() should reject surrogate codepoints (0xD800-0xDFFF) +// High surrogate 0xD800 = 55296 +std.char(55296) diff --git a/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet.golden b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet.golden new file mode 100644 index 00000000..64de38eb --- /dev/null +++ b/sjsonnet/test/resources/new_test_suite/error.char_surrogate.jsonnet.golden @@ -0,0 +1 @@ +sjsonnet.Error: [std.char] Invalid unicode code point, got 55296 diff --git a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala index 25c9d70f..e746b958 100644 --- a/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala +++ b/sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala @@ -163,23 +163,17 @@ object UnicodeHandlingTests extends TestSuite { assert(sjsonnet.Util.compareStringsByCodepoint(rawSurrogatePrefix, validSurrogatePair) < 0) assert(sjsonnet.Util.compareStringsByCodepoint(validSurrogatePair, rawSurrogatePrefix) > 0) - - eval("(std.char(55296) + std.char(65535)) < (std.char(55296) + std.char(56320))") ==> - ujson.Bool(true) - - eval( - "std.sort([std.char(55296) + std.char(56320), std.char(55296) + std.char(65535)])" - ) ==> ujson.Arr(rawSurrogatePrefix, validSurrogatePair) } - // Unpaired surrogate handling - sjsonnet-specific behavior + // Unpaired surrogate handling // - // Note: This is an intentional divergence from go-jsonnet and C++ jsonnet: - // - go/C++ reject unpaired surrogates in escape sequences at parse time - // - go-jsonnet's std.char() replaces surrogate codepoints with U+FFFD - // - sjsonnet was preserving unpaired surrogates throughout + // Note: The three reference implementations diverge on surrogate codepoints: + // - go-jsonnet replaces surrogates with U+FFFD + // - jsonnet-cpp preserves raw surrogates + // - jrsonnet rejects surrogates with an error // - // sjsonnet now reject these to align with go-jsonet/ c++ jsonnet + // sjsonnet rejects surrogates with an error (matching jrsonnet), + // since surrogates are not valid Unicode codepoints per the Unicode spec. // test("unpairedSurrogatesInEscapes") { @@ -191,10 +185,10 @@ object UnicodeHandlingTests extends TestSuite { eval("\"\\uD83C\\uDF0D\"") ==> ujson.Str("🌍") // Earth emoji } - test("stdCharPreservesRawSurrogates") { - // sjsonnet preserves raw surrogate codepoints (go-jsonnet would replace with U+FFFD) - eval("std.codepoint(std.char(55296))") ==> ujson.Num(55296) // 0xD800 high surrogate - eval("std.codepoint(std.char(56320))") ==> ujson.Num(56320) // 0xDC00 low surrogate + test("stdCharRejectsSurrogates") { + evalErr("std.char(55296)").contains("Invalid unicode code point") // 0xD800 + evalErr("std.char(56320)").contains("Invalid unicode code point") // 0xDC00 + evalErr("std.char(57343)").contains("Invalid unicode code point") // 0xDFFF } test("invalidSurrogateHandling") {