Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions sjsonnet/src/sjsonnet/stdlib/StringModule.scala
Original file line number Diff line number Diff line change
Expand Up @@ -210,13 +210,12 @@ object StringModule extends AbstractFunctionModule {
*/
private object Char_ extends Val.Builtin1("char", "n") {
def evalRhs(n: Eval, ev: EvalScope, pos: Position): Val = {
val c = n.value.asInt
if (!Character.isValidCodePoint(c)) {
Error.fail(s"Invalid unicode code point, got " + c)
val c0 = n.value.asInt
if (!Character.isValidCodePoint(c0) || (c0 >= 0xd800 && c0 <= 0xdfff)) {
Error.fail(s"Invalid unicode code point, got " + c0)
}
val s = Character.toString(c)
// Single-codepoint result; ASCII printable except '"' and '\\' is JSON-safe.
if (c >= 0x20 && c < 0x7f && c != '"' && c != '\\') Val.Str.asciiSafe(pos, s)
val s = Character.toString(c0)
if (c0 >= 0x20 && c0 < 0x7f && c0 != '"' && c0 != '\\') Val.Str.asciiSafe(pos, s)
else Val.Str(pos, s)
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
// Test that std.char() works correctly for valid codepoints near the surrogate range
// and rejects surrogate codepoints (0xD800-0xDFFF) with an error.
[
std.codepoint(std.char(55295)), // 0xD7FF - last valid before surrogates
std.codepoint(std.char(57344)), // 0xE000 - first valid after surrogates
std.codepoint(std.char(65533)), // 0xFFFD - replacement character itself
std.codepoint(std.char(0)), // 0x0000 - null
std.codepoint(std.char(65)), // 0x0041 - 'A'
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[
55295,
57344,
65533,
0,
65
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
// std.char() should reject surrogate codepoints (0xD800-0xDFFF)
// High surrogate 0xD800 = 55296
std.char(55296)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
sjsonnet.Error: [std.char] Invalid unicode code point, got 55296
28 changes: 11 additions & 17 deletions sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala
Original file line number Diff line number Diff line change
Expand Up @@ -163,23 +163,17 @@ object UnicodeHandlingTests extends TestSuite {

assert(sjsonnet.Util.compareStringsByCodepoint(rawSurrogatePrefix, validSurrogatePair) < 0)
assert(sjsonnet.Util.compareStringsByCodepoint(validSurrogatePair, rawSurrogatePrefix) > 0)

eval("(std.char(55296) + std.char(65535)) < (std.char(55296) + std.char(56320))") ==>
ujson.Bool(true)

eval(
"std.sort([std.char(55296) + std.char(56320), std.char(55296) + std.char(65535)])"
) ==> ujson.Arr(rawSurrogatePrefix, validSurrogatePair)
}

// Unpaired surrogate handling - sjsonnet-specific behavior
// Unpaired surrogate handling
//
// Note: This is an intentional divergence from go-jsonnet and C++ jsonnet:
// - go/C++ reject unpaired surrogates in escape sequences at parse time
// - go-jsonnet's std.char() replaces surrogate codepoints with U+FFFD
// - sjsonnet was preserving unpaired surrogates throughout
// Note: The three reference implementations diverge on surrogate codepoints:
// - go-jsonnet replaces surrogates with U+FFFD
// - jsonnet-cpp preserves raw surrogates
// - jrsonnet rejects surrogates with an error
//
// sjsonnet now reject these to align with go-jsonet/ c++ jsonnet
// sjsonnet rejects surrogates with an error (matching jrsonnet),
// since surrogates are not valid Unicode codepoints per the Unicode spec.
//

test("unpairedSurrogatesInEscapes") {
Expand All @@ -191,10 +185,10 @@ object UnicodeHandlingTests extends TestSuite {
eval("\"\\uD83C\\uDF0D\"") ==> ujson.Str("🌍") // Earth emoji
}

test("stdCharPreservesRawSurrogates") {
// sjsonnet preserves raw surrogate codepoints (go-jsonnet would replace with U+FFFD)
eval("std.codepoint(std.char(55296))") ==> ujson.Num(55296) // 0xD800 high surrogate
eval("std.codepoint(std.char(56320))") ==> ujson.Num(56320) // 0xDC00 low surrogate
test("stdCharRejectsSurrogates") {
evalErr("std.char(55296)").contains("Invalid unicode code point") // 0xD800
evalErr("std.char(56320)").contains("Invalid unicode code point") // 0xDC00
evalErr("std.char(57343)").contains("Invalid unicode code point") // 0xDFFF
}

test("invalidSurrogateHandling") {
Expand Down
Loading