Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion sjsonnet/src/sjsonnet/Format.scala
Original file line number Diff line number Diff line change
Expand Up @@ -716,7 +716,8 @@ object Format {
Error.fail("Codepoints must be >= 0, got " + codePoint)
if (codePoint > 0x10ffff)
Error.fail("Invalid unicode codepoint, got " + codePoint)
widenRaw(formatted, Character.toString(codePoint))
val c = if (codePoint >= 0xd800 && codePoint <= 0xdfff) 0xfffd else codePoint
widenRaw(formatted, Character.toString(c))
case 's' =>
widenRaw(formatted, RenderUtils.renderDouble(s))
case _ =>
Expand Down
7 changes: 4 additions & 3 deletions sjsonnet/src/sjsonnet/stdlib/StringModule.scala
Original file line number Diff line number Diff line change
Expand Up @@ -210,10 +210,11 @@ object StringModule extends AbstractFunctionModule {
*/
private object Char_ extends Val.Builtin1("char", "n") {
def evalRhs(n: Eval, ev: EvalScope, pos: Position): Val = {
val c = n.value.asInt
if (!Character.isValidCodePoint(c)) {
Error.fail(s"Invalid unicode code point, got " + c)
val c0 = n.value.asInt
if (!Character.isValidCodePoint(c0)) {
Error.fail(s"Invalid unicode code point, got " + c0)
}
val c = if (c0 >= 0xd800 && c0 <= 0xdfff) 0xfffd else c0
val s = Character.toString(c)
// Single-codepoint result; ASCII printable except '"' and '\\' is JSON-safe.
if (c >= 0x20 && c < 0x7f && c != '"' && c != '\\') Val.Str.asciiSafe(pos, s)
Expand Down
25 changes: 11 additions & 14 deletions sjsonnet/test/src/sjsonnet/UnicodeHandlingTests.scala
Original file line number Diff line number Diff line change
Expand Up @@ -164,22 +164,19 @@ object UnicodeHandlingTests extends TestSuite {
assert(sjsonnet.Util.compareStringsByCodepoint(rawSurrogatePrefix, validSurrogatePair) < 0)
assert(sjsonnet.Util.compareStringsByCodepoint(validSurrogatePair, rawSurrogatePrefix) > 0)

eval("(std.char(55296) + std.char(65535)) < (std.char(55296) + std.char(56320))") ==>
ujson.Bool(true)
// std.char now replaces surrogates with U+FFFD (matching go-jsonnet)
eval("(std.char(55296) + std.char(65535)) == (std.char(55296) + std.char(56320))") ==>
ujson.Bool(false)

eval(
"std.sort([std.char(55296) + std.char(56320), std.char(55296) + std.char(65535)])"
) ==> ujson.Arr(rawSurrogatePrefix, validSurrogatePair)
eval("std.char(55296)") ==> ujson.Str("\uFFFD")
}

// Unpaired surrogate handling - sjsonnet-specific behavior
// Unpaired surrogate handling
//
// Note: This is an intentional divergence from go-jsonnet and C++ jsonnet:
// sjsonnet aligns with go-jsonnet:
// - go/C++ reject unpaired surrogates in escape sequences at parse time
// - go-jsonnet's std.char() replaces surrogate codepoints with U+FFFD
// - sjsonnet was preserving unpaired surrogates throughout
//
// sjsonnet now reject these to align with go-jsonet/ c++ jsonnet
// - sjsonnet now matches both behaviors
//

test("unpairedSurrogatesInEscapes") {
Expand All @@ -191,10 +188,10 @@ object UnicodeHandlingTests extends TestSuite {
eval("\"\\uD83C\\uDF0D\"") ==> ujson.Str("🌍") // Earth emoji
}

test("stdCharPreservesRawSurrogates") {
// sjsonnet preserves raw surrogate codepoints (go-jsonnet would replace with U+FFFD)
eval("std.codepoint(std.char(55296))") ==> ujson.Num(55296) // 0xD800 high surrogate
eval("std.codepoint(std.char(56320))") ==> ujson.Num(56320) // 0xDC00 low surrogate
test("stdCharReplacesSurrogates") {
// std.char() replaces surrogate codepoints with U+FFFD (matching go-jsonnet)
eval("std.codepoint(std.char(55296))") ==> ujson.Num(65533) // 0xD800 → U+FFFD
eval("std.codepoint(std.char(56320))") ==> ujson.Num(65533) // 0xDC00 → U+FFFD
}

test("invalidSurrogateHandling") {
Expand Down
Loading