diff --git a/src/main/java/org/apache/commons/lang3/StringUtils.java b/src/main/java/org/apache/commons/lang3/StringUtils.java index db9b225e65b..659304c1dfe 100644 --- a/src/main/java/org/apache/commons/lang3/StringUtils.java +++ b/src/main/java/org/apache/commons/lang3/StringUtils.java @@ -3002,6 +3002,12 @@ public static int indexOfDifference(final CharSequence... css) { break; } } + if (firstDiff > 0 && Character.isLowSurrogate(css[0].charAt(firstDiff)) + && Character.isHighSurrogate(css[0].charAt(firstDiff - 1))) { + // the difference splits a surrogate pair whose high half is common; report the start of the + // pair so getCommonPrefix never slices it in half and leaves a stray high surrogate. + firstDiff--; + } if (firstDiff == -1 && shortestStrLen != longestStrLen) { // we compared all of the characters up to the length of the // shortest string and didn't find a match, but the string lengths @@ -3048,6 +3054,12 @@ public static int indexOfDifference(final CharSequence cs1, final CharSequence c break; } } + if (i > 0 && i < cs1.length() && i < cs2.length() && Character.isHighSurrogate(cs1.charAt(i - 1)) + && (Character.isLowSurrogate(cs1.charAt(i)) || Character.isLowSurrogate(cs2.charAt(i)))) { + // the difference splits a surrogate pair whose high half is common; report the start of the + // pair so difference does not return a string that begins with a stray low surrogate. + i--; + } if (i < cs2.length() || i < cs1.length()) { return i; } diff --git a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java index df33386b2dc..1389e73296a 100644 --- a/src/test/java/org/apache/commons/lang3/StringUtilsTest.java +++ b/src/test/java/org/apache/commons/lang3/StringUtilsTest.java @@ -542,6 +542,11 @@ void testDifference_StringString() { assertEquals("robot", StringUtils.difference("i am a machine", "i am a robot")); assertEquals("", StringUtils.difference("abc", "abc")); assertEquals("you are a robot", StringUtils.difference("i am a robot", "you are a robot")); + // 0x10400 and 0x10401 share the same high surrogate; the difference must not begin with a lone low surrogate + final String cp10400 = new String(Character.toChars(0x10400)); + final String cp10401 = new String(Character.toChars(0x10401)); + assertEquals(cp10401, StringUtils.difference(cp10400, cp10401)); + assertEquals("Y", StringUtils.difference(cp10400 + "X", cp10400 + "Y")); } @Test @@ -563,6 +568,11 @@ void testDifferenceAt_StringArray() { assertEquals(0, StringUtils.indexOfDifference("abcde", "xyz")); assertEquals(0, StringUtils.indexOfDifference("xyz", "abcde")); assertEquals(7, StringUtils.indexOfDifference("i am a machine", "i am a robot")); + // a difference that falls inside a shared surrogate pair is reported at the start of the pair, not mid-pair + final String cp10400 = new String(Character.toChars(0x10400)); + final String cp10401 = new String(Character.toChars(0x10401)); + assertEquals(0, StringUtils.indexOfDifference(new String[] {cp10400, cp10401})); + assertEquals(2, StringUtils.indexOfDifference(new String[] {cp10400 + "X", cp10400 + "Y"})); } @Test @@ -576,6 +586,11 @@ void testDifferenceAt_StringString() { assertEquals(7, StringUtils.indexOfDifference("i am a machine", "i am a robot")); assertEquals(-1, StringUtils.indexOfDifference("foo", "foo")); assertEquals(0, StringUtils.indexOfDifference("i am a robot", "you are a robot")); + // a difference that falls inside a shared surrogate pair is reported at the start of the pair, not mid-pair + final String cp10400 = new String(Character.toChars(0x10400)); + final String cp10401 = new String(Character.toChars(0x10401)); + assertEquals(0, StringUtils.indexOfDifference(cp10400, cp10401)); + assertEquals(2, StringUtils.indexOfDifference(cp10400 + "X", cp10400 + "Y")); } /** @@ -679,6 +694,11 @@ void testGetCommonPrefix_StringArray() { assertEquals("", StringUtils.getCommonPrefix("abcde", "xyz")); assertEquals("", StringUtils.getCommonPrefix("xyz", "abcde")); assertEquals("i am a ", StringUtils.getCommonPrefix("i am a machine", "i am a robot")); + // 0x10400 and 0x10401 share the high surrogate but differ; the common prefix must not be a lone high surrogate + final String cp10400 = new String(Character.toChars(0x10400)); + final String cp10401 = new String(Character.toChars(0x10401)); + assertEquals("", StringUtils.getCommonPrefix(cp10400, cp10401)); + assertEquals(cp10400, StringUtils.getCommonPrefix(cp10400 + "X", cp10400 + "Y")); } @Test