diff --git a/ChangeLog b/ChangeLog index f34c7e1d1..e8c46e270 100644 --- a/ChangeLog +++ b/ChangeLog @@ -30,7 +30,16 @@ exceptions or null pointer crashes in code which expects the -UTF8String method to work for all strings. Remove a few old testcases and add a load of new ones. - + +2026-04-22 Hendrik Huebner + + * Headers/Foundation/NSString.h: + Declare -stringByFoldingWithOptions:locale:. + * Source/NSString.m: + Implement -stringByFoldingWithOptions:locale: using ICU transliteration. + * Tests/base/NSString/locale.m: + Add folding tests for case, diacritic, width and combined behavior. + 2026-04-20 Wolfgang Lux * Headers/Foundation/NSFileManager.h: diff --git a/Headers/Foundation/NSString.h b/Headers/Foundation/NSString.h index e0f842ecb..74a597a75 100644 --- a/Headers/Foundation/NSString.h +++ b/Headers/Foundation/NSString.h @@ -657,6 +657,14 @@ GS_EXPORT_CLASS - (NSString *) precomposedStringWithCanonicalMapping; #endif +#if OS_API_VERSION(MAC_OS_X_VERSION_10_5,GS_API_LATEST) +/** Returns a copy of the receiver suitable for comparison with the supplied + * options and locale. + */ +- (NSString *) stringByFoldingWithOptions: (NSStringCompareOptions)options + locale: (NSLocale *)locale; +#endif + // Converting String Contents into a Property List - (id) propertyList; - (NSDictionary*) propertyListFromStringsFileFormat; diff --git a/Source/NSString.m b/Source/NSString.m index 27f16475b..0f676e6fe 100644 --- a/Source/NSString.m +++ b/Source/NSString.m @@ -116,6 +116,17 @@ #if defined(HAVE_UNICODE_UTYPES_H) # include #endif +#if !defined(GS_HAVE_ICU_UTRANS) +# if defined(__has_include) +# if __has_include() +# include +# define GS_HAVE_ICU_UTRANS 1 +# endif +# endif +#endif +#ifndef GS_HAVE_ICU_UTRANS +# define GS_HAVE_ICU_UTRANS 0 +#endif #if defined(HAVE_ICU_H) # include #endif @@ -640,6 +651,8 @@ - (void) dealloc @interface NSThread (StringCollatorCache) - (id) _stringCollatorCache; - (void) _setStringCollatorCache: (id)cache; +- (id) _stringTransliteratorCache; +- (void) _setStringTransliteratorCache: (id)cache; @end // The locale parameter must not be nil at this point. @@ -678,6 +691,318 @@ - (void) _setStringCollatorCache: (id)cache; } #endif // GS_USE_ICU +#if (GS_USE_ICU == 1) && GS_HAVE_ICU_UTRANS +typedef struct +{ + NSString *transliteratorId; + UTransliterator *transliterator; +} GSICUTransliteratorEntry; + +@interface GSICUTransliteratorCache : NSObject +{ + @public + GSICUTransliteratorEntry entries[4]; + NSUInteger nextEviction; +} +- (UTransliterator *) transliteratorForId: (NSString *)transliteratorId; +@end + +static UTransliterator * +GSICUCreateTransliterator(NSString *transliteratorId) +{ + NSUInteger transIdLength = [transliteratorId length]; + unichar *transId; + UErrorCode err = U_ZERO_ERROR; + UParseError parseError; + UTransliterator *trans; + + transId = (unichar *)malloc(transIdLength * sizeof(unichar)); + [transliteratorId getCharacters: transId + range: NSMakeRange(0, transIdLength)]; + trans = utrans_openU((const UChar *)transId, (int32_t)transIdLength, + UTRANS_FORWARD, NULL, 0, &parseError, &err); + free(transId); + + if (U_FAILURE(err) || trans == NULL) + { + [NSException raise: NSCharacterConversionException + format: @"libicu transliterator open failed"]; + } + return trans; +} + +@implementation GSICUTransliteratorCache +- (UTransliterator *) transliteratorForId: (NSString *)transliteratorId +{ + NSUInteger i; + + /* We only cache a few static transliterator IDs, so do a simple + * linear search to find matches. + */ + for (i = 0; i < sizeof(entries) / sizeof(*entries); i++) + { + if (entries[i].transliteratorId == transliteratorId) + { + return entries[i].transliterator; + } + } + + for (i = 0; i < sizeof(entries) / sizeof(*entries); i++) + { + if (entries[i].transliteratorId == nil) + { + ASSIGN(entries[i].transliteratorId, transliteratorId); + entries[i].transliterator = GSICUCreateTransliterator(transliteratorId); + return entries[i].transliterator; + } + } + + /* If cache is full, use a FIFO eviction strategy. */ + RELEASE(entries[nextEviction].transliteratorId); + if (entries[nextEviction].transliterator != NULL) + { + utrans_close(entries[nextEviction].transliterator); + } + ASSIGN(entries[nextEviction].transliteratorId, transliteratorId); + entries[nextEviction].transliterator + = GSICUCreateTransliterator(transliteratorId); + i = nextEviction; + nextEviction = (nextEviction + 1) % (sizeof(entries) / sizeof(*entries)); + return entries[i].transliterator; +} + +- (void) dealloc +{ + NSUInteger i; + + for (i = 0; i < sizeof(entries) / sizeof(*entries); i++) + { + RELEASE(entries[i].transliteratorId); + if (entries[i].transliterator != NULL) + { + utrans_close(entries[i].transliterator); + } + } + [super dealloc]; +} +@end + +static UTransliterator * +GSICUCachedTransliterator(NSString *transliteratorId) +{ + NSThread *current; + GSICUTransliteratorCache *cache; + + current = [NSThread currentThread]; + cache = [current _stringTransliteratorCache]; + if (nil == cache) + { + cache = [[GSICUTransliteratorCache alloc] init]; + [current _setStringTransliteratorCache: cache]; + [cache release]; + } + return [cache transliteratorForId: transliteratorId]; +} +#endif + +static NSString * +GSStringApplyTransliterator(const unichar *src, + NSUInteger srcLength, + void *transOpaque) +{ + if (srcLength == 0) + { + return @""; + } + +#if (GS_USE_ICU == 1) && GS_HAVE_ICU_UTRANS + { + UTransliterator *trans = (UTransliterator *)transOpaque; + UErrorCode err = U_ZERO_ERROR; + unichar *dst; + unichar stackDst[100]; + BOOL dstOnStack = NO; + int32_t srcLen = (int32_t)srcLength; + int32_t capacity; + int32_t textLen; + int32_t limit; + NSString *result; + + capacity = srcLen + 16; + if (capacity < 32) + { + capacity = 32; + } + if ((NSUInteger)capacity * sizeof(unichar) <= 200) + { + dst = stackDst; + dstOnStack = YES; + } + else + { + dst = (unichar *)malloc(capacity * sizeof(unichar)); + } + + /* A transliterator can increase output size beyond the input size + * (for example decomposition stages), so we retry with a larger + * destination buffer when ICU reports overflow. + */ + for (;;) + { + memcpy(dst, src, srcLen * sizeof(unichar)); + textLen = srcLen; + limit = textLen; + err = U_ZERO_ERROR; + utrans_transUChars(trans, (UChar *)dst, &textLen, capacity, 0, &limit, &err); + if (err == U_BUFFER_OVERFLOW_ERROR) + { + unichar *tmp; + + capacity = textLen + 16; + if (dstOnStack == YES) + { + dst = (unichar *)malloc(capacity * sizeof(unichar)); + dstOnStack = NO; + } + else + { + tmp = (unichar *)realloc(dst, capacity * sizeof(unichar)); + dst = tmp; + } + continue; + } + if (U_FAILURE(err)) + { + if (dstOnStack == NO) + { + free(dst); + } + [NSException raise: NSCharacterConversionException + format: @"libicu transliteration failed"]; + } + break; + } + + result = [NSString stringWithCharacters: dst length: textLen]; + if (dstOnStack == NO) + { + free(dst); + } + return result; + } +#else + [NSException raise: NSInternalInconsistencyException + format: @"ICU transliterator support is required"]; + return nil; +#endif +} + +#if (GS_USE_ICU == 1) && GS_HAVE_ICU_UTRANS +static NSString * +GSStringApplyTransliteratorToString(NSString *input, + void *transOpaque) +{ + NSUInteger length = [input length]; + unichar *src; + NSString *result; + + if (length == 0) + { + return @""; + } + + src = (unichar *)malloc(length * sizeof(unichar)); + [input getCharacters: src range: NSMakeRange(0, length)]; + result = GSStringApplyTransliterator(src, length, transOpaque); + free(src); + + return result; +} +#endif + +static NSString * +GSStringApplyTransliteratorIdentifierToString(NSString *input, + NSString *transliteratorId) +{ +#if (GS_USE_ICU == 1) && GS_HAVE_ICU_UTRANS + return GSStringApplyTransliteratorToString(input, + GSICUCachedTransliterator(transliteratorId)); +#else + [NSException raise: NSInternalInconsistencyException + format: @"ICU transliterator support is required"]; + return nil; +#endif +} + +#if (GS_USE_ICU == 1) +static NSString * +GSStringFoldCaseWithLocale(NSString *input, id locale) +{ + NSUInteger length = [input length]; + unichar *src; + unichar *dst; + int32_t newLength; + UErrorCode err; + const char *localeId = NULL; + NSString *result; + + if (length == 0) + { + return @""; + } + + if (locale == nil) + { + locale = [NSLocale systemLocale]; + } + else if ([locale isKindOfClass: [NSLocale class]] == NO) + { + locale = [NSLocale currentLocale]; + } + + if (locale != nil) + { + localeId = [[locale localeIdentifier] UTF8String]; + } + + src = (unichar *)malloc(length * sizeof(unichar)); + [input getCharacters: src range: NSMakeRange(0, length)]; + + err = U_ZERO_ERROR; + newLength = u_strToLower(NULL, 0, (const UChar *)src, (int32_t)length, + localeId, &err); + if (err != U_BUFFER_OVERFLOW_ERROR) + { + free(src); + [NSException raise: NSCharacterConversionException + format: @"libicu case folding length check failed"]; + } + + dst = (unichar *)malloc(newLength * sizeof(unichar)); + err = U_ZERO_ERROR; + u_strToLower((UChar *)dst, newLength, (const UChar *)src, + (int32_t)length, localeId, &err); + free(src); + + if (U_FAILURE(err)) + { + free(dst); + [NSException raise: NSCharacterConversionException + format: @"libicu case folding failed"]; + } + + result = [NSString stringWithCharacters: dst length: newLength]; + free(dst); + return result; +} +#else +static NSString * +GSStringFoldCaseWithLocale(NSString *input, id locale) +{ + return [input lowercaseString]; +} +#endif + @implementation NSString // NSString itself is an abstract class which provides factory @@ -1968,6 +2293,72 @@ - (NSString *) decomposedStringWithCanonicalMapping return [self notImplemented: _cmd]; #endif } + +#if OS_API_VERSION(MAC_OS_X_VERSION_10_5,GS_API_LATEST) +- (NSString *) stringByFoldingWithOptions: (NSStringCompareOptions)options + locale: (NSLocale *)locale +{ + NSString *result = self; + static NSString * const widthTransliteratorId = @"Fullwidth-Halfwidth"; + static NSString * const diacriticTransliteratorId + = @"NFD; [:Nonspacing Mark:] Remove; NFC"; + static NSString * const widthDiacriticTransliteratorId + = @"Fullwidth-Halfwidth; NFD; [:Nonspacing Mark:] Remove; NFC"; + BOOL foldCase; + BOOL foldDiacritic; + BOOL foldWidth; + + if ([self length] == 0) + { + return @""; + } + + foldCase = ((options & NSCaseInsensitiveSearch) == NSCaseInsensitiveSearch); + foldDiacritic = ((options & NSDiacriticInsensitiveSearch) + == NSDiacriticInsensitiveSearch); + foldWidth = ((options & NSWidthInsensitiveSearch) == NSWidthInsensitiveSearch); + + if (foldCase == NO && foldDiacritic == NO && foldWidth == NO) + { + return IMMUTABLE(self); + } + +#if !((GS_USE_ICU == 1) && GS_HAVE_ICU_UTRANS) + if (foldDiacritic == YES || foldWidth == YES) + { + return [self notImplemented: _cmd]; + } +#endif + + if (foldWidth == YES && foldDiacritic == YES) + { + result = GSStringApplyTransliteratorIdentifierToString( + result, widthDiacriticTransliteratorId); + foldWidth = NO; + foldDiacritic = NO; + } + + if (foldDiacritic == YES) + { + result = GSStringApplyTransliteratorIdentifierToString( + result, diacriticTransliteratorId); + } + + if (foldWidth == YES) + { + result = GSStringApplyTransliteratorIdentifierToString( + result, widthTransliteratorId); + } + + if (foldCase == YES) + { + /* TODO: use `lowercaseStringWithLocale` once implemented. */ + result = GSStringFoldCaseWithLocale(result, locale); + } + + return IMMUTABLE(result); +} +#endif /** * Returns this string as an array of 16-bit unichar (unsigned @@ -6947,4 +7338,3 @@ - (void) setString: (NSString*)aString } @end - diff --git a/Source/NSThread.m b/Source/NSThread.m index 4cf8f3fc1..3f4fd5927 100644 --- a/Source/NSThread.m +++ b/Source/NSThread.m @@ -97,6 +97,7 @@ int pthread_spin_destroy(pthread_spinlock_t *lock) #define EXPOSE_NSThread_IVARS 1 #define GS_NSThread_IVARS \ id _stringCollatorCache; \ + id _stringTransliteratorCache; \ BOOL _targetIsBlock; \ gs_thread_id_t _pthreadID; \ NSUInteger _threadID; \ @@ -171,6 +172,7 @@ int pthread_spin_destroy(pthread_spinlock_t *lock) #define lockInfo (internal->_lockInfo) #define targetIsBlock (internal->_targetIsBlock) #define stringCollatorCache (internal->_stringCollatorCache) +#define stringTransliteratorCache (internal->_stringTransliteratorCache) #if defined(HAVE_PTHREAD_MAIN_NP) @@ -1205,6 +1207,7 @@ - (void) dealloc DESTROY(_arg); DESTROY(_name); DESTROY(stringCollatorCache); + DESTROY(stringTransliteratorCache); if (_autorelease_vars.pool_cache != 0) { [NSAutoreleasePool _endThread: self]; @@ -1601,6 +1604,15 @@ - (void) _setStringCollatorCache: (id) cache ASSIGN(stringCollatorCache, cache); } +- (id) _stringTransliteratorCache +{ + return (id)stringTransliteratorCache; +} +- (void) _setStringTransliteratorCache: (id) cache +{ + ASSIGN(stringTransliteratorCache, cache); +} + @end diff --git a/Tests/base/NSString/locale.m b/Tests/base/NSString/locale.m index 1b1213e29..a77a0b68a 100644 --- a/Tests/base/NSString/locale.m +++ b/Tests/base/NSString/locale.m @@ -10,6 +10,18 @@ #define NSLOCALE_SUPPORTED 1 /* Assume Apple support */ #endif +#if !defined(GS_HAVE_ICU_UTRANS) +# if defined(__has_include) +# if __has_include() +# define GS_HAVE_ICU_UTRANS 1 +# else +# define GS_HAVE_ICU_UTRANS 0 +# endif +# else +# define GS_HAVE_ICU_UTRANS 0 +# endif +#endif + static void testBasic(void) { NSComparisonResult compRes; @@ -269,6 +281,112 @@ static void testDiacritics(void) PASS(compRes == NSOrderedSame, "expected 0 got %d", (int)compRes); } +#if OS_API_VERSION(MAC_OS_X_VERSION_10_5,GS_API_LATEST) +static void testFolding(void) +{ + const unichar eAcute = 0x00e9; + const unichar fullWidthA = 0xFF21; + const unichar dotlessI = 0x0131; + NSString *eAcuteStr = [[[NSString alloc] initWithCharacters: &eAcute + length: 1] autorelease]; + NSString *fullWidthAStr = [[[NSString alloc] initWithCharacters: &fullWidthA + length: 1] autorelease]; + NSString *dotlessIStr = [[[NSString alloc] initWithCharacters: &dotlessI + length: 1] autorelease]; + NSString *turkish = [[[NSLocale alloc] initWithLocaleIdentifier: @"tr_TR"] + autorelease]; + NSString *folded; + + folded = [@"HELLO" stringByFoldingWithOptions: NSCaseInsensitiveSearch + locale: nil]; + PASS_EQUAL(folded, @"hello", + "NSCaseInsensitiveSearch folds HELLO to hello"); + +#if GS_HAVE_ICU_UTRANS + folded = [eAcuteStr stringByFoldingWithOptions: NSDiacriticInsensitiveSearch + locale: nil]; + PASS_EQUAL(folded, @"e", + "NSDiacriticInsensitiveSearch folds e-acute to e"); + + folded = [fullWidthAStr stringByFoldingWithOptions: NSWidthInsensitiveSearch + locale: nil]; + PASS_EQUAL(folded, @"A", + "NSWidthInsensitiveSearch folds fullwidth A to ASCII A"); + + folded = [@"ABC123" + stringByFoldingWithOptions: NSWidthInsensitiveSearch + locale: nil]; + PASS_EQUAL(folded, @"ABC123", + "NSWidthInsensitiveSearch folds fullwidth letters and digits"); + + folded = [@"A\u0301" + stringByFoldingWithOptions: NSWidthInsensitiveSearch + | NSDiacriticInsensitiveSearch + locale: nil]; + PASS_EQUAL(folded, @"A", + "Width+diacritic folding removes acute after width fold"); + + folded = [@",.!ABC" + stringByFoldingWithOptions: NSWidthInsensitiveSearch + | NSCaseInsensitiveSearch + locale: nil]; + PASS_EQUAL(folded, @",.!abc", + "NSWidthInsensitiveSearch and NSCaseInsensitiveSearch folds " + "fullwidth punctuation and uppercase letters"); + + folded = [@"Iİıi" + stringByFoldingWithOptions: NSCaseInsensitiveSearch + locale: turkish]; + PASS_EQUAL(folded, @"ıiıi", + "Turkish NSCaseInsensitiveSearch folds I-sequence correctly"); + + folded = [@"K" + stringByFoldingWithOptions: NSCaseInsensitiveSearch + locale: nil]; + PASS_EQUAL(folded, @"k", + "NSCaseInsensitiveSearch folds Kelvin sign to k"); + + folded = [@"ÅÇñöü" + stringByFoldingWithOptions: NSDiacriticInsensitiveSearch + locale: nil]; + PASS_EQUAL(folded, @"ACnou", + "NSDiacriticInsensitiveSearch folds multiple accented Latin letters"); + + folded = [@"e\u0301" + stringByFoldingWithOptions: NSDiacriticInsensitiveSearch + locale: nil]; + PASS_EQUAL(folded, @"e", + "NSDiacriticInsensitiveSearch folds decomposed acute sequence"); + + folded = [@"ÉAI" + stringByFoldingWithOptions: NSCaseInsensitiveSearch + | NSDiacriticInsensitiveSearch + | NSWidthInsensitiveSearch + locale: turkish]; + PASS_EQUAL(folded, @"eaı", + "combined folding applies case, diacritic, and width handling"); + + folded = [@"é" stringByFoldingWithOptions: NSWidthInsensitiveSearch + locale: nil]; + PASS_EQUAL(folded, @"é", + "NSWidthInsensitiveSearch does not remove diacritics"); + + folded = [@"ø" stringByFoldingWithOptions: NSDiacriticInsensitiveSearch + locale: nil]; + PASS_EQUAL(folded, @"ø", + "NSDiacriticInsensitiveSearch does not fold stroked o"); +#else + { + BOOL wasHopeful = testHopeful; + testHopeful = YES; + PASS(YES, "Skipping transliterator-dependent folding checks " + "(ICU transliterator support unavailable at compile time)"); + testHopeful = wasHopeful; + } +#endif +} +#endif + int main() { START_SET("NSString + locale") @@ -282,6 +400,14 @@ int main() testEszett(); testLithuanian(); testDiacritics(); +#if OS_API_VERSION(MAC_OS_X_VERSION_10_5,GS_API_LATEST) + { + BOOL wasHopeful = testHopeful; + testHopeful = NO; + testFolding(); + testHopeful = wasHopeful; + } +#endif END_SET("NSString + locale")