Skip to content

Commit 4ff3474

Browse files
committed
With ICU, we cna have a much more comprehensive implementation for getting the range of a composed character sequence.
1 parent 1a940f7 commit 4ff3474

5 files changed

Lines changed: 368 additions & 31 deletions

File tree

ChangeLog

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
2026-05-11 Richard Frith-Macdonald <rfm@gnu.org>
2+
3+
* Source/GSPrivate.h: New function to get composed sequence range.
4+
* Source/GSString.m: Use new ICU based code if possible.
5+
* Source/NSString.m: Use new ICU based code if possible.
6+
* Tests/base/NSString/rangeOfComposedCharacter.m:
7+
Range of composed character sequence testcases to cover a fairly
8+
comprehensive range of cases rather than just the simplistic set
9+
handled by the non-ICU code.
10+
111
2026-05-09 Richard Frith-Macdonald <rfm@gnu.org>
212

313
* Source/GSSocketStream.m:

Source/GSPrivate.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -464,6 +464,11 @@ void GSPrivateNotifyIdle(NSString *mode) GS_ATTRIB_PRIVATE;
464464
*/
465465
BOOL GSPrivateNotifyMore(NSString *mode) GS_ATTRIB_PRIVATE;
466466

467+
/* Function to return the range of a composed character sequence.
468+
*/
469+
NSRange GSPrivateRangeOfComposed(const unichar *buf, NSUInteger length,
470+
NSUInteger index) GS_ATTRIB_PRIVATE;
471+
467472
/* Function to return the function for searching in a string for a range.
468473
*/
469474
typedef NSRange (*GSRSFunc)(id, id, unsigned, NSRange);

Source/GSString.m

Lines changed: 7 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3217,28 +3217,22 @@ static void GSStrWiden(GSStr s)
32173217
rangeOfSequence_c(GSStr self, unsigned anIndex)
32183218
{
32193219
if (anIndex >= self->_count)
3220-
[NSException raise: NSRangeException format:@"Invalid location."];
3220+
[NSException raise: NSRangeException format: @"Invalid location."];
32213221

32223222
return (NSRange){anIndex, 1};
32233223
}
32243224

32253225
static inline NSRange
32263226
rangeOfSequence_u(GSStr self, unsigned anIndex)
32273227
{
3228-
unsigned start;
3229-
unsigned end;
3228+
NSRange result;
32303229

32313230
if (anIndex >= self->_count)
3232-
[NSException raise: NSRangeException format:@"Invalid location."];
3233-
3234-
start = anIndex;
3235-
while (uni_isnonsp(self->_contents.u[start]) && start > 0)
3236-
start--;
3237-
end = start + 1;
3238-
if (end < self->_count)
3239-
while ((end < self->_count) && (uni_isnonsp(self->_contents.u[end])))
3240-
end++;
3241-
return (NSRange){start, end-start};
3231+
{
3232+
[NSException raise: NSRangeException format: @"Invalid location."];
3233+
}
3234+
result = GSPrivateRangeOfComposed(self->_contents.u, self->_count, anIndex);
3235+
return result;
32423236
}
32433237

32443238
static inline NSRange

Source/NSString.m

Lines changed: 110 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,7 @@
120120
# include <icu.h>
121121
#endif
122122

123+
123124
/* Create local inline versions of key functions for case-insensitive operations
124125
*/
125126
#import "Additions/unicode/caseconv.h"
@@ -138,6 +139,101 @@
138139

139140
#import "GNUstepBase/Unicode.h"
140141

142+
NSRange GSPrivateRangeOfComposed(const unichar *buf, NSUInteger length,
143+
NSUInteger anIndex)
144+
{
145+
#if GS_USE_ICU == 1
146+
UErrorCode status = U_ZERO_ERROR;
147+
int32_t len = (int32_t)length;
148+
int32_t index = (int32_t)anIndex;
149+
int32_t start;
150+
int32_t end;
151+
UBreakIterator *bi;
152+
153+
/* Create a grapheme-cluster (UBRK_CHARACTER) break iterator
154+
* over the UTF16 buffer.
155+
*/
156+
bi = ubrk_open(UBRK_CHARACTER, NULL /* default locale */,
157+
(const UChar*)buf, len, &status);
158+
159+
if (U_FAILURE(status) || NULL == bi)
160+
{
161+
return NSMakeRange(0, NSNotFound);
162+
}
163+
164+
/* Find start, end of the grapheme cluster containing index.
165+
*
166+
* ubrk_isBoundary(bi, pos) returns true when pos is a cluster boundary
167+
* AND leaves the iterator positioned at pos, ready for ubrk_next().
168+
*
169+
* Case A — index is itself a cluster-start boundary:
170+
* isBoundary returns true and positions the iterator there.
171+
* start = index, end = ubrk_next().
172+
*
173+
* Case B — index falls inside a cluster (e.g. trail surrogate, combining
174+
* mark, or non-first code unit of an emoji modifier sequence):
175+
* isBoundary returns false. ICU positions the iterator at the next
176+
* boundary strictly after index. We then call ubrk_preceding() to
177+
* step back to the cluster start, and ubrk_next() to return to the end.
178+
*
179+
* This avoids the pitfall of calling preceding(index+1) when index+1 is
180+
* itself in the middle of a surrogate pair, which returns UBRK_DONE.
181+
*/
182+
index = (int32_t)anIndex;
183+
if (ubrk_isBoundary(bi, index))
184+
{
185+
start = index;
186+
end = ubrk_next(bi);
187+
}
188+
else
189+
{
190+
int32_t next;
191+
192+
end = ubrk_current(bi);
193+
start = ubrk_preceding(bi, end);
194+
next = ubrk_next(bi);
195+
if (next != UBRK_DONE)
196+
{
197+
end = next;
198+
}
199+
}
200+
201+
ubrk_close(bi);
202+
203+
if (UBRK_DONE == start || UBRK_DONE == end)
204+
{
205+
return NSMakeRange(0, NSNotFound);
206+
}
207+
return NSMakeRange((NSUInteger)start, (NSUInteger)(end - start));
208+
#else
209+
unsigned start;
210+
unsigned end;
211+
unsigned length = [self length];
212+
unichar ch;
213+
unichar (*caiImp)(NSString*, SEL, NSUInteger);
214+
215+
caiImp = (unichar (*)(NSString*,SEL,NSUInteger))
216+
[self methodForSelector: caiSel];
217+
218+
for (start = anIndex; start > 0; start--)
219+
{
220+
ch = (*caiImp)(self, caiSel, start);
221+
if (uni_isnonsp(ch) == NO)
222+
break;
223+
}
224+
for (end = start+1; end < length; end++)
225+
{
226+
ch = (*caiImp)(self, caiSel, end);
227+
if (uni_isnonsp(ch) == NO)
228+
break;
229+
}
230+
231+
return NSMakeRange(start, end-start);
232+
#endif
233+
}
234+
235+
236+
141237
@interface NSScanner (Double)
142238
+ (BOOL) _scanDouble: (double*)value from: (NSString*)str;
143239
@end
@@ -3047,31 +3143,27 @@ - (NSUInteger) indexOfString: (NSString*)substring
30473143
*/
30483144
- (NSRange) rangeOfComposedCharacterSequenceAtIndex: (NSUInteger)anIndex
30493145
{
3050-
unsigned start;
3051-
unsigned end;
3052-
unsigned length = [self length];
3053-
unichar ch;
3054-
unichar (*caiImp)(NSString*, SEL, NSUInteger);
3146+
NSUInteger length = [self length];
30553147

30563148
if (anIndex >= length)
3057-
[NSException raise: NSRangeException format:@"Invalid location."];
3058-
caiImp = (unichar (*)(NSString*,SEL,NSUInteger))
3059-
[self methodForSelector: caiSel];
3060-
3061-
for (start = anIndex; start > 0; start--)
30623149
{
3063-
ch = (*caiImp)(self, caiSel, start);
3064-
if (uni_isnonsp(ch) == NO)
3065-
break;
3150+
[NSException raise: NSRangeException format: @"Invalid location."];
30663151
}
3067-
for (end = start+1; end < length; end++)
3152+
3153+
if (0 == length)
30683154
{
3069-
ch = (*caiImp)(self, caiSel, end);
3070-
if (uni_isnonsp(ch) == NO)
3071-
break;
3155+
return NSMakeRange(0, NSNotFound);
30723156
}
3157+
else
3158+
{
3159+
NSRange result;
3160+
GS_BEGINITEMBUF(buf, (length * sizeof(unichar)), unichar)
30733161

3074-
return NSMakeRange(start, end-start);
3162+
[self getCharacters: buf];
3163+
result = GSPrivateRangeOfComposed(buf, length, anIndex);
3164+
GS_ENDITEMBUF()
3165+
return result;
3166+
}
30753167
}
30763168

30773169
- (NSRange) rangeOfComposedCharacterSequencesForRange: (NSRange)range

0 commit comments

Comments
 (0)