Skip to content

Commit 4bb3ea4

Browse files
8365675: Add String.toCaseFold()
1 parent 84a4a36 commit 4bb3ea4

9 files changed

Lines changed: 623 additions & 28 deletions

File tree

make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java

Lines changed: 64 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -29,45 +29,82 @@
2929
import java.nio.file.Files;
3030
import java.nio.file.Paths;
3131
import java.nio.file.StandardOpenOption;
32+
import java.util.Arrays;
3233
import java.util.stream.Collectors;
34+
import java.util.stream.IntStream;
3335
import java.util.stream.Stream;
3436

3537
public class CaseFolding {
3638

3739
public static void main(String[] args) throws Throwable {
38-
if (args.length != 3) {
39-
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java");
40+
if (args.length != 4) {
41+
System.err.println("Usage: java CaseFolding TemplateFile CaseFolding.txt CaseFolding.java lang");
4042
System.exit(1);
4143
}
4244
var templateFile = Paths.get(args[0]);
4345
var caseFoldingTxt = Paths.get(args[1]);
4446
var genSrcFile = Paths.get(args[2]);
45-
var supportedTypes = "^.*; [CTS]; .*$";
46-
var caseFoldingEntries = Files.lines(caseFoldingTxt)
47-
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
48-
.map(line -> {
49-
String[] cols = line.split("; ");
50-
return new String[] {cols[0], cols[1], cols[2]};
51-
})
52-
.filter(cols -> {
53-
// the folding case doesn't map back to the original char.
54-
var cp1 = Integer.parseInt(cols[0], 16);
55-
var cp2 = Integer.parseInt(cols[2], 16);
56-
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
57-
})
58-
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
59-
.collect(Collectors.joining(",\n", "", ""));
47+
var pkg = args[3];
6048

61-
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
62-
// 0049; T; 0131; # LATIN CAPITAL LETTER I
63-
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
49+
if ("lang_string".equals(pkg)) {
50+
var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding
51+
var caseFoldingEntries = Files.lines(caseFoldingTxt)
52+
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
53+
.map(line -> {
54+
var fields = line.split("; ");
55+
var cp = Integer.parseInt(fields[0], 16);
56+
fields = fields[2].trim().split(" ");
57+
var folding = new int[fields.length];
58+
for (int i = 0; i < folding.length; i++) {
59+
folding[i] = Integer.parseInt(fields[i], 16);
60+
}
61+
var foldingChars = Arrays.stream(folding)
62+
.mapToObj(Character::toChars)
63+
.flatMapToInt(chars -> IntStream.range(0, chars.length).map(i -> (int)chars[i]))
64+
.toArray();
65+
return String.format("\t\tnew CaseFoldingEntry(0x%04x, %s)",
66+
cp,
67+
Arrays.stream(foldingChars)
68+
.mapToObj(c -> String.format("0x%04x", c))
69+
.collect(Collectors.joining(", ", "new char[] {", "}"))
70+
);
71+
})
72+
.collect(Collectors.joining(",\n", "", ""));
6473

65-
// Generate .java file
66-
Files.write(
67-
genSrcFile,
68-
Files.lines(templateFile)
69-
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
70-
.collect(Collectors.toList()),
71-
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
74+
Files.write(
75+
genSrcFile,
76+
Files.lines(templateFile)
77+
.map(line -> line.contains("%%%Entries") ? caseFoldingEntries : line)
78+
.collect(Collectors.toList()),
79+
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
80+
} else {
81+
var supportedTypes = "^.*; [CTS]; .*$";
82+
var caseFoldingEntries = Files.lines(caseFoldingTxt)
83+
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
84+
.map(line -> {
85+
String[] cols = line.split("; ");
86+
return new String[]{cols[0], cols[1], cols[2]};
87+
})
88+
.filter(cols -> {
89+
// the folding case doesn't map back to the original char.
90+
var cp1 = Integer.parseInt(cols[0], 16);
91+
var cp2 = Integer.parseInt(cols[2], 16);
92+
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
93+
})
94+
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
95+
.collect(Collectors.joining(",\n", "", ""));
96+
97+
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
98+
// 0049; T; 0131; # LATIN CAPITAL LETTER I
99+
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
100+
101+
// Generate .java file
102+
Files.write(
103+
genSrcFile,
104+
Files.lines(templateFile)
105+
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
106+
.collect(Collectors.toList()),
107+
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
108+
}
72109
}
73110
}

make/modules/java.base/gensrc/GensrcCharacterData.gmk

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,5 +72,23 @@ TARGETS += $(GENSRC_CHARACTERDATA)
7272

7373
################################################################################
7474

75+
76+
GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/java/lang/CaseFolding.java
77+
78+
STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template
79+
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
80+
81+
$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT)
82+
$(call LogInfo, Generating $@)
83+
$(call MakeTargetDir)
84+
$(TOOL_GENERATECASEFOLDING) \
85+
$(STRINGCASEFOLDING_TEMPLATE) \
86+
$(CASEFOLDINGTXT) \
87+
$(GENSRC_STRINGCASEFOLDING) \
88+
lang_string
89+
90+
TARGETS += $(GENSRC_STRINGCASEFOLDING)
91+
92+
7593
endif # include guard
7694
include MakeIncludeEnd.gmk

make/modules/java.base/gensrc/GensrcRegex.gmk

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ $(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
6161
$(TOOL_GENERATECASEFOLDING) \
6262
$(CASEFOLDINGTEMP) \
6363
$(CASEFOLDINGTXT) \
64-
$(GENSRC_CASEFOLDING)
64+
$(GENSRC_CASEFOLDING) \
65+
util_regex
6566

6667
TARGETS += $(GENSRC_CASEFOLDING)
6768

src/java.base/share/classes/java/lang/String.java

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3791,6 +3791,47 @@ public String toUpperCase() {
37913791
return toUpperCase(Locale.getDefault());
37923792
}
37933793

3794+
/**
3795+
* Returns a case-folded copy of this {@code String}, using the Unicode
3796+
* case folding mappings defined in
3797+
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">
3798+
* Unicode Case Folding Properties</a>.
3799+
*
3800+
* <p>Case folding is a locale-independent, language-neutral form of
3801+
* case mapping, primarily intended for case-insensitive matching.
3802+
* Unlike {@link #toLowerCase()} or {@link #toUpperCase()}, which are
3803+
* designed for locale-sensitive or display-oriented transformations,
3804+
* case folding provides a stable and consistent mapping across all
3805+
* environments. It may include one-to-many mappings; for example,
3806+
* the German sharp s ({@code U+00DF}) folds to the sequence
3807+
* {@code "ss"}.
3808+
*
3809+
* <p>This method performs the "Full" case folding as defined in the
3810+
* Unicode CaseFolding data file. The result is suitable for use in
3811+
* case-insensitive string comparison, searching, or indexing.
3812+
*
3813+
* @apiNote
3814+
* Case folding is intended for caseless matching, not for locale-sensitive
3815+
* presentation. For example:
3816+
*
3817+
* <pre>{@code
3818+
* String a = "Maße";
3819+
* String b = "MASSE";
3820+
* if (a.toCaseFold().equals(b.toCaseFold())) {
3821+
* // true, matches according to Unicode case-insensitive rules
3822+
* }
3823+
* }</pre>
3824+
*
3825+
* @return a {@code String} containing the case-folded form of this string
3826+
* @see #toLowerCase()
3827+
* @see #toUpperCase()
3828+
*/
3829+
3830+
public String toCaseFold() {
3831+
return isLatin1() ? StringLatin1.toCaseFold(this, value)
3832+
: StringUTF16.toCaseFold(this, value);
3833+
}
3834+
37943835
/**
37953836
* Returns a string whose value is this string, with all leading
37963837
* and trailing space removed, where space is defined

src/java.base/share/classes/java/lang/StringLatin1.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
import java.util.function.IntConsumer;
3333
import java.util.stream.Stream;
3434
import java.util.stream.StreamSupport;
35+
import jdk.internal.java.lang.CaseFolding;
3536
import jdk.internal.util.ArraysSupport;
3637
import jdk.internal.vm.annotation.IntrinsicCandidate;
3738

@@ -560,6 +561,54 @@ private static String toUpperCaseEx(String str, byte[] value,
560561
return StringUTF16.newString(result, 0, resultOffset);
561562
}
562563

564+
private static String toCaseFoldEx(String str, byte[] value, int first) {
565+
byte[] result = StringUTF16.newBytesFor(value.length);
566+
int resultOffset = 0;
567+
for (int i = 0; i < first; i++) {
568+
StringUTF16.putChar(result, resultOffset++, value[i] & 0xff);
569+
}
570+
for (int i = first; i < value.length; i++) {
571+
int cp = value[i] & 0xff;
572+
char[] folded = CaseFolding.fold(cp);
573+
if (folded.length == 1) {
574+
StringUTF16.putChar(result, resultOffset++, folded[0]);
575+
} else {
576+
byte[] result2 = StringUTF16.newBytesFor((result.length >> 1) + folded.length - 1);
577+
System.arraycopy(result, 0, result2, 0, resultOffset << 1);
578+
result = result2;
579+
for (int x = 0; x < folded.length; ++x) {
580+
StringUTF16.putChar(result, resultOffset++, folded[x]);
581+
}
582+
}
583+
}
584+
return StringUTF16.newString(result, 0, resultOffset);
585+
}
586+
587+
public static String toCaseFold(String str, byte[] value) {
588+
int first;
589+
final int len = value.length;
590+
// Now check if there are any characters that need to be changed
591+
for (first = 0 ; first < len; first++) {
592+
var cp = value[first] & 0xff;
593+
if (!CaseFolding.isFolded(value[first] & 0xff)) {
594+
break;
595+
}
596+
}
597+
if (first == len)
598+
return str;
599+
byte[] result = new byte[len];
600+
System.arraycopy(value, 0, result, 0, first); // Just copy the first few
601+
// fold characters
602+
for (int i = first; i < len; i++) {
603+
var folded = CaseFolding.fold(value[i] & 0xff);
604+
if (folded.length > 1 || !canEncode(folded[0])) {
605+
return toCaseFoldEx(str, value, first);
606+
}
607+
result[i] = (byte)(folded[0] & 0xff);
608+
}
609+
return new String(result, LATIN1);
610+
}
611+
563612
public static String trim(byte[] value) {
564613
int len = value.length;
565614
int st = 0;

src/java.base/share/classes/java/lang/StringUTF16.java

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import java.util.stream.Stream;
3535
import java.util.stream.StreamSupport;
3636

37+
import jdk.internal.java.lang.CaseFolding;
3738
import jdk.internal.misc.Unsafe;
3839
import jdk.internal.util.ArraysSupport;
3940
import jdk.internal.vm.annotation.ForceInline;
@@ -1164,6 +1165,77 @@ private static String toUpperCaseEx(String str, byte[] value,
11641165
return newString(result, 0, resultOffset);
11651166
}
11661167

1168+
private static String toCaseFoldEx(String str, byte[] value, byte[] result, int first) {
1169+
assert(result.length == value.length);
1170+
assert(first >= 0);
1171+
int resultOffset = first;
1172+
int length = value.length >> 1;
1173+
int srcCount;
1174+
for (int i = first; i < length; i += srcCount) {
1175+
int src = getChar(value, i);
1176+
srcCount = 1;
1177+
if (Character.isSurrogate((char)src)) {
1178+
src = codePointAt(value, i, length);
1179+
srcCount = Character.charCount(src);
1180+
}
1181+
char[] folded = CaseFolding.fold(src);
1182+
if (folded.length > srcCount) {
1183+
// tbd: should we pre-scan?
1184+
byte[] result2 = newBytesFor((result.length >> 1) + folded.length - srcCount);
1185+
System.arraycopy(result, 0, result2, 0, resultOffset << 1);
1186+
result = result2;
1187+
}
1188+
assert resultOffset >= 0;
1189+
assert resultOffset + folded.length <= length(result);
1190+
for (int x = 0; x < folded.length; ++x) {
1191+
putChar(result, resultOffset++, folded[x]);
1192+
}
1193+
}
1194+
return newString(result, 0, resultOffset);
1195+
}
1196+
1197+
public static String toCaseFold(String str, byte[] value) {
1198+
int first;
1199+
final int len = value.length >> 1;
1200+
int cpCount = 1;
1201+
1202+
// Now check if there are any characters that need to be changed, or are surrogate
1203+
for (first = 0 ; first < len; first += cpCount) {
1204+
int cp = (int)getChar(value, first);
1205+
if (Character.isSurrogate((char)cp)) {
1206+
cp = codePointAt(value, first, len);
1207+
}
1208+
if (!CaseFolding.isFolded(cp)) {
1209+
break;
1210+
}
1211+
cpCount = Character.charCount(cp);
1212+
}
1213+
if (first == len) {
1214+
return str;
1215+
}
1216+
byte[] result = new byte[value.length];
1217+
System.arraycopy(value, 0, result, 0, first << 1); // Just copy the first few
1218+
// case-fold characters.
1219+
int bits = 0;
1220+
for (int i = first; i < len; i++) {
1221+
int cp = (int)getChar(value, i);
1222+
if (Character.isSurrogate((char)cp)) {
1223+
return toCaseFoldEx(str, value, result, i);
1224+
}
1225+
char[] folded = CaseFolding.fold(cp);
1226+
if (folded.length != 1) { // 1:M or surrogate pair
1227+
return toCaseFoldEx(str, value, result, i);
1228+
}
1229+
bits |= folded[0];
1230+
putChar(result, i, folded[0]);
1231+
}
1232+
if (bits < 0 || bits > 0xff) {
1233+
return new String(result, UTF16);
1234+
} else {
1235+
return newString(result, 0, len);
1236+
}
1237+
}
1238+
11671239
public static String trim(byte[] value) {
11681240
int length = value.length >> 1;
11691241
int len = length;

0 commit comments

Comments
 (0)