Skip to content

Commit 9c8f02c

Browse files
8365675: Add String.toCaseFold() to support Unicode case-folding
to update with new approach
1 parent 84a4a36 commit 9c8f02c

12 files changed

Lines changed: 924 additions & 157 deletions

File tree

make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java

Lines changed: 79 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,14 @@
2222
* or visit www.oracle.com if you need additional information or have any
2323
* questions.
2424
*/
25-
2625
package build.tools.generatecharacter;
2726

28-
import java.io.IOException;
2927
import java.nio.file.Files;
3028
import java.nio.file.Paths;
3129
import java.nio.file.StandardOpenOption;
30+
import java.util.Arrays;
3231
import java.util.stream.Collectors;
33-
import java.util.stream.Stream;
32+
import java.util.stream.IntStream;
3433

3534
public class CaseFolding {
3635

@@ -42,32 +41,89 @@ public static void main(String[] args) throws Throwable {
4241
var templateFile = Paths.get(args[0]);
4342
var caseFoldingTxt = Paths.get(args[1]);
4443
var genSrcFile = Paths.get(args[2]);
45-
var supportedTypes = "^.*; [CTS]; .*$";
44+
45+
// java.lang
46+
var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding
4647
var caseFoldingEntries = Files.lines(caseFoldingTxt)
47-
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
48-
.map(line -> {
49-
String[] cols = line.split("; ");
50-
return new String[] {cols[0], cols[1], cols[2]};
51-
})
52-
.filter(cols -> {
53-
// the folding case doesn't map back to the original char.
54-
var cp1 = Integer.parseInt(cols[0], 16);
55-
var cp2 = Integer.parseInt(cols[2], 16);
56-
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
57-
})
58-
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
59-
.collect(Collectors.joining(",\n", "", ""));
48+
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
49+
.map(line -> {
50+
var fields = line.split("; ");
51+
var cp = Integer.parseInt(fields[0], 16);
52+
fields = fields[2].trim().split(" ");
53+
var folding = new int[fields.length];
54+
for (int i = 0; i < folding.length; i++) {
55+
folding[i] = Integer.parseInt(fields[i], 16);
56+
}
57+
var foldingChars = Arrays.stream(folding)
58+
.mapToObj(Character::toChars)
59+
.flatMapToInt(chars -> IntStream.range(0, chars.length).map(i -> (int) chars[i]))
60+
.toArray();
61+
return String.format("\t\tnew CaseFoldingEntry(0x%04x, %s)",
62+
cp,
63+
Arrays.stream(foldingChars)
64+
.mapToObj(c -> String.format("0x%04x", c))
65+
.collect(Collectors.joining(", ", "new char[] {", "}"))
66+
);
67+
})
68+
.collect(Collectors.joining(",\n", "", ""));
69+
// util.regex
70+
var expandedSupportedTypes = "^.*; [CTS]; .*$";
71+
var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt)
72+
.filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes))
73+
.map(line -> {
74+
String[] cols = line.split("; ");
75+
return new String[]{cols[0], cols[1], cols[2]};
76+
})
77+
.filter(cols -> {
78+
// the folding case doesn't map back to the original char.
79+
var cp1 = Integer.parseInt(cols[0], 16);
80+
var cp2 = Integer.parseInt(cols[2], 16);
81+
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
82+
})
83+
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
84+
.collect(Collectors.joining(",\n", "", ""));
6085

6186
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
6287
// 0049; T; 0131; # LATIN CAPITAL LETTER I
6388
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
6489

65-
// Generate .java file
6690
Files.write(
67-
genSrcFile,
68-
Files.lines(templateFile)
69-
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
70-
.collect(Collectors.toList()),
71-
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
91+
genSrcFile,
92+
Files.lines(templateFile)
93+
.map(line -> line.contains("%%%Entries") ? caseFoldingEntries : line)
94+
.map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line)
95+
.collect(Collectors.toList()),
96+
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
7297
}
7398
}
99+
/*
100+
} else {
101+
var supportedTypes = "^.*; [CTS]; .*$";
102+
var caseFoldingEntries = Files.lines(caseFoldingTxt)
103+
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
104+
.map(line -> {
105+
String[] cols = line.split("; ");
106+
return new String[]{cols[0], cols[1], cols[2]};
107+
})
108+
.filter(cols -> {
109+
// the folding case doesn't map back to the original char.
110+
var cp1 = Integer.parseInt(cols[0], 16);
111+
var cp2 = Integer.parseInt(cols[2], 16);
112+
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
113+
})
114+
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
115+
.collect(Collectors.joining(",\n", "", ""));
116+
117+
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
118+
// 0049; T; 0131; # LATIN CAPITAL LETTER I
119+
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
120+
121+
// Generate .java file
122+
Files.write(
123+
genSrcFile,
124+
Files.lines(templateFile)
125+
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
126+
.collect(Collectors.toList()),
127+
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
128+
}
129+
*/

make/modules/java.base/gensrc/GensrcCharacterData.gmk

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA)
7272

7373
################################################################################
7474

75+
76+
GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/java/lang/CaseFolding.java
77+
78+
STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template
79+
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
80+
81+
$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT)
82+
$(call LogInfo, Generating $@)
83+
$(call MakeTargetDir)
84+
$(TOOL_GENERATECASEFOLDING) \
85+
$(STRINGCASEFOLDING_TEMPLATE) \
86+
$(CASEFOLDINGTXT) \
87+
$(GENSRC_STRINGCASEFOLDING)
88+
89+
TARGETS += $(GENSRC_STRINGCASEFOLDING)
90+
91+
7592
endif # include guard
7693
include MakeIncludeEnd.gmk

make/modules/java.base/gensrc/GensrcRegex.gmk

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)
5050

5151
################################################################################
5252

53-
GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java
54-
55-
CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
56-
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
57-
58-
$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
59-
$(call LogInfo, Generating $@)
60-
$(call MakeTargetDir)
61-
$(TOOL_GENERATECASEFOLDING) \
62-
$(CASEFOLDINGTEMP) \
63-
$(CASEFOLDINGTXT) \
64-
$(GENSRC_CASEFOLDING)
65-
66-
TARGETS += $(GENSRC_CASEFOLDING)
67-
68-
################################################################################
69-
7053
endif # include guard
7154
include MakeIncludeEnd.gmk

src/java.base/share/classes/java/lang/String.java

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2039,6 +2039,73 @@ public boolean equalsIgnoreCase(String anotherString) {
20392039
&& regionMatches(true, 0, anotherString, 0, length());
20402040
}
20412041

2042+
/**
2043+
* Compares this {@code String} to another {@code String} for equality,
2044+
* using <em>Unicode case folding</em>.
2045+
* <p>
2046+
* Two strings are considered equal by this method if their case-folded
2047+
* forms are identical. Case folding is defined by the Unicode Standard in
2048+
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
2049+
* including 1:M mappings. For example, {@code "Maße".equalsCaseFold("MASSE")}
2050+
* returns {@code true}, since the character {@code U+00DF} (sharp s) folds
2051+
* to {@code "ss"}.
2052+
* <p>
2053+
* Case folding is locale-independent and language-neutral, unlike
2054+
* locale-sensitive transformations such as {@link #toLowerCase()} or
2055+
* {@link #toUpperCase()}. It is intended for caseless matching,
2056+
* searching, and indexing.
2057+
*
2058+
* @apiNote
2059+
* This method is the Unicode-compliant alternative to
2060+
* {@link #equalsIgnoreCase(String)}. It implements full case folding as
2061+
* defined by the Unicode Standard, which may differ from the simpler
2062+
* per-character mapping performed by {@code equalsIgnoreCase}.
2063+
* For example:
2064+
* <pre>{@code
2065+
* String a = "Maße";
2066+
* String b = "MASSE";
2067+
* boolean equalCaseFold = a.equalsCaseFold(b); // returns true
2068+
* boolean equalIgnoreCase = a.equalsIgnoreCase(b); // returns false
2069+
* }</pre>
2070+
*
2071+
* @param anotherString
2072+
* The {@code String} to compare this {@code String} against
2073+
*
2074+
* @return {@code true} if the given object is a {@code String}
2075+
* that represents the same sequence of characters as this
2076+
* string under Unicode case folding; {@code false} otherwise.
2077+
*
2078+
* @see #compareToCaseFold(String)
2079+
* @see #equalsIgnoreCase(String)
2080+
* @see java.text.Collator
2081+
* @since 26
2082+
*/
2083+
public boolean equalsCaseFold(String anotherString) {
2084+
if (this == anotherString) {
2085+
return true;
2086+
}
2087+
if (anotherString == null) {
2088+
return false;
2089+
}
2090+
byte[] v1 = this.value;
2091+
byte[] v2 = anotherString.value;
2092+
var ltr1 = this.coder == LATIN1 ? StringCaseFoldedCharIterator.ofLatin1(v1)
2093+
: StringCaseFoldedCharIterator.ofUTF16(v1);
2094+
var ltr2 = this.coder == LATIN1 ? StringCaseFoldedCharIterator.ofLatin1(v2)
2095+
: StringCaseFoldedCharIterator.ofUTF16(v2);
2096+
while (ltr1.hasNext() && ltr2.hasNext()) {
2097+
int ch1 = ltr1.nextChar();
2098+
int ch2 = ltr2.nextChar();
2099+
if (ch1 != ch2) {
2100+
return false;
2101+
}
2102+
}
2103+
if (ltr1.hasNext() || ltr2.hasNext()) {
2104+
return false;
2105+
}
2106+
return true;
2107+
}
2108+
20422109
/**
20432110
* Compares two strings lexicographically.
20442111
* The comparison is based on the Unicode value of each character in
@@ -2160,6 +2227,85 @@ public int compareToIgnoreCase(String str) {
21602227
return CASE_INSENSITIVE_ORDER.compare(this, str);
21612228
}
21622229

2230+
/**
2231+
* A Comparator that orders {@code String} objects as by
2232+
* {@link #compareToCaseFold(String) compareToCaseFold()}.
2233+
*
2234+
* @see #compareToCaseFold(String)
2235+
* @since 26
2236+
*/
2237+
public static final Comparator<String> CASE_FOLD_ORDER
2238+
= new CaseFoldComparator();
2239+
2240+
private static class CaseFoldComparator implements Comparator<String> {
2241+
2242+
@Override
2243+
public int compare(String s1, String s2) {
2244+
byte[] v1 = s1.value;
2245+
byte[] v2 = s2.value;
2246+
var ltr1 = s1.coder == LATIN1 ? StringCaseFoldedCharIterator.ofLatin1(v1)
2247+
: StringCaseFoldedCharIterator.ofUTF16(v1);
2248+
var ltr2 = s2.coder == LATIN1 ? StringCaseFoldedCharIterator.ofLatin1(v2)
2249+
: StringCaseFoldedCharIterator.ofUTF16(v2);
2250+
while (ltr1.hasNext() && ltr2.hasNext()) {
2251+
int ch1 = ltr1.nextChar();
2252+
int ch2 = ltr2.nextChar();
2253+
if (ch1 != ch2) {
2254+
return ch1 - ch2;
2255+
}
2256+
}
2257+
if (ltr1.hasNext()) return 1;
2258+
if (ltr2.hasNext()) return -1;
2259+
return 0;
2260+
}
2261+
}
2262+
2263+
/**
2264+
* Compares two strings lexicographically using Unicode case folding.
2265+
* <p>
2266+
* This method returns an integer whose sign is that of calling {@code compareTo}
2267+
* on the case folded versions of the strings. Unicode Case folding eliminates
2268+
* differences in case according to the Unicode Standard, using the mappings
2269+
* defined in
2270+
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
2271+
* including 1:M mappings, such as {@code"ß"} → {@code }"ss"}.
2272+
* <p>
2273+
* Case folding is a locale-independent, language-neutral form of case mapping,
2274+
* primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)},
2275+
* which applies a simpler locale-insensitive uppercase mapping. This method
2276+
* follows the Unicode-defined <em>full</em> case folding, providing stable and
2277+
* consistent results across all environments.
2278+
* <p>
2279+
* Note that this method does <em>not</em> take locale into account, and may
2280+
* produce results that differ from locale-sensitive ordering. For locale-aware
2281+
* comparisons, use {@link java.text.Collator}.
2282+
*
2283+
* @apiNote
2284+
* This method is the Unicode-compliant alternative to
2285+
* {@link #compareToIgnoreCase(String)}. It implements full case folding
2286+
* as defined by the Unicode Standard, which may differ from the simpler
2287+
* per-character mapping performed by {@code compareToIgnoreCase}.
2288+
* For example:
2289+
* <pre>{@code
2290+
* String a = "Maße";
2291+
* String b = "MASSE";
2292+
* int cmpCaseFold = a.compareToCaseFold(b); // returns 0
2293+
* int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0
2294+
* }</pre>
2295+
*
2296+
* @param str the {@code String} to be compared.
2297+
* @return a negative integer, zero, or a positive integer as the specified
2298+
* String is greater than, equal to, or less than this String,
2299+
* ignoring case considerations by case folding.
2300+
* @see #equalsCaseFold(String)
2301+
* @see #compareToIgnoreCase(String)
2302+
* @see java.text.Collator
2303+
* @since 26
2304+
*/
2305+
public int compareToCaseFold(String str) {
2306+
return CASE_FOLD_ORDER.compare(this, str);
2307+
}
2308+
21632309
/**
21642310
* Tests if two string regions are equal.
21652311
* <p>
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
package java.lang;
2+
3+
import jdk.internal.java.lang.CaseFolding;
4+
5+
abstract class StringCaseFoldedCharIterator {
6+
7+
protected final byte[] value; // underlying byte array
8+
protected final int length; // length of the char unit in byte array
9+
protected int index; // current position in byte array
10+
protected char[] folded; // buffer for folded expansion
11+
protected int foldedIndex; // position in folded[]
12+
13+
StringCaseFoldedCharIterator(byte[] value, int length) {
14+
this.value = value;
15+
this.length = length;
16+
this.index = 0;
17+
this.folded = null;
18+
this.foldedIndex = 0;
19+
}
20+
21+
public boolean hasNext() {
22+
return (folded != null && foldedIndex < folded.length) || index < length;
23+
}
24+
25+
public int nextChar() {
26+
if (folded != null && foldedIndex < folded.length) {
27+
return folded[foldedIndex++];
28+
}
29+
if (index >= length) {
30+
return -1;
31+
}
32+
int cp = codePointAt(value, index);
33+
index += Character.charCount(cp);
34+
folded = CaseFolding.fold(cp);
35+
foldedIndex = 0;
36+
return folded[foldedIndex++];
37+
}
38+
39+
protected abstract int codePointAt(byte[] value, int index);
40+
41+
// Factory for Latin1
42+
static StringCaseFoldedCharIterator ofLatin1(byte[] value) {
43+
return new StringCaseFoldedCharIterator(value, value.length) {
44+
@Override
45+
protected int codePointAt(byte[] value, int index) {
46+
return StringLatin1.codePointAt(value, index, value.length);
47+
}
48+
};
49+
}
50+
51+
// Factory for UTF16
52+
static StringCaseFoldedCharIterator ofUTF16(byte[] value) {
53+
return new StringCaseFoldedCharIterator(value, value.length >> 1) {
54+
@Override
55+
protected int codePointAt(byte[] value, int index) {
56+
return StringUTF16.codePointAt(value, index, value.length);
57+
}
58+
};
59+
}
60+
}

0 commit comments

Comments
 (0)