Skip to content

Commit 4c47ef2

Browse files
8365675: Add String.toCaseFold() to support Unicode case-folding
to update api
1 parent 84a4a36 commit 4c47ef2

13 files changed

Lines changed: 1187 additions & 172 deletions

File tree

make/jdk/src/classes/build/tools/generatecharacter/CaseFolding.java

Lines changed: 79 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -22,15 +22,14 @@
2222
* or visit www.oracle.com if you need additional information or have any
2323
* questions.
2424
*/
25-
2625
package build.tools.generatecharacter;
2726

28-
import java.io.IOException;
2927
import java.nio.file.Files;
3028
import java.nio.file.Paths;
3129
import java.nio.file.StandardOpenOption;
30+
import java.util.Arrays;
3231
import java.util.stream.Collectors;
33-
import java.util.stream.Stream;
32+
import java.util.stream.IntStream;
3433

3534
public class CaseFolding {
3635

@@ -42,32 +41,89 @@ public static void main(String[] args) throws Throwable {
4241
var templateFile = Paths.get(args[0]);
4342
var caseFoldingTxt = Paths.get(args[1]);
4443
var genSrcFile = Paths.get(args[2]);
45-
var supportedTypes = "^.*; [CTS]; .*$";
44+
45+
// java.lang
46+
var supportedTypes = "^.*; [CF]; .*$"; // full/1:M case folding
4647
var caseFoldingEntries = Files.lines(caseFoldingTxt)
47-
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
48-
.map(line -> {
49-
String[] cols = line.split("; ");
50-
return new String[] {cols[0], cols[1], cols[2]};
51-
})
52-
.filter(cols -> {
53-
// the folding case doesn't map back to the original char.
54-
var cp1 = Integer.parseInt(cols[0], 16);
55-
var cp2 = Integer.parseInt(cols[2], 16);
56-
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
57-
})
58-
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
59-
.collect(Collectors.joining(",\n", "", ""));
48+
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
49+
.map(line -> {
50+
var fields = line.split("; ");
51+
var cp = Integer.parseInt(fields[0], 16);
52+
fields = fields[2].trim().split(" ");
53+
var folding = new int[fields.length];
54+
for (int i = 0; i < folding.length; i++) {
55+
folding[i] = Integer.parseInt(fields[i], 16);
56+
}
57+
var foldingChars = Arrays.stream(folding)
58+
.mapToObj(Character::toChars)
59+
.flatMapToInt(chars -> IntStream.range(0, chars.length).map(i -> (int) chars[i]))
60+
.toArray();
61+
return String.format("\t\tnew CaseFoldingEntry(0x%04x, %s)",
62+
cp,
63+
Arrays.stream(foldingChars)
64+
.mapToObj(c -> String.format("0x%04x", c))
65+
.collect(Collectors.joining(", ", "new char[] {", "}"))
66+
);
67+
})
68+
.collect(Collectors.joining(",\n", "", ""));
69+
// util.regex
70+
var expandedSupportedTypes = "^.*; [CTS]; .*$";
71+
var expanded_caseFoldingEntries = Files.lines(caseFoldingTxt)
72+
.filter(line -> !line.startsWith("#") && line.matches(expandedSupportedTypes))
73+
.map(line -> {
74+
String[] cols = line.split("; ");
75+
return new String[]{cols[0], cols[1], cols[2]};
76+
})
77+
.filter(cols -> {
78+
// the folding case doesn't map back to the original char.
79+
var cp1 = Integer.parseInt(cols[0], 16);
80+
var cp2 = Integer.parseInt(cols[2], 16);
81+
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
82+
})
83+
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
84+
.collect(Collectors.joining(",\n", "", ""));
6085

6186
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
6287
// 0049; T; 0131; # LATIN CAPITAL LETTER I
6388
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
6489

65-
// Generate .java file
6690
Files.write(
67-
genSrcFile,
68-
Files.lines(templateFile)
69-
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
70-
.collect(Collectors.toList()),
71-
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
91+
genSrcFile,
92+
Files.lines(templateFile)
93+
.map(line -> line.contains("%%%Entries") ? caseFoldingEntries : line)
94+
.map(line -> line.contains("%%%Expanded_Case_Map_Entries") ? T_0x0131_0x49 + expanded_caseFoldingEntries : line)
95+
.collect(Collectors.toList()),
96+
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
7297
}
7398
}
99+
/*
100+
} else {
101+
var supportedTypes = "^.*; [CTS]; .*$";
102+
var caseFoldingEntries = Files.lines(caseFoldingTxt)
103+
.filter(line -> !line.startsWith("#") && line.matches(supportedTypes))
104+
.map(line -> {
105+
String[] cols = line.split("; ");
106+
return new String[]{cols[0], cols[1], cols[2]};
107+
})
108+
.filter(cols -> {
109+
// the folding case doesn't map back to the original char.
110+
var cp1 = Integer.parseInt(cols[0], 16);
111+
var cp2 = Integer.parseInt(cols[2], 16);
112+
return Character.toUpperCase(cp2) != cp1 && Character.toLowerCase(cp2) != cp1;
113+
})
114+
.map(cols -> String.format(" entry(0x%s, 0x%s)", cols[0], cols[2]))
115+
.collect(Collectors.joining(",\n", "", ""));
116+
117+
// hack, hack, hack! the logic does not pick 0131. just add manually to support 'I's.
118+
// 0049; T; 0131; # LATIN CAPITAL LETTER I
119+
final String T_0x0131_0x49 = String.format(" entry(0x%04x, 0x%04x),\n", 0x0131, 0x49);
120+
121+
// Generate .java file
122+
Files.write(
123+
genSrcFile,
124+
Files.lines(templateFile)
125+
.map(line -> line.contains("%%%Entries") ? T_0x0131_0x49 + caseFoldingEntries : line)
126+
.collect(Collectors.toList()),
127+
StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING);
128+
}
129+
*/

make/modules/java.base/gensrc/GensrcCharacterData.gmk

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,5 +72,22 @@ TARGETS += $(GENSRC_CHARACTERDATA)
7272

7373
################################################################################
7474

75+
76+
GENSRC_STRINGCASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/java/lang/CaseFolding.java
77+
78+
STRINGCASEFOLDING_TEMPLATE := $(MODULE_SRC)/share/classes/jdk/internal/lang/CaseFolding.java.template
79+
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
80+
81+
$(GENSRC_STRINGCASEFOLDING): $(BUILD_TOOLS_JDK) $(STRINGCASEFOLDING_TEMPLATE) $(CASEFOLDINGTXT)
82+
$(call LogInfo, Generating $@)
83+
$(call MakeTargetDir)
84+
$(TOOL_GENERATECASEFOLDING) \
85+
$(STRINGCASEFOLDING_TEMPLATE) \
86+
$(CASEFOLDINGTXT) \
87+
$(GENSRC_STRINGCASEFOLDING)
88+
89+
TARGETS += $(GENSRC_STRINGCASEFOLDING)
90+
91+
7592
endif # include guard
7693
include MakeIncludeEnd.gmk

make/modules/java.base/gensrc/GensrcRegex.gmk

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -50,22 +50,5 @@ TARGETS += $(GENSRC_INDICCONJUNCTBREAK)
5050

5151
################################################################################
5252

53-
GENSRC_CASEFOLDING := $(SUPPORT_OUTPUTDIR)/gensrc/java.base/jdk/internal/util/regex/CaseFolding.java
54-
55-
CASEFOLDINGTEMP := $(MODULE_SRC)/share/classes/jdk/internal/util/regex/CaseFolding.java.template
56-
CASEFOLDINGTXT := $(MODULE_SRC)/share/data/unicodedata/CaseFolding.txt
57-
58-
$(GENSRC_CASEFOLDING): $(BUILD_TOOLS_JDK) $(CASEFOLDINGTEMP) $(CASEFOLDINGTXT)
59-
$(call LogInfo, Generating $@)
60-
$(call MakeTargetDir)
61-
$(TOOL_GENERATECASEFOLDING) \
62-
$(CASEFOLDINGTEMP) \
63-
$(CASEFOLDINGTXT) \
64-
$(GENSRC_CASEFOLDING)
65-
66-
TARGETS += $(GENSRC_CASEFOLDING)
67-
68-
################################################################################
69-
7053
endif # include guard
7154
include MakeIncludeEnd.gmk

src/java.base/share/classes/java/lang/String.java

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2039,6 +2039,56 @@ public boolean equalsIgnoreCase(String anotherString) {
20392039
&& regionMatches(true, 0, anotherString, 0, length());
20402040
}
20412041

2042+
/**
2043+
* Compares this {@code String} to another {@code String} for equality,
2044+
* using <em>Unicode case folding</em>. Two strings are considered equal
2045+
* by this method if their case-folded forms are identical.
2046+
* <p>
2047+
* Case folding is defined by the Unicode Standard in
2048+
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
2049+
* including 1:M mappings. For example, {@code "Maße".equalsFoldCase("MASSE")}
2050+
* returns {@code true}, since the character {@code U+00DF} (sharp s) folds
2051+
* to {@code "ss"}.
2052+
* <p>
2053+
* Case folding is locale-independent and language-neutral, unlike
2054+
* locale-sensitive transformations such as {@link #toLowerCase()} or
2055+
* {@link #toUpperCase()}. It is intended for caseless matching,
2056+
* searching, and indexing.
2057+
*
2058+
* @apiNote
2059+
* This method is the Unicode-compliant alternative to
2060+
* {@link #equalsIgnoreCase(String)}. It implements full case folding as
2061+
* defined by the Unicode Standard, which may differ from the simpler
2062+
* per-character mapping performed by {@code equalsIgnoreCase}.
2063+
* For example:
2064+
* <pre>{@code
2065+
* String a = "Maße";
2066+
* String b = "MASSE";
2067+
* boolean equalFoldCase = a.equalsFoldCase(b); // returns true
2068+
* boolean equalIgnoreCase = a.equalsIgnoreCase(b); // returns false
2069+
* }</pre>
2070+
*
2071+
* @param anotherString
2072+
* The {@code String} to compare this {@code String} against
2073+
*
2074+
* @return {@code true} if the given object is not {@code null} and represents
2075+
* the same sequence of characters as this string under Unicode case
2076+
* folding; {@code false} otherwise.
2077+
*
2078+
* @see #compareToFoldCase(String)
2079+
* @see #equalsIgnoreCase(String)
2080+
* @since 26
2081+
*/
2082+
public boolean equalsFoldCase(String anotherString) {
2083+
if (this == anotherString) {
2084+
return true;
2085+
}
2086+
if (anotherString == null) {
2087+
return false;
2088+
}
2089+
return UNICODE_CASELESS_ORDER.compare(this, anotherString) == 0;
2090+
}
2091+
20422092
/**
20432093
* Compares two strings lexicographically.
20442094
* The comparison is based on the Unicode value of each character in
@@ -2160,6 +2210,76 @@ public int compareToIgnoreCase(String str) {
21602210
return CASE_INSENSITIVE_ORDER.compare(this, str);
21612211
}
21622212

2213+
/**
2214+
* A Comparator that orders {@code String} objects as by
2215+
* {@link #compareToFoldCase(String) compareToFoldCase()}.
2216+
*
2217+
* @see #compareToFoldCase(String)
2218+
* @since 26
2219+
*/
2220+
public static final Comparator<String> UNICODE_CASELESS_ORDER
2221+
= new FoldCaseComparator();
2222+
2223+
private static class FoldCaseComparator implements Comparator<String> {
2224+
2225+
@Override
2226+
public int compare(String s1, String s2) {
2227+
byte[] v1 = s1.value;
2228+
byte[] v2 = s2.value;
2229+
if (s1.coder == s2.coder()) {
2230+
return s1.coder == LATIN1 ? StringLatin1.compareToFC(v1, v2)
2231+
: StringUTF16.compareToFC(v1, v2);
2232+
}
2233+
return s1.coder == LATIN1 ? StringLatin1.compareToFC_UTF16(v1, v2)
2234+
: StringUTF16.compareToFC_Latin1(v1, v2);
2235+
}
2236+
}
2237+
2238+
/**
2239+
* Compares two strings lexicographically using <em>Unicode case folding</em>.
2240+
* This method returns an integer whose sign is that of calling {@code compareTo}
2241+
* on the Unicode case folded version of the strings. Unicode Case folding
2242+
* eliminates differences in case according to the Unicode Standard, using the
2243+
* mappings defined in
2244+
* <a href="https://www.unicode.org/Public/UCD/latest/ucd/CaseFolding.txt">CaseFolding.txt</a>,
2245+
* including 1:M mappings, such as {@code"ß"} → {@code }"ss"}.
2246+
* <p>
2247+
* Case folding is a locale-independent, language-neutral form of case mapping,
2248+
* primarily intended for caseless matching. Unlike {@link #compareToIgnoreCase(String)},
2249+
* which applies a simpler locale-insensitive uppercase mapping. This method
2250+
* follows the Unicode <em>full</em> case folding, providing stable and
2251+
* consistent results across all environments.
2252+
* <p>
2253+
* Note that this method does <em>not</em> take locale into account, and may
2254+
* produce results that differ from locale-sensitive ordering. Use
2255+
* {@link java.text.Collator} for locale-sensitive comparison.
2256+
*
2257+
* @apiNote
2258+
* This method is the Unicode-compliant alternative to
2259+
* {@link #compareToIgnoreCase(String)}. It implements the <em>full</em> case folding
2260+
* as defined by the Unicode Standard, which may differ from the simpler
2261+
* per-character mapping performed by {@code compareToIgnoreCase}.
2262+
* For example:
2263+
* <pre>{@code
2264+
* String a = "Maße";
2265+
* String b = "MASSE";
2266+
* int cmpFoldCase = a.compareToFoldCase(b); // returns 0
2267+
* int cmpIgnoreCase = a.compareToIgnoreCase(b); // returns > 0
2268+
* }</pre>
2269+
*
2270+
* @param str the {@code String} to be compared.
2271+
* @return a negative integer, zero, or a positive integer as the specified
2272+
* String is greater than, equal to, or less than this String,
2273+
* ignoring case considerations by case folding.
2274+
* @see #equalsFoldCase(String)
2275+
* @see #compareToIgnoreCase(String)
2276+
* @see java.text.Collator
2277+
* @since 26
2278+
*/
2279+
public int compareToFoldCase(String str) {
2280+
return UNICODE_CASELESS_ORDER.compare(this, str);
2281+
}
2282+
21632283
/**
21642284
* Tests if two string regions are equal.
21652285
* <p>

0 commit comments

Comments
 (0)