Skip to content

CLDR-14139 investigate exemplar issues #4649

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import java.util.TreeSet;
import org.unicode.cldr.tool.LanguageCodeConverter;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.ExemplarType;
import org.unicode.cldr.util.CLDRFile.WinningChoice;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.Factory;
Expand Down Expand Up @@ -329,8 +330,8 @@ public static ExemplarInfo make(String language, Set<String> missingExemplars) {
System.out.print("");
}
CLDRFile file = ExemplarInfo.cldrFactory.make(cldrLanguage, true);
exemplars1 = file.getExemplarSet("", WinningChoice.WINNING, 0);
auxiliary1 = file.getExemplarSet("auxiliary", WinningChoice.WINNING, 0);
exemplars1 = file.getExemplarSet(ExemplarType.main, WinningChoice.WINNING, 0);
auxiliary1 = file.getExemplarSet(ExemplarType.auxiliary, WinningChoice.WINNING, 0);
} catch (Exception e) {
System.out.println(
"Can't read "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
import org.unicode.cldr.util.Builder;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.ExemplarType;
import org.unicode.cldr.util.CLDRFile.WinningChoice;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.CollatorHelper;
Expand Down Expand Up @@ -335,7 +336,9 @@ private static void showExemplarSize() {
}
String englishName = english.nameGetter().getNameFromIdentifier(baseLanguage);
CLDRFile cldrFile = factory.make(baseLanguage, false);
UnicodeSet set = cldrFile.getExemplarSet("", WinningChoice.WINNING);
UnicodeSet set =
cldrFile.getExemplarSet(
ExemplarType.main, WinningChoice.WINNING, UnicodeSet.CASE_INSENSITIVE);
int script = -1;
for (String s : set) {
int cp = s.codePointAt(0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import org.unicode.cldr.test.CheckConsistentCasing.Category;
import org.unicode.cldr.tool.Option.Options;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.ExemplarType;
import org.unicode.cldr.util.CLDRFile.WinningChoice;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.CldrUtility;
Expand Down Expand Up @@ -141,7 +142,9 @@ private Map<String, Boolean> generateCasingInformation(String localePattern) {

// Save casing information about the locale.
CLDRFile file = cldrFactory.make(localeID, true);
UnicodeSet examplars = file.getExemplarSet("", WinningChoice.NORMAL);
UnicodeSet examplars =
file.getExemplarSet(
ExemplarType.main, WinningChoice.NORMAL, UnicodeSet.CASE_INSENSITIVE);
localeUsesCasing.put(localeID, examplars.containsSome(allCaps));
createCasingXml(localeID, CheckConsistentCasing.getSamples(file));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ public enum ExemplarType {
public final UnicodeSet toRemove;
public final String message;
public final boolean convertUppercase;
public final CLDRFile.ExemplarType cldr;

ExemplarType(UnicodeSet allowed, String message, boolean convertUppercase) {
if (!allowed.isFrozen()) {
Expand All @@ -127,6 +128,7 @@ public enum ExemplarType {
this.message = message;
this.toRemove = new UnicodeSet(allowed).complement().freeze();
this.convertUppercase = convertUppercase;
cldr = CLDRFile.ExemplarType.fromString(name());
}

public static ExemplarType from(String name) {
Expand Down Expand Up @@ -159,7 +161,10 @@ public CheckCLDR handleSetCldrFileToCheck(
if (!SUPPRESS_AUX_EMPTY_CHECK) {
UnicodeSet auxiliarySet =
getResolvedCldrFileToCheck()
.getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING);
.getExemplarSet(
ExemplarType.auxiliary.cldr,
CLDRFile.WinningChoice.WINNING,
UnicodeSet.CASE_INSENSITIVE);

if (auxiliarySet == null) {
possibleErrors.add(
Expand Down Expand Up @@ -193,7 +198,11 @@ public CheckCLDR handleCheck(
// check relation to auxiliary set
try {
UnicodeSet mainSet =
getResolvedCldrFileToCheck().getExemplarSet("", CLDRFile.WinningChoice.WINNING);
getResolvedCldrFileToCheck()
.getExemplarSet(
ExemplarType.main.cldr,
CLDRFile.WinningChoice.WINNING,
UnicodeSet.CASE_INSENSITIVE);
if (type == ExemplarType.auxiliary) {
UnicodeSet auxiliarySet = SimpleUnicodeSetFormatter.parseLenient(value);

Expand Down Expand Up @@ -252,7 +261,10 @@ public CheckCLDR handleCheck(
// auxiliary exemplars
UnicodeSet auxiliarySet =
getResolvedCldrFileToCheck()
.getExemplarSet("auxiliary", CLDRFile.WinningChoice.WINNING);
.getExemplarSet(
ExemplarType.auxiliary.cldr,
CLDRFile.WinningChoice.WINNING,
UnicodeSet.CASE_INSENSITIVE);
if (auxiliarySet == null) {
auxiliarySet = new UnicodeSet();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.unicode.cldr.test.CheckCLDR.CheckStatus.Subtype;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.ExemplarType;
import org.unicode.cldr.util.CLDRFile.Status;
import org.unicode.cldr.util.DateConstants;
import org.unicode.cldr.util.Factory;
Expand Down Expand Up @@ -224,7 +225,7 @@ public CheckCLDR handleSetCldrFileToCheck(

CLDRFile resolvedFile = getResolvedCldrFileToCheck();
boolean[] ok = new boolean[1];
exemplars = safeGetExemplars("", possibleErrors, resolvedFile, ok);
exemplars = safeGetExemplars(ExemplarType.main, possibleErrors, resolvedFile, ok);

if (exemplars == null) {
CheckStatus item =
Expand All @@ -251,7 +252,7 @@ public CheckCLDR handleSetCldrFileToCheck(
// if (temp != null) exemplars.addAll(temp);
UnicodeSet auxiliary =
safeGetExemplars(
"auxiliary",
ExemplarType.auxiliary,
possibleErrors,
resolvedFile,
ok); // resolvedFile.getExemplarSet("auxiliary",
Expand All @@ -263,7 +264,7 @@ public CheckCLDR handleSetCldrFileToCheck(
if (CheckExemplars.USE_PUNCTUATION) {
UnicodeSet punctuation =
safeGetExemplars(
"punctuation",
ExemplarType.punctuation,
possibleErrors,
resolvedFile,
ok); // resolvedFile.getExemplarSet("auxiliary",
Expand Down Expand Up @@ -294,10 +295,15 @@ private UnicodeSet getNumberSystemExemplars() {
}

private UnicodeSet safeGetExemplars(
String type, List<CheckStatus> possibleErrors, CLDRFile resolvedFile, boolean[] ok) {
ExemplarType type,
List<CheckStatus> possibleErrors,
CLDRFile resolvedFile,
boolean[] ok) {
UnicodeSet result = null;
try {
result = resolvedFile.getExemplarSet(type, CLDRFile.WinningChoice.WINNING);
result =
resolvedFile.getExemplarSet(
type, CLDRFile.WinningChoice.WINNING, UnicodeSet.CASE_INSENSITIVE);
ok[0] = true;
} catch (IllegalArgumentException iae) {
possibleErrors.add(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import java.util.TreeMap;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.ExemplarType;
import org.unicode.cldr.util.CLDRFile.WinningChoice;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.Factory;
Expand Down Expand Up @@ -114,7 +115,9 @@ public class DeriveScripts {
+ ")",
nsde);
}
UnicodeSet exemplars = cldrFile.getExemplarSet("", WinningChoice.WINNING);
UnicodeSet exemplars =
cldrFile.getExemplarSet(
ExemplarType.main, WinningChoice.WINNING, UnicodeSet.CASE_INSENSITIVE);
for (String s : exemplars) {
int scriptNum = UScript.getScript(s.codePointAt(0));
if (scriptNum != UScript.COMMON
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import java.util.regex.Matcher;
import org.unicode.cldr.draft.FileUtilities;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.ExemplarType;
import org.unicode.cldr.util.CLDRFile.WinningChoice;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.CldrUtility;
Expand Down Expand Up @@ -274,7 +275,9 @@ public void setLocale(String locale) {
throw new RuntimeException("Skipping " + locale);
}
cldrFile = cldrFactory.make(locale, false);
UnicodeSet exemplars = cldrFile.getExemplarSet("", WinningChoice.WINNING);
UnicodeSet exemplars =
cldrFile.getExemplarSet(
ExemplarType.main, WinningChoice.WINNING, UnicodeSet.CASE_INSENSITIVE);
usesLatin = exemplars != null && exemplars.containsSome(LATIN_SCRIPT);
for (DataHandler dataHandler : dataHandlers) {
dataHandler.reset(cldrFile);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.unicode.cldr.test.CheckConsistentCasing.Category;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.ExemplarType;
import org.unicode.cldr.util.CLDRFile.WinningChoice;
import org.unicode.cldr.util.CLDRPaths;
import org.unicode.cldr.util.ChainedMap;
Expand Down Expand Up @@ -103,7 +104,9 @@ public static void main(String[] args) {
continue;
}
CLDRFile cldrFile = factory.make(locale, true);
UnicodeSet exemplars = cldrFile.getExemplarSet("", WinningChoice.WINNING);
UnicodeSet exemplars =
cldrFile.getExemplarSet(
ExemplarType.main, WinningChoice.WINNING, UnicodeSet.CASE_INSENSITIVE);

M3<ContextTransformUsage, ContextTransformType, ContextTransformValue> data =
ChainedMap.of(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,11 @@ public static String getConstructedIndexSet(String locale, CLDRFile cFile) {
// ICU.
AlphabeticIndex<String> index = new AlphabeticIndex<>(uLocale);
index.clearRecords();
UnicodeSet indexLabels = cFile.getExemplarSet("index", WinningChoice.WINNING);
UnicodeSet indexLabels =
cFile.getExemplarSet(
org.unicode.cldr.util.CLDRFile.ExemplarType.main,
WinningChoice.WINNING,
UnicodeSet.CASE_INSENSITIVE);
if (indexLabels != null && indexLabels.size() > 0) {
index.addLabels(indexLabels);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
import org.unicode.cldr.util.ArrayComparator;
import org.unicode.cldr.util.CLDRConfig;
import org.unicode.cldr.util.CLDRFile;
import org.unicode.cldr.util.CLDRFile.ExemplarType;
import org.unicode.cldr.util.CLDRFile.WinningChoice;
import org.unicode.cldr.util.CLDRLocale;
import org.unicode.cldr.util.CLDRPaths;
Expand Down Expand Up @@ -619,7 +620,11 @@ private static void addLanguageScriptCells2(
} catch (RuntimeException e) {
scriptSet = new UnicodeSet();
}
UnicodeSet exemplars = nativeLanguage.getExemplarSet("", WinningChoice.WINNING);
UnicodeSet exemplars =
nativeLanguage.getExemplarSet(
ExemplarType.main,
WinningChoice.WINNING,
UnicodeSet.CASE_INSENSITIVE);
if (scriptSet.containsNone(exemplars)) {
System.out.println(
"Skipping CLDR file -- exemplars differ: "
Expand Down Expand Up @@ -1009,14 +1014,14 @@ public LanguageInfo(Factory cldrFactory) throws IOException {
script_languages = Multimaps.invertFrom(language_scripts, TreeMultimap.create());

// now get some metadata
localeAliasInfo.put("language", new TreeMap<String, String>());
localeAliasInfo.put("script", new TreeMap<String, String>());
localeAliasInfo.put("territory", new TreeMap<String, String>());
localeAliasInfo.put("variant", new TreeMap<String, String>());
localeAliasInfo.put("zone", new TreeMap<String, String>());
localeAliasInfo.put("subdivision", new TreeMap<String, String>());
localeAliasInfo.put("unit", new TreeMap<String, String>());
localeAliasInfo.put("usage", new TreeMap<String, String>());
localeAliasInfo.put("language", new TreeMap<>());
localeAliasInfo.put("script", new TreeMap<>());
localeAliasInfo.put("territory", new TreeMap<>());
localeAliasInfo.put("variant", new TreeMap<>());
localeAliasInfo.put("zone", new TreeMap<>());
localeAliasInfo.put("subdivision", new TreeMap<>());
localeAliasInfo.put("unit", new TreeMap<>());
localeAliasInfo.put("usage", new TreeMap<>());

// localeAliasInfo.get("language").put("nb", "no");
localeAliasInfo.get("language").put("zh_CN", "zh_Hans_CN");
Expand Down Expand Up @@ -2717,7 +2722,7 @@ public void printContains(PrintWriter index) throws IOException {
new FormattedFileWriter(null, title, null, SUPPLEMENTAL_INDEX_ANCHORS));
// doTitle(pw, title);
List<String[]> rows = new ArrayList<>();
printContains3("001", rows, new ArrayList<String>());
printContains3("001", rows, new ArrayList<>());
TablePrinter tablePrinter =
new TablePrinter()
.addColumn("World", "class='source'", null, "class='z0'", true)
Expand Down
29 changes: 19 additions & 10 deletions tools/cldr-code/src/main/java/org/unicode/cldr/util/CLDRFile.java
Original file line number Diff line number Diff line change
Expand Up @@ -1432,6 +1432,7 @@ public static Set<String> getMatchingXMLFiles(File[] sourceDirs, Matcher m) {

private final boolean DEFAULT_ITERATION_INCLUDES_EXTRAS = true;

@Override
public Iterator<String> iterator() {
if (DEFAULT_ITERATION_INCLUDES_EXTRAS) {
return Iterators.filter(fullIterable().iterator(), p -> getStringValue(p) != null);
Expand Down Expand Up @@ -2369,10 +2370,6 @@ public CLDRFile makeDraft(DraftStatus draftStatus) {
return this;
}

public UnicodeSet getExemplarSet(String type, WinningChoice winningChoice) {
return getExemplarSet(type, winningChoice, UnicodeSet.CASE_INSENSITIVE);
}

public UnicodeSet getExemplarSet(ExemplarType type, WinningChoice winningChoice) {
return getExemplarSet(type, winningChoice, UnicodeSet.CASE_INSENSITIVE);
}
Expand All @@ -2386,16 +2383,28 @@ public enum ExemplarType {
main,
auxiliary,
index,
numbers,
numbers_auxiliary,
punctuation,
numbers;
punctuation_auxiliary,
punctuation_person,
;
private final String pathID;
static final Map<String, ExemplarType> stringToPathID =
Arrays.asList(ExemplarType.values()).stream()
.collect(Collectors.toMap(x -> x.pathID, x -> x));

private ExemplarType() {
pathID = name().replace('_', '-');
}

public static ExemplarType fromString(String type) {
return type.isEmpty() ? main : valueOf(type);
return type == null || type.isEmpty() ? main : stringToPathID.get(type);
}
}

public UnicodeSet getExemplarSet(String type, WinningChoice winningChoice, int option) {
return getExemplarSet(ExemplarType.fromString(type), winningChoice, option);
String pathID() {
return pathID;
}
}

public UnicodeSet getExemplarSet(ExemplarType type, WinningChoice winningChoice, int option) {
Expand Down Expand Up @@ -2424,7 +2433,7 @@ public UnicodeSet getRawExemplarSet(ExemplarType type, WinningChoice winningChoi

public static String getExemplarPath(ExemplarType type) {
return "//ldml/characters/exemplarCharacters"
+ (type == ExemplarType.main ? "" : "[@type=\"" + type + "\"]");
+ (type == ExemplarType.main ? "" : "[@type=\"" + type.pathID() + "\"]");
}

public enum NumberingSystem {
Expand Down
Loading
Loading