Skip to content

Commit 76cc677

Browse files
committed
Be more cautious when guessing language from filename
Use more strict validation when attempting to interpret part of filename as language code. Don't try to parse things that don't look like language tags, because Language::TryParse() is too permissive. A particular failure mode is that ICU attempts to parse a language tag in a way where a string with-many-components-like-this may be interpreted as a valid language with e.g. nonsense country or variants.
1 parent b879c9f commit 76cc677

2 files changed

Lines changed: 22 additions & 3 deletions

File tree

src/language.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,9 @@ const std::wregex RE_LANG_CODE(L"([a-z]){2,3}(_([A-Z]{2}|[0-9]{3}))?(@[a-z]+)?")
6464
// a more permissive variant of the same that TryNormalize() would fix
6565
const std::wregex RE_LANG_CODE_PERMISSIVE(L"([a-zA-Z]){2,3}([_-]([a-zA-Z]{2}|[0-9]{3}))?(@[a-zA-Z]+)?");
6666

67+
// approximate match for BCP 47 language tags
68+
const std::wregex RE_LANG_CODE_BCP47(LR"(^[a-zA-Z]{2,3}(-[A-Z][a-z]{3})?(-([A-Z]{2}|\d{3}))?$)");
69+
6770
// try some normalizations: s/-/_/, case adjustments
6871
void TryNormalize(std::wstring& s)
6972
{
@@ -324,6 +327,11 @@ bool Language::IsValidCode(const std::wstring& s)
324327
return std::regex_match(s, RE_LANG_CODE);
325328
}
326329

330+
bool Language::IsPlausibleCode(const std::wstring& s)
331+
{
332+
return std::regex_match(s, RE_LANG_CODE_PERMISSIVE) || std::regex_match(s, RE_LANG_CODE_BCP47);
333+
}
334+
327335
std::string Language::Lang() const
328336
{
329337
return m_code.substr(0, m_code.find_first_of("_@"));
@@ -636,7 +644,8 @@ Language Language::TryGuessFromFilename(const wxString& filename, wxString *wild
636644
while (pos != wxString::npos)
637645
{
638646
auto part = name.substr(pos+1);
639-
lang = Language::TryParseWithValidation(part);
647+
if (Language::IsPlausibleCode(part))
648+
lang = Language::TryParseWithValidation(part);
640649
if (lang.IsValid())
641650
{
642651
if (wildcard)
@@ -663,12 +672,16 @@ Language Language::TryGuessFromFilename(const wxString& filename, wxString *wild
663672
wxString rest, wmatch;
664673
if (dirs[i].EndsWith(".lproj", &rest))
665674
{
666-
lang = Language::TryParseWithValidation(rest.ToStdWstring());
675+
auto l = rest.ToStdWstring();
676+
if (Language::IsPlausibleCode(l))
677+
lang = Language::TryParseWithValidation(l);
667678
wmatch = "*.lproj";
668679
}
669680
else
670681
{
671-
lang = Language::TryParseWithValidation(dirs[i].ToStdWstring());
682+
auto l = dirs[i].ToStdWstring();
683+
if (Language::IsPlausibleCode(l))
684+
lang = Language::TryParseWithValidation(l);
672685
wmatch = "*";
673686
}
674687
if (lang.IsValid())

src/language.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,12 @@ class Language
184184
Checks if @a s has the form of language code.
185185
*/
186186
static bool IsValidCode(const std::wstring& s);
187+
/**
188+
Checks if @a s has the form of language code, being more permissive and allowing
189+
e.g. BCP 47 -- i..e something suitable for passing to TryParse().
190+
*/
191+
static bool IsPlausibleCode(const std::wstring& s);
192+
187193

188194
bool operator==(const Language& other) const { return m_code == other.m_code; }
189195
bool operator!=(const Language& other) const { return m_code != other.m_code; }

0 commit comments

Comments
 (0)