Skip to content

Commit 4ca4b23

Browse files
committed
ENH case-insensitive matching and ignore leading/trailing whitespace
1 parent 13e27ab commit 4ca4b23

5 files changed

Lines changed: 124 additions & 31 deletions

File tree

CHANGELOG.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,16 @@ major/minor/micro version numbers like `05` (it'd have to be just `5`).
1717
### Fixed
1818
### Security
1919

20+
## [2025.2.18]
21+
22+
### Added
23+
* `Language.match` now supports case-insensitive matching
24+
and ignores leading/trailing whitespace by default.
25+
2026
## [2025.2.8]
2127

2228
### Added
23-
* Explcitly indicated that the library is type-annotated.
29+
* Explicitly indicated that the library is type-annotated.
2430

2531
## [2025.1.28]
2632

README.md

Lines changed: 19 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,13 +45,13 @@ Create a `Language` instance by one of the class methods.
4545
Language(part3='fra', part2b='fre', part2t='fra', part1='fr', scope='I', type='L', name='French', comment=None, other_names=None, macrolanguage=None, retire_reason=None, retire_change_to=None, retire_remedy=None, retire_date=None)
4646
```
4747

48-
Fast object instantiation for retrieving language information (run on Python 3.13, macOS 15.2, Apple M1 Pro)
48+
Fast object instantiation for retrieving language information (run on Python 3.13, macOS 15.3.1, Apple M1 Pro)
4949

5050
```python
5151
In [1]: import iso639
5252

5353
In [2]: %timeit iso639.Language.from_part3("fra")
54-
220 ns ± 0.658 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
54+
217 ns ± 0.139 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
5555
```
5656

5757
#### From Another ISO 639 Code Set or a Reference Name
@@ -116,11 +116,27 @@ Use the `match` classmethod:
116116
True
117117
```
118118

119+
By default, the classmethod `match` supports case-insensitive matching
120+
and ignores leading/trailing whitespace.
121+
To enforce exact matching instead, pass in `exact=True`:
122+
123+
```python
124+
>>> lang5 = iso639.Language.match('FRA')
125+
>>> lang6 = iso639.Language.match('fra ')
126+
>>> lang7 = iso639.Language.match('french')
127+
>>> lang4 == lang5 == lang6 == lang7
128+
True
129+
>>> iso639.Language.match("french", exact=True)
130+
Traceback (most recent call last):
131+
File "<stdin>", line 1, in <module>
132+
LanguageNotFoundError: 'french' isn't an ISO language code or name
133+
```
134+
119135
The classmethod `match` is particularly useful for consistently
120136
accessing a specific attribute from unknown inputs, e.g., the ISO 639-3 code.
121137

122138
```python
123-
>>> 'fra' == lang1.part3 == lang2.part3 == lang3.part3 == lang4.part3
139+
>>> 'fra' == lang1.part3 == lang2.part3 == lang3.part3 == lang4.part3 == lang5.part3 == lang6.part3 == lang7.part3
124140
True
125141
```
126142

@@ -223,7 +239,6 @@ Beyond that, the precise order in matching is as follows:
223239
* ISO 639-3 alternative language names (the "print" ones)
224240
* ISO 639-3 alternative language names (the "inverted" ones)
225241

226-
Only exact matching is done (there's no fuzzy string matching of any sort).
227242
As soon as a match is found, `Language.match` returns a `Language` instance.
228243
If there isn't a match, a `LanguageNotFoundError` is raised.
229244

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
44

55
[project]
66
name = "python-iso639"
7-
version = "2025.2.8"
7+
version = "2025.2.18"
88
description = "ISO 639 language codes, names, and other associated information"
99
readme = "README.md"
1010
requires-python = ">= 3.9"

src/iso639/language.py

Lines changed: 72 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import datetime
66
from dataclasses import dataclass
77

8-
from typing import Dict, List, Union, Set
8+
from typing import Dict, List, NoReturn, Optional, Union, Set
99

1010
from ._data import (
1111
_PART3_TO_CODES,
@@ -26,6 +26,12 @@
2626
)
2727

2828

29+
_STRING_CLEANING_FUNCS = [
30+
lambda x: x.strip().lower(),
31+
lambda x: x.strip().title(),
32+
]
33+
34+
2935
class LanguageNotFoundError(Exception):
3036
pass
3137

@@ -100,13 +106,17 @@ def __eq__(self, other) -> bool:
100106
return isinstance(other, Language) and self.part3 == other.part3
101107

102108
@classmethod
103-
def match(cls, user_input: str, /) -> Language:
109+
def match(cls, user_input: str, /, *, exact: bool = False) -> Language:
104110
"""Return a ``Language`` instance by matching on the user input.
105111
106112
Parameters
107113
----------
108114
user_input : str
109115
A language code or name.
116+
exact : bool, optional
117+
Whether to enforce exact matching against the user input.
118+
Defaults to `False`. If `False`, matching is case-insensitive
119+
and ignores leading/trailing whitespace.
110120
111121
Returns
112122
-------
@@ -140,29 +150,29 @@ def match(cls, user_input: str, /) -> Language:
140150
_NameIndexColumn.PRINT_NAME,
141151
_NameIndexColumn.INVERTED_NAME,
142152
]
143-
return _PART3_TO_LANGUAGES[_get_part3(user_input, query_order)]
153+
return _PART3_TO_LANGUAGES[_get_part3(user_input, query_order, exact)]
144154

145155
@classmethod
146156
def from_part3(cls, user_input: str, /) -> Language:
147157
"""Return a ``Language`` instance from an ISO 639-3 code."""
148158
return _PART3_TO_LANGUAGES[
149-
_get_part3(user_input, [_CodesColumn.ID, _RetirementsColumn.ID])
159+
_get_part3_exact(user_input, [_CodesColumn.ID, _RetirementsColumn.ID])
150160
]
151161

152162
@classmethod
153163
def from_part2b(cls, user_input: str, /) -> Language:
154164
"""Return a ``Language`` instance from an ISO 639-2 (bibliographic) code."""
155-
return _PART3_TO_LANGUAGES[_get_part3(user_input, [_CodesColumn.PART2B])]
165+
return _PART3_TO_LANGUAGES[_get_part3_exact(user_input, [_CodesColumn.PART2B])]
156166

157167
@classmethod
158168
def from_part2t(cls, user_input: str, /) -> Language:
159169
"""Return a ``Language`` instance from an ISO 639-2 (terminological) code."""
160-
return _PART3_TO_LANGUAGES[_get_part3(user_input, [_CodesColumn.PART2T])]
170+
return _PART3_TO_LANGUAGES[_get_part3_exact(user_input, [_CodesColumn.PART2T])]
161171

162172
@classmethod
163173
def from_part1(cls, user_input: str, /) -> Language:
164174
"""Return a ``Language`` instance from an ISO 639-1 code."""
165-
return _PART3_TO_LANGUAGES[_get_part3(user_input, [_CodesColumn.PART1])]
175+
return _PART3_TO_LANGUAGES[_get_part3_exact(user_input, [_CodesColumn.PART1])]
166176

167177
@classmethod
168178
def from_name(cls, user_input: str, /) -> Language:
@@ -172,10 +182,57 @@ def from_name(cls, user_input: str, /) -> Language:
172182
_NameIndexColumn.PRINT_NAME,
173183
_NameIndexColumn.INVERTED_NAME,
174184
]
175-
return _PART3_TO_LANGUAGES[_get_part3(user_input, query_order)]
185+
return _PART3_TO_LANGUAGES[_get_part3_exact(user_input, query_order)]
186+
176187

188+
def _raise_language_not_found_error(user_input: str) -> NoReturn:
189+
raise LanguageNotFoundError(f"{user_input!r} isn't an ISO language code or name")
190+
191+
192+
def _get_part3(
193+
user_input: str, query_order: List[_COLUMN_TYPE], exact: bool = True
194+
) -> str:
195+
"""Get the part 3 code of a language.
177196
178-
def _get_part3(user_input: str, query_order: List[_COLUMN_TYPE]) -> str:
197+
Parameters
198+
----------
199+
user_input : str
200+
The user-provided language code or name.
201+
query_order : List[_COLUMN_TYPE]
202+
A list of columns to specify query order.
203+
exact : bool, optional
204+
Whether to enforce exact matching against the user input. Defaults to `True`.
205+
If `False`, basic string cleaning is applied to the user input.
206+
207+
Returns
208+
-------
209+
str
210+
211+
Raises
212+
------
213+
LanguageNotFoundError
214+
If `part3` isn't a language name or code
215+
"""
216+
try:
217+
return _get_part3_exact(user_input, query_order)
218+
except LanguageNotFoundError as e:
219+
if exact:
220+
raise e
221+
else:
222+
for func in _STRING_CLEANING_FUNCS:
223+
try:
224+
return _get_part3_exact(func(user_input), query_order, user_input)
225+
except LanguageNotFoundError:
226+
continue
227+
else:
228+
_raise_language_not_found_error(user_input)
229+
230+
231+
def _get_part3_exact(
232+
user_input: str,
233+
query_order: List[_COLUMN_TYPE],
234+
original_user_input: Optional[str] = None,
235+
) -> str:
179236
"""Get the part 3 code of a language.
180237
181238
Parameters
@@ -184,6 +241,9 @@ def _get_part3(user_input: str, query_order: List[_COLUMN_TYPE]) -> str:
184241
The user-provided language code or name.
185242
query_order : List[_COLUMN_TYPE]
186243
A list of columns to specify query order.
244+
original_user_input : str, optional
245+
The original user input. Default is `None`.
246+
This argument is used when the user input has been cleaned.
187247
188248
Returns
189249
-------
@@ -198,8 +258,7 @@ def _get_part3(user_input: str, query_order: List[_COLUMN_TYPE]) -> str:
198258
for column in query_order:
199259
if column == _CodesColumn.ID:
200260
if user_input in _PART3_TO_CODES:
201-
part3 = user_input
202-
break
261+
return user_input
203262
elif column == _CodesColumn.PART2B:
204263
part3 = _PART2B_TO_PART3.get(user_input)
205264
elif column == _CodesColumn.PART2T:
@@ -208,8 +267,7 @@ def _get_part3(user_input: str, query_order: List[_COLUMN_TYPE]) -> str:
208267
part3 = _PART1_TO_PART3.get(user_input)
209268
elif column == _RetirementsColumn.ID:
210269
if user_input in _PART3_TO_RETIREMENTS:
211-
part3 = user_input
212-
break
270+
return user_input
213271
elif column == _CodesColumn.REF_NAME:
214272
part3 = _REF_NAME_TO_PART3.get(user_input)
215273
elif column == _NameIndexColumn.PRINT_NAME:
@@ -222,9 +280,7 @@ def _get_part3(user_input: str, query_order: List[_COLUMN_TYPE]) -> str:
222280
break
223281

224282
if part3 is None:
225-
raise LanguageNotFoundError(
226-
f"{user_input!r} isn't an ISO language code or name"
227-
)
283+
_raise_language_not_found_error(original_user_input or user_input)
228284

229285
return part3
230286

tests/test_language.py

Lines changed: 25 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,34 @@
77

88

99
@pytest.mark.parametrize(
10-
"user_input, expected_part3",
10+
"user_input, exact, expected_part3",
1111
[
12-
("fra", "fra"),
13-
("fre", "fra"),
14-
("fr", "fra"),
15-
("French", "fra"),
16-
("Castilian", "spa"),
12+
("fra", True, "fra"),
13+
("fra", False, "fra"),
14+
("FRA", True, None),
15+
("FRA", False, "fra"),
16+
(" FRA ", False, "fra"),
17+
("Fra", True, None),
18+
("Fra", False, "fra"),
19+
(" Fra ", False, "fra"),
20+
("French", True, "fra"),
21+
("French", False, "fra"),
22+
("FRENCH", True, None),
23+
("FRENCH", False, "fra"),
24+
(" FRENCH ", False, "fra"),
25+
("french", True, None),
26+
("french", False, "fra"),
27+
(" french ", False, "fra"),
28+
("Castilian", True, "spa"),
1729
],
1830
)
19-
def test_match(user_input, expected_part3):
20-
actual_part3 = Language.match(user_input).part3
21-
assert actual_part3 == expected_part3
31+
def test_match(user_input, exact, expected_part3):
32+
if expected_part3 is None:
33+
with pytest.raises(LanguageNotFoundError):
34+
Language.match(user_input, exact=exact)
35+
else:
36+
actual_part3 = Language.match(user_input, exact=exact).part3
37+
assert actual_part3 == expected_part3
2238

2339

2440
def test_name():

0 commit comments

Comments
 (0)