Skip to content

Commit 9581b01

Browse files
committed
RFC/DEPR: deprecate yt.funcs.levenshtein_distance in favor of difflib.get_close_matches
1 parent 0ba2cfa commit 9581b01

File tree

7 files changed

+77
-78
lines changed

7 files changed

+77
-78
lines changed

nose_ignores.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,4 @@
5050
--ignore-file=test_sph_pixelization_pytestonly\.py
5151
--ignore-file=test_time_series\.py
5252
--ignore-file=test_cf_radial_pytest\.py
53+
--ignore-file=test_levenshtein_pytest\.py

tests/tests.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,5 +231,6 @@ other_tests:
231231
- "--ignore-file=test_offaxisprojection_pytestonly\\.py"
232232
- "--ignore-file=test_sph_pixelization_pytestonly\\.py"
233233
- "--ignore-file=test_cf_radial_pytest\\.py"
234+
- "--ignore-file=test_levenshtein_pytest\\.py"
234235
cookbook:
235236
- 'doc/source/cookbook/tests/test_cookbook.py'

yt/funcs.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1414,6 +1414,12 @@ def levenshtein_distance(seq1, seq2, max_dist=None):
14141414
as the number of edits goes above the value. This allows for an earlier break
14151415
and speeds calculations up.
14161416
"""
1417+
issue_deprecation_warning(
1418+
"yt.funcs.levenshtein_distance is deprecated. "
1419+
"Please prefer difflib.get_close_matches.",
1420+
since="4.5.0",
1421+
stacklevel=3,
1422+
)
14171423
size_x = len(seq1) + 1
14181424
size_y = len(seq2) + 1
14191425
if max_dist is None:

yt/loaders.py

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import types
1111
import warnings
1212
from collections.abc import Mapping
13+
from difflib import get_close_matches
1314
from pathlib import Path
1415
from typing import TYPE_CHECKING, Any, Union, cast
1516
from urllib.parse import urlsplit
@@ -23,7 +24,6 @@
2324
)
2425
from yt._typing import AnyFieldKey, AxisOrder, FieldKey
2526
from yt.data_objects.static_output import Dataset
26-
from yt.funcs import levenshtein_distance
2727
from yt.sample_data.api import lookup_on_disk_data
2828
from yt.utilities.decompose import decompose_array, get_psize
2929
from yt.utilities.exceptions import (
@@ -1513,10 +1513,7 @@ def _get_sample_data(
15131513
known_names: list[str] = registry_table.dropna()["filename"].to_list()
15141514
if topdir not in known_names:
15151515
msg = f"'{topdir}' is not an available dataset."
1516-
lexical_distances: list[tuple[str, int]] = [
1517-
(name, levenshtein_distance(name, topdir)) for name in known_names
1518-
]
1519-
suggestions: list[str] = [name for name, dist in lexical_distances if dist < 4]
1516+
suggestions = get_close_matches(topdir, known_names)
15201517
if len(suggestions) == 1:
15211518
msg += f" Did you mean '{suggestions[0]}' ?"
15221519
elif suggestions:

yt/tests/test_funcs.py

Lines changed: 1 addition & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,7 @@
33
from nose.tools import assert_raises
44
from numpy.testing import assert_equal
55

6-
from yt.funcs import (
7-
just_one,
8-
levenshtein_distance,
9-
simple_download_file,
10-
validate_axis,
11-
validate_center,
12-
)
6+
from yt.funcs import just_one, simple_download_file, validate_axis, validate_center
137
from yt.testing import fake_amr_ds
148
from yt.units import YTArray, YTQuantity
159

@@ -71,41 +65,6 @@ def test_just_one():
7165
assert jo == expected
7266

7367

74-
def test_levenshtein():
75-
assert_equal(levenshtein_distance("abcdef", "abcdef"), 0)
76-
77-
# Deletions / additions
78-
assert_equal(levenshtein_distance("abcdef", "abcde"), 1)
79-
assert_equal(levenshtein_distance("abcdef", "abcd"), 2)
80-
assert_equal(levenshtein_distance("abcdef", "abc"), 3)
81-
82-
assert_equal(levenshtein_distance("abcdf", "abcdef"), 1)
83-
assert_equal(levenshtein_distance("cdef", "abcdef"), 2)
84-
assert_equal(levenshtein_distance("bde", "abcdef"), 3)
85-
86-
# Substitutions
87-
assert_equal(levenshtein_distance("abcd", "abc_"), 1)
88-
assert_equal(levenshtein_distance("abcd", "ab__"), 2)
89-
assert_equal(levenshtein_distance("abcd", "a___"), 3)
90-
assert_equal(levenshtein_distance("abcd", "____"), 4)
91-
92-
# Deletion + Substitutions
93-
assert_equal(levenshtein_distance("abcd", "abc_z"), 2)
94-
assert_equal(levenshtein_distance("abcd", "ab__zz"), 4)
95-
assert_equal(levenshtein_distance("abcd", "a___zzz"), 6)
96-
assert_equal(levenshtein_distance("abcd", "____zzzz"), 8)
97-
98-
# Max distance
99-
assert_equal(levenshtein_distance("abcd", "", max_dist=0), 1)
100-
assert_equal(levenshtein_distance("abcd", "", max_dist=3), 4)
101-
assert_equal(levenshtein_distance("abcd", "", max_dist=10), 4)
102-
assert_equal(levenshtein_distance("abcd", "", max_dist=1), 2)
103-
assert_equal(levenshtein_distance("abcd", "a", max_dist=2), 3)
104-
assert_equal(levenshtein_distance("abcd", "ad", max_dist=2), 2)
105-
assert_equal(levenshtein_distance("abcd", "abd", max_dist=2), 1)
106-
assert_equal(levenshtein_distance("abcd", "abcd", max_dist=2), 0)
107-
108-
10968
def test_simple_download_file():
11069
fn = simple_download_file("http://yt-project.org", "simple-download-file")
11170
try:

yt/tests/test_funcs_pytest.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import pytest
2+
3+
from yt.funcs import levenshtein_distance
4+
5+
6+
@pytest.mark.parametrize(
7+
"a, b, expected",
8+
[
9+
("abcdef", "abcdef", 0),
10+
# Deletions / additions
11+
("abcdef", "abcde", 1),
12+
("abcdef", "abcd", 2),
13+
("abcdef", "abc", 3),
14+
("abcdf", "abcdef", 1),
15+
("cdef", "abcdef", 2),
16+
("bde", "abcdef", 3),
17+
# Substitutions
18+
("abcd", "abc_", 1),
19+
("abcd", "ab__", 2),
20+
("abcd", "a___", 3),
21+
("abcd", "____", 4),
22+
# Deletion + Substitutions
23+
("abcd", "abc_z", 2),
24+
("abcd", "ab__zz", 4),
25+
("abcd", "a___zzz", 6),
26+
("abcd", "____zzzz", 8),
27+
],
28+
)
29+
def test_levenshtein(a, b, expected):
30+
with pytest.deprecated_call():
31+
assert levenshtein_distance(a, b) == expected
32+
33+
34+
@pytest.mark.parametrize(
35+
"a, b, max_dist, expected",
36+
[
37+
("abcd", "", 0, 1),
38+
("abcd", "", 3, 4),
39+
("abcd", "", 10, 4),
40+
("abcd", "", 1, 2),
41+
("abcd", "a", 2, 3),
42+
("abcd", "ad", 2, 2),
43+
("abcd", "abd", 2, 1),
44+
("abcd", "abcd", 2, 0),
45+
],
46+
)
47+
def test_levenshtein_with_max_dist(a, b, max_dist, expected):
48+
with pytest.deprecated_call():
49+
assert levenshtein_distance(a, b, max_dist=max_dist) == expected

yt/utilities/exceptions.py

Lines changed: 17 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# We don't need to import 'exceptions'
22
import os.path
3+
from difflib import get_close_matches
34

45
from unyt.exceptions import UnitOperationError
56

@@ -97,51 +98,36 @@ def __init__(self, field, ds):
9798
self.ds = ds
9899

99100
def _get_suggestions(self) -> list[FieldKey]:
100-
from yt.funcs import levenshtein_distance
101-
102101
field = self.field
103102
ds = self.ds
104103

105-
suggestions = {}
106104
if not isinstance(field, tuple):
107105
ftype, fname = None, field
108106
elif field[1] is None:
109107
ftype, fname = None, field[0]
110108
else:
111109
ftype, fname = field
112110

113-
# Limit the suggestions to a distance of 3 (at most 3 edits)
114-
# This is very arbitrary, but is picked so that...
115-
# - small typos lead to meaningful suggestions (e.g. `densty` -> `density`)
116-
# - we don't suggest unrelated things (e.g. `pressure` -> `density` has a distance
117-
# of 6, we definitely do not want it)
118-
# A threshold of 3 seems like a good middle point.
119-
max_distance = 3
111+
suggestions: list[FieldKey] = []
112+
if ftype is not None:
113+
fields_str: dict[str, FieldKey] = {
114+
str(df).lower(): df for df in ds.derived_field_list
115+
}
116+
field_str = str(field).lower()
117+
suggestions.extend(
118+
fields_str[k] for k in get_close_matches(field_str, fields_str.keys())
119+
)
120120

121-
# Suggest (ftype, fname), with alternative ftype
121+
# Ensure we suggest (ftype, fname), with alternative ftype
122122
for ft, fn in ds.derived_field_list:
123-
if fn.lower() == fname.lower() and (
124-
ftype is None or ft.lower() != ftype.lower()
123+
if (
124+
fn.lower() == fname.lower()
125+
and (ftype is None or ft.lower() != ftype.lower())
126+
and (ft, fn) not in suggestions
125127
):
126-
suggestions[ft, fn] = 0
127-
128-
if ftype is not None:
129-
# Suggest close matches using levenshtein distance
130-
fields_str = {_: str(_).lower() for _ in ds.derived_field_list}
131-
field_str = str(field).lower()
128+
suggestions.insert(0, (ft, fn))
132129

133-
for (ft, fn), fs in fields_str.items():
134-
distance = levenshtein_distance(field_str, fs, max_dist=max_distance)
135-
if distance < max_distance:
136-
if (ft, fn) in suggestions:
137-
continue
138-
suggestions[ft, fn] = distance
139-
140-
# Return suggestions sorted by increasing distance (first are most likely)
141-
return [
142-
(ft, fn)
143-
for (ft, fn), distance in sorted(suggestions.items(), key=lambda v: v[1])
144-
]
130+
return suggestions
145131

146132
def __str__(self):
147133
msg = f"Could not find field {self.field!r} in {self.ds}."

0 commit comments

Comments
 (0)