Skip to content

Commit be28298

Browse files
committed
add bounded version of restricted edit distance
1 parent f19e792 commit be28298

File tree

9 files changed

+289
-58
lines changed

9 files changed

+289
-58
lines changed

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ Edits::Levenshtein.distance "acer", "earn"
4242
# Max distance
4343
Edits::Levenshtein.distance_with_max "iota", "atom", 2
4444
# => 2
45-
Edits::Levenshtein.most_similar "atom", %w[tram atlas rota racer]
46-
# => "atlas"
45+
Edits::Levenshtein.most_similar "atom", %w[tree rota toes racer]
46+
# => "toes"
4747
```
4848

4949
### Restricted Edit (Optimal Alignment)
@@ -59,6 +59,12 @@ Edits::RestrictedEdit.distance "iota", "atom"
5959
# => 3
6060
Edits::RestrictedEdit.distance "acer", "earn"
6161
# => 4
62+
63+
# Max distance
64+
Edits::RestrictedEdit.distance_with_max "iota", "atom", 2
65+
# => 2
66+
Edits::RestrictedEdit.most_similar "atom", %w[tree rota toes racer]
67+
# => "rota"
6268
```
6369

6470
### Damerau-Levenshtein

lib/edits.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
require "edits/version"
44

5+
require "edits/compare"
56
require "edits/damerau_levenshtein"
67
require "edits/hamming"
78
require "edits/jaro"

lib/edits/compare.rb

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# frozen_string_literal: true
2+
3+
module Edits
4+
# Comparison helpers
5+
module Compare
6+
# Given a prototype string and an array of strings, determines which
7+
# string is most similar to the prototype.
8+
#
9+
# `most_similar("foo", strings)` is functionally equivalent to
10+
# `strings.min_by { |s| distance("foo", s) }`, leveraging
11+
# {.distance_with_max}.
12+
#
13+
# @example
14+
# most_similar("atom", %w[tram atlas rota racer])
15+
# # => "atlas"
16+
# @param prototype [String]
17+
# @param strings [<String>]
18+
# @return [String, nil] most similar string, or nil for empty array
19+
def most_similar(prototype, strings)
20+
return nil if strings.empty?
21+
min_s = strings[0]
22+
min_d = distance(prototype, min_s)
23+
24+
strings[1..-1].each do |s|
25+
return min_s if min_d.zero?
26+
d = distance_with_max(prototype, s, min_d)
27+
if d < min_d
28+
min_d = d
29+
min_s = s
30+
end
31+
end
32+
33+
min_s
34+
end
35+
end
36+
end

lib/edits/levenshtein.rb

Lines changed: 2 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ module Edits
88
# * Deletion
99
# * Substitution
1010
module Levenshtein
11+
extend Compare
12+
1113
# Calculate the Levenshtein (edit) distance of two sequences.
1214
#
1315
# @note A true distance metric, satisfies triangle inequality.
@@ -125,35 +127,5 @@ def self.distance_with_max(seq1, seq2, max)
125127

126128
last_row[cols] > max ? max : last_row[cols]
127129
end
128-
129-
# Given a prototype string and an array of strings, determines which
130-
# string is most similar to the prototype.
131-
#
132-
# `Levenshtein.most_similar("foo", strings)` is functionally equivalent to
133-
# `strings.min_by { |s| Levenshtein.distance("foo", s) }`, leveraging
134-
# {.distance_with_max}.
135-
#
136-
# @example
137-
# Edits::Levenshtein.most_similar("atom", %w[tram atlas rota racer])
138-
# # => "atlas"
139-
# @param prototype [String]
140-
# @param strings [<String>]
141-
# @return [String, nil] most similar string, or nil for empty array
142-
def self.most_similar(prototype, strings)
143-
return nil if strings.empty?
144-
min_s = strings[0]
145-
min_d = distance(prototype, min_s)
146-
147-
strings[1..-1].each do |s|
148-
return min_s if min_d.zero?
149-
d = distance_with_max(prototype, s, min_d)
150-
if d < min_d
151-
min_d = d
152-
min_s = s
153-
end
154-
end
155-
156-
min_s
157-
end
158130
end
159131
end

lib/edits/restricted_edit.rb

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ module Edits
1313
# This variant is restricted by the condition that no sub-string is edited
1414
# more than once.
1515
module RestrictedEdit
16+
extend Compare
17+
1618
# Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment)
1719
# of two sequences.
1820
#
@@ -82,5 +84,84 @@ def self.distance(seq1, seq2)
8284

8385
curr_row[cols]
8486
end
87+
88+
# Calculate the Restricted Damerau-Levenshtein distance (Optimal Alignment)
89+
# of two sequences, bounded by a maximum value.
90+
#
91+
# @example
92+
# Edits::RestrictedEdit.distance("cloud", "crayon")
93+
# # => 5
94+
# Edits::RestrictedEdit.distance_with_max("cloud", "crayon", 2)
95+
# # => 2
96+
# @param seq1 [String, Array]
97+
# @param seq2 [String, Array]
98+
# @param max [Integer] maximum distance
99+
# @return [Integer]
100+
def self.distance_with_max(seq1, seq2, max)
101+
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
102+
103+
rows = seq1.length
104+
cols = seq2.length
105+
return cols > max ? max : cols if rows.zero?
106+
return rows > max ? max : rows if cols.zero?
107+
return max if (cols - rows) >= max
108+
109+
# array of codepoints outperforms String
110+
seq1 = seq1.codepoints if seq1.is_a? String
111+
seq2 = seq2.codepoints if seq2.is_a? String
112+
113+
# 'infinite' edit distance for padding cost matrix.
114+
# Can be any value > max[rows, cols]
115+
inf = cols + 1
116+
117+
# retain previous two rows of cost matrix,
118+
# padded with "inf" as matrix is not fully evaluated
119+
lastlast_row = Array.new(inf, inf)
120+
last_row = Array.new(inf, inf)
121+
curr_row = 0.upto(cols).to_a
122+
123+
rows.times do |row|
124+
# rotate row arrays
125+
curr_row, last_row, lastlast_row = lastlast_row, curr_row, last_row
126+
127+
# Ukkonen cut-off
128+
min_col = row > max ? row - max : 0
129+
max_col = row + max
130+
max_col = cols - 1 if max_col > cols - 1
131+
132+
curr_row[min_col] = min_col.zero? ? row + 1 : inf
133+
seq1_item = seq1[row]
134+
diagonal = cols - rows + row
135+
136+
min_col.upto(max_col) do |col|
137+
return max if diagonal == col && last_row[col] >= max
138+
139+
sub_cost = seq1_item == seq2[col] ? 0 : 1
140+
is_swap = sub_cost.positive? &&
141+
row.positive? && col.positive? &&
142+
seq1_item == seq2[col - 1] &&
143+
seq1[row - 1] == seq2[col]
144+
145+
# | Xt | | |
146+
# | | Xs | Xd |
147+
# | | Xi | ? |
148+
# substitution, deletion, insertion, transposition
149+
cost = [
150+
last_row[col] + sub_cost,
151+
last_row[col + 1] + 1,
152+
curr_row[col] + 1
153+
].min
154+
155+
if is_swap
156+
swap = lastlast_row[col - 1] + 1
157+
cost = swap if swap < cost
158+
end
159+
160+
curr_row[col + 1] = cost
161+
end
162+
end
163+
164+
curr_row[cols] > max ? max : curr_row[cols]
165+
end
85166
end
86167
end

spec/edits/damerau_levenshtein_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
include_examples "levenshtein"
1111

1212
[
13-
# swaps
13+
# simple transpositions
1414
["a cat", "an act", 2],
1515
["abc", "acb", 1],
1616
["abc", "bac", 1],

spec/edits/levenshtein_spec.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
RSpec.describe Edits::Levenshtein do
77
cases = [
8-
# swaps
8+
# simple transpositions
99
["a cat", "an act", 3],
1010
["abc", "acb", 2],
1111
["abc", "bac", 2],

spec/edits/restricted_edit_spec.rb

Lines changed: 99 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -4,34 +4,36 @@
44
require "edits/levenshtein_shared"
55

66
RSpec.describe Edits::RestrictedEdit do
7+
cases = [
8+
# simple transpositions
9+
["a cat", "an act", 2],
10+
["abc", "acb", 1],
11+
["abc", "bac", 1],
12+
["abcdef", "abcdfe", 1],
13+
["abcdefghij", "acbdegfhji", 3],
14+
["acre", "acer", 1],
15+
["art", "ran", 2],
16+
["caned", "acned", 1],
17+
["iota", "atom", 3],
18+
["minion", "noir", 4],
19+
20+
# complex transpositions
21+
["a cat", "a tc", 3],
22+
["a cat", "an abct", 4],
23+
["acer", "earn", 4],
24+
["craned", "read", 4],
25+
["information", "informant", 4],
26+
["raced", "dear", 5],
27+
["roam", "art", 4],
28+
["tram", "rota", 4]
29+
]
30+
731
describe ".distance" do
832
subject { described_class.distance a, b }
933

1034
include_examples "levenshtein"
1135

12-
[
13-
# swaps
14-
["a cat", "an act", 2],
15-
["abc", "acb", 1],
16-
["abc", "bac", 1],
17-
["abcdef", "abcdfe", 1],
18-
["abcdefghij", "acbdegfhji", 3],
19-
["acre", "acer", 1],
20-
["art", "ran", 2],
21-
["caned", "acned", 1],
22-
["iota", "atom", 3],
23-
["minion", "noir", 4],
24-
25-
# complex transpositions
26-
["a cat", "a tc", 3],
27-
["a cat", "an abct", 4],
28-
["acer", "earn", 4],
29-
["craned", "read", 4],
30-
["information", "informant", 4],
31-
["raced", "dear", 5],
32-
["roam", "art", 4],
33-
["tram", "rota", 4]
34-
].each do |(a, b, distance)|
36+
cases.each do |(a, b, distance)|
3537
context "with '#{a}', '#{b}'" do
3638
let(:a) { a }
3739
let(:b) { b }
@@ -40,4 +42,78 @@
4042
end
4143
end
4244
end
45+
46+
describe ".distance_with_max" do
47+
subject { described_class.distance_with_max a, b, max }
48+
49+
context "when max is 100" do
50+
let(:max) { 100 }
51+
52+
include_examples "levenshtein"
53+
54+
cases.each do |(a, b, distance)|
55+
context "with '#{a}', '#{b}'" do
56+
let(:a) { a }
57+
let(:b) { b }
58+
59+
it { is_expected.to eq distance }
60+
end
61+
end
62+
end
63+
64+
context "when max is 4" do
65+
let(:max) { 4 }
66+
67+
cases.each do |(a, b, distance)|
68+
context "with '#{a}', '#{b}'" do
69+
let(:a) { a }
70+
let(:b) { b }
71+
72+
it { is_expected.to eq(distance > max ? max : distance) }
73+
end
74+
end
75+
76+
context "with '', 'abcdfe'" do
77+
let(:a) { "" }
78+
let(:b) { "abcdfe" }
79+
80+
it { is_expected.to eq max }
81+
end
82+
83+
context "with 'abcdfe', ''" do
84+
let(:a) { "abcdfe" }
85+
let(:b) { "" }
86+
87+
it { is_expected.to eq max }
88+
end
89+
end
90+
end
91+
92+
describe ".most_similar" do
93+
let(:prototype) { "atom" }
94+
95+
subject { described_class.most_similar prototype, words }
96+
97+
context "with empty array" do
98+
let(:words) { [] }
99+
100+
it { is_expected.to be_nil }
101+
end
102+
103+
context "when a single word has the lowest distance" do
104+
let(:words) { %w[light at atlas beer iota train] }
105+
106+
it "returns the word with lowest distance from prototype" do
107+
expect(subject).to eq "at"
108+
end
109+
end
110+
111+
context "when two words share the lowest distance" do
112+
let(:words) { %w[light beer iota train] }
113+
114+
it "returns the first with lowest distance from prototype" do
115+
expect(subject).to eq "iota"
116+
end
117+
end
118+
end
43119
end

0 commit comments

Comments
 (0)