Skip to content

Commit ee91e3c

Browse files
committed
Improve DamerauLevenshtein performance
1 parent 4206930 commit ee91e3c

File tree

1 file changed

+39
-43
lines changed

1 file changed

+39
-43
lines changed

lib/edits/damerau_levenshtein.rb

Lines changed: 39 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -21,70 +21,66 @@ def self.distance(seq1, seq2)
2121
seq1, seq2 = seq2, seq1 if seq1.length > seq2.length
2222

2323
# array of codepoints outperforms String
24-
seq1 = seq1.codepoints if seq1.is_a? String
25-
seq2 = seq2.codepoints if seq2.is_a? String
24+
if seq1.is_a?(String) && seq2.is_a?(String)
25+
seq1 = seq1.codepoints
26+
seq2 = seq2.codepoints
27+
end
2628

2729
rows = seq1.length
2830
cols = seq2.length
2931
return cols if rows == 0
3032
return rows if cols == 0
3133

32-
# 'infinite' edit distance for padding cost matrix.
33-
# Can be any value > max[rows, cols]
34-
inf = rows + cols
35-
36-
# Initialize first two rows of cost matrix.
37-
# The full initial state where cols=3, rows=2 (inf=5) would be:
38-
# [[5, 5, 5, 5, 5],
39-
# [5, 0, 1, 2, 3],
40-
# [5, 1, 0, 0, 0],
41-
# [5, 2, 0, 0, 0]]
42-
matrix = [Array.new(cols + 2, inf)]
43-
matrix << 0.upto(cols).to_a.unshift(inf)
34+
# 'infinite' edit distance to pad cost matrix.
35+
# Any value > max[rows, cols]
36+
inf = cols + 1
4437

4538
# element => last row seen
46-
item_history = Hash.new(0)
39+
row_history = Hash.new(0)
4740

48-
1.upto(rows) do |row|
49-
# generate next row of cost matrix
50-
new_row = Array.new(cols + 2, 0)
51-
new_row[0] = inf
52-
new_row[1] = row
53-
matrix << new_row
41+
# initialize alphabet-keyed cost matrix
42+
matrix = {}
43+
curr_row = 0.upto(cols).to_a
5444

55-
last_match_col = 0
56-
seq1_item = seq1[row - 1]
45+
rows.times do |row|
46+
seq1_item = seq1[row]
47+
match_col = 0
5748

58-
1.upto(cols) do |col|
59-
seq2_item = seq2[col - 1]
60-
last_match_row = item_history[seq2_item]
49+
# rotate row arrays & generate next
50+
matrix[seq1_item] = last_row = curr_row
51+
curr_row = Array.new(cols + 1, inf)
52+
curr_row[0] = row + 1
6153

54+
cols.times do |col|
55+
seq2_item = seq2[col]
6256
sub_cost = seq1_item == seq2_item ? 0 : 1
6357

64-
transposition = 1 + matrix[last_match_row][last_match_col]
65-
transposition += row - last_match_row - 1
66-
transposition += col - last_match_col - 1
67-
68-
# TODO: do insertion/deletion need to be considered when
69-
# seq1_item == seq2_item ?
70-
#
71-
# substitution, deletion, insertion, transposition
58+
# | Xs | Xd |
59+
# | Xi | ? |
60+
# substitution, deletion, insertion
7261
cost = [
73-
matrix[row][col] + sub_cost,
74-
matrix[row][col + 1] + 1,
75-
matrix[row + 1][col] + 1,
76-
transposition
62+
last_row[col] + sub_cost,
63+
last_row[col + 1] + 1,
64+
curr_row[col] + 1
7765
].min
7866

79-
matrix[row + 1][col + 1] = cost
80-
81-
last_match_col = col if sub_cost == 0
67+
# transposition cost
68+
# skip missed matrix lookup (inf cost)
69+
if sub_cost > 0 && row > 0 && (m = matrix[seq2_item])
70+
transpose = 1 + m[match_col] \
71+
+ (row - row_history[seq2_item] - 1) \
72+
+ (col - match_col - 1)
73+
cost = transpose if transpose < cost
74+
end
75+
76+
match_col = col if sub_cost == 0
77+
curr_row[col + 1] = cost
8278
end
8379

84-
item_history[seq1_item] = row
80+
row_history[seq1_item] = row
8581
end
8682

87-
matrix[rows + 1][cols + 1]
83+
curr_row[cols]
8884
end
8985
end
9086
end

0 commit comments

Comments
 (0)