@@ -21,70 +21,66 @@ def self.distance(seq1, seq2)
2121 seq1 , seq2 = seq2 , seq1 if seq1 . length > seq2 . length
2222
2323 # array of codepoints outperforms String
24- seq1 = seq1 . codepoints if seq1 . is_a? String
25- seq2 = seq2 . codepoints if seq2 . is_a? String
24+ if seq1 . is_a? ( String ) && seq2 . is_a? ( String )
25+ seq1 = seq1 . codepoints
26+ seq2 = seq2 . codepoints
27+ end
2628
2729 rows = seq1 . length
2830 cols = seq2 . length
2931 return cols if rows == 0
3032 return rows if cols == 0
3133
32- # 'infinite' edit distance for padding cost matrix.
33- # Can be any value > max[rows, cols]
34- inf = rows + cols
35-
36- # Initialize first two rows of cost matrix.
37- # The full initial state where cols=3, rows=2 (inf=5) would be:
38- # [[5, 5, 5, 5, 5],
39- # [5, 0, 1, 2, 3],
40- # [5, 1, 0, 0, 0],
41- # [5, 2, 0, 0, 0]]
42- matrix = [ Array . new ( cols + 2 , inf ) ]
43- matrix << 0 . upto ( cols ) . to_a . unshift ( inf )
34+ # 'infinite' edit distance to pad cost matrix.
35+ # Any value > max[rows, cols]
36+ inf = cols + 1
4437
4538 # element => last row seen
46- item_history = Hash . new ( 0 )
39+ row_history = Hash . new ( 0 )
4740
48- 1 . upto ( rows ) do |row |
49- # generate next row of cost matrix
50- new_row = Array . new ( cols + 2 , 0 )
51- new_row [ 0 ] = inf
52- new_row [ 1 ] = row
53- matrix << new_row
41+ # initialize alphabet-keyed cost matrix
42+ matrix = { }
43+ curr_row = 0 . upto ( cols ) . to_a
5444
55- last_match_col = 0
56- seq1_item = seq1 [ row - 1 ]
45+ rows . times do |row |
46+ seq1_item = seq1 [ row ]
47+ match_col = 0
5748
58- 1 . upto ( cols ) do |col |
59- seq2_item = seq2 [ col - 1 ]
60- last_match_row = item_history [ seq2_item ]
49+ # rotate row arrays & generate next
50+ matrix [ seq1_item ] = last_row = curr_row
51+ curr_row = Array . new ( cols + 1 , inf )
52+ curr_row [ 0 ] = row + 1
6153
54+ cols . times do |col |
55+ seq2_item = seq2 [ col ]
6256 sub_cost = seq1_item == seq2_item ? 0 : 1
6357
64- transposition = 1 + matrix [ last_match_row ] [ last_match_col ]
65- transposition += row - last_match_row - 1
66- transposition += col - last_match_col - 1
67-
68- # TODO: do insertion/deletion need to be considered when
69- # seq1_item == seq2_item ?
70- #
71- # substitution, deletion, insertion, transposition
58+ # | Xs | Xd |
59+ # | Xi | ? |
60+ # substitution, deletion, insertion
7261 cost = [
73- matrix [ row ] [ col ] + sub_cost ,
74- matrix [ row ] [ col + 1 ] + 1 ,
75- matrix [ row + 1 ] [ col ] + 1 ,
76- transposition
62+ last_row [ col ] + sub_cost ,
63+ last_row [ col + 1 ] + 1 ,
64+ curr_row [ col ] + 1
7765 ] . min
7866
79- matrix [ row + 1 ] [ col + 1 ] = cost
80-
81- last_match_col = col if sub_cost == 0
67+ # transposition cost
68+ # skip missed matrix lookup (inf cost)
69+ if sub_cost > 0 && row > 0 && ( m = matrix [ seq2_item ] )
70+ transpose = 1 + m [ match_col ] \
71+ + ( row - row_history [ seq2_item ] - 1 ) \
72+ + ( col - match_col - 1 )
73+ cost = transpose if transpose < cost
74+ end
75+
76+ match_col = col if sub_cost == 0
77+ curr_row [ col + 1 ] = cost
8278 end
8379
84- item_history [ seq1_item ] = row
80+ row_history [ seq1_item ] = row
8581 end
8682
87- matrix [ rows + 1 ] [ cols + 1 ]
83+ curr_row [ cols ]
8884 end
8985 end
9086end
0 commit comments