Skip to content

Commit 09766c3

Browse files
committed
fix regression in median/median_improve
1 parent d3396a5 commit 09766c3

File tree

5 files changed

+17
-19
lines changed

5 files changed

+17
-19
lines changed

HISTORY.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@
55
- implement all Python wrappers mostly with cython
66
- replace usage of deprecated Python APIs
77

8+
#### Fixed
9+
- fix behavior of median and median_improve
10+
811
### v0.18.2
912
#### Changed
1013
- Allow installation from system installed versions of `rapidfuzz-cpp`

docs/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
author = 'Max Bachmann'
2323

2424
# The full version, including alpha/beta/rc tags
25-
release = '0.18.2'
25+
release = '0.19.0'
2626

2727
# -- General configuration ---------------------------------------------------
2828

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
setup(
77
name="Levenshtein",
8-
version="0.18.2",
8+
version="0.19.0",
99
url="https://github.com/maxbachmann/Levenshtein",
1010
author="Max Bachmann",
1111
install_requires=["rapidfuzz >= 2.0.1, < 3.0.0"],

src/Levenshtein/Levenshtein-c/_levenshtein.hpp

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
#include <memory>
99
#include <vector>
1010
#include <string>
11-
#include <unordered_set>
11+
#include <set>
1212
#include <rapidfuzz/distance/Indel.hpp>
1313
#include <rapidfuzz/distance/Levenshtein.hpp>
1414

@@ -190,7 +190,7 @@ static inline std::vector<uint32_t> make_symlist(const std::vector<RF_String>& s
190190
return symlist;
191191
}
192192

193-
std::unordered_set<uint32_t> symmap;
193+
std::set<uint32_t> symmap;
194194
for (const auto& string : strings) {
195195
visit(string, [&](auto first1, auto last1){
196196
for (; first1 != last1; ++first1) {
@@ -351,20 +351,16 @@ static inline double finish_distance_computations(size_t len1, uint32_t* string1
351351
const std::vector<double>& weights, std::vector<std::unique_ptr<size_t[]>>& rows,
352352
std::unique_ptr<size_t[]>& row)
353353
{
354-
size_t *end;
355-
size_t i, j;
356-
size_t offset; /* row[0]; offset + len1 give together real len of string1 */
357354
double distsum = 0.0; /* sum of distances */
358-
359355
/* catch trivial case */
360356
if (len1 == 0) {
361-
for (j = 0; j < strings.size(); j++)
357+
for (size_t j = 0; j < strings.size(); j++)
362358
distsum += (double)rows[j][strings[j].length]*weights[j];
363359
return distsum;
364360
}
365361

366362
/* iterate through the strings and sum the distances */
367-
for (j = 0; j < strings.size(); j++) {
363+
for (size_t j = 0; j < strings.size(); j++) {
368364
visit(strings[j], [&](auto first1, auto last1){
369365
size_t* rowi = rows[j].get(); /* current row */
370366
size_t leni = (size_t)std::distance(first1, last1); /* current length */
@@ -381,17 +377,18 @@ static inline double finish_distance_computations(size_t len1, uint32_t* string1
381377
distsum += (double)rowi[leni]*weights[j];
382378
return;
383379
}
384-
offset = rowi[0];
380+
/* row[0]; offset + len1 give together real len of string1 */
381+
size_t offset = rowi[0];
385382
if (leni == 0) {
386383
distsum += (double)(offset + len)*weights[j];
387384
return;
388385
}
389386

390387
/* complete the matrix */
391388
memcpy(row.get(), rowi, (leni + 1)*sizeof(size_t));
392-
end = row.get() + leni;
389+
size_t* end = row.get() + leni;
393390

394-
for (i = 1; i <= len; i++) {
391+
for (size_t i = 1; i <= len; i++) {
395392
size_t* p = row.get() + 1;
396393
const uint32_t char1 = string1[i - 1];
397394
auto char2p = first1;
@@ -447,18 +444,16 @@ static inline std::basic_string<uint32_t> lev_median_improve(const RF_String& st
447444
/* allocate and initialize per-string matrix rows and a common work buffer */
448445
std::vector<std::unique_ptr<size_t[]>> rows(strings.size());
449446
size_t maxlen = 0;
450-
for (const auto& str : strings) {
451-
maxlen = std::max(maxlen, (size_t)str.length);
452-
}
453-
454447
for (size_t i = 0; i < strings.size(); i++) {
455448
size_t leni = (size_t)strings[i].length;
449+
if (leni > maxlen)
450+
maxlen = leni;
456451
rows[i] = std::make_unique<size_t[]>(leni + 1);
457452
std::iota(rows[i].get(), rows[i].get() + leni + 1, 0);
458453
}
459454

460455
size_t stoplen = 2*maxlen + 1;
461-
auto row = std::make_unique<size_t[]>(stoplen + 1);
456+
auto row = std::make_unique<size_t[]>(stoplen + 2);
462457

463458
/* initialize median to given string */
464459
auto _median = std::make_unique<uint32_t[]>(stoplen + 1);

src/Levenshtein/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616

1717
__author__: str = "Max Bachmann"
1818
__license__: str = "GPL"
19-
__version__: str = "0.18.2"
19+
__version__: str = "0.19.0"
2020

2121
from rapidfuzz.distance.Levenshtein import distance
2222
from rapidfuzz.distance.Indel import normalized_similarity as ratio

0 commit comments

Comments
 (0)