Skip to content

Commit e4e0c11

Browse files
committed
prevents duplicates in simstring database
1 parent 021f914 commit e4e0c11

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

install.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,15 @@ def parse_and_encode_ngrams(extracted_it, simstring_dir, cuisty_dir):
9292
mkdir(cuisty_dir)
9393

9494
ss_db = SimstringDBWriter(simstring_dir)
95-
9695
cuisty_db = CuiSemTypesDB(cuisty_dir)
9796

97+
simstring_terms = set()
98+
9899
for i, (term, cui, stys, preferred) in enumerate(extracted_it, start=1):
99-
ss_db.insert(term)
100+
if term not in simstring_terms:
101+
ss_db.insert(term)
102+
simstring_terms.add(term)
103+
100104
cuisty_db.insert(term, cui, stys, preferred)
101105

102106

0 commit comments

Comments
 (0)