Skip to content

Commit d5bc7e0

Browse files
Adding computing frequency on the fly when dictionaries are gradually loaded
Signed-off-by: moonbeamcelery <moonbeamcelery@proton.me>
1 parent 1c35899 commit d5bc7e0

File tree

1 file changed

+14
-3
lines changed

1 file changed

+14
-3
lines changed

nlpcore/src/latin/dictionary.cppm

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,8 @@ export class LatinDictionary : public Dictionary {
193193
std::vector<fl::str::UniString> ngram;
194194
std::map<WordIdT, fl::str::UniString> id_to_words_map;
195195
WordIdT current_word_id = 1;
196+
total_scores_.clear();
197+
vocab_sizes_.clear();
196198

197199
while (std::getline(istream, line)) {
198200
fl::str::trim(line);
@@ -236,6 +238,10 @@ export class LatinDictionary : public Dictionary {
236238
}
237239
// Assign word to word ID map
238240
id_to_words_map[current_word_id++] = std::move(word);
241+
// compute frequency scores
242+
auto type = EntryType::word();
243+
total_scores_[type] += properties->absolute_score;
244+
vocab_sizes_[type]++;
239245
} else if (section == LatinDictionarySection::NGRAMS) {
240246
fl::str::split(line, FLDIC_SEPARATOR, line_components);
241247
if (line_components.size() < 2) {
@@ -251,6 +257,9 @@ export class LatinDictionary : public Dictionary {
251257
auto node = insertNgram(ngram);
252258
auto properties = node->value(dict_id_)->ngramProperties();
253259
properties->absolute_score = std::stoll(line_components[1]);
260+
auto type = EntryType::ngram(line_components.size());
261+
total_scores_[type] += properties->absolute_score;
262+
vocab_sizes_[type]++;
254263
} else if (section == LatinDictionarySection::SHORTCUTS) {
255264
fl::str::split(line, FLDIC_SEPARATOR, line_components);
256265
if (line_components.size() < 2) {
@@ -262,12 +271,14 @@ export class LatinDictionary : public Dictionary {
262271
auto properties = node->valueOrCreate(dict_id_)->shortcutPropertiesOrCreate();
263272
properties->absolute_score = 1;
264273
properties->shortcut_phrase = line_components[1];
274+
auto type = EntryType::shortcut();
275+
total_scores_[type] += properties->absolute_score;
276+
vocab_sizes_[type]++;
265277
}
266278
}
267279

268-
// TODO: do this directly when reading the words/ngrams and avoid this heavy op
269-
// TODO: this is necessary for the performance to be good during gradual loading
270-
recalculateAllFrequencyScores();
280+
// This is already done as we go.
281+
// recalculateAllFrequencyScores();
271282
}
272283

273284
void serializeContent(std::ostream& ostream) override {

0 commit comments

Comments
 (0)