Skip to content

Commit c2552a4

Browse files
authored
Merge pull request #187 from bab2min/dev/fix_sb_errors
Fix SB errors & Optimize path search
2 parents d3503bd + d8740fc commit c2552a4

13 files changed

+1026
-335
lines changed

Diff for: ModelGenerator/morphemes.txt

+101-97
Large diffs are not rendered by default.

Diff for: include/kiwi/SubstringExtractor.h

+45
Original file line numberDiff line numberDiff line change
@@ -62,4 +62,49 @@ namespace kiwi
6262
size_t cluster(size_t i) const;
6363
float score(size_t i) const;
6464
};
65+
66+
class Kiwi;
67+
68+
class NgramExtractor
69+
{
70+
const Kiwi* kiwi = nullptr;
71+
bool gatherLmScore = true;
72+
UnorderedMap<std::u16string, size_t> morph2id;
73+
Vector<std::u16string> id2morph;
74+
Vector<uint16_t> buf;
75+
Vector<int16_t> scores;
76+
Vector<size_t> docBoundaries;
77+
Vector<uint32_t> positions;
78+
Vector<std::u16string> rawDocs;
79+
80+
size_t addTokens(const std::vector<TokenInfo>& tokens);
81+
82+
public:
83+
struct Candidate
84+
{
85+
std::u16string text;
86+
std::vector<std::u16string> tokens;
87+
std::vector<float> tokenScores;
88+
size_t cnt = 0;
89+
size_t df = 0;
90+
float score = 0;
91+
float npmi = 0;
92+
float leftBranch = 0;
93+
float rightBranch = 0;
94+
float lmScore = 0;
95+
};
96+
97+
NgramExtractor();
98+
NgramExtractor(const Kiwi& kiwi, bool gatherLmScore = true);
99+
NgramExtractor(const NgramExtractor&);
100+
NgramExtractor(NgramExtractor&&) noexcept;
101+
NgramExtractor& operator=(const NgramExtractor&);
102+
NgramExtractor& operator=(NgramExtractor&&) noexcept;
103+
~NgramExtractor();
104+
105+
size_t addText(const std::u16string& text);
106+
size_t addTexts(const U16Reader& reader);
107+
108+
std::vector<Candidate> extract(size_t maxCandidates = 1000, size_t minCnt = 10, size_t maxLength = 5, float minScore = 1e-3, size_t numWorkers = 1) const;
109+
};
65110
}

Diff for: src/KTrie.cpp

+1-11
Original file line numberDiff line numberDiff line change
@@ -972,16 +972,6 @@ size_t kiwi::splitByTrie(
972972
if (curNode->fail())
973973
{
974974
curNode = curNode->fail();
975-
for (auto submatcher = curNode; submatcher; submatcher = submatcher->fail())
976-
{
977-
const Form* cand = submatcher->val(trie);
978-
if (!cand) break;
979-
else if (!trie.hasSubmatch(cand))
980-
{
981-
zCodaFollowable = zCodaFollowable || getZCodaAppendable<typoTolerant>(cand, formBase);
982-
if (!insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces)) break;
983-
}
984-
}
985975
nextNode = curNode->template nextOpt<arch>(trie, c);
986976
}
987977
else
@@ -1161,7 +1151,7 @@ size_t kiwi::splitByTrie(
11611151
const Form* cand = node.second->val(trie);
11621152
if (cand && !trie.hasSubmatch(cand))
11631153
{
1164-
insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces, 0, 0, lengtheningTypoCost * node.first, node.first);
1154+
insertCandidates(candidates, cand, formBase, typoPtrs, str, nonSpaces, 0, 0, lengtheningTypoCost * (3 + node.first), node.first);
11651155
}
11661156
}
11671157
}

Diff for: src/Kiwi.cpp

+19-2
Original file line numberDiff line numberDiff line change
@@ -775,8 +775,25 @@ namespace kiwi
775775
spStateCnt[r.curState]++;
776776
validTarget++;
777777
}
778-
ret.erase(ret.begin() + validTarget, ret.end());
779-
spStatesByRet.erase(spStatesByRet.begin() + validTarget, spStatesByRet.end());
778+
Vector<size_t> idx(validTarget);
779+
iota(idx.begin(), idx.end(), 0);
780+
sort(idx.begin(), idx.end(), [&](size_t a, size_t b) { return ret[a].second > ret[b].second; });
781+
782+
Vector<TokenResult> sortedRet;
783+
Vector<SpecialState> sortedSpStatesByRet;
784+
const size_t maxCands = min(topN * 2, validTarget);
785+
for (size_t i = 0; i < maxCands; ++i)
786+
{
787+
sortedRet.emplace_back(move(ret[idx[i]]));
788+
sortedSpStatesByRet.emplace_back(spStatesByRet[idx[i]]);
789+
}
790+
ret.clear();
791+
spStatesByRet.clear();
792+
for (size_t i = 0; i < maxCands; ++i)
793+
{
794+
ret.emplace_back(move(sortedRet[i]));
795+
spStatesByRet.emplace_back(sortedSpStatesByRet[i]);
796+
}
780797
}
781798

782799
inline void makePretokenizedSpanGroup(

0 commit comments

Comments
 (0)