-
Notifications
You must be signed in to change notification settings - Fork 22
Expand file tree
/
Copy pathberttokenizer.h
More file actions
64 lines (51 loc) · 1.95 KB
/
berttokenizer.h
File metadata and controls
64 lines (51 loc) · 1.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#ifndef BERTTOKENIZER_H
#define BERTTOKENIZER_H
// https://gist.github.com/luistung/ace4888cf5fd1bad07844021cb2c7ecf
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <unordered_map>
using Vocab = std::unordered_map<std::wstring, size_t>;
using InvVocab = std::unordered_map<size_t, std::wstring>;
class BasicTokenizer {
public:
BasicTokenizer(bool doLowerCase);
std::vector<std::wstring> tokenize(const std::string& text) const;
private:
std::wstring cleanText(const std::wstring& text) const;
bool isControol(const wchar_t& ch) const;
bool isWhitespace(const wchar_t& ch) const;
bool isPunctuation(const wchar_t& ch) const;
bool isChineseChar(const wchar_t& ch) const;
std::wstring tokenizeChineseChars(const std::wstring& text) const;
bool isStripChar(const wchar_t& ch) const;
std::wstring strip(const std::wstring& text) const;
std::vector<std::wstring> split(const std::wstring& text) const;
std::wstring runStripAccents(const std::wstring& text) const;
std::vector<std::wstring> runSplitOnPunc(const std::wstring& text) const;
bool mDoLowerCase;
};
class WordpieceTokenizer {
public:
WordpieceTokenizer(std::shared_ptr<Vocab> vocab, const std::wstring& unkToken = L"[UNK]", size_t maxInputCharsPerWord=200);
std::vector<std::wstring> tokenize(const std::wstring& text) const;
private:
std::shared_ptr<Vocab> mVocab;
std::wstring mUnkToken;
size_t mMaxInputCharsPerWord;
};
class FullTokenizer {
public:
FullTokenizer(const std::string& vocabFile, bool doLowerCase = true);
std::vector<std::wstring> tokenize(const std::string& text) const;
std::vector<size_t> convertTokensToIds(const std::vector<std::wstring>& text) const;
private:
std::shared_ptr<Vocab> mVocab;
InvVocab mInvVocab;
std::string mVocabFile;
bool mDoLowerCase;
BasicTokenizer mBasicTokenizer;
WordpieceTokenizer mWordpieceTokenizer;
};
#endif // BERTTOKENIZER_H