1313
1414#include " lldb/ValueObject/DILLexer.h"
1515#include " clang/Basic/CharInfo.h"
16- // #include "llvm/ADT/StringMap.h"
1716#include " llvm/ADT/StringSwitch.h"
1817#include " llvm/Support/ConvertUTF.h"
1918#include " llvm/Support/Unicode.h"
2221
2322namespace lldb_private ::dil {
2423
25- /*
26- const llvm::StringMap<Token::Kind> Keywords = {
27- {"bool", Token::kw_bool},
28- {"char", Token::kw_char},
29- {"char16_t", Token::kw_char16_t},
30- {"char32_t", Token::kw_char32_t},
31- {"const", Token::kw_const},
32- {"double", Token::kw_double},
33- {"dynamic_cast", Token::kw_dynamic_cast},
34- {"false", Token::kw_false},
35- {"float", Token::kw_float},
36- {"int", Token::kw_int},
37- {"long", Token::kw_long},
38- {"namespace", Token::kw_namespace},
39- {"nullptr", Token::kw_nullptr},
40- {"reinterpret_cast", Token::kw_reinterpret_cast},
41- {"short", Token::kw_short},
42- {"signed", Token::kw_signed},
43- {"sizeof", Token::kw_sizeof},
44- {"static_cast", Token::kw_static_cast},
45- {"this", Token::kw_this},
46- {"true", Token::kw_true},
47- {"unsigned", Token::kw_unsigned},
48- {"void", Token::kw_void},
49- {"volatile", Token::kw_volatile},
50- {"wchar_t", Token::kw_wchar_t}};
51- */
52-
5324llvm::StringRef Token::GetTokenName (Kind kind) {
5425 switch (kind){
5526 case Token::amp: return " amp" ;
@@ -164,71 +135,39 @@ static bool IsLetter(char c) {
164135
165136static bool IsDigit (char c) { return (' 0' <= c && c <= ' 9' ); }
166137
167- static bool isValidIdentifierContinuationCodePoint (uint32_t c) {
168- if (c < 0x80 )
169- return clang::isAsciiIdentifierContinue (c, /* dollar*/ true );
170-
171- // N1518: Recommendations for extended identifier characters for C and C++
172- // Proposed Annex X.1: Ranges of characters allowed
173- return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF ||
174- (c >= 0x00B2 && c <= 0x00B5 ) || (c >= 0x00B7 && c <= 0x00BA ) ||
175- (c >= 0x00BC && c <= 0x00BE ) || (c >= 0x00C0 && c <= 0x00D6 ) ||
176- (c >= 0x00D8 && c <= 0x00F6 ) || (c >= 0x00F8 && c <= 0x00FF )
177-
178- || (c >= 0x0100 && c <= 0x167F ) || (c >= 0x1681 && c <= 0x180D ) ||
179- (c >= 0x180F && c <= 0x1FFF )
180-
181- || (c >= 0x200B && c <= 0x200D ) || (c >= 0x202A && c <= 0x202E ) ||
182- (c >= 0x203F && c <= 0x2040 ) || c == 0x2054 ||
183- (c >= 0x2060 && c <= 0x206F )
184-
185- || (c >= 0x2070 && c <= 0x218F ) || (c >= 0x2460 && c <= 0x24FF ) ||
186- (c >= 0x2776 && c <= 0x2793 ) || (c >= 0x2C00 && c <= 0x2DFF ) ||
187- (c >= 0x2E80 && c <= 0x2FFF )
188-
189- || (c >= 0x3004 && c <= 0x3007 ) || (c >= 0x3021 && c <= 0x302F ) ||
190- (c >= 0x3031 && c <= 0x303F )
138+ static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
139+ {0x0085 , 0x0085 }, {0x00A0 , 0x00A0 }, {0x1680 , 0x1680 },
140+ {0x180E , 0x180E }, {0x2000 , 0x200A }, {0x2028 , 0x2029 },
141+ {0x202F , 0x202F }, {0x205F , 0x205F }, {0x3000 , 0x3000 }};
191142
192- || (c >= 0x3040 && c <= 0xD7FF )
143+ static bool IsUnicodeWhitespace (uint32_t Codepoint) {
144+ static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars (
145+ UnicodeWhitespaceCharRanges);
146+ return UnicodeWhitespaceChars.contains (Codepoint);
147+ }
193148
194- || (c >= 0xF900 && c <= 0xFD3D ) || (c >= 0xFD40 && c <= 0xFDCF ) ||
195- (c >= 0xFDF0 && c <= 0xFE44 ) || (c >= 0xFE47 && c <= 0xFFF8 )
149+ inline bool IsOperator (unsigned char c) {
150+ using namespace clang ::charinfo;
151+ return (InfoTable[c] & (CHAR_PUNCT | CHAR_PERIOD)) != 0 ;
152+ }
196153
197- || (c >= 0x10000 && c <= 0x1FFFD ) || (c >= 0x20000 && c <= 0x2FFFD ) ||
198- (c >= 0x30000 && c <= 0x3FFFD ) || (c >= 0x40000 && c <= 0x4FFFD ) ||
199- (c >= 0x50000 && c <= 0x5FFFD ) || (c >= 0x60000 && c <= 0x6FFFD ) ||
200- (c >= 0x70000 && c <= 0x7FFFD ) || (c >= 0x80000 && c <= 0x8FFFD ) ||
201- (c >= 0x90000 && c <= 0x9FFFD ) || (c >= 0xA0000 && c <= 0xAFFFD ) ||
202- (c >= 0xB0000 && c <= 0xBFFFD ) || (c >= 0xC0000 && c <= 0xCFFFD ) ||
203- (c >= 0xD0000 && c <= 0xDFFFD ) || (c >= 0xE0000 && c <= 0xEFFFD );
154+ static bool IsValidIdentifierContinuationCodePoint ( uint32_t c) {
155+ if (c < 0x80 ) {
156+ if (c == ' $ ' )
157+ return true ;
158+ return ! IsOperator (c) && ! clang::isWhitespace (c);
159+ }
160+ return ! IsUnicodeWhitespace (c );
204161}
205162
206- static bool isValidIdentifierStartCodePoint (uint32_t c) {
207- if (!isValidIdentifierContinuationCodePoint (c))
163+ static bool IsValidIdentifierStartCodePoint (uint32_t c) {
164+ if (!IsValidIdentifierContinuationCodePoint (c))
208165 return false ;
209166 if (c < 0x80 && IsDigit (c))
210167 return false ;
211-
212- // N1518: Recommendations for extended identifier characters for C and C++
213- // Proposed Annex X.2: Ranges of characters disallowed initially
214- if ((c >= 0x0300 && c <= 0x036F ) || (c >= 0x1DC0 && c <= 0x1DFF ) ||
215- (c >= 0x20D0 && c <= 0x20FF ) || (c >= 0xFE20 && c <= 0xFE2F ))
216- return false ;
217-
218168 return true ;
219169}
220170
221- static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
222- {0x0085 , 0x0085 }, {0x00A0 , 0x00A0 }, {0x1680 , 0x1680 },
223- {0x180E , 0x180E }, {0x2000 , 0x200A }, {0x2028 , 0x2029 },
224- {0x202F , 0x202F }, {0x205F , 0x205F }, {0x3000 , 0x3000 }};
225-
226- static bool isUnicodeWhitespace (uint32_t Codepoint) {
227- static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars (
228- UnicodeWhitespaceCharRanges);
229- return UnicodeWhitespaceChars.contains (Codepoint);
230- }
231-
232171static std::tuple<llvm::ConversionResult, llvm::UTF32, uint32_t >
233172convertUTF8SequenceAndAdvance (llvm::StringRef::iterator &cur_pos,
234173 llvm::StringRef::iterator end) {
@@ -246,7 +185,7 @@ static void SkipUnicodeWhitespaces(llvm::StringRef &remainder) {
246185 while (true ) {
247186 auto [Status, CodePoint, size] =
248187 convertUTF8SequenceAndAdvance (cur_pos, remainder.end ());
249- if (Status != llvm::conversionOK || !isUnicodeWhitespace (CodePoint))
188+ if (Status != llvm::conversionOK || !IsUnicodeWhitespace (CodePoint))
250189 break ;
251190 length += size;
252191 }
@@ -262,23 +201,21 @@ static void SkipWhitespaces(llvm::StringRef &remainder) {
262201 } while (remainder.begin () != cur_pos);
263202}
264203
265- static std::optional<llvm::StringRef>
266- IsWord (llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &utf_length ) {
204+ static std::optional<llvm::StringRef> IsWord (llvm::StringRef expr,
205+ llvm::StringRef &remainder) {
267206 llvm::StringRef::iterator cur_pos = remainder.begin ();
268207 llvm::StringRef::iterator start = cur_pos;
269208 auto [Status, CodePoint, size] =
270209 convertUTF8SequenceAndAdvance (cur_pos, remainder.end ());
271210 if (Status == llvm::conversionOK &&
272- isValidIdentifierStartCodePoint (CodePoint)) {
273- utf_length = 1 ;
211+ IsValidIdentifierStartCodePoint (CodePoint)) {
274212 unsigned length = size;
275213 while (true ) {
276214 auto [Status, CodePoint, size] =
277215 convertUTF8SequenceAndAdvance (cur_pos, remainder.end ());
278216 if (Status != llvm::conversionOK ||
279- !isValidIdentifierContinuationCodePoint (CodePoint))
217+ !IsValidIdentifierContinuationCodePoint (CodePoint))
280218 break ;
281- utf_length++;
282219 length += size;
283220 }
284221
@@ -374,9 +311,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
374311 position += number.size ();
375312 return token;
376313 } else {
377- uint32_t utf_length = 0 ;
378- std::optional<llvm::StringRef> maybe_word =
379- IsWord (expr, remainder, utf_length);
314+ std::optional<llvm::StringRef> maybe_word = IsWord (expr, remainder);
380315 if (maybe_word) {
381316 llvm::StringRef word = *maybe_word;
382317 Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
@@ -406,7 +341,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
406341 .Case (" wchar_t" , Token::kw_wchar_t )
407342 .Default (Token::identifier);
408343 auto token = Token (kind, word.str (), position);
409- position += utf_length ;
344+ position += llvm::sys::unicode::columnWidthUTF8 (word. str ()) ;
410345 return token;
411346 }
412347 }
0 commit comments