Skip to content

Commit 6520218

Browse files
committed
Allow identifier to be contain anything except operators and whitespaces
1 parent 4e79115 commit 6520218

File tree

3 files changed

+39
-98
lines changed

3 files changed

+39
-98
lines changed

lldb/source/ValueObject/DILLexer.cpp

Lines changed: 29 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313

1414
#include "lldb/ValueObject/DILLexer.h"
1515
#include "clang/Basic/CharInfo.h"
16-
//#include "llvm/ADT/StringMap.h"
1716
#include "llvm/ADT/StringSwitch.h"
1817
#include "llvm/Support/ConvertUTF.h"
1918
#include "llvm/Support/Unicode.h"
@@ -22,34 +21,6 @@
2221

2322
namespace lldb_private::dil {
2423

25-
/*
26-
const llvm::StringMap<Token::Kind> Keywords = {
27-
{"bool", Token::kw_bool},
28-
{"char", Token::kw_char},
29-
{"char16_t", Token::kw_char16_t},
30-
{"char32_t", Token::kw_char32_t},
31-
{"const", Token::kw_const},
32-
{"double", Token::kw_double},
33-
{"dynamic_cast", Token::kw_dynamic_cast},
34-
{"false", Token::kw_false},
35-
{"float", Token::kw_float},
36-
{"int", Token::kw_int},
37-
{"long", Token::kw_long},
38-
{"namespace", Token::kw_namespace},
39-
{"nullptr", Token::kw_nullptr},
40-
{"reinterpret_cast", Token::kw_reinterpret_cast},
41-
{"short", Token::kw_short},
42-
{"signed", Token::kw_signed},
43-
{"sizeof", Token::kw_sizeof},
44-
{"static_cast", Token::kw_static_cast},
45-
{"this", Token::kw_this},
46-
{"true", Token::kw_true},
47-
{"unsigned", Token::kw_unsigned},
48-
{"void", Token::kw_void},
49-
{"volatile", Token::kw_volatile},
50-
{"wchar_t", Token::kw_wchar_t}};
51-
*/
52-
5324
llvm::StringRef Token::GetTokenName(Kind kind) {
5425
switch (kind){
5526
case Token::amp: return "amp";
@@ -164,71 +135,39 @@ static bool IsLetter(char c) {
164135

165136
static bool IsDigit(char c) { return ('0' <= c && c <= '9'); }
166137

167-
static bool isValidIdentifierContinuationCodePoint(uint32_t c) {
168-
if (c < 0x80)
169-
return clang::isAsciiIdentifierContinue(c, /*dollar*/ true);
170-
171-
// N1518: Recommendations for extended identifier characters for C and C++
172-
// Proposed Annex X.1: Ranges of characters allowed
173-
return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF ||
174-
(c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) ||
175-
(c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) ||
176-
(c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF)
177-
178-
|| (c >= 0x0100 && c <= 0x167F) || (c >= 0x1681 && c <= 0x180D) ||
179-
(c >= 0x180F && c <= 0x1FFF)
180-
181-
|| (c >= 0x200B && c <= 0x200D) || (c >= 0x202A && c <= 0x202E) ||
182-
(c >= 0x203F && c <= 0x2040) || c == 0x2054 ||
183-
(c >= 0x2060 && c <= 0x206F)
184-
185-
|| (c >= 0x2070 && c <= 0x218F) || (c >= 0x2460 && c <= 0x24FF) ||
186-
(c >= 0x2776 && c <= 0x2793) || (c >= 0x2C00 && c <= 0x2DFF) ||
187-
(c >= 0x2E80 && c <= 0x2FFF)
188-
189-
|| (c >= 0x3004 && c <= 0x3007) || (c >= 0x3021 && c <= 0x302F) ||
190-
(c >= 0x3031 && c <= 0x303F)
138+
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
139+
{0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680},
140+
{0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029},
141+
{0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}};
191142

192-
|| (c >= 0x3040 && c <= 0xD7FF)
143+
static bool IsUnicodeWhitespace(uint32_t Codepoint) {
144+
static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
145+
UnicodeWhitespaceCharRanges);
146+
return UnicodeWhitespaceChars.contains(Codepoint);
147+
}
193148

194-
|| (c >= 0xF900 && c <= 0xFD3D) || (c >= 0xFD40 && c <= 0xFDCF) ||
195-
(c >= 0xFDF0 && c <= 0xFE44) || (c >= 0xFE47 && c <= 0xFFF8)
149+
inline bool IsOperator(unsigned char c) {
150+
using namespace clang::charinfo;
151+
return (InfoTable[c] & (CHAR_PUNCT | CHAR_PERIOD)) != 0;
152+
}
196153

197-
|| (c >= 0x10000 && c <= 0x1FFFD) || (c >= 0x20000 && c <= 0x2FFFD) ||
198-
(c >= 0x30000 && c <= 0x3FFFD) || (c >= 0x40000 && c <= 0x4FFFD) ||
199-
(c >= 0x50000 && c <= 0x5FFFD) || (c >= 0x60000 && c <= 0x6FFFD) ||
200-
(c >= 0x70000 && c <= 0x7FFFD) || (c >= 0x80000 && c <= 0x8FFFD) ||
201-
(c >= 0x90000 && c <= 0x9FFFD) || (c >= 0xA0000 && c <= 0xAFFFD) ||
202-
(c >= 0xB0000 && c <= 0xBFFFD) || (c >= 0xC0000 && c <= 0xCFFFD) ||
203-
(c >= 0xD0000 && c <= 0xDFFFD) || (c >= 0xE0000 && c <= 0xEFFFD);
154+
static bool IsValidIdentifierContinuationCodePoint(uint32_t c) {
155+
if (c < 0x80) {
156+
if (c == '$')
157+
return true;
158+
return !IsOperator(c) && !clang::isWhitespace(c);
159+
}
160+
return !IsUnicodeWhitespace(c);
204161
}
205162

206-
static bool isValidIdentifierStartCodePoint(uint32_t c) {
207-
if (!isValidIdentifierContinuationCodePoint(c))
163+
static bool IsValidIdentifierStartCodePoint(uint32_t c) {
164+
if (!IsValidIdentifierContinuationCodePoint(c))
208165
return false;
209166
if (c < 0x80 && IsDigit(c))
210167
return false;
211-
212-
// N1518: Recommendations for extended identifier characters for C and C++
213-
// Proposed Annex X.2: Ranges of characters disallowed initially
214-
if ((c >= 0x0300 && c <= 0x036F) || (c >= 0x1DC0 && c <= 0x1DFF) ||
215-
(c >= 0x20D0 && c <= 0x20FF) || (c >= 0xFE20 && c <= 0xFE2F))
216-
return false;
217-
218168
return true;
219169
}
220170

221-
static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
222-
{0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680},
223-
{0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029},
224-
{0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}};
225-
226-
static bool isUnicodeWhitespace(uint32_t Codepoint) {
227-
static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
228-
UnicodeWhitespaceCharRanges);
229-
return UnicodeWhitespaceChars.contains(Codepoint);
230-
}
231-
232171
static std::tuple<llvm::ConversionResult, llvm::UTF32, uint32_t>
233172
convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos,
234173
llvm::StringRef::iterator end) {
@@ -246,7 +185,7 @@ static void SkipUnicodeWhitespaces(llvm::StringRef &remainder) {
246185
while (true) {
247186
auto [Status, CodePoint, size] =
248187
convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
249-
if (Status != llvm::conversionOK || !isUnicodeWhitespace(CodePoint))
188+
if (Status != llvm::conversionOK || !IsUnicodeWhitespace(CodePoint))
250189
break;
251190
length += size;
252191
}
@@ -262,23 +201,21 @@ static void SkipWhitespaces(llvm::StringRef &remainder) {
262201
} while (remainder.begin() != cur_pos);
263202
}
264203

265-
static std::optional<llvm::StringRef>
266-
IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &utf_length) {
204+
static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
205+
llvm::StringRef &remainder) {
267206
llvm::StringRef::iterator cur_pos = remainder.begin();
268207
llvm::StringRef::iterator start = cur_pos;
269208
auto [Status, CodePoint, size] =
270209
convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
271210
if (Status == llvm::conversionOK &&
272-
isValidIdentifierStartCodePoint(CodePoint)) {
273-
utf_length = 1;
211+
IsValidIdentifierStartCodePoint(CodePoint)) {
274212
unsigned length = size;
275213
while (true) {
276214
auto [Status, CodePoint, size] =
277215
convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
278216
if (Status != llvm::conversionOK ||
279-
!isValidIdentifierContinuationCodePoint(CodePoint))
217+
!IsValidIdentifierContinuationCodePoint(CodePoint))
280218
break;
281-
utf_length++;
282219
length += size;
283220
}
284221

@@ -374,9 +311,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
374311
position += number.size();
375312
return token;
376313
} else {
377-
uint32_t utf_length = 0;
378-
std::optional<llvm::StringRef> maybe_word =
379-
IsWord(expr, remainder, utf_length);
314+
std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
380315
if (maybe_word) {
381316
llvm::StringRef word = *maybe_word;
382317
Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
@@ -406,7 +341,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
406341
.Case("wchar_t", Token::kw_wchar_t)
407342
.Default(Token::identifier);
408343
auto token = Token(kind, word.str(), position);
409-
position += utf_length;
344+
position += llvm::sys::unicode::columnWidthUTF8(word.str());
410345
return token;
411346
}
412347
}

lldb/unittests/DIL/DILTests.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3681,13 +3681,15 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) {
36813681
}
36823682
#endif
36833683

3684-
TEST_F(EvalTest, TestUnicodeIdentifiers) {
3684+
TEST_F(EvalTest, TestUnicodeInput) {
36853685
EXPECT_THAT(Eval("フー + 1"), IsEqual("2"));
36863686
EXPECT_THAT(Eval("1 + フー"), IsEqual("2"));
36873687
EXPECT_THAT(Eval("föo + 1"), IsEqual("4"));
36883688
EXPECT_THAT(Eval("שלום + 1"), IsEqual("5"));
36893689
EXPECT_THAT(Eval(" 1 +   föo   "), // Contains Unicode whitespaces
36903690
IsEqual("4"));
3691+
3692+
// Check diagnostic pointer location
36913693
EXPECT_THAT(Eval("фу + бар"),
36923694
IsError("<expr:1:6>: use of undeclared identifier 'бар'\n"
36933695
"фу + бар\n"
@@ -3696,4 +3698,8 @@ TEST_F(EvalTest, TestUnicodeIdentifiers) {
36963698
IsError("<expr:1:8>: use of undeclared identifier 'бар'\n"
36973699
"фу + бар\n"
36983700
" ^"));
3701+
EXPECT_THAT(Eval("フー + бар"),
3702+
IsError("<expr:1:8>: use of undeclared identifier 'бар'\n"
3703+
"フー + бар\n"
3704+
" ^"));
36993705
}

lldb/unittests/DIL/Inputs/test_binary.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1196,12 +1196,12 @@ static void TestStringParsing() {
11961196
// BREAK(TestStringParsing)
11971197
}
11981198

1199-
static void TestUnicodeIdentifiers() {
1199+
static void TestUnicodeInput() {
12001200
int フー = 1;
12011201
int фу = 2;
12021202
int föo = 3;
12031203
int שלום = 4;
1204-
// BREAK(TestUnicodeIdentifiers)
1204+
// BREAK(TestUnicodeInput)
12051205
}
12061206

12071207
namespace test_binary {
@@ -1258,7 +1258,7 @@ void main() {
12581258

12591259
TestCharParsing();
12601260
TestStringParsing();
1261-
TestUnicodeIdentifiers();
1261+
TestUnicodeInput();
12621262

12631263
// BREAK HERE
12641264
}

0 commit comments

Comments
 (0)