Allow identifier to be contain anything except operators and whitespaces

kuilpd · kuilpd · commit 6520218fbd4b · 2025-03-04T01:23:38.000+05:00
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
@@ -13,7 +13,6 @@
 
 #include "lldb/ValueObject/DILLexer.h"
 #include "clang/Basic/CharInfo.h"
-//#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Unicode.h"
@@ -22,34 +21,6 @@
 
 namespace lldb_private::dil {
 
-/*
-const llvm::StringMap<Token::Kind> Keywords = {
-    {"bool", Token::kw_bool},
-    {"char", Token::kw_char},
-    {"char16_t", Token::kw_char16_t},
-    {"char32_t", Token::kw_char32_t},
-    {"const", Token::kw_const},
-    {"double", Token::kw_double},
-    {"dynamic_cast", Token::kw_dynamic_cast},
-    {"false", Token::kw_false},
-    {"float", Token::kw_float},
-    {"int", Token::kw_int},
-    {"long", Token::kw_long},
-    {"namespace", Token::kw_namespace},
-    {"nullptr", Token::kw_nullptr},
-    {"reinterpret_cast", Token::kw_reinterpret_cast},
-    {"short", Token::kw_short},
-    {"signed", Token::kw_signed},
-    {"sizeof", Token::kw_sizeof},
-    {"static_cast", Token::kw_static_cast},
-    {"this", Token::kw_this},
-    {"true", Token::kw_true},
-    {"unsigned", Token::kw_unsigned},
-    {"void", Token::kw_void},
-    {"volatile", Token::kw_volatile},
-    {"wchar_t", Token::kw_wchar_t}};
-*/
-
 llvm::StringRef Token::GetTokenName(Kind kind) {
   switch (kind){
     case Token::amp: return "amp";
@@ -164,71 +135,39 @@ static bool IsLetter(char c) {
 
 static bool IsDigit(char c) { return ('0' <= c && c <= '9'); }
 
-static bool isValidIdentifierContinuationCodePoint(uint32_t c) {
-  if (c < 0x80)
-    return clang::isAsciiIdentifierContinue(c, /*dollar*/ true);
-
-  // N1518: Recommendations for extended identifier characters for C and C++
-  // Proposed Annex X.1: Ranges of characters allowed
-  return c == 0x00A8 || c == 0x00AA || c == 0x00AD || c == 0x00AF ||
-         (c >= 0x00B2 && c <= 0x00B5) || (c >= 0x00B7 && c <= 0x00BA) ||
-         (c >= 0x00BC && c <= 0x00BE) || (c >= 0x00C0 && c <= 0x00D6) ||
-         (c >= 0x00D8 && c <= 0x00F6) || (c >= 0x00F8 && c <= 0x00FF)
-
-         || (c >= 0x0100 && c <= 0x167F) || (c >= 0x1681 && c <= 0x180D) ||
-         (c >= 0x180F && c <= 0x1FFF)
-
-         || (c >= 0x200B && c <= 0x200D) || (c >= 0x202A && c <= 0x202E) ||
-         (c >= 0x203F && c <= 0x2040) || c == 0x2054 ||
-         (c >= 0x2060 && c <= 0x206F)
-
-         || (c >= 0x2070 && c <= 0x218F) || (c >= 0x2460 && c <= 0x24FF) ||
-         (c >= 0x2776 && c <= 0x2793) || (c >= 0x2C00 && c <= 0x2DFF) ||
-         (c >= 0x2E80 && c <= 0x2FFF)
-
-         || (c >= 0x3004 && c <= 0x3007) || (c >= 0x3021 && c <= 0x302F) ||
-         (c >= 0x3031 && c <= 0x303F)
+static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
+    {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680},
+    {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029},
+    {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}};
 
-         || (c >= 0x3040 && c <= 0xD7FF)
+static bool IsUnicodeWhitespace(uint32_t Codepoint) {
+  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
+      UnicodeWhitespaceCharRanges);
+  return UnicodeWhitespaceChars.contains(Codepoint);
+}
 
-         || (c >= 0xF900 && c <= 0xFD3D) || (c >= 0xFD40 && c <= 0xFDCF) ||
-         (c >= 0xFDF0 && c <= 0xFE44) || (c >= 0xFE47 && c <= 0xFFF8)
+inline bool IsOperator(unsigned char c) {
+  using namespace clang::charinfo;
+  return (InfoTable[c] & (CHAR_PUNCT | CHAR_PERIOD)) != 0;
+}
 
-         || (c >= 0x10000 && c <= 0x1FFFD) || (c >= 0x20000 && c <= 0x2FFFD) ||
-         (c >= 0x30000 && c <= 0x3FFFD) || (c >= 0x40000 && c <= 0x4FFFD) ||
-         (c >= 0x50000 && c <= 0x5FFFD) || (c >= 0x60000 && c <= 0x6FFFD) ||
-         (c >= 0x70000 && c <= 0x7FFFD) || (c >= 0x80000 && c <= 0x8FFFD) ||
-         (c >= 0x90000 && c <= 0x9FFFD) || (c >= 0xA0000 && c <= 0xAFFFD) ||
-         (c >= 0xB0000 && c <= 0xBFFFD) || (c >= 0xC0000 && c <= 0xCFFFD) ||
-         (c >= 0xD0000 && c <= 0xDFFFD) || (c >= 0xE0000 && c <= 0xEFFFD);
+static bool IsValidIdentifierContinuationCodePoint(uint32_t c) {
+  if (c < 0x80) {
+    if (c == '$')
+      return true;
+    return !IsOperator(c) && !clang::isWhitespace(c);
+  }
+  return !IsUnicodeWhitespace(c);
 }
 
-static bool isValidIdentifierStartCodePoint(uint32_t c) {
-  if (!isValidIdentifierContinuationCodePoint(c))
+static bool IsValidIdentifierStartCodePoint(uint32_t c) {
+  if (!IsValidIdentifierContinuationCodePoint(c))
     return false;
   if (c < 0x80 && IsDigit(c))
     return false;
-
-  // N1518: Recommendations for extended identifier characters for C and C++
-  // Proposed Annex X.2: Ranges of characters disallowed initially
-  if ((c >= 0x0300 && c <= 0x036F) || (c >= 0x1DC0 && c <= 0x1DFF) ||
-      (c >= 0x20D0 && c <= 0x20FF) || (c >= 0xFE20 && c <= 0xFE2F))
-    return false;
-
   return true;
 }
 
-static const llvm::sys::UnicodeCharRange UnicodeWhitespaceCharRanges[] = {
-    {0x0085, 0x0085}, {0x00A0, 0x00A0}, {0x1680, 0x1680},
-    {0x180E, 0x180E}, {0x2000, 0x200A}, {0x2028, 0x2029},
-    {0x202F, 0x202F}, {0x205F, 0x205F}, {0x3000, 0x3000}};
-
-static bool isUnicodeWhitespace(uint32_t Codepoint) {
-  static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
-      UnicodeWhitespaceCharRanges);
-  return UnicodeWhitespaceChars.contains(Codepoint);
-}
-
 static std::tuple<llvm::ConversionResult, llvm::UTF32, uint32_t>
 convertUTF8SequenceAndAdvance(llvm::StringRef::iterator &cur_pos,
                               llvm::StringRef::iterator end) {
@@ -246,7 +185,7 @@ static void SkipUnicodeWhitespaces(llvm::StringRef &remainder) {
   while (true) {
     auto [Status, CodePoint, size] =
         convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
-    if (Status != llvm::conversionOK || !isUnicodeWhitespace(CodePoint))
+    if (Status != llvm::conversionOK || !IsUnicodeWhitespace(CodePoint))
       break;
     length += size;
   }
@@ -262,23 +201,21 @@ static void SkipWhitespaces(llvm::StringRef &remainder) {
   } while (remainder.begin() != cur_pos);
 }
 
-static std::optional<llvm::StringRef>
-IsWord(llvm::StringRef expr, llvm::StringRef &remainder, uint32_t &utf_length) {
+static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
+                                             llvm::StringRef &remainder) {
   llvm::StringRef::iterator cur_pos = remainder.begin();
   llvm::StringRef::iterator start = cur_pos;
   auto [Status, CodePoint, size] =
       convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
   if (Status == llvm::conversionOK &&
-      isValidIdentifierStartCodePoint(CodePoint)) {
-    utf_length = 1;
+      IsValidIdentifierStartCodePoint(CodePoint)) {
     unsigned length = size;
     while (true) {
       auto [Status, CodePoint, size] =
           convertUTF8SequenceAndAdvance(cur_pos, remainder.end());
       if (Status != llvm::conversionOK ||
-          !isValidIdentifierContinuationCodePoint(CodePoint))
+          !IsValidIdentifierContinuationCodePoint(CodePoint))
         break;
-      utf_length++;
       length += size;
     }
 
@@ -374,9 +311,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
     position += number.size();
     return token;
   } else {
-    uint32_t utf_length = 0;
-    std::optional<llvm::StringRef> maybe_word =
-        IsWord(expr, remainder, utf_length);
+    std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
     if (maybe_word) {
       llvm::StringRef word = *maybe_word;
       Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
@@ -406,7 +341,7 @@ llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
                             .Case("wchar_t", Token::kw_wchar_t)
                             .Default(Token::identifier);
       auto token = Token(kind, word.str(), position);
-      position += utf_length;
+      position += llvm::sys::unicode::columnWidthUTF8(word.str());
       return token;
     }
   }
diff --git a/lldb/unittests/DIL/DILTests.cpp b/lldb/unittests/DIL/DILTests.cpp
@@ -3681,13 +3681,15 @@ TEST_F(EvalTest, DISABLED_TestStringParsing) {
 }
 #endif
 
-TEST_F(EvalTest, TestUnicodeIdentifiers) {
+TEST_F(EvalTest, TestUnicodeInput) {
   EXPECT_THAT(Eval("フー + 1"), IsEqual("2"));
   EXPECT_THAT(Eval("1 + フー"), IsEqual("2"));
   EXPECT_THAT(Eval("föo + 1"), IsEqual("4"));
   EXPECT_THAT(Eval("שלום + 1"), IsEqual("5"));
   EXPECT_THAT(Eval(" 1　+   föo   "), // Contains Unicode whitespaces
               IsEqual("4"));
+
+  // Check diagnostic pointer location
   EXPECT_THAT(Eval("фу + бар"),
               IsError("<expr:1:6>: use of undeclared identifier 'бар'\n"
                       "фу + бар\n"
@@ -3696,4 +3698,8 @@ TEST_F(EvalTest, TestUnicodeIdentifiers) {
               IsError("<expr:1:8>: use of undeclared identifier 'бар'\n"
                       "фу　+　бар\n"
                       "       ^"));
+  EXPECT_THAT(Eval("フー + бар"),
+              IsError("<expr:1:8>: use of undeclared identifier 'бар'\n"
+                      "フー + бар\n"
+                      "       ^"));
 }
diff --git a/lldb/unittests/DIL/Inputs/test_binary.cc b/lldb/unittests/DIL/Inputs/test_binary.cc
@@ -1196,12 +1196,12 @@ static void TestStringParsing() {
   // BREAK(TestStringParsing)
 }
 
-static void TestUnicodeIdentifiers() {
+static void TestUnicodeInput() {
   int フー = 1;
   int фу = 2;
   int föo = 3;
   int שלום = 4;
-  // BREAK(TestUnicodeIdentifiers)
+  // BREAK(TestUnicodeInput)
 }
 
 namespace test_binary {
@@ -1258,7 +1258,7 @@ void main() {
 
   TestCharParsing();
   TestStringParsing();
-  TestUnicodeIdentifiers();
+  TestUnicodeInput();
 
   // BREAK HERE
 }

Original file line number	Diff line number	Diff line change
`@@ -1196,12 +1196,12 @@ static void TestStringParsing() {`
`1196`	`1196`	`// BREAK(TestStringParsing)`
`1197`	`1197`	`}`
`1198`	`1198`
`1199`		`-static void TestUnicodeIdentifiers() {`
	`1199`	`+static void TestUnicodeInput() {`
`1200`	`1200`	`int フー = 1;`
`1201`	`1201`	`int фу = 2;`
`1202`	`1202`	`int föo = 3;`
`1203`	`1203`	`int שלום = 4;`
`1204`		`- // BREAK(TestUnicodeIdentifiers)`
	`1204`	`+ // BREAK(TestUnicodeInput)`
`1205`	`1205`	`}`
`1206`	`1206`
`1207`	`1207`	`namespace test_binary {`
`@@ -1258,7 +1258,7 @@ void main() {`
`1258`	`1258`
`1259`	`1259`	`TestCharParsing();`
`1260`	`1260`	`TestStringParsing();`
`1261`		`- TestUnicodeIdentifiers();`
	`1261`	`+ TestUnicodeInput();`
`1262`	`1262`
`1263`	`1263`	`// BREAK HERE`
`1264`	`1264`	`}`