feat(c++): Support the UTF-8 to UTF-16 with SIMD (#1990)

pandalee99 · web-flow · commit 1c250e1d2f20 · 2025-01-01T21:56:18.000+08:00
## What does this PR do? To support the utf8 utf16 and using simd to accelerate the optimization ``` c++ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) ``` The logic of converting UTF-8 to UTF-16 isn't that complicated. but there are still lots of optimizations that I haven't come up with yet. So, I'll first design a version that's a bit faster than the original one, and then think about how to make further optimizations. Judging from the tests, the logic is correct: ``` text [----------] 9 tests from UTF8ToUTF16Test [ RUN ] UTF8ToUTF16Test.BasicConversion [ OK ] UTF8ToUTF16Test.BasicConversion (0 ms) [ RUN ] UTF8ToUTF16Test.EmptyString [ OK ] UTF8ToUTF16Test.EmptyString (0 ms) [ RUN ] UTF8ToUTF16Test.SurrogatePairs [ OK ] UTF8ToUTF16Test.SurrogatePairs (0 ms) [ RUN ] UTF8ToUTF16Test.BoundaryValues [ OK ] UTF8ToUTF16Test.BoundaryValues (0 ms) [ RUN ] UTF8ToUTF16Test.SpecialCharacters [ OK ] UTF8ToUTF16Test.SpecialCharacters (0 ms) [ RUN ] UTF8ToUTF16Test.LittleEndian [ OK ] UTF8ToUTF16Test.LittleEndian (0 ms) [ RUN ] UTF8ToUTF16Test.BigEndian [ OK ] UTF8ToUTF16Test.BigEndian (0 ms) [ RUN ] UTF8ToUTF16Test.RoundTripConversion [ OK ] UTF8ToUTF16Test.RoundTripConversion (0 ms) ``` <img width="264" alt="image" src="https://github.com/user-attachments/assets/7b9033ad-001f-4a36-a27e-6a8362f3a6df" /> And from the performance perspective, it's improved compared to serial processing: <img width="394" alt="image" src="https://github.com/user-attachments/assets/93379aeb-9080-449e-b889-567dd207f5fc" /> The speed of execution has been significantly improved Actually, this code doesn't use libraries like AVX2 or really apply SIMD to process. The main reason is that the structure of UTF-8 encoding is complex and not fixed. It involves multi-byte encoding, and we need to analyze it byte by byte when dealing with different bytes. So, without clear rules and a uniform length, it becomes really hard to directly parallelize the processing of each byte. During the process of converting UTF-8 to UTF-16, we have to handle characters of different lengths, ranging from 1 to 4 bytes, which makes it difficult to break it down into structures that can be directly applied to SIMD operations. There are also some code style changes, uniform writing  ## Related issues Close #1964  ## Does this PR introduce any user-facing change?  - [x] Does this PR introduce any public API change? - [ ] Does this PR introduce any binary protocol compatibility change? ## Benchmark
diff --git a/cpp/fury/util/string_util.cc b/cpp/fury/util/string_util.cc
@@ -58,6 +58,129 @@ inline void utf16SurrogatePairToUtf8(uint16_t high, uint16_t low, char *&utf8) {
   *utf8++ = static_cast<char>((code_point & 0x3F) | 0x80);
 }
 
+std::u16string utf8ToUtf16SIMD(const std::string &utf8, bool is_little_endian) {
+  std::u16string utf16;
+  utf16.reserve(utf8.size()); // Reserve space to avoid frequent reallocations
+
+  char buffer[64]; // Buffer to hold temporary UTF-16 results
+  char16_t *output =
+      reinterpret_cast<char16_t *>(buffer); // Use char16_t for output
+
+  size_t i = 0;
+  size_t n = utf8.size();
+
+  while (i + 32 <= n) {
+
+    for (int j = 0; j < 32; ++j) {
+      uint8_t byte = utf8[i + j];
+
+      if (byte < 0x80) {
+        // 1-byte character (ASCII)
+        *output++ = static_cast<char16_t>(byte);
+      } else if (byte < 0xE0) {
+        // 2-byte character
+        uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + j + 1] & 0x3F);
+        if (!is_little_endian) {
+          utf16_char = (utf16_char >> 8) |
+                       (utf16_char << 8); // Swap bytes for big-endian
+        }
+        *output++ = utf16_char;
+        ++j;
+      } else if (byte < 0xF0) {
+        // 3-byte character
+        uint16_t utf16_char = ((byte & 0x0F) << 12) |
+                              ((utf8[i + j + 1] & 0x3F) << 6) |
+                              (utf8[i + j + 2] & 0x3F);
+        if (!is_little_endian) {
+          utf16_char = (utf16_char >> 8) |
+                       (utf16_char << 8); // Swap bytes for big-endian
+        }
+        *output++ = utf16_char;
+        j += 2;
+      } else {
+        // 4-byte character (surrogate pair handling required)
+        uint32_t code_point =
+            ((byte & 0x07) << 18) | ((utf8[i + j + 1] & 0x3F) << 12) |
+            ((utf8[i + j + 2] & 0x3F) << 6) | (utf8[i + j + 3] & 0x3F);
+
+        // Convert the code point to a surrogate pair
+        uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10);
+        uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF);
+
+        if (!is_little_endian) {
+          high_surrogate = (high_surrogate >> 8) |
+                           (high_surrogate << 8); // Swap bytes for big-endian
+          low_surrogate = (low_surrogate >> 8) |
+                          (low_surrogate << 8); // Swap bytes for big-endian
+        }
+
+        *output++ = high_surrogate;
+        *output++ = low_surrogate;
+
+        j += 3;
+      }
+    }
+
+    // Append the processed buffer to the final utf16 string
+    utf16.append(reinterpret_cast<char16_t *>(buffer),
+                 output - reinterpret_cast<char16_t *>(buffer));
+    output =
+        reinterpret_cast<char16_t *>(buffer); // Reset output buffer pointer
+    i += 32;
+  }
+
+  // Handle remaining characters
+  while (i < n) {
+    uint8_t byte = utf8[i];
+
+    if (byte < 0x80) {
+      *output++ = static_cast<char16_t>(byte);
+    } else if (byte < 0xE0) {
+      uint16_t utf16_char = ((byte & 0x1F) << 6) | (utf8[i + 1] & 0x3F);
+      if (!is_little_endian) {
+        utf16_char =
+            (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian
+      }
+      *output++ = utf16_char;
+      ++i;
+    } else if (byte < 0xF0) {
+      uint16_t utf16_char = ((byte & 0x0F) << 12) |
+                            ((utf8[i + 1] & 0x3F) << 6) | (utf8[i + 2] & 0x3F);
+      if (!is_little_endian) {
+        utf16_char =
+            (utf16_char >> 8) | (utf16_char << 8); // Swap bytes for big-endian
+      }
+      *output++ = utf16_char;
+      i += 2;
+    } else {
+      uint32_t code_point = ((byte & 0x07) << 18) |
+                            ((utf8[i + 1] & 0x3F) << 12) |
+                            ((utf8[i + 2] & 0x3F) << 6) | (utf8[i + 3] & 0x3F);
+
+      uint16_t high_surrogate = 0xD800 + ((code_point - 0x10000) >> 10);
+      uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF);
+
+      if (!is_little_endian) {
+        high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8);
+        low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8);
+      }
+
+      *output++ = high_surrogate;
+      *output++ = low_surrogate;
+
+      i += 3;
+    }
+
+    ++i;
+  }
+
+  // Append the last part of the buffer to the utf16 string
+  utf16.append(reinterpret_cast<char16_t *>(buffer),
+               output - reinterpret_cast<char16_t *>(buffer));
+
+  return utf16;
+}
+
 #if defined(__x86_64__) || defined(_M_X64)
 
 bool isLatin(const std::string &str) {
@@ -168,6 +291,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   return utf8;
 }
 
+std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
+  return utf8ToUtf16SIMD(utf8, is_little_endian);
+}
+
 #elif defined(__ARM_NEON) || defined(__ARM_NEON__)
 
 bool isLatin(const std::string &str) {
@@ -264,6 +391,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   return utf8;
 }
 
+std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
+  return utf8ToUtf16SIMD(utf8, is_little_endian);
+}
+
 #elif defined(__riscv) && __riscv_vector
 
 bool isLatin(const std::string &str) {
@@ -365,6 +496,10 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   return utf8;
 }
 
+std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
+  return utf8ToUtf16SIMD(utf8, is_little_endian);
+}
+
 #else
 
 bool isLatin(const std::string &str) {
@@ -414,6 +549,78 @@ std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian) {
   return utf8;
 }
 
+// Fallback implementation without SIMD acceleration
+std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian) {
+  std::u16string utf16;   // Resulting UTF-16 string
+  size_t i = 0;           // Index for traversing the UTF-8 string
+  size_t n = utf8.size(); // Total length of the UTF-8 string
+
+  // Loop through each byte of the UTF-8 string
+  while (i < n) {
+    uint32_t code_point = 0;   // The Unicode code point
+    unsigned char c = utf8[i]; // Current byte of the UTF-8 string
+
+    // Determine the number of bytes for this character based on its first byte
+    if ((c & 0x80) == 0) {
+      // 1-byte character (ASCII)
+      code_point = c;
+      ++i;
+    } else if ((c & 0xE0) == 0xC0) {
+      // 2-byte character
+      code_point = c & 0x1F;
+      code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
+      i += 2;
+    } else if ((c & 0xF0) == 0xE0) {
+      // 3-byte character
+      code_point = c & 0x0F;
+      code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
+      code_point = (code_point << 6) | (utf8[i + 2] & 0x3F);
+      i += 3;
+    } else if ((c & 0xF8) == 0xF0) {
+      // 4-byte character
+      code_point = c & 0x07;
+      code_point = (code_point << 6) | (utf8[i + 1] & 0x3F);
+      code_point = (code_point << 6) | (utf8[i + 2] & 0x3F);
+      code_point = (code_point << 6) | (utf8[i + 3] & 0x3F);
+      i += 4;
+    } else {
+      // Invalid UTF-8 byte sequence
+      throw std::invalid_argument("Invalid UTF-8 encoding.");
+    }
+
+    // If the code point is beyond the BMP range, use surrogate pairs
+    if (code_point >= 0x10000) {
+      code_point -= 0x10000; // Subtract 0x10000 to get the surrogate pair
+      uint16_t high_surrogate = 0xD800 + (code_point >> 10);  // High surrogate
+      uint16_t low_surrogate = 0xDC00 + (code_point & 0x3FF); // Low surrogate
+
+      // If not little-endian, swap bytes of the surrogates
+      if (!is_little_endian) {
+        high_surrogate = (high_surrogate >> 8) | (high_surrogate << 8);
+        low_surrogate = (low_surrogate >> 8) | (low_surrogate << 8);
+      }
+
+      // Add both high and low surrogates to the UTF-16 string
+      utf16.push_back(high_surrogate);
+      utf16.push_back(low_surrogate);
+    } else {
+      // For code points within the BMP range, directly store as a 16-bit value
+      uint16_t utf16_char = static_cast<uint16_t>(code_point);
+
+      // If not little-endian, swap the bytes of the 16-bit character
+      if (!is_little_endian) {
+        utf16_char = (utf16_char >> 8) | (utf16_char << 8);
+      }
+
+      // Add the UTF-16 character to the string
+      utf16.push_back(utf16_char);
+    }
+  }
+
+  // Return the resulting UTF-16 string
+  return utf16;
+}
+
 #endif
 
-} // namespace fury
+} // namespace fury
diff --git a/cpp/fury/util/string_util.h b/cpp/fury/util/string_util.h
@@ -27,4 +27,6 @@ bool isLatin(const std::string &str);
 
 std::string utf16ToUtf8(const std::u16string &utf16, bool is_little_endian);
 
-} // namespace fury
+std::u16string utf8ToUtf16(const std::string &utf8, bool is_little_endian);
+
+} // namespace fury
diff --git a/cpp/fury/util/string_util_test.cc b/cpp/fury/util/string_util_test.cc