From 55a1e631c60d3c9f41421fab490d50dec7914b5e Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sun, 22 Jan 2023 14:16:25 -0500
Subject: [PATCH] Sketch for UTF-16 support

---
 include/ada/checkers.h |  4 +--
 include/ada/parser.h   |  1 -
 include/ada/unicode.h  |  2 +-
 src/implementation.cpp | 66 ++++++++++++++++++++++++++++++++++++++----
 src/parser.cpp         | 16 +++++-----
 src/unicode.cpp        | 62 +++++++++++++++++++++++++++++++++++++++
 6 files changed, 134 insertions(+), 17 deletions(-)
diff --git a/include/ada/checkers.h b/include/ada/checkers.h
index 4c006a369..54f4f3f9e 100644
--- a/include/ada/checkers.h
+++ b/include/ada/checkers.h
@@ -20,8 +20,8 @@ namespace ada::checkers {
   // safe if input.size() >=2. See has_hex_prefix.
   inline bool has_hex_prefix_unsafe(std::string_view input) {
     // This is actualy efficient code, see has_hex_prefix for the assembly.
-    uint32_t value = 1;
-    bool is_little_endian = (static_cast<uint8_t>(value) == 1);
+    uint32_t value_one = 1;
+    bool is_little_endian = (reinterpret_cast<char*>(&value_one)[0] == 1);
     uint16_t word0x{};
     std::memcpy(&word0x, "0x", 2); // we would use bit_cast in C++20 and the function could be constexpr.
     uint16_t two_first_bytes{};
diff --git a/include/ada/parser.h b/include/ada/parser.h
index 1d815fd79..c475f6e2c 100644
--- a/include/ada/parser.h
+++ b/include/ada/parser.h
@@ -11,7 +11,6 @@
 namespace ada::parser {
   url parse_url(std::string_view user_input,
                 std::optional<ada::url> base_url = std::nullopt,
-                ada::encoding_type encoding = ada::encoding_type::UTF8,
                 std::optional<ada::url> optional_url = std::nullopt);
 
 } // namespace ada
diff --git a/include/ada/unicode.h b/include/ada/unicode.h
index 1864bf237..0f08248e0 100644
--- a/include/ada/unicode.h
+++ b/include/ada/unicode.h
@@ -28,7 +28,7 @@ namespace ada::unicode {
   std::string percent_decode(const std::string_view input, size_t first_percent);
   std::string percent_encode(const std::string_view input, const uint8_t character_set[]);
   ada_really_inline bool to_lower_ascii_string(std::optional<std::string>& out, size_t first_percent) noexcept;
-
+  size_t utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output, encoding_type type);
 } // namespace ada::unicode
 
 #endif // ADA_UNICODE_H
diff --git a/src/implementation.cpp b/src/implementation.cpp
index 60a54725d..05d7f4abf 100644
--- a/src/implementation.cpp
+++ b/src/implementation.cpp
@@ -1,5 +1,7 @@
 #include <charconv>
 #include <iostream>
+#include <memory>
+#include <vector>
 #include <string_view>
 #include <utility>
 
@@ -26,10 +28,24 @@ namespace ada {
                             std::optional<ada::url> base_url,
                             ada::encoding_type encoding) {
     if(encoding != encoding_type::UTF8) {
-      // todo: unsupported !
+      // If there is a BOM, prune it out.
+      if(input.size() >= 2) {
+        if((uint8_t(input[0]) == 0xff) && (uint8_t(input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) {
+          input.remove_prefix(2);
+        } else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) {
+          input.remove_prefix(2);
+        }
+      }
+      if(!input.empty()) {
+        std::unique_ptr<char[]> utf8buffer(new char[input.size() * 2]);
+        size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast<const char16_t*>(input.data()), input.size()/2,utf8buffer.get(), encoding);
+        if((input.size() % 2) != 0) { utf8_length = 0; }
+        std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0
+        return ada::parser::parse_url(utf8_input, std::move(base_url));
+      }
     }
     // TODO std::move(base_url) might be unwise. Check.
-    return ada::parser::parse_url(input, std::move(base_url), encoding);
+    return ada::parser::parse_url(input, std::move(base_url));
   }
 
   /*
@@ -44,7 +60,20 @@ namespace ada {
    */
   bool set_scheme(ada::url& base, std::string input, ada::encoding_type encoding) noexcept {
     if(encoding != encoding_type::UTF8) {
-      return false; // unsupported !
+      std::string_view initial_input = input;
+      // If there is a BOM, prune it out.
+      if(initial_input.size() >= 2) {
+        if((uint8_t(initial_input[0]) == 0xff) && (uint8_t(initial_input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) {
+          initial_input.remove_prefix(2);
+        } else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(initial_input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) {
+          initial_input.remove_prefix(2);
+        }
+      }
+      std::unique_ptr<char[]> utf8buffer(new char[input.size() * 2]);
+      size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast<const char16_t*>(initial_input.data()), initial_input.size()/2,utf8buffer.get(), encoding);
+      if((input.size() % 2) != 0) { utf8_length = 0; }
+      std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0
+      return set_scheme(base, std::string(utf8_input), encoding_type::UTF8);
     }
     if (!input.empty()) {
       input.append(":");
@@ -110,7 +139,20 @@ namespace ada {
    */
   bool set_host(ada::url& base, std::string_view input, ada::encoding_type encoding) noexcept {
     if(encoding != encoding_type::UTF8) {
-      return false; // unsupported !
+      std::string_view initial_input = input;
+      // If there is a BOM, prune it out.
+      if(initial_input.size() >= 2) {
+        if((uint8_t(initial_input[0]) == 0xff) && (uint8_t(initial_input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) {
+          initial_input.remove_prefix(2);
+        } else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(initial_input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) {
+          initial_input.remove_prefix(2);
+        }
+      }
+      std::unique_ptr<char[]> utf8buffer(new char[input.size() * 2]);
+      size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast<const char16_t*>(initial_input.data()), initial_input.size()/2,utf8buffer.get(), encoding);
+      if((input.size() % 2) != 0) { utf8_length = 0; }
+      std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0
+      return set_host(base, utf8_input, encoding_type::UTF8);
     }
     // If this’s URL has an opaque path, then return.
     if (base.has_opaque_path) {
@@ -199,9 +241,21 @@ namespace ada {
    * @see https://url.spec.whatwg.org/#dom-url-pathname
    */
   bool set_pathname(ada::url& base, std::string_view input, ada::encoding_type encoding) noexcept {
-
     if(encoding != encoding_type::UTF8) {
-      return false; // unsupported !
+      std::string_view initial_input = input;
+      // If there is a BOM, prune it out.
+      if(initial_input.size() >= 2) {
+        if((uint8_t(initial_input[0]) == 0xff) && (uint8_t(initial_input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) {
+          initial_input.remove_prefix(2);
+        } else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(initial_input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) {
+          initial_input.remove_prefix(2);
+        }
+      }
+      std::unique_ptr<char[]> utf8buffer(new char[input.size() * 2]);
+      size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast<const char16_t*>(initial_input.data()), initial_input.size()/2,utf8buffer.get(), encoding);
+      if((input.size() % 2) != 0) { utf8_length = 0; }
+      std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0
+      return set_pathname(base, utf8_input, encoding_type::UTF8);
     }
     // If this’s URL has an opaque path, then return.
     if (base.has_opaque_path) {
diff --git a/src/parser.cpp b/src/parser.cpp
index cfccd1a1c..ccf1600b0 100644
--- a/src/parser.cpp
+++ b/src/parser.cpp
@@ -17,7 +17,6 @@ namespace ada::parser {
 
   url parse_url(std::string_view user_input,
                 std::optional<ada::url> base_url,
-                ada::encoding_type encoding,
                 std::optional<ada::url> optional_url) {
     // Let state be state override if given, or scheme start state otherwise.
     ada::state state = ada::state::SCHEME_START;
@@ -395,12 +394,15 @@ namespace ada::parser {
           // If encoding is not UTF-8 and one of the following is true:
           // - url is not special
           // - url’s scheme is "ws" or "wss"
-          if (encoding != ada::encoding_type::UTF8) {
-            if (!url.is_special() || url.get_scheme_type() == ada::scheme::type::WS || url.get_scheme_type() == ada::scheme::type::WSS) {
-              // then set encoding to UTF-8.
-              encoding = ada::encoding_type::UTF8;
-            }
-          }
+          //////////////
+          // All of the processing *requires* UTF-8. So we would never get here:
+          /////////////
+          //if (encoding != ada::encoding_type::UTF8) {
+          //  if (!url.is_special() || url.get_scheme_type() == ada::scheme::type::WS || url.get_scheme_type() == ada::scheme::type::WSS) {
+          //    // then set encoding to UTF-8.
+          //    encoding = ada::encoding_type::UTF8;
+          //  }
+          //}
 
           // Let queryPercentEncodeSet be the special-query percent-encode set if url is special;
           // otherwise the query percent-encode set.
diff --git a/src/unicode.cpp b/src/unicode.cpp
index e1ff870f3..28618e2f5 100644
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@@ -433,4 +433,66 @@ constexpr static bool is_forbidden_domain_code_point_table[] = {
     return to_ascii(out, out.value(), false, first_percent);
   }
 
+
+  size_t utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output, encoding_type type) {
+    uint32_t value_one = 1;
+    bool is_little_endian = (reinterpret_cast<char*>(&value_one)[0] == 1);
+    bool need_flip = (is_little_endian) ? (type == encoding_type::UTF_16BE) : (type == encoding_type::UTF_16LE);
+    const uint16_t *data = reinterpret_cast<const uint16_t *>(buf);
+    size_t pos = 0;
+    auto swap_bytes = [](uint16_t word) { return uint16_t((word >> 8) | (word << 8)); };
+    char* start{utf8_output};
+    while (pos < len) {
+      // try to convert the next block of 8 ASCII characters
+      if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii
+        uint64_t v;
+        ::memcpy(&v, data + pos, sizeof(uint64_t));
+        if (need_flip) v = (v >> 8) | (v << (64 - 8));
+        if ((v & 0xFF80FF80FF80FF80) == 0) {
+          size_t final_pos = pos + 4;
+          while(pos < final_pos) {
+            *utf8_output++ = need_flip ? char(swap_bytes(buf[pos])) : char(buf[pos]);
+            pos++;
+          }
+          continue;
+        }
+      }
+      uint16_t word = need_flip ? swap_bytes(data[pos]) : data[pos];
+      if((word & 0xFF80)==0) {
+        // will generate one UTF-8 bytes
+        *utf8_output++ = char(word);
+        pos++;
+      } else if((word & 0xF800)==0) {
+        // will generate two UTF-8 bytes
+        // we have 0b110XXXXX 0b10XXXXXX
+        *utf8_output++ = char((word>>6) | 0b11000000);
+        *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        pos++;
+      } else if((word &0xF800 ) != 0xD800) {
+        // will generate three UTF-8 bytes
+        // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX
+        *utf8_output++ = char((word>>12) | 0b11100000);
+        *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000);
+        *utf8_output++ = char((word & 0b111111) | 0b10000000);
+        pos++;
+      } else {
+        // must be a surrogate pair
+        if(pos + 1 >= len) { return 0; }
+        uint16_t diff = uint16_t(word - 0xD800);
+        if(diff > 0x3FF) { return 0; }
+        uint16_t next_word = need_flip ? swap_bytes(data[pos + 1]) : data[pos + 1];
+        uint16_t diff2 = uint16_t(next_word - 0xDC00);
+        if(diff2 > 0x3FF) { return 0; }
+        uint32_t value = (diff << 10) + diff2 + 0x10000;
+        // will generate four UTF-8 bytes
+        // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX
+        *utf8_output++ = char((value>>18) | 0b11110000);
+        *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000);
+        *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000);
+        *utf8_output++ = char((value & 0b111111) | 0b10000000);
+        pos += 2;
+      }
+    }
+    return utf8_output - start;
+  }
 } // namespace ada::unicode