From 55a1e631c60d3c9f41421fab490d50dec7914b5e Mon Sep 17 00:00:00 2001 From: Daniel Lemire Date: Sun, 22 Jan 2023 14:16:25 -0500 Subject: [PATCH] Sketch for UTF-16 support --- include/ada/checkers.h | 4 +-- include/ada/parser.h | 1 - include/ada/unicode.h | 2 +- src/implementation.cpp | 66 ++++++++++++++++++++++++++++++++++++++---- src/parser.cpp | 16 +++++----- src/unicode.cpp | 62 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 134 insertions(+), 17 deletions(-) diff --git a/include/ada/checkers.h b/include/ada/checkers.h index 4c006a369..54f4f3f9e 100644 --- a/include/ada/checkers.h +++ b/include/ada/checkers.h @@ -20,8 +20,8 @@ namespace ada::checkers { // safe if input.size() >=2. See has_hex_prefix. inline bool has_hex_prefix_unsafe(std::string_view input) { // This is actualy efficient code, see has_hex_prefix for the assembly. - uint32_t value = 1; - bool is_little_endian = (static_cast(value) == 1); + uint32_t value_one = 1; + bool is_little_endian = (reinterpret_cast(&value_one)[0] == 1); uint16_t word0x{}; std::memcpy(&word0x, "0x", 2); // we would use bit_cast in C++20 and the function could be constexpr. uint16_t two_first_bytes{}; diff --git a/include/ada/parser.h b/include/ada/parser.h index 1d815fd79..c475f6e2c 100644 --- a/include/ada/parser.h +++ b/include/ada/parser.h @@ -11,7 +11,6 @@ namespace ada::parser { url parse_url(std::string_view user_input, std::optional base_url = std::nullopt, - ada::encoding_type encoding = ada::encoding_type::UTF8, std::optional optional_url = std::nullopt); } // namespace ada diff --git a/include/ada/unicode.h b/include/ada/unicode.h index 1864bf237..0f08248e0 100644 --- a/include/ada/unicode.h +++ b/include/ada/unicode.h @@ -28,7 +28,7 @@ namespace ada::unicode { std::string percent_decode(const std::string_view input, size_t first_percent); std::string percent_encode(const std::string_view input, const uint8_t character_set[]); ada_really_inline bool to_lower_ascii_string(std::optional& out, size_t first_percent) noexcept; - + size_t utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output, encoding_type type); } // namespace ada::unicode #endif // ADA_UNICODE_H diff --git a/src/implementation.cpp b/src/implementation.cpp index 60a54725d..05d7f4abf 100644 --- a/src/implementation.cpp +++ b/src/implementation.cpp @@ -1,5 +1,7 @@ #include #include +#include +#include #include #include @@ -26,10 +28,24 @@ namespace ada { std::optional base_url, ada::encoding_type encoding) { if(encoding != encoding_type::UTF8) { - // todo: unsupported ! + // If there is a BOM, prune it out. + if(input.size() >= 2) { + if((uint8_t(input[0]) == 0xff) && (uint8_t(input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) { + input.remove_prefix(2); + } else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) { + input.remove_prefix(2); + } + } + if(!input.empty()) { + std::unique_ptr utf8buffer(new char[input.size() * 2]); + size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast(input.data()), input.size()/2,utf8buffer.get(), encoding); + if((input.size() % 2) != 0) { utf8_length = 0; } + std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0 + return ada::parser::parse_url(utf8_input, std::move(base_url)); + } } // TODO std::move(base_url) might be unwise. Check. - return ada::parser::parse_url(input, std::move(base_url), encoding); + return ada::parser::parse_url(input, std::move(base_url)); } /* @@ -44,7 +60,20 @@ namespace ada { */ bool set_scheme(ada::url& base, std::string input, ada::encoding_type encoding) noexcept { if(encoding != encoding_type::UTF8) { - return false; // unsupported ! + std::string_view initial_input = input; + // If there is a BOM, prune it out. + if(initial_input.size() >= 2) { + if((uint8_t(initial_input[0]) == 0xff) && (uint8_t(initial_input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) { + initial_input.remove_prefix(2); + } else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(initial_input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) { + initial_input.remove_prefix(2); + } + } + std::unique_ptr utf8buffer(new char[input.size() * 2]); + size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast(initial_input.data()), initial_input.size()/2,utf8buffer.get(), encoding); + if((input.size() % 2) != 0) { utf8_length = 0; } + std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0 + return set_scheme(base, std::string(utf8_input), encoding_type::UTF8); } if (!input.empty()) { input.append(":"); @@ -110,7 +139,20 @@ namespace ada { */ bool set_host(ada::url& base, std::string_view input, ada::encoding_type encoding) noexcept { if(encoding != encoding_type::UTF8) { - return false; // unsupported ! + std::string_view initial_input = input; + // If there is a BOM, prune it out. + if(initial_input.size() >= 2) { + if((uint8_t(initial_input[0]) == 0xff) && (uint8_t(initial_input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) { + initial_input.remove_prefix(2); + } else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(initial_input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) { + initial_input.remove_prefix(2); + } + } + std::unique_ptr utf8buffer(new char[input.size() * 2]); + size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast(initial_input.data()), initial_input.size()/2,utf8buffer.get(), encoding); + if((input.size() % 2) != 0) { utf8_length = 0; } + std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0 + return set_host(base, utf8_input, encoding_type::UTF8); } // If this’s URL has an opaque path, then return. if (base.has_opaque_path) { @@ -199,9 +241,21 @@ namespace ada { * @see https://url.spec.whatwg.org/#dom-url-pathname */ bool set_pathname(ada::url& base, std::string_view input, ada::encoding_type encoding) noexcept { - if(encoding != encoding_type::UTF8) { - return false; // unsupported ! + std::string_view initial_input = input; + // If there is a BOM, prune it out. + if(initial_input.size() >= 2) { + if((uint8_t(initial_input[0]) == 0xff) && (uint8_t(initial_input[1]) == 0xfe) && encoding == encoding_type::UTF_16LE) { + initial_input.remove_prefix(2); + } else if ((uint8_t(input[0]) == 0xfe) && (uint8_t(initial_input[1]) == 0xff) && encoding == encoding_type::UTF_16BE) { + initial_input.remove_prefix(2); + } + } + std::unique_ptr utf8buffer(new char[input.size() * 2]); + size_t utf8_length = unicode::utf16_to_utf8(reinterpret_cast(initial_input.data()), initial_input.size()/2,utf8buffer.get(), encoding); + if((input.size() % 2) != 0) { utf8_length = 0; } + std::string_view utf8_input(utf8buffer.get(), utf8_length); // in case of error utf8_length == 0 + return set_pathname(base, utf8_input, encoding_type::UTF8); } // If this’s URL has an opaque path, then return. if (base.has_opaque_path) { diff --git a/src/parser.cpp b/src/parser.cpp index cfccd1a1c..ccf1600b0 100644 --- a/src/parser.cpp +++ b/src/parser.cpp @@ -17,7 +17,6 @@ namespace ada::parser { url parse_url(std::string_view user_input, std::optional base_url, - ada::encoding_type encoding, std::optional optional_url) { // Let state be state override if given, or scheme start state otherwise. ada::state state = ada::state::SCHEME_START; @@ -395,12 +394,15 @@ namespace ada::parser { // If encoding is not UTF-8 and one of the following is true: // - url is not special // - url’s scheme is "ws" or "wss" - if (encoding != ada::encoding_type::UTF8) { - if (!url.is_special() || url.get_scheme_type() == ada::scheme::type::WS || url.get_scheme_type() == ada::scheme::type::WSS) { - // then set encoding to UTF-8. - encoding = ada::encoding_type::UTF8; - } - } + ////////////// + // All of the processing *requires* UTF-8. So we would never get here: + ///////////// + //if (encoding != ada::encoding_type::UTF8) { + // if (!url.is_special() || url.get_scheme_type() == ada::scheme::type::WS || url.get_scheme_type() == ada::scheme::type::WSS) { + // // then set encoding to UTF-8. + // encoding = ada::encoding_type::UTF8; + // } + //} // Let queryPercentEncodeSet be the special-query percent-encode set if url is special; // otherwise the query percent-encode set. diff --git a/src/unicode.cpp b/src/unicode.cpp index e1ff870f3..28618e2f5 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -433,4 +433,66 @@ constexpr static bool is_forbidden_domain_code_point_table[] = { return to_ascii(out, out.value(), false, first_percent); } + + size_t utf16_to_utf8(const char16_t* buf, size_t len, char* utf8_output, encoding_type type) { + uint32_t value_one = 1; + bool is_little_endian = (reinterpret_cast(&value_one)[0] == 1); + bool need_flip = (is_little_endian) ? (type == encoding_type::UTF_16BE) : (type == encoding_type::UTF_16LE); + const uint16_t *data = reinterpret_cast(buf); + size_t pos = 0; + auto swap_bytes = [](uint16_t word) { return uint16_t((word >> 8) | (word << 8)); }; + char* start{utf8_output}; + while (pos < len) { + // try to convert the next block of 8 ASCII characters + if (pos + 4 <= len) { // if it is safe to read 8 more bytes, check that they are ascii + uint64_t v; + ::memcpy(&v, data + pos, sizeof(uint64_t)); + if (need_flip) v = (v >> 8) | (v << (64 - 8)); + if ((v & 0xFF80FF80FF80FF80) == 0) { + size_t final_pos = pos + 4; + while(pos < final_pos) { + *utf8_output++ = need_flip ? char(swap_bytes(buf[pos])) : char(buf[pos]); + pos++; + } + continue; + } + } + uint16_t word = need_flip ? swap_bytes(data[pos]) : data[pos]; + if((word & 0xFF80)==0) { + // will generate one UTF-8 bytes + *utf8_output++ = char(word); + pos++; + } else if((word & 0xF800)==0) { + // will generate two UTF-8 bytes + // we have 0b110XXXXX 0b10XXXXXX + *utf8_output++ = char((word>>6) | 0b11000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else if((word &0xF800 ) != 0xD800) { + // will generate three UTF-8 bytes + // we have 0b1110XXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((word>>12) | 0b11100000); + *utf8_output++ = char(((word>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((word & 0b111111) | 0b10000000); + pos++; + } else { + // must be a surrogate pair + if(pos + 1 >= len) { return 0; } + uint16_t diff = uint16_t(word - 0xD800); + if(diff > 0x3FF) { return 0; } + uint16_t next_word = need_flip ? swap_bytes(data[pos + 1]) : data[pos + 1]; + uint16_t diff2 = uint16_t(next_word - 0xDC00); + if(diff2 > 0x3FF) { return 0; } + uint32_t value = (diff << 10) + diff2 + 0x10000; + // will generate four UTF-8 bytes + // we have 0b11110XXX 0b10XXXXXX 0b10XXXXXX 0b10XXXXXX + *utf8_output++ = char((value>>18) | 0b11110000); + *utf8_output++ = char(((value>>12) & 0b111111) | 0b10000000); + *utf8_output++ = char(((value>>6) & 0b111111) | 0b10000000); + *utf8_output++ = char((value & 0b111111) | 0b10000000); + pos += 2; + } + } + return utf8_output - start; + } } // namespace ada::unicode