-
-
Notifications
You must be signed in to change notification settings - Fork 117
optimize canonicalize methods for url_pattern #1033
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2,9 +2,16 @@ | |
| #include "ada/url_pattern_helpers-inl.h" | ||
|
|
||
| #include <algorithm> | ||
| #include <charconv> | ||
| #include <optional> | ||
| #include <ranges> | ||
| #include <string> | ||
|
|
||
| #include "ada/character_sets.h" | ||
| #include "ada/helpers.h" | ||
| #include "ada/scheme.h" | ||
| #include "ada/unicode.h" | ||
|
|
||
| namespace ada::url_pattern_helpers { | ||
|
|
||
| std::tuple<std::string, std::vector<std::string>> | ||
|
|
@@ -224,15 +231,16 @@ tl::expected<std::string, errors> canonicalize_username( | |
| if (input.empty()) [[unlikely]] { | ||
| return ""; | ||
| } | ||
| // Let dummyURL be a new URL record. | ||
| auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr); | ||
| ADA_ASSERT_TRUE(url.has_value()); | ||
| // Set the username given dummyURL and value. | ||
| if (!url->set_username(input)) { | ||
| return tl::unexpected(errors::type_error); | ||
| // Percent-encode the input using the userinfo percent-encode set. | ||
| size_t idx = ada::unicode::percent_encode_index( | ||
| input, character_sets::USERINFO_PERCENT_ENCODE); | ||
| if (idx == input.size()) { | ||
| // No encoding needed, return input as-is | ||
| return std::string(input); | ||
| } | ||
| // Return dummyURL's username. | ||
| return std::string(url->get_username()); | ||
| // Percent-encode from the first character that needs encoding | ||
| return ada::unicode::percent_encode( | ||
| input, character_sets::USERINFO_PERCENT_ENCODE, idx); | ||
| } | ||
|
|
||
| tl::expected<std::string, errors> canonicalize_password( | ||
|
|
@@ -241,16 +249,16 @@ tl::expected<std::string, errors> canonicalize_password( | |
| if (input.empty()) [[unlikely]] { | ||
| return ""; | ||
| } | ||
| // Let dummyURL be a new URL record. | ||
| // Set the password given dummyURL and value. | ||
| auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr); | ||
|
|
||
| ADA_ASSERT_TRUE(url.has_value()); | ||
| if (!url->set_password(input)) { | ||
| return tl::unexpected(errors::type_error); | ||
| // Percent-encode the input using the userinfo percent-encode set. | ||
| size_t idx = ada::unicode::percent_encode_index( | ||
| input, character_sets::USERINFO_PERCENT_ENCODE); | ||
| if (idx == input.size()) { | ||
| // No encoding needed, return input as-is | ||
| return std::string(input); | ||
| } | ||
| // Return dummyURL's password. | ||
| return std::string(url->get_password()); | ||
| // Percent-encode from the first character that needs encoding | ||
| return ada::unicode::percent_encode( | ||
| input, character_sets::USERINFO_PERCENT_ENCODE, idx); | ||
| } | ||
|
|
||
| tl::expected<std::string, errors> canonicalize_hostname( | ||
|
|
@@ -300,17 +308,41 @@ tl::expected<std::string, errors> canonicalize_port( | |
| if (port_value.empty()) [[unlikely]] { | ||
| return ""; | ||
| } | ||
| // Let dummyURL be a new URL record. | ||
| // If protocolValue was given, then set dummyURL's scheme to protocolValue. | ||
| // Let parseResult be the result of running basic URL parser given portValue | ||
| // with dummyURL as url and port state as state override. | ||
| auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr); | ||
| ADA_ASSERT_TRUE(url); | ||
| if (url->set_port(port_value)) { | ||
| // Return dummyURL's port, serialized, or empty string if it is null. | ||
| return std::string(url->get_port()); | ||
|
|
||
| // Remove ASCII tab or newline characters | ||
| std::string trimmed(port_value); | ||
| helpers::remove_ascii_tab_or_newline(trimmed); | ||
|
|
||
| if (trimmed.empty()) { | ||
| return ""; | ||
| } | ||
| // If parseResult is failure, then throw a TypeError. | ||
|
|
||
| // Input should start with a digit character | ||
| if (!unicode::is_ascii_digit(trimmed.front())) { | ||
| return tl::unexpected(errors::type_error); | ||
| } | ||
|
|
||
| // Find the first non-digit character | ||
| auto first_non_digit = | ||
| std::ranges::find_if_not(trimmed, unicode::is_ascii_digit); | ||
| std::string_view digits_to_parse = | ||
| std::string_view(trimmed.data(), first_non_digit - trimmed.begin()); | ||
|
|
||
| // Parse the port number | ||
| uint16_t parsed_port{}; | ||
| auto result = std::from_chars(digits_to_parse.data(), | ||
| digits_to_parse.data() + digits_to_parse.size(), | ||
| parsed_port); | ||
|
|
||
| if (result.ec == std::errc::result_out_of_range) { | ||
| return tl::unexpected(errors::type_error); | ||
| } | ||
|
|
||
| if (result.ec == std::errc()) { | ||
| // Successfully parsed, return as string | ||
| return std::to_string(parsed_port); | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Presumably, this could be avoided by using the range The whole logic here is to ensure that we have a 16-bit unsigned string. This could be better. Still: this is fine.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah nice touch. Agreed. Let's fix it! I'll open a PR |
||
| } | ||
|
|
||
| return tl::unexpected(errors::type_error); | ||
| } | ||
|
|
||
|
|
@@ -321,34 +353,55 @@ tl::expected<std::string, errors> canonicalize_port_with_protocol( | |
| return ""; | ||
| } | ||
|
|
||
| // TODO: Remove this | ||
| // We have an empty protocol because get_protocol() returns an empty string | ||
| // We should handle this in the caller rather than here. | ||
| // Handle empty or trailing colon in protocol | ||
| if (protocol.empty()) { | ||
| protocol = "fake"; | ||
| } else if (protocol.ends_with(":")) { | ||
| protocol.remove_suffix(1); | ||
| } | ||
| // Let dummyURL be a new URL record. | ||
| // If protocolValue was given, then set dummyURL's scheme to protocolValue. | ||
| // Let parseResult be the result of running basic URL parser given portValue | ||
| // with dummyURL as url and port state as state override. | ||
| auto url = ada::parse<url_aggregator>(std::string(protocol) + "://dummy.test", | ||
| nullptr); | ||
| // TODO: Remove has_port() check. | ||
| // This is actually a bug with url parser where set_port() returns true for | ||
| // "invalid80" port value. | ||
| if (url && url->set_port(port_value) && url->has_port()) { | ||
| // Return dummyURL's port, serialized, or empty string if it is null. | ||
| return std::string(url->get_port()); | ||
| } | ||
| // TODO: Remove this once the previous has_port() check is removed. | ||
| if (url) { | ||
| if (scheme::is_special(protocol) && url->get_port().empty()) { | ||
|
|
||
| // Remove ASCII tab or newline characters | ||
| std::string trimmed(port_value); | ||
| helpers::remove_ascii_tab_or_newline(trimmed); | ||
|
|
||
| if (trimmed.empty()) { | ||
| return ""; | ||
| } | ||
|
|
||
| // Input should start with a digit character | ||
| if (!unicode::is_ascii_digit(trimmed.front())) { | ||
| return tl::unexpected(errors::type_error); | ||
| } | ||
|
|
||
| // Find the first non-digit character | ||
| auto first_non_digit = | ||
| std::ranges::find_if_not(trimmed, unicode::is_ascii_digit); | ||
| std::string_view digits_to_parse = | ||
| std::string_view(trimmed.data(), first_non_digit - trimmed.begin()); | ||
|
|
||
| // Parse the port number | ||
| uint16_t parsed_port{}; | ||
| auto result = std::from_chars(digits_to_parse.data(), | ||
| digits_to_parse.data() + digits_to_parse.size(), | ||
| parsed_port); | ||
|
|
||
| if (result.ec == std::errc::result_out_of_range) { | ||
| return tl::unexpected(errors::type_error); | ||
| } | ||
|
|
||
| if (result.ec == std::errc()) { | ||
| // Check if this is the default port for the scheme | ||
| uint16_t default_port = scheme::get_special_port(protocol); | ||
|
|
||
| // If it's the default port for a special scheme, return empty string | ||
| if (default_port != 0 && default_port == parsed_port) { | ||
| return ""; | ||
| } | ||
|
|
||
| // Successfully parsed, return as string | ||
| return std::to_string(parsed_port); | ||
| } | ||
| // If parseResult is failure, then throw a TypeError. | ||
|
|
||
| return tl::unexpected(errors::type_error); | ||
| } | ||
|
|
||
|
|
@@ -401,44 +454,55 @@ tl::expected<std::string, errors> canonicalize_search(std::string_view input) { | |
| if (input.empty()) [[unlikely]] { | ||
| return ""; | ||
| } | ||
| // Let dummyURL be a new URL record. | ||
| // Set dummyURL's query to the empty string. | ||
| // Let parseResult be the result of running basic URL parser given value with | ||
| // dummyURL as url and query state as state override. | ||
| auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr); | ||
| ADA_ASSERT_TRUE(url.has_value()); | ||
| url->set_search(input); | ||
| if (url->has_search()) { | ||
| const auto search = url->get_search(); | ||
| if (!search.empty()) { | ||
| return std::string(search.substr(1)); | ||
| } | ||
| // Remove leading '?' if present | ||
| std::string new_value; | ||
| new_value = input[0] == '?' ? input.substr(1) : input; | ||
| // Remove ASCII tab or newline characters | ||
| helpers::remove_ascii_tab_or_newline(new_value); | ||
|
|
||
| if (new_value.empty()) { | ||
| return ""; | ||
| } | ||
| return tl::unexpected(errors::type_error); | ||
|
|
||
| // Percent-encode using QUERY_PERCENT_ENCODE (for non-special URLs) | ||
| // Note: "fake://dummy.test" is not a special URL, so we use | ||
| // QUERY_PERCENT_ENCODE | ||
| size_t idx = ada::unicode::percent_encode_index( | ||
| new_value, character_sets::QUERY_PERCENT_ENCODE); | ||
| if (idx == new_value.size()) { | ||
| // No encoding needed | ||
| return new_value; | ||
| } | ||
| // Percent-encode from the first character that needs encoding | ||
| return ada::unicode::percent_encode( | ||
| new_value, character_sets::QUERY_PERCENT_ENCODE, idx); | ||
| } | ||
|
|
||
| tl::expected<std::string, errors> canonicalize_hash(std::string_view input) { | ||
| // If value is the empty string, return value. | ||
| if (input.empty()) [[unlikely]] { | ||
| return ""; | ||
| } | ||
| // Let dummyURL be a new URL record. | ||
| // Set dummyURL's fragment to the empty string. | ||
| // Let parseResult be the result of running basic URL parser given value with | ||
| // dummyURL as url and fragment state as state override. | ||
| auto url = ada::parse<url_aggregator>("fake://dummy.test", nullptr); | ||
| ADA_ASSERT_TRUE(url.has_value()); | ||
| url->set_hash(input); | ||
| // Return dummyURL's fragment. | ||
| if (url->has_hash()) { | ||
| const auto hash = url->get_hash(); | ||
| if (!hash.empty()) { | ||
| return std::string(hash.substr(1)); | ||
| } | ||
| // Remove leading '#' if present | ||
| std::string new_value; | ||
| new_value = input[0] == '#' ? input.substr(1) : input; | ||
| // Remove ASCII tab or newline characters | ||
| helpers::remove_ascii_tab_or_newline(new_value); | ||
|
|
||
| if (new_value.empty()) { | ||
| return ""; | ||
| } | ||
| return tl::unexpected(errors::type_error); | ||
|
|
||
| // Percent-encode using FRAGMENT_PERCENT_ENCODE | ||
| size_t idx = ada::unicode::percent_encode_index( | ||
| new_value, character_sets::FRAGMENT_PERCENT_ENCODE); | ||
| if (idx == new_value.size()) { | ||
| // No encoding needed | ||
| return new_value; | ||
| } | ||
| // Percent-encode from the first character that needs encoding | ||
| return ada::unicode::percent_encode( | ||
| new_value, character_sets::FRAGMENT_PERCENT_ENCODE, idx); | ||
| } | ||
|
|
||
| tl::expected<std::vector<token>, errors> tokenize(std::string_view input, | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good.