|
1 | 1 | #include "ada/implementation-inl.h" |
2 | 2 |
|
| 3 | +#include <optional> |
3 | 4 | #include <string_view> |
4 | 5 |
|
| 6 | +#include "ada/checkers-inl.h" |
| 7 | +#include "ada/checkers.h" |
5 | 8 | #include "ada/common_defs.h" |
6 | 9 | #include "ada/parser.h" |
| 10 | +#include "ada/scheme.h" |
| 11 | +#include "ada/unicode-inl.h" |
7 | 12 | #include "ada/url.h" |
8 | 13 | #include "ada/url_aggregator.h" |
9 | 14 |
|
10 | 15 | namespace ada { |
11 | 16 |
|
| 17 | +// ============================================================ |
| 18 | +// Fast-path validator for can_parse. |
| 19 | +// |
| 20 | +// Validates absolute special (non-file) URLs without constructing any |
| 21 | +// url_aggregator object and without running the state machine. |
| 22 | +// Performs a single forward scan over the input bytes. |
| 23 | +// |
| 24 | +// Returns: |
| 25 | +// true -- URL is structurally valid |
| 26 | +// false -- URL is definitely invalid |
| 27 | +// nullopt -- edge case; fall through to the full parser |
| 28 | +// (credentials, IDNA, IPv4/6, tabs/newlines, relative URLs, ...) |
| 29 | +// ============================================================ |
| 30 | +static std::optional<bool> try_can_parse_absolute_fast( |
| 31 | + std::string_view input) noexcept { |
| 32 | + const uint8_t* b = reinterpret_cast<const uint8_t*>(input.data()); |
| 33 | + size_t len = input.size(); |
| 34 | + |
| 35 | + // -- Inline C0 whitespace trim (no allocation) -------------------------- |
| 36 | + while (len > 0 && b[0] <= 0x20) { |
| 37 | + b++; |
| 38 | + len--; |
| 39 | + } |
| 40 | + while (len > 0 && b[len - 1] <= 0x20) { |
| 41 | + len--; |
| 42 | + } |
| 43 | + if (len == 0) return false; |
| 44 | + |
| 45 | + // Tabs/newlines are rare and require tmp_buffer allocation; defer to full |
| 46 | + // parser. |
| 47 | + if (unicode::has_tabs_or_newline({reinterpret_cast<const char*>(b), len})) { |
| 48 | + return std::nullopt; |
| 49 | + } |
| 50 | + |
| 51 | + // -- Scheme detection ----------------------------------------------------- |
| 52 | + if (!checkers::is_alpha(static_cast<char>(b[0]))) return false; |
| 53 | + |
| 54 | + // Scan for ':' within the first 7 bytes. All special schemes are <= 5 chars |
| 55 | + // ("https"), so any URL whose first ':' is beyond byte 6 is either |
| 56 | + // non-special or relative -- both require the full parser. |
| 57 | + size_t colon_pos = 0; |
| 58 | + for (size_t i = 1;; ++i) { |
| 59 | + if (i >= 7 || i >= len) return std::nullopt; |
| 60 | + const char c = static_cast<char>(b[i]); |
| 61 | + if (c == ':') { |
| 62 | + colon_pos = i; |
| 63 | + break; |
| 64 | + } |
| 65 | + if (!unicode::is_alnum_plus(c)) return false; |
| 66 | + } |
| 67 | + |
| 68 | + // Lowercase scheme bytes inline and classify via the existing perfect hash. |
| 69 | + char scheme_buf[6]; |
| 70 | + scheme_buf[0] = static_cast<char>(b[0] | 0x20); |
| 71 | + for (size_t i = 1; i < colon_pos; ++i) |
| 72 | + scheme_buf[i] = static_cast<char>(b[i] | 0x20); |
| 73 | + |
| 74 | + const ada::scheme::type scheme_type = |
| 75 | + ada::scheme::get_scheme_type({scheme_buf, colon_pos}); |
| 76 | + |
| 77 | + // Only handle special, non-file schemes. |
| 78 | + if (scheme_type == ada::scheme::NOT_SPECIAL) return std::nullopt; |
| 79 | + if (scheme_type == ada::scheme::FILE) return std::nullopt; |
| 80 | + |
| 81 | + // Per WHATWG, special URLs don't require "//": "http:example.com" is valid |
| 82 | + // (SPECIAL_AUTHORITY_IGNORE_SLASHES just skips leading slashes and proceeds |
| 83 | + // to AUTHORITY). Defer to the inline fallback for any input without "://". |
| 84 | + size_t pos = colon_pos + 1; |
| 85 | + if (pos + 2 > len || b[pos] != '/' || b[pos + 1] != '/') { |
| 86 | + return std::nullopt; |
| 87 | + } |
| 88 | + pos += 2; |
| 89 | + |
| 90 | + // -- Single-pass authority scan -------------------------------------------- |
| 91 | + const size_t auth_start = pos; |
| 92 | + size_t auth_end = pos; |
| 93 | + size_t port_colon = SIZE_MAX; |
| 94 | + bool has_x = false; |
| 95 | + |
| 96 | + for (; auth_end < len; ++auth_end) { |
| 97 | + const uint8_t c = b[auth_end]; |
| 98 | + if (c == '/' || c == '?' || c == '#' || c == '\\') break; |
| 99 | + if (c == '@') return std::nullopt; // credentials -> full parse |
| 100 | + if (c >= 0x80) return std::nullopt; // non-ASCII -> IDNA -> full parse |
| 101 | + if (c == ':') { |
| 102 | + if (port_colon == SIZE_MAX) port_colon = auth_end; |
| 103 | + continue; |
| 104 | + } |
| 105 | + if (c == 'x' || c == 'X') has_x = true; |
| 106 | + } |
| 107 | + |
| 108 | + // IPv6 literal |
| 109 | + if (auth_start < auth_end && b[auth_start] == '[') return std::nullopt; |
| 110 | + |
| 111 | + const size_t host_end = (port_colon != SIZE_MAX) ? port_colon : auth_end; |
| 112 | + |
| 113 | + // Empty host is invalid for special URLs. |
| 114 | + if (auth_start == host_end) return false; |
| 115 | + |
| 116 | + const char* host_ptr = reinterpret_cast<const char*>(b + auth_start); |
| 117 | + const size_t host_len = host_end - auth_start; |
| 118 | + |
| 119 | + // -- Host validation ------------------------------------------------------- |
| 120 | + // Bit 0x01: forbidden domain code point -> invalid. |
| 121 | + // Bit 0x02: uppercase letter -> still valid (parser lowercases), not checked |
| 122 | + // here. |
| 123 | + const uint8_t domain_check = |
| 124 | + unicode::contains_forbidden_domain_code_point_or_upper(host_ptr, |
| 125 | + host_len); |
| 126 | + if (domain_check & 0x01) return false; |
| 127 | + |
| 128 | + // xn-- labels require full IDNA validation. |
| 129 | + if (has_x) { |
| 130 | + for (size_t i = 0; i + 4 <= host_len; ++i) { |
| 131 | + if ((host_ptr[i] | 0x20) == 'x' && (host_ptr[i + 1] | 0x20) == 'n' && |
| 132 | + host_ptr[i + 2] == '-' && host_ptr[i + 3] == '-') { |
| 133 | + return std::nullopt; |
| 134 | + } |
| 135 | + } |
| 136 | + } |
| 137 | + |
| 138 | + // IPv4 detection: all-decimal-and-dot host -> try the fast IPv4 parser. |
| 139 | + { |
| 140 | + bool all_dec_dots = true; |
| 141 | + for (size_t i = 0; i < host_len && all_dec_dots; ++i) { |
| 142 | + const uint8_t c = static_cast<uint8_t>(host_ptr[i]); |
| 143 | + if (c != '.' && (c < '0' || c > '9')) all_dec_dots = false; |
| 144 | + } |
| 145 | + if (all_dec_dots) { |
| 146 | + return checkers::try_parse_ipv4_fast({host_ptr, host_len}) != |
| 147 | + checkers::ipv4_fast_fail; |
| 148 | + } |
| 149 | + |
| 150 | + // Last-significant-character heuristic for non-decimal IPv4 (hex/octal): |
| 151 | + // if the last non-dot char is a digit, 'a'-'f', or 'x' the host might be |
| 152 | + // an IPv4 address that the fast path can't validate -- fall through. |
| 153 | + uint8_t last = 0; |
| 154 | + for (size_t i = host_len; i > 0; --i) { |
| 155 | + if (host_ptr[i - 1] != '.') { |
| 156 | + last = static_cast<uint8_t>(host_ptr[i - 1]); |
| 157 | + break; |
| 158 | + } |
| 159 | + } |
| 160 | + const uint8_t lc = last | 0x20; |
| 161 | + if ((last >= '0' && last <= '9') || (lc >= 'a' && lc <= 'f') || lc == 'x') { |
| 162 | + return std::nullopt; |
| 163 | + } |
| 164 | + } |
| 165 | + |
| 166 | + // -- Port validation ------------------------------------------------------- |
| 167 | + if (port_colon != SIZE_MAX) { |
| 168 | + const uint8_t* pp = b + port_colon + 1; |
| 169 | + const size_t pl = auth_end - port_colon - 1; |
| 170 | + if (pl > 0) { |
| 171 | + if (pl > 5) return false; // > 99999 cannot be a valid port |
| 172 | + uint32_t pv = 0; |
| 173 | + for (size_t i = 0; i < pl; ++i) { |
| 174 | + if (pp[i] < '0' || pp[i] > '9') return false; |
| 175 | + pv = pv * 10 + (pp[i] - '0'); |
| 176 | + } |
| 177 | + if (pv > 65535) return false; |
| 178 | + } |
| 179 | + } |
| 180 | + |
| 181 | + // Path, query, and fragment are structurally always valid for can_parse -- |
| 182 | + // the parser would encode whatever is there. |
| 183 | + return true; |
| 184 | +} |
| 185 | + |
12 | 186 | template <class result_type> |
13 | 187 | ada_warn_unused tl::expected<result_type, errors> parse( |
14 | 188 | std::string_view input, const result_type* base_url) { |
15 | | - result_type u = |
16 | | - ada::parser::parse_url_impl<result_type, true>(input, base_url); |
| 189 | + result_type u = ada::parser::parse_url_impl<result_type>(input, base_url); |
17 | 190 | if (!u.is_valid) { |
18 | 191 | return tl::unexpected(errors::type_error); |
19 | 192 | } |
@@ -49,22 +222,31 @@ std::string href_from_file(std::string_view input) { |
49 | 222 | } |
50 | 223 |
|
51 | 224 | bool can_parse(std::string_view input, const std::string_view* base_input) { |
52 | | - ada::url_aggregator base_aggregator; |
53 | | - ada::url_aggregator* base_pointer = nullptr; |
| 225 | + // Fast path: handles the overwhelming majority of inputs -- absolute special |
| 226 | + // URLs with an ASCII domain, no credentials, and no base -- with a single |
| 227 | + // forward scan and zero allocations. |
| 228 | + if (base_input == nullptr) { |
| 229 | + if (const auto r = try_can_parse_absolute_fast(input)) { |
| 230 | + return *r; |
| 231 | + } |
| 232 | + } |
54 | 233 |
|
| 234 | + // Fallback: run the parser in validation-only mode (store_values=false), |
| 235 | + // which skips all the expensive work that isn't needed to determine validity: |
| 236 | + // buffer reservation, credential encoding, path normalisation, query and |
| 237 | + // fragment percent-encoding. The host is still fully validated (IDNA, IPv4, |
| 238 | + // IPv6) because parse_host() must run for correctness. |
| 239 | + ada::url_aggregator base_agg; |
| 240 | + ada::url_aggregator* base_ptr = nullptr; |
55 | 241 | if (base_input != nullptr) { |
56 | | - base_aggregator = ada::parser::parse_url_impl<ada::url_aggregator, false>( |
| 242 | + base_agg = ada::parser::parse_url_impl<ada::url_aggregator, false>( |
57 | 243 | *base_input, nullptr); |
58 | | - if (!base_aggregator.is_valid) { |
59 | | - return false; |
60 | | - } |
61 | | - base_pointer = &base_aggregator; |
| 244 | + if (!base_agg.is_valid) return false; |
| 245 | + base_ptr = &base_agg; |
62 | 246 | } |
63 | | - |
64 | | - ada::url_aggregator result = |
65 | | - ada::parser::parse_url_impl<ada::url_aggregator, false>(input, |
66 | | - base_pointer); |
67 | | - return result.is_valid; |
| 247 | + return ada::parser::parse_url_impl<ada::url_aggregator, false>(input, |
| 248 | + base_ptr) |
| 249 | + .is_valid; |
68 | 250 | } |
69 | 251 |
|
70 | 252 | ada_warn_unused std::string_view to_string(ada::encoding_type type) { |
|
0 commit comments