Skip to content

Commit 1d8a88d

Browse files
committed
optimize url::can_parse method
1 parent a3cbb2c commit 1d8a88d

File tree

6 files changed

+265
-22
lines changed

6 files changed

+265
-22
lines changed

.github/workflows/abi-check.yml

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,21 @@ jobs:
3737
- name: Find latest release tag
3838
id: baseline
3939
run: |
40-
# Find the most recent vX.Y.Z tag reachable from the current commit's history
41-
LATEST_TAG=$(git tag --list 'v*.*.*' --sort=-version:refname | head -1)
40+
# Find the most recent vX.Y.Z tag that does NOT point to the current HEAD.
41+
# Excluding HEAD ensures we compare against a previous release even when a
42+
# new release tag was just pushed to main alongside this workflow run.
43+
CURRENT_SHA=$(git rev-parse HEAD)
44+
LATEST_TAG=$(git tag --list 'v*.*.*' --sort=-version:refname | while IFS= read -r tag; do
45+
TAG_SHA=$(git rev-parse "${tag}^{}" 2>/dev/null)
46+
if [ "$TAG_SHA" != "$CURRENT_SHA" ]; then
47+
echo "$tag"
48+
break
49+
fi
50+
done)
51+
if [ -z "$LATEST_TAG" ]; then
52+
echo "No previous release tag found — cannot establish a baseline."
53+
exit 1
54+
fi
4255
echo "Latest release tag: $LATEST_TAG"
4356
echo "tag=$LATEST_TAG" >> "$GITHUB_OUTPUT"
4457

include/ada/parser.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,12 @@ template <typename result_type = url_aggregator, bool store_values = true>
6565
result_type parse_url_impl(std::string_view user_input,
6666
const result_type* base_url = nullptr);
6767

68-
extern template url_aggregator parse_url_impl<url_aggregator>(
68+
extern template url_aggregator parse_url_impl<url_aggregator, true>(
6969
std::string_view user_input, const url_aggregator* base_url);
70-
extern template url parse_url_impl<url>(std::string_view user_input,
71-
const url* base_url);
70+
extern template url_aggregator parse_url_impl<url_aggregator, false>(
71+
std::string_view user_input, const url_aggregator* base_url);
72+
extern template url parse_url_impl<url, true>(std::string_view user_input,
73+
const url* base_url);
7274

7375
#if ADA_INCLUDE_URL_PATTERN
7476
template <url_pattern_regex::regex_concept regex_provider>

include/ada/url.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,9 @@ struct url : url_base {
367367
const ada::url *);
368368
friend ada::url_aggregator ada::parser::parse_url_impl<
369369
ada::url_aggregator, true>(std::string_view, const ada::url_aggregator *);
370+
friend ada::url_aggregator
371+
ada::parser::parse_url_impl<ada::url_aggregator, false>(
372+
std::string_view, const ada::url_aggregator *);
370373

371374
inline void update_unencoded_base_hash(std::string_view input);
372375
inline void update_base_hostname(std::string_view input);

src/checkers.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,4 +129,42 @@ ada_really_inline constexpr bool verify_dns_length(
129129

130130
return true;
131131
}
132+
namespace {
133+
134+
// Returns true if port_bytes (already tab/newline-free) is a valid ASCII
135+
// decimal port number in the range [0, 65535]. An empty string is accepted
136+
// and means "no port was specified".
137+
ada_really_inline bool validate_port(const uint8_t* port_bytes,
138+
size_t length) noexcept {
139+
if (length == 0) return true;
140+
if (length > 5) return false; // > 99999 can never be a valid port
141+
uint32_t value = 0;
142+
for (size_t i = 0; i < length; ++i) {
143+
if (port_bytes[i] < '0' || port_bytes[i] > '9') return false;
144+
value = value * 10 + (port_bytes[i] - '0');
145+
}
146+
return value <= 65535;
147+
}
148+
149+
// Returns true if every byte between the '[' and ']' of an IPv6 literal
150+
// belongs to the allowed character set: hex digits (0-9, a-f, A-F), ':', '.',
151+
// and '%' (zone-ID prefix). This is a character-set check only; structural
152+
// validation (correct group count, '::' placement, etc.) is left to the full
153+
// parser.
154+
ada_really_inline bool validate_ipv6_inner(const uint8_t* address_bytes,
155+
size_t length) noexcept {
156+
if (length == 0) return false;
157+
for (size_t i = 0; i < length; ++i) {
158+
const uint8_t byte = address_bytes[i];
159+
if ((byte >= '0' && byte <= '9') || (byte >= 'a' && byte <= 'f') ||
160+
(byte >= 'A' && byte <= 'F') || byte == ':' || byte == '.' ||
161+
byte == '%')
162+
continue;
163+
return false;
164+
}
165+
return true;
166+
}
167+
168+
} // anonymous namespace
169+
132170
} // namespace ada::checkers

src/implementation.cpp

Lines changed: 196 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,192 @@
11
#include "ada/implementation-inl.h"
22

3+
#include <optional>
34
#include <string_view>
45

6+
#include "ada/checkers-inl.h"
7+
#include "ada/checkers.h"
58
#include "ada/common_defs.h"
69
#include "ada/parser.h"
10+
#include "ada/scheme.h"
11+
#include "ada/unicode-inl.h"
712
#include "ada/url.h"
813
#include "ada/url_aggregator.h"
914

1015
namespace ada {
1116

17+
// ============================================================
18+
// Fast-path validator for can_parse.
19+
//
20+
// Validates absolute special (non-file) URLs without constructing any
21+
// url_aggregator object and without running the state machine.
22+
// Performs a single forward scan over the input bytes.
23+
//
24+
// Returns:
25+
// true -- URL is structurally valid
26+
// false -- URL is definitely invalid
27+
// nullopt -- edge case; fall through to the full parser
28+
// (credentials, IDNA, IPv4/6, tabs/newlines, relative URLs, ...)
29+
// ============================================================
30+
static std::optional<bool> try_can_parse_absolute_fast(
31+
std::string_view input) noexcept {
32+
const uint8_t* b = reinterpret_cast<const uint8_t*>(input.data());
33+
size_t len = input.size();
34+
35+
// -- Inline C0 whitespace trim (no allocation) --------------------------
36+
while (len > 0 && b[0] <= 0x20) {
37+
b++;
38+
len--;
39+
}
40+
while (len > 0 && b[len - 1] <= 0x20) {
41+
len--;
42+
}
43+
if (len == 0) return false;
44+
45+
// Tabs/newlines are rare and require tmp_buffer allocation; defer to full
46+
// parser.
47+
if (unicode::has_tabs_or_newline({reinterpret_cast<const char*>(b), len})) {
48+
return std::nullopt;
49+
}
50+
51+
// -- Scheme detection -----------------------------------------------------
52+
if (!checkers::is_alpha(static_cast<char>(b[0]))) return false;
53+
54+
// Scan for ':' within the first 7 bytes. All special schemes are <= 5 chars
55+
// ("https"), so any URL whose first ':' is beyond byte 6 is either
56+
// non-special or relative -- both require the full parser.
57+
size_t colon_pos = 0;
58+
for (size_t i = 1;; ++i) {
59+
if (i >= 7 || i >= len) return std::nullopt;
60+
const char c = static_cast<char>(b[i]);
61+
if (c == ':') {
62+
colon_pos = i;
63+
break;
64+
}
65+
if (!unicode::is_alnum_plus(c)) return false;
66+
}
67+
68+
// Lowercase scheme bytes inline and classify via the existing perfect hash.
69+
char scheme_buf[6];
70+
scheme_buf[0] = static_cast<char>(b[0] | 0x20);
71+
for (size_t i = 1; i < colon_pos; ++i)
72+
scheme_buf[i] = static_cast<char>(b[i] | 0x20);
73+
74+
const ada::scheme::type scheme_type =
75+
ada::scheme::get_scheme_type({scheme_buf, colon_pos});
76+
77+
// Only handle special, non-file schemes.
78+
if (scheme_type == ada::scheme::NOT_SPECIAL) return std::nullopt;
79+
if (scheme_type == ada::scheme::FILE) return std::nullopt;
80+
81+
// Per WHATWG, special URLs don't require "//": "http:example.com" is valid
82+
// (SPECIAL_AUTHORITY_IGNORE_SLASHES just skips leading slashes and proceeds
83+
// to AUTHORITY). Defer to the inline fallback for any input without "://".
84+
size_t pos = colon_pos + 1;
85+
if (pos + 2 > len || b[pos] != '/' || b[pos + 1] != '/') {
86+
return std::nullopt;
87+
}
88+
pos += 2;
89+
90+
// -- Single-pass authority scan --------------------------------------------
91+
const size_t auth_start = pos;
92+
size_t auth_end = pos;
93+
size_t port_colon = SIZE_MAX;
94+
bool has_x = false;
95+
96+
for (; auth_end < len; ++auth_end) {
97+
const uint8_t c = b[auth_end];
98+
if (c == '/' || c == '?' || c == '#' || c == '\\') break;
99+
if (c == '@') return std::nullopt; // credentials -> full parse
100+
if (c >= 0x80) return std::nullopt; // non-ASCII -> IDNA -> full parse
101+
if (c == ':') {
102+
if (port_colon == SIZE_MAX) port_colon = auth_end;
103+
continue;
104+
}
105+
if (c == 'x' || c == 'X') has_x = true;
106+
}
107+
108+
// IPv6 literal
109+
if (auth_start < auth_end && b[auth_start] == '[') return std::nullopt;
110+
111+
const size_t host_end = (port_colon != SIZE_MAX) ? port_colon : auth_end;
112+
113+
// Empty host is invalid for special URLs.
114+
if (auth_start == host_end) return false;
115+
116+
const char* host_ptr = reinterpret_cast<const char*>(b + auth_start);
117+
const size_t host_len = host_end - auth_start;
118+
119+
// -- Host validation -------------------------------------------------------
120+
// Bit 0x01: forbidden domain code point -> invalid.
121+
// Bit 0x02: uppercase letter -> still valid (parser lowercases), not checked
122+
// here.
123+
const uint8_t domain_check =
124+
unicode::contains_forbidden_domain_code_point_or_upper(host_ptr,
125+
host_len);
126+
if (domain_check & 0x01) return false;
127+
128+
// xn-- labels require full IDNA validation.
129+
if (has_x) {
130+
for (size_t i = 0; i + 4 <= host_len; ++i) {
131+
if ((host_ptr[i] | 0x20) == 'x' && (host_ptr[i + 1] | 0x20) == 'n' &&
132+
host_ptr[i + 2] == '-' && host_ptr[i + 3] == '-') {
133+
return std::nullopt;
134+
}
135+
}
136+
}
137+
138+
// IPv4 detection: all-decimal-and-dot host -> try the fast IPv4 parser.
139+
{
140+
bool all_dec_dots = true;
141+
for (size_t i = 0; i < host_len && all_dec_dots; ++i) {
142+
const uint8_t c = static_cast<uint8_t>(host_ptr[i]);
143+
if (c != '.' && (c < '0' || c > '9')) all_dec_dots = false;
144+
}
145+
if (all_dec_dots) {
146+
return checkers::try_parse_ipv4_fast({host_ptr, host_len}) !=
147+
checkers::ipv4_fast_fail;
148+
}
149+
150+
// Last-significant-character heuristic for non-decimal IPv4 (hex/octal):
151+
// if the last non-dot char is a digit, 'a'-'f', or 'x' the host might be
152+
// an IPv4 address that the fast path can't validate -- fall through.
153+
uint8_t last = 0;
154+
for (size_t i = host_len; i > 0; --i) {
155+
if (host_ptr[i - 1] != '.') {
156+
last = static_cast<uint8_t>(host_ptr[i - 1]);
157+
break;
158+
}
159+
}
160+
const uint8_t lc = last | 0x20;
161+
if ((last >= '0' && last <= '9') || (lc >= 'a' && lc <= 'f') || lc == 'x') {
162+
return std::nullopt;
163+
}
164+
}
165+
166+
// -- Port validation -------------------------------------------------------
167+
if (port_colon != SIZE_MAX) {
168+
const uint8_t* pp = b + port_colon + 1;
169+
const size_t pl = auth_end - port_colon - 1;
170+
if (pl > 0) {
171+
if (pl > 5) return false; // > 99999 cannot be a valid port
172+
uint32_t pv = 0;
173+
for (size_t i = 0; i < pl; ++i) {
174+
if (pp[i] < '0' || pp[i] > '9') return false;
175+
pv = pv * 10 + (pp[i] - '0');
176+
}
177+
if (pv > 65535) return false;
178+
}
179+
}
180+
181+
// Path, query, and fragment are structurally always valid for can_parse --
182+
// the parser would encode whatever is there.
183+
return true;
184+
}
185+
12186
template <class result_type>
13187
ada_warn_unused tl::expected<result_type, errors> parse(
14188
std::string_view input, const result_type* base_url) {
15-
result_type u =
16-
ada::parser::parse_url_impl<result_type, true>(input, base_url);
189+
result_type u = ada::parser::parse_url_impl<result_type>(input, base_url);
17190
if (!u.is_valid) {
18191
return tl::unexpected(errors::type_error);
19192
}
@@ -49,22 +222,31 @@ std::string href_from_file(std::string_view input) {
49222
}
50223

51224
bool can_parse(std::string_view input, const std::string_view* base_input) {
52-
ada::url_aggregator base_aggregator;
53-
ada::url_aggregator* base_pointer = nullptr;
225+
// Fast path: handles the overwhelming majority of inputs -- absolute special
226+
// URLs with an ASCII domain, no credentials, and no base -- with a single
227+
// forward scan and zero allocations.
228+
if (base_input == nullptr) {
229+
if (const auto r = try_can_parse_absolute_fast(input)) {
230+
return *r;
231+
}
232+
}
54233

234+
// Fallback: run the parser in validation-only mode (store_values=false),
235+
// which skips all the expensive work that isn't needed to determine validity:
236+
// buffer reservation, credential encoding, path normalisation, query and
237+
// fragment percent-encoding. The host is still fully validated (IDNA, IPv4,
238+
// IPv6) because parse_host() must run for correctness.
239+
ada::url_aggregator base_agg;
240+
ada::url_aggregator* base_ptr = nullptr;
55241
if (base_input != nullptr) {
56-
base_aggregator = ada::parser::parse_url_impl<ada::url_aggregator, false>(
242+
base_agg = ada::parser::parse_url_impl<ada::url_aggregator, false>(
57243
*base_input, nullptr);
58-
if (!base_aggregator.is_valid) {
59-
return false;
60-
}
61-
base_pointer = &base_aggregator;
244+
if (!base_agg.is_valid) return false;
245+
base_ptr = &base_agg;
62246
}
63-
64-
ada::url_aggregator result =
65-
ada::parser::parse_url_impl<ada::url_aggregator, false>(input,
66-
base_pointer);
67-
return result.is_valid;
247+
return ada::parser::parse_url_impl<ada::url_aggregator, false>(input,
248+
base_ptr)
249+
.is_valid;
68250
}
69251

70252
ada_warn_unused std::string_view to_string(ada::encoding_type type) {

src/parser.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@
77
#include "ada/common_defs.h"
88
#include "ada/log.h"
99
#include "ada/unicode.h"
10+
#include "ada/url.h"
11+
#include "ada/url_aggregator.h"
12+
#include "ada/url_aggregator-inl.h"
1013

1114
namespace ada::parser {
1215

@@ -926,9 +929,11 @@ result_type parse_url_impl(std::string_view user_input,
926929
return url;
927930
}
928931

929-
template url parse_url_impl(std::string_view user_input,
930-
const url* base_url = nullptr);
931-
template url_aggregator parse_url_impl(
932+
template url parse_url_impl<url, true>(std::string_view user_input,
933+
const url* base_url = nullptr);
934+
template url_aggregator parse_url_impl<url_aggregator, true>(
935+
std::string_view user_input, const url_aggregator* base_url = nullptr);
936+
template url_aggregator parse_url_impl<url_aggregator, false>(
932937
std::string_view user_input, const url_aggregator* base_url = nullptr);
933938

934939
template <class result_type>

0 commit comments

Comments
 (0)