refactor(security): SanitizeUrl uses Uri.TryCreate, not manual scheme parse

codemonkeychris · claude · codemonkeychris · commit 50a068c453fd · 2026-05-18T18:56:06.000-07:00
The original comment claimed Uri.TryCreate was "too lenient" because it
parses `javascript:alert(1)` as a valid Uri. That's true, but the
SafeUrlSchemes allow-list filter on uri.Scheme runs *after* parsing
and rejects `javascript` (and every other non-http/https/mailto scheme)
the same way. So the System.Uri version is equally safe and avoids the
hand-rolled RFC 3986 scheme-character validator that was the actual
maintenance risk.

Behavior preserved:
- http/https/mailto pass through unchanged (allow-list, case-insensitive)
- javascript:/data:/vbscript:/file:/ftp:/ssh: → "about:blank"
- Relative URLs (no scheme) pass through unchanged
- Empty input passes through unchanged
- unsafeAllowed=true bypasses the filter entirely

Tests added (new file, 25 tests): TASK-045 XSS-fence had zero direct
coverage before this commit. The theory rows pin each known XSS-vector
scheme and the case-insensitive normalization contract.

Co-Authored-By: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/src/Reactor/Markdown/MarkdownHtml.cs b/src/Reactor/Markdown/MarkdownHtml.cs
@@ -89,23 +89,17 @@ internal static string SanitizeUrl(string url, bool unsafeAllowed)
     {
         if (unsafeAllowed) return url;
         if (string.IsNullOrEmpty(url)) return url;
-        // Relative URLs (no scheme) are safe; only absolute schemes need
-        // checking. Pull the scheme by hand because Uri.TryCreate is too
-        // lenient — it would parse `javascript:alert(1)` as a valid Uri.
-        int colon = url.IndexOf(':');
-        if (colon <= 0) return url; // no scheme = treat as path-relative
-        // Schemes per RFC 3986: ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
-        for (int i = 0; i < colon; i++)
-        {
-            var c = url[i];
-            bool ok = (i == 0)
-                ? (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
-                : (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
-                    || (c >= '0' && c <= '9') || c == '+' || c == '-' || c == '.';
-            if (!ok) return url; // not a real scheme — treat as relative
-        }
-        var scheme = url[..colon];
-        return SafeUrlSchemes.Contains(scheme) ? url : "about:blank";
+
+        // Uri.TryCreate happily parses `javascript:alert(1)` as a valid
+        // absolute Uri — that's fine, because the allow-list filter on
+        // uri.Scheme below rejects it. The point is to use System.Uri's
+        // RFC 3986 scheme/host decomposition rather than hand-rolled
+        // string parsing, which is fragile and a known source of bypass
+        // bugs in URL sanitizers.
+        if (!Uri.TryCreate(url, UriKind.Absolute, out var uri))
+            return url; // Not absolute — treat as path-relative; safe.
+
+        return SafeUrlSchemes.Contains(uri.Scheme) ? url : "about:blank";
     }
 
     private sealed class HtmlRenderer
diff --git a/tests/Reactor.Tests/Markdown/SanitizeUrlTests.cs b/tests/Reactor.Tests/Markdown/SanitizeUrlTests.cs
@@ -0,0 +1,103 @@
+using System.Reflection;
+using Xunit;
+
+namespace Microsoft.UI.Reactor.Tests.Markdown;
+
+/// <summary>
+/// Tests for the internal MarkdownHtml.SanitizeUrl scheme-allow-list.
+/// TASK-045 — the XSS fence on href/src attributes emitted by the
+/// Markdown renderer. Allow-list: http, https, mailto. Anything else
+/// becomes `about:blank`.
+/// </summary>
+public class SanitizeUrlTests
+{
+    private static readonly Type MarkdownHtmlType =
+        typeof(Microsoft.UI.Reactor.Factories).Assembly
+            .GetType("Microsoft.UI.Reactor.Markdown.MarkdownHtml")
+        ?? throw new InvalidOperationException(
+            "MarkdownHtml type not found in Reactor assembly.");
+
+    private static string Sanitize(string url, bool unsafeAllowed = false)
+    {
+        var mi = MarkdownHtmlType.GetMethod("SanitizeUrl",
+            BindingFlags.Static | BindingFlags.NonPublic)!;
+        return (string)mi.Invoke(null, new object?[] { url, unsafeAllowed })!;
+    }
+
+    // ── Allow-listed schemes pass through unchanged ───────────────
+
+    [Theory]
+    [InlineData("http://example.com/")]
+    [InlineData("https://example.com/path?q=1")]
+    [InlineData("HTTPS://EXAMPLE.COM")]         // case-insensitive scheme
+    [InlineData("mailto:foo@bar.com")]
+    [InlineData("mailto:foo@bar.com?subject=hi")]
+    public void Safe_Schemes_Pass_Through(string url)
+    {
+        Assert.Equal(url, Sanitize(url));
+    }
+
+    // ── Relative URLs are not absolute — pass through unchanged ───
+
+    [Theory]
+    [InlineData("/path/to/thing")]
+    [InlineData("relative/path.html")]
+    [InlineData("../up/one")]
+    [InlineData("#fragment-only")]
+    [InlineData("?query=only")]
+    public void Relative_Urls_Pass_Through(string url)
+    {
+        Assert.Equal(url, Sanitize(url));
+    }
+
+    // ── XSS vectors are rewritten to about:blank ──────────────────
+
+    [Theory]
+    // The canonical XSS payload. A renderer that emits this in an href
+    // attribute lets the page execute arbitrary JS on click.
+    [InlineData("javascript:alert(1)")]
+    [InlineData("JAVASCRIPT:alert(1)")]            // case-insensitive
+    [InlineData("javascript:void(0)")]
+    // data: URIs can carry script in text/html or SVG payloads.
+    [InlineData("data:text/html,<script>alert(1)</script>")]
+    [InlineData("data:image/svg+xml,<svg onload=alert(1)>")]
+    // vbscript: — IE legacy but still a known fingerprint payload.
+    [InlineData("vbscript:msgbox(1)")]
+    // file:// — disclosure / SSRF surface even on non-script targets.
+    [InlineData("file:///C:/windows/win.ini")]
+    // ftp, ssh, etc. — not on the allow-list.
+    [InlineData("ftp://example.com/")]
+    [InlineData("ssh://user@host")]
+    // Capitalized variants — System.Uri normalizes scheme to lowercase
+    // before comparison, and SafeUrlSchemes is OrdinalIgnoreCase, so
+    // both layers fail closed regardless.
+    [InlineData("Javascript:alert(1)")]
+    [InlineData("DATA:text/html,x")]
+    public void Disallowed_Schemes_Become_AboutBlank(string url)
+    {
+        Assert.Equal("about:blank", Sanitize(url));
+    }
+
+    // ── Empty / null-ish input ────────────────────────────────────
+
+    [Fact]
+    public void Empty_String_Passes_Through()
+    {
+        Assert.Equal("", Sanitize(""));
+    }
+
+    // ── unsafeAllowed escape hatch bypasses the filter entirely ───
+
+    [Theory]
+    [InlineData("javascript:alert(1)")]
+    [InlineData("data:text/html,x")]
+    [InlineData("file:///C:/x")]
+    public void UnsafeAllowed_Bypasses_AllowList(string url)
+    {
+        // Pin: when the renderer is configured with AllowUnsafeUrls (host
+        // is presenting trusted markdown, e.g. local devtools), every URL
+        // passes through. A regression that filtered even in unsafe mode
+        // would silently break that opt-in.
+        Assert.Equal(url, Sanitize(url, unsafeAllowed: true));
+    }
+}