fsprojects · Thorium · Nov 11, 2025 · Nov 16, 2025 · Nov 16, 2025 · nojaf
diff --git a/src/FSharp.Formatting.Markdown/HtmlFormatting.fs b/src/FSharp.Formatting.Markdown/HtmlFormatting.fs
@@ -21,6 +21,52 @@ open MarkdownUtils
 let internal htmlEncode (code: string) =
     code.Replace("&", "&amp;").Replace("<", "&lt;").Replace(">", "&gt;")
 
+/// Encode emojis and problematic Unicode characters as HTML numeric entities
+/// Encodes characters in emoji ranges and symbols, but preserves common international text
+let internal encodeHighUnicode (text: string) =
+    if String.IsNullOrEmpty text then
+        text
+    else
+        // Fast path: check if string needs encoding at all
+        let needsEncoding =
+            text
+            |> Seq.exists (fun c ->
+                let codePoint = int c
+                Char.IsSurrogate c || (codePoint >= 0x2000 && codePoint <= 0x2BFF))
+
+        if not needsEncoding then
+            text
+        else
+            // Tail-recursive function with StringBuilder accumulator
+            let rec processChars i (sb: System.Text.StringBuilder) =
+                if i >= text.Length then
+                    sb.ToString()
+                else
+                    let c = text.[i]
+                    // Check for surrogate pairs first (emojis and other characters outside BMP)
+                    if
+                        Char.IsHighSurrogate c
+                        && i + 1 < text.Length
+                        && Char.IsLowSurrogate(text.[i + 1])
+                    then
+                        let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1])
+                        // Encode all characters outside BMP (>= 0x10000) as they're typically emojis
+                        sb.Append(sprintf "&#%d;" fullCodePoint) |> ignore
+                        processChars (i + 2) sb // Skip both surrogate chars
+                    else
+                        let codePoint = int c
+                        // Encode specific ranges that contain emojis and symbols:
+                        // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc.
+                        // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates)
+                        if codePoint >= 0x2000 && codePoint <= 0x2BFF then
+                            sb.Append(sprintf "&#%d;" codePoint) |> ignore
+                        else
+                            sb.Append c |> ignore
+
+                        processChars (i + 1) sb
+
+            processChars 0 (System.Text.StringBuilder text.Length)
+
 /// Basic escaping as done by Markdown including quotes
 let internal htmlEncodeQuotes (code: string) =
     (htmlEncode code).Replace("\"", "&quot;")
@@ -78,7 +124,7 @@ let rec internal formatSpan (ctx: FormattingContext) span =
 
     | AnchorLink(id, _) -> ctx.Writer.Write("<a name=\"" + htmlEncodeQuotes id + "\">&#160;</a>")
     | EmbedSpans(cmd, _) -> formatSpans ctx (cmd.Render())
-    | Literal(str, _) -> ctx.Writer.Write(str)
+    | Literal(str, _) -> ctx.Writer.Write(encodeHighUnicode str)
     | HardLineBreak(_) -> ctx.Writer.Write("<br />" + ctx.Newline)
     | IndirectLink(body, _, LookupKey ctx.Links (link, title), _)
     | DirectLink(body, link, title, _) ->

diff --git a/tests/FSharp.Markdown.Tests/Markdown.fs b/tests/FSharp.Markdown.Tests/Markdown.fs
@@ -30,6 +30,38 @@ let ``Escape HTML entities inside of code`` () =
     |> Markdown.ToHtml
     |> should contain "<p><code>a &amp;gt; &amp; b</code></p>"
 
+[<Test>]
+let ``Emojis are encoded as HTML numeric entities`` () =
+    let html = "Like this 🎉🚧⭐⚠️✅" |> Markdown.ToHtml
+    html |> should contain "&#127881;" // 🎉 party popper
+    html |> should contain "&#128679;" // 🚧 construction
+    html |> should contain "&#11088;" // ⭐ star
+    html |> should contain "&#9888;" // ⚠️ warning
+    html |> should contain "&#9989;" // ✅ check mark
+
+[<Test>]
+let ``Regular text without emojis is not modified`` () =
+    // Fast path optimization: regular text should pass through unchanged
+    let html = "This is regular text with пристаням Cyrillic and 中文 Chinese" |> Markdown.ToHtml
+    html |> should contain "пристаням"
+    html |> should contain "中文"
+    html |> should not' (contain "&#") // No HTML entities for regular international text
+
+[<Test>]
+let ``List without blank line after heading`` () =
+    // Test the issue mentioned in comment: https://github.com/fsprojects/FSharp.Formatting/issues/964#issuecomment-3515381382
+    let markdown =
+        """# This is my title
+- this list
+- should render"""
+
+    let html = Markdown.ToHtml markdown
+    // Check if list is rendered as a separate element, not part of heading
+    html |> should contain "<h1>This is my title</h1>"
+    html |> should contain "<ul>"
+    html |> should contain "<li>this list</li>"
+    html |> should contain "<li>should render</li>"
+
 [<Test>]
 let ``Inline HTML tag containing 'at' is not turned into hyperlink`` () =
     let doc = """<a href="mailto:[email protected]">hi</a>""" |> Markdown.Parse