-
Notifications
You must be signed in to change notification settings - Fork 162
Markdown to html improvement #965
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,52 @@ open MarkdownUtils | |
| let internal htmlEncode (code: string) = | ||
| code.Replace("&", "&").Replace("<", "<").Replace(">", ">") | ||
|
|
||
| /// Encode emojis and problematic Unicode characters as HTML numeric entities | ||
| /// Encodes characters in emoji ranges and symbols, but preserves common international text | ||
| let internal encodeHighUnicode (text: string) = | ||
| if String.IsNullOrEmpty text then | ||
| text | ||
| else | ||
| // Fast path: check if string needs encoding at all | ||
| let needsEncoding = | ||
| text | ||
| |> Seq.exists (fun c -> | ||
| let codePoint = int c | ||
| Char.IsSurrogate c || (codePoint >= 0x2000 && codePoint <= 0x2BFF)) | ||
|
|
||
| if not needsEncoding then | ||
| text | ||
| else | ||
| // Tail-recursive function with StringBuilder accumulator | ||
| let rec processChars i (sb: System.Text.StringBuilder) = | ||
| if i >= text.Length then | ||
| sb.ToString() | ||
| else | ||
| let c = text.[i] | ||
| // Check for surrogate pairs first (emojis and other characters outside BMP) | ||
| if | ||
| Char.IsHighSurrogate c | ||
| && i + 1 < text.Length | ||
| && Char.IsLowSurrogate(text.[i + 1]) | ||
| then | ||
| let fullCodePoint = Char.ConvertToUtf32(c, text.[i + 1]) | ||
| // Encode all characters outside BMP (>= 0x10000) as they're typically emojis | ||
| sb.Append(sprintf "&#%d;" fullCodePoint) |> ignore | ||
|
||
| processChars (i + 2) sb // Skip both surrogate chars | ||
| else | ||
| let codePoint = int c | ||
| // Encode specific ranges that contain emojis and symbols: | ||
| // U+2000-U+2BFF: General Punctuation, Superscripts, Currency, Dingbats, Arrows, Math, Technical, Box Drawing, etc. | ||
| // U+1F000-U+1FFFF: Supplementary Multilingual Plane emojis (handled above via surrogates) | ||
| if codePoint >= 0x2000 && codePoint <= 0x2BFF then | ||
| sb.Append(sprintf "&#%d;" codePoint) |> ignore | ||
| else | ||
| sb.Append c |> ignore | ||
|
|
||
| processChars (i + 1) sb | ||
|
|
||
| processChars 0 (System.Text.StringBuilder text.Length) | ||
|
|
||
| /// Basic escaping as done by Markdown including quotes | ||
| let internal htmlEncodeQuotes (code: string) = | ||
| (htmlEncode code).Replace("\"", """) | ||
|
|
@@ -78,7 +124,7 @@ let rec internal formatSpan (ctx: FormattingContext) span = | |
|
|
||
| | AnchorLink(id, _) -> ctx.Writer.Write("<a name=\"" + htmlEncodeQuotes id + "\"> </a>") | ||
| | EmbedSpans(cmd, _) -> formatSpans ctx (cmd.Render()) | ||
| | Literal(str, _) -> ctx.Writer.Write(str) | ||
| | Literal(str, _) -> ctx.Writer.Write(encodeHighUnicode str) | ||
| | HardLineBreak(_) -> ctx.Writer.Write("<br />" + ctx.Newline) | ||
| | IndirectLink(body, _, LookupKey ctx.Links (link, title), _) | ||
| | DirectLink(body, link, title, _) -> | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,38 @@ let ``Escape HTML entities inside of code`` () = | |
| |> Markdown.ToHtml | ||
| |> should contain "<p><code>a &gt; & b</code></p>" | ||
|
|
||
| [<Test>] | ||
| let ``Emojis are encoded as HTML numeric entities`` () = | ||
| let html = "Like this 🎉🚧⭐⚠️✅" |> Markdown.ToHtml | ||
| html |> should contain "🎉" // 🎉 party popper | ||
| html |> should contain "🚧" // 🚧 construction | ||
| html |> should contain "⭐" // ⭐ star | ||
| html |> should contain "⚠" // ⚠️ warning | ||
| html |> should contain "✅" // ✅ check mark | ||
|
|
||
| [<Test>] | ||
| let ``Regular text without emojis is not modified`` () = | ||
| // Fast path optimization: regular text should pass through unchanged | ||
| let html = "This is regular text with пристаням Cyrillic and 中文 Chinese" |> Markdown.ToHtml | ||
| html |> should contain "пристаням" | ||
| html |> should contain "中文" | ||
| html |> should not' (contain "&#") // No HTML entities for regular international text | ||
|
|
||
| [<Test>] | ||
| let ``List without blank line after heading`` () = | ||
| // Test the issue mentioned in comment: https://github.com/fsprojects/FSharp.Formatting/issues/964#issuecomment-3515381382 | ||
| let markdown = | ||
| """# This is my title | ||
| - this list | ||
| - should render""" | ||
|
|
||
| let html = Markdown.ToHtml markdown | ||
| // Check if list is rendered as a separate element, not part of heading | ||
| html |> should contain "<h1>This is my title</h1>" | ||
| html |> should contain "<ul>" | ||
| html |> should contain "<li>this list</li>" | ||
| html |> should contain "<li>should render</li>" | ||
|
|
||
| [<Test>] | ||
| let ``Inline HTML tag containing 'at' is not turned into hyperlink`` () = | ||
| let doc = """<a href="mailto:[email protected]">hi</a>""" |> Markdown.Parse | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This scans the string to check if encoding is needed, but you traverse it again when you encoding.
Can we do this in one pass?
I'm a little afraid this code is in a hot path.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The question I had is that, expecting this is a minor use-case (but AI does more and more documentation in the future) is it worth of checking every character ever that goes through formatting? Will it slowdown the process? So could we do somehow like a high-level check, if the replacement-run is relevant at all?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you mean here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sorry, I meant F# Formatting of course not Fantomas. I was thinkin only in context of html generation: It is called in basically every .NET tool build-scripts, we want it to be fast if possible.