From 7585fbf5898a02445481905cfe5241446b404d4c Mon Sep 17 00:00:00 2001
From: Taneli Hukkinen <3275109+hukkin@users.noreply.github.com>
Date: Wed, 18 Dec 2024 12:59:46 +0200
Subject: [PATCH] fix: regex in is_md_equal is too greedy

---
 src/mdformat/_util.py | 14 ++++++++++++--
 tests/test_util.py    | 27 +++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/src/mdformat/_util.py b/src/mdformat/_util.py
index 83b25d1..ad5492c 100644
--- a/src/mdformat/_util.py
+++ b/src/mdformat/_util.py
@@ -47,6 +47,15 @@ def build_mdit(
     return mdit
 
 
+# Chars that markdown-it-py escapes when rendering code_inline:
+# https://github.com/executablebooks/markdown-it-py/blob/c5161b550f3c6c0a98d77e8389872405e8f9f9ee/markdown_it/common/utils.py#L138
+# Note that "&" is not included as it is used in the escape sequences of
+# these characters.
+_invalid_html_code_chars = '<>"'
+# a regex str that matches all except above chars
+_valid_html_code_char_re = rf"[^{re.escape(_invalid_html_code_chars)}]"
+
+
 def is_md_equal(
     md1: str,
     md2: str,
@@ -71,10 +80,11 @@ def is_md_equal(
         if codeformatters:
             langs_re = "|".join(re.escape(lang) for lang in codeformatters)
             html = re.sub(
-                rf'<code class="language-(?:{langs_re})">.*</code>',
+                rf'<code class="language-(?:{langs_re})">'
+                rf"{_valid_html_code_char_re}*"
+                r"</code>",
                 "",
                 html,
-                flags=re.DOTALL,
             )
 
         # Reduce all whitespace to a single space
diff --git a/tests/test_util.py b/tests/test_util.py
index e3bc89f..62c8dcd 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -21,3 +21,30 @@ def test_is_md_equal():
 paragr"""
     assert not is_md_equal(md1, md2)
     assert is_md_equal(md1, md2, codeformatters=("js", "go"))
+
+
+def test_is_md_equal__not():
+    md1 = """
+```js
+console.log()
+```
+
+paragr
+
+```js
+console.log()
+```
+"""
+    md2 = """
+```js
+bonsole.l()g
+```
+
+A different paragraph
+
+```js
+console.log()
+```
+"""
+    assert not is_md_equal(md1, md2)
+    assert not is_md_equal(md1, md2, codeformatters=("js",))