From 7585fbf5898a02445481905cfe5241446b404d4c Mon Sep 17 00:00:00 2001
From: Taneli Hukkinen <3275109+hukkin@users.noreply.github.com>
Date: Wed, 18 Dec 2024 12:59:46 +0200
Subject: [PATCH] fix: regex in is_md_equal is too greedy
---
src/mdformat/_util.py | 14 ++++++++++++--
tests/test_util.py | 27 +++++++++++++++++++++++++++
2 files changed, 39 insertions(+), 2 deletions(-)
diff --git a/src/mdformat/_util.py b/src/mdformat/_util.py
index 83b25d1..ad5492c 100644
--- a/src/mdformat/_util.py
+++ b/src/mdformat/_util.py
@@ -47,6 +47,15 @@ def build_mdit(
return mdit
+# Chars that markdown-it-py escapes when rendering code_inline:
+# https://github.com/executablebooks/markdown-it-py/blob/c5161b550f3c6c0a98d77e8389872405e8f9f9ee/markdown_it/common/utils.py#L138
+# Note that "&" is not included as it is used in the escape sequences of
+# these characters.
+_invalid_html_code_chars = '<>"'
+# a regex str that matches all except above chars
+_valid_html_code_char_re = rf"[^{re.escape(_invalid_html_code_chars)}]"
+
+
def is_md_equal(
md1: str,
md2: str,
@@ -71,10 +80,11 @@ def is_md_equal(
if codeformatters:
langs_re = "|".join(re.escape(lang) for lang in codeformatters)
html = re.sub(
- rf'.*',
+ rf''
+ rf"{_valid_html_code_char_re}*"
+ r"",
"",
html,
- flags=re.DOTALL,
)
# Reduce all whitespace to a single space
diff --git a/tests/test_util.py b/tests/test_util.py
index e3bc89f..62c8dcd 100644
--- a/tests/test_util.py
+++ b/tests/test_util.py
@@ -21,3 +21,30 @@ def test_is_md_equal():
paragr"""
assert not is_md_equal(md1, md2)
assert is_md_equal(md1, md2, codeformatters=("js", "go"))
+
+
+def test_is_md_equal__not():
+ md1 = """
+```js
+console.log()
+```
+
+paragr
+
+```js
+console.log()
+```
+"""
+ md2 = """
+```js
+bonsole.l()g
+```
+
+A different paragraph
+
+```js
+console.log()
+```
+"""
+ assert not is_md_equal(md1, md2)
+ assert not is_md_equal(md1, md2, codeformatters=("js",))