From 7585fbf5898a02445481905cfe5241446b404d4c Mon Sep 17 00:00:00 2001 From: Taneli Hukkinen <3275109+hukkin@users.noreply.github.com> Date: Wed, 18 Dec 2024 12:59:46 +0200 Subject: [PATCH] fix: regex in is_md_equal is too greedy --- src/mdformat/_util.py | 14 ++++++++++++-- tests/test_util.py | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/src/mdformat/_util.py b/src/mdformat/_util.py index 83b25d1..ad5492c 100644 --- a/src/mdformat/_util.py +++ b/src/mdformat/_util.py @@ -47,6 +47,15 @@ def build_mdit( return mdit +# Chars that markdown-it-py escapes when rendering code_inline: +# https://github.com/executablebooks/markdown-it-py/blob/c5161b550f3c6c0a98d77e8389872405e8f9f9ee/markdown_it/common/utils.py#L138 +# Note that "&" is not included as it is used in the escape sequences of +# these characters. +_invalid_html_code_chars = '<>"' +# a regex str that matches all except above chars +_valid_html_code_char_re = rf"[^{re.escape(_invalid_html_code_chars)}]" + + def is_md_equal( md1: str, md2: str, @@ -71,10 +80,11 @@ def is_md_equal( if codeformatters: langs_re = "|".join(re.escape(lang) for lang in codeformatters) html = re.sub( - rf'.*', + rf'' + rf"{_valid_html_code_char_re}*" + r"", "", html, - flags=re.DOTALL, ) # Reduce all whitespace to a single space diff --git a/tests/test_util.py b/tests/test_util.py index e3bc89f..62c8dcd 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -21,3 +21,30 @@ def test_is_md_equal(): paragr""" assert not is_md_equal(md1, md2) assert is_md_equal(md1, md2, codeformatters=("js", "go")) + + +def test_is_md_equal__not(): + md1 = """ +```js +console.log() +``` + +paragr + +```js +console.log() +``` +""" + md2 = """ +```js +bonsole.l()g +``` + +A different paragraph + +```js +console.log() +``` +""" + assert not is_md_equal(md1, md2) + assert not is_md_equal(md1, md2, codeformatters=("js",))