improve: improve import time with lazy regex and imports

hukkin · hukkin · commit b9aac112540d · 2025-01-08T11:31:43.000+02:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -156,6 +156,13 @@ commands = [
     ["python", "-m", "timeit", "from mdformat._cli import run", 'run(["README.md", "docs/", "--check", "--wrap", "50"])'],
 ]
 
+[tool.tox.env."benchmark-import"]
+description = "Measure module import times. Tox sends mdformat output to stderr, so to filter use e.g. `tox -e benchmark-import 2> >(grep mdformat)`."
+deps = []
+commands = [
+    ["python", "-X", "importtime", "-m", "mdformat"],
+]
+
 
 [tool.coverage.run]
 source = ["mdformat"]
diff --git a/src/mdformat/codepoints/__init__.py b/src/mdformat/codepoints/__init__.py
@@ -5,7 +5,6 @@
     "ASCII_WHITESPACE",
 )
 
-import warnings
 
 from mdformat.codepoints._unicode_punctuation import UNICODE_PUNCTUATION
 from mdformat.codepoints._unicode_whitespace import UNICODE_WHITESPACE
@@ -19,6 +18,8 @@ def __getattr__(name: str) -> frozenset[str]:
     Used during the deprecation period of `ASCII_WHITESPACE`.
     """
     if name == "ASCII_WHITESPACE":
+        import warnings
+
         warnings.warn(
             "ASCII_WHITESPACE is deprecated because CommonMark v0.30 no longer "
             "defines ASCII whitespace.",
diff --git a/src/mdformat/plugins.py b/src/mdformat/plugins.py
@@ -4,11 +4,11 @@
 from collections.abc import Callable, Mapping
 from typing import TYPE_CHECKING, Any, Protocol
 
-from markdown_it import MarkdownIt
-
 from mdformat._compat import importlib_metadata
 
 if TYPE_CHECKING:
+    from markdown_it import MarkdownIt
+
     from mdformat.renderer.typing import Postprocess, Render
 
 
diff --git a/src/mdformat/renderer/__init__.py b/src/mdformat/renderer/__init__.py
@@ -13,13 +13,15 @@
 import logging
 import string
 from types import MappingProxyType
-from typing import Any
-
-from markdown_it.token import Token
+from typing import TYPE_CHECKING, Any
 
 from mdformat.renderer._context import DEFAULT_RENDERERS, WRAP_POINT, RenderContext
 from mdformat.renderer._tree import RenderTreeNode
-from mdformat.renderer.typing import Postprocess
+
+if TYPE_CHECKING:
+    from markdown_it.token import Token
+
+    from mdformat.renderer.typing import Postprocess
 
 LOGGER = logging.getLogger(__name__)
 
diff --git a/src/mdformat/renderer/_context.py b/src/mdformat/renderer/_context.py
@@ -15,7 +15,6 @@
 from mdformat import codepoints
 from mdformat._conf import DEFAULT_OPTS
 from mdformat.renderer._util import (
-    RE_CHAR_REFERENCE,
     decimalify_leading,
     decimalify_trailing,
     escape_asterisk_emphasis,
@@ -27,6 +26,7 @@
     is_tight_list_item,
     longest_consecutive_sequence,
     maybe_add_link_brackets,
+    re_char_reference,
 )
 
 if TYPE_CHECKING:
@@ -137,7 +137,7 @@ def text(node: RenderTreeNode, context: RenderContext) -> str:
 
     # Escape "&" if it starts a sequence that can be interpreted as
     # a character reference.
-    text = RE_CHAR_REFERENCE.sub(r"\\\g<0>", text)
+    text = re_char_reference().sub(r"\\\g<0>", text)
 
     # The parser can give us consecutive newlines which can break
     # the markdown structure. Replace two or more consecutive newlines
diff --git a/src/mdformat/renderer/_util.py b/src/mdformat/renderer/_util.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from collections.abc import Iterable
+import functools
 import html.entities
 import re
 from typing import TYPE_CHECKING
@@ -10,20 +11,28 @@
 if TYPE_CHECKING:
     from mdformat.renderer import RenderTreeNode
 
-# Regex that finds character references.
-# The reference can be either
-#   1. decimal representation, e.g. &#11;
-#   2. hex representation, e.g. &#x1e;
-#   3. HTML5 entity reference, e.g. &nbsp;
-RE_CHAR_REFERENCE = re.compile(
-    "&(?:"
-    + "#[0-9]{1,7}"
-    + "|"
-    + "#[Xx][0-9A-Fa-f]{1,6}"
-    + "|"
-    + "|".join({c.rstrip(";") for c in html.entities.html5})
-    + ");"
-)
+
+@functools.cache
+def re_char_reference() -> re.Pattern[str]:
+    """Return a regex that finds character references.
+
+    The reference can be either:
+    1. decimal representation, e.g. &#11;
+    2. hex representation, e.g. &#x1e;
+    3. HTML5 entity reference, e.g. &nbsp;
+
+    This cached function compiles the regex lazily,
+    as compilation can take over 20ms.
+    """
+    return re.compile(
+        "&(?:"
+        + "#[0-9]{1,7}"
+        + "|"
+        + "#[Xx][0-9A-Fa-f]{1,6}"
+        + "|"
+        + "|".join({c.rstrip(";") for c in html.entities.html5})
+        + ");"
+    )
 
 
 def is_tight_list(node: RenderTreeNode) -> bool:
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -146,3 +146,12 @@ def test_mdrenderer_no_finalize(tmp_path):
 def test_ascii_whitespace_deprecation():
     with pytest.warns(DeprecationWarning):
         mdformat.codepoints.ASCII_WHITESPACE
+
+
+def test_import_typing():
+    """Try to import mdformat.renderer.typing.
+
+    The module consists of annotation types only, so mdformat never
+    imports it at runtime. This test ensures that it still runs.
+    """
+    import mdformat.renderer.typing  # noqa: F401
diff --git a/tests/utils.py b/tests/utils.py
@@ -1,11 +1,15 @@
-import json
+from __future__ import annotations
 
-from markdown_it import MarkdownIt
+import json
+from typing import TYPE_CHECKING
 
 from mdformat._cli import run
 from mdformat._conf import read_toml_opts
 from mdformat.renderer import RenderContext, RenderTreeNode
 
+if TYPE_CHECKING:
+    from markdown_it import MarkdownIt
+
 UNFORMATTED_MARKDOWN = "\n\n# A header\n\n"
 FORMATTED_MARKDOWN = "# A header\n"