Skip to content

Commit 28a9778

Browse files
authored
✨ Add GFM autolink and composite GFM plugins (#135)
## Summary Adds two new plugins that require markdown-it-py >= 4.1.0: ### `gfm_autolink` — GFM autolink literals Implements the [GFM autolinks extension](https://github.github.com/gfm/#autolinks-extension-) with three inline scanners: - `www.` URLs (trigger char: `w`, via `add_terminator_char`) - `http://`/`https://`/`mailto:`/`xmpp:` URLs (trigger char: `:`) - Bare email addresses (trigger char: `@`) Matching logic is ported from the Rust [gfm_autolinks](https://github.com/markdown-it-rust/markdown-it-plugins.rs/tree/main/crates/gfm-autolinks) crate. Covers GFM spec examples 622–635 plus additional edge cases (trailing delimiters, emphasis integration, parentheses balancing, etc.). ### `gfm` — Composite GFM plugin A single-call plugin that enables a GFM-like configuration: - Tables (built-in) - Strikethrough with single and double tildes (built-in) - GFM autolinks (`gfm_autolink`) - Task lists (built-in) - Alerts (built-in) - Footnotes (`footnote_plugin`, inline=False) - Dollar math (optional, disabled by default) - Front matter (optional, disabled by default) Tag filtering is noted as a TODO. ### Other changes - `tox.ini`: test envs pin `markdown-it-py>=4.1.0` - `pyproject.toml`: added `pytest-timeout` to testing extras ### References - [GFM spec](https://github.github.com/gfm/) - [GitHub basic formatting syntax](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax) - markdown-it-py 4.1.0 `add_terminator_char` API (PR executablebooks/markdown-it-py#391)
1 parent 39c681f commit 28a9778

11 files changed

Lines changed: 1081 additions & 0 deletions

File tree

docs/index.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,18 @@ html_string = md.render("some *Markdown*")
4343
.. autofunction:: mdit_py_plugins.front_matter.front_matter_plugin
4444
```
4545

46+
## GFM (GitHub Flavored Markdown)
47+
48+
```{eval-rst}
49+
.. autofunction:: mdit_py_plugins.gfm.gfm_plugin
50+
```
51+
52+
## GFM Autolinks
53+
54+
```{eval-rst}
55+
.. autofunction:: mdit_py_plugins.gfm_autolink.gfm_autolink_plugin
56+
```
57+
4658
## Footnotes
4759

4860
```{eval-rst}

mdit_py_plugins/gfm/__init__.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
"""Composite GFM (GitHub Flavored Markdown) plugin.
2+
3+
Enables a set of plugins that together approximate GitHub's Markdown rendering:
4+
5+
- Tables (built-in)
6+
- Strikethrough with single and double tildes (built-in)
7+
- Autolinks (gfm_autolink plugin)
8+
- Task lists (built-in, markdown-it-py >= 4.1.0)
9+
- Alerts (built-in, markdown-it-py >= 4.1.0)
10+
- Footnotes (``[^label]`` references and definitions)
11+
12+
Optional extras:
13+
14+
- Dollar math (``$...$`` / ``$$...$$``)
15+
- Front matter (YAML)
16+
17+
.. note::
18+
Tag filtering (disallowed raw HTML tags) is not yet implemented.
19+
20+
.. seealso::
21+
- `GitHub Flavored Markdown Spec <https://github.github.com/gfm/>`__
22+
- `GitHub basic formatting syntax
23+
<https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax>`__
24+
25+
.. versionadded:: 0.5.0
26+
27+
Requires markdown-it-py >= 4.1.0.
28+
"""
29+
30+
from __future__ import annotations
31+
32+
from functools import lru_cache
33+
34+
from markdown_it import MarkdownIt
35+
from markdown_it import __version__ as _mdit_version
36+
37+
from mdit_py_plugins.dollarmath import dollarmath_plugin
38+
from mdit_py_plugins.footnote import footnote_plugin
39+
from mdit_py_plugins.front_matter import front_matter_plugin
40+
from mdit_py_plugins.gfm_autolink import gfm_autolink_plugin
41+
42+
__all__ = ("gfm_plugin",)
43+
44+
_MIN_VERSION = (4, 1, 0)
45+
46+
47+
@lru_cache(maxsize=8)
48+
def _parse_version(v: str) -> tuple[int, ...]:
49+
"""Parse a version string like '4.1.0' into a tuple of ints."""
50+
return tuple(int(x) for x in v.split(".")[:3])
51+
52+
53+
def gfm_plugin(
54+
md: MarkdownIt,
55+
*,
56+
dollarmath: bool = False,
57+
front_matter: bool = False,
58+
tasklists_editable: bool = False,
59+
) -> None:
60+
"""Enable GFM-like rendering.
61+
62+
Starts from the current parser configuration and enables the GFM
63+
components on top.
64+
65+
:param dollarmath: Enable dollar-delimited math (``$...$``, ``$$...$$``).
66+
:param front_matter: Enable YAML front matter (``---``).
67+
:param tasklists_editable: If True, rendered task list checkboxes are not
68+
disabled (i.e. they are interactive).
69+
"""
70+
if _parse_version(_mdit_version) < _MIN_VERSION:
71+
raise RuntimeError(
72+
f"gfm_plugin requires markdown-it-py >= {'.'.join(str(x) for x in _MIN_VERSION)} "
73+
f"(installed: {_mdit_version})"
74+
)
75+
76+
# Enable table and strikethrough rules (built into markdown-it-py)
77+
md.enable("table")
78+
md.enable("strikethrough")
79+
80+
# GFM options available in markdown-it-py >= 4.1.0
81+
md.options["tasklists"] = True
82+
md.options["tasklists_editable"] = tasklists_editable
83+
md.options["alerts"] = True
84+
md.options["strikethrough_single_tilde"] = True
85+
# GFM autolinks
86+
md.use(gfm_autolink_plugin)
87+
88+
# Footnotes (inline footnotes ^[...] are not part of GFM)
89+
md.use(footnote_plugin, inline=False)
90+
91+
# Dollar math (inline $...$ and block $$...$$)
92+
if dollarmath:
93+
md.use(dollarmath_plugin, allow_blank_lines=False)
94+
95+
# TODO: Tag filter — replace leading `<` with `&lt;` for disallowed raw
96+
# HTML tags: <title>, <textarea>, <style>, <xmp>, <iframe>, <noembed>,
97+
# <noframes>, <script>, <plaintext>.
98+
# See https://github.github.com/gfm/#disallowed-raw-html-extension-
99+
100+
# Optional plugins
101+
if front_matter:
102+
md.use(front_matter_plugin)
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
"""GFM autolink extension plugin for markdown-it-py.
2+
3+
Implements the `GFM autolink extension
4+
<https://github.github.com/gfm/#autolinks-extension->`_,
5+
which recognises bare URLs (``http://``, ``https://``, ``www.``),
6+
protocol links (``mailto:``, ``xmpp:``),
7+
and bare email addresses without requiring angle brackets.
8+
9+
Ported from the Rust crate
10+
`markdown_it_autolink <https://github.com/markdown-it-rust/markdown-it-plugins.rs>`_.
11+
"""
12+
13+
from .index import gfm_autolink_plugin
14+
15+
__all__ = ("gfm_autolink_plugin",)
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
"""URL / email matching helpers for the GFM autolink extension.
2+
3+
Ported from the Rust ``gfm_autolinks`` crate.
4+
"""
5+
6+
from __future__ import annotations
7+
8+
import unicodedata
9+
10+
# ---------------------------------------------------------------------------
11+
# Character classification helpers
12+
# ---------------------------------------------------------------------------
13+
14+
_VALID_PREV_CHARS = frozenset(" \t\r\n*_~(")
15+
16+
17+
def check_prev(ch: str) -> bool:
18+
"""Return ``True`` if *ch* is a valid preceding character for an autolink."""
19+
return ch in _VALID_PREV_CHARS
20+
21+
22+
def _is_valid_hostchar(ch: str) -> bool:
23+
"""Return ``True`` if *ch* is valid inside a domain label (not whitespace/punctuation)."""
24+
if ch.isspace():
25+
return False
26+
cat = unicodedata.category(ch)
27+
# Unicode punctuation categories: Pc, Pd, Pe, Pf, Pi, Po, Ps
28+
return not cat.startswith("P")
29+
30+
31+
# Characters that terminate a URL (before autolink_delim trimming).
32+
_SPACE_CHARS = frozenset(" \t\r\n\x00\x0b\x0c")
33+
34+
35+
def _isspace(ch: str) -> bool:
36+
return ch in _SPACE_CHARS
37+
38+
39+
_LINK_END_ASSORTMENT = frozenset("?!.,:*_~'\"[]")
40+
41+
42+
def _autolink_delim(data: str, link_end: int) -> int:
43+
"""Trim trailing punctuation from a URL according to GFM rules."""
44+
# Truncate at first '<'
45+
for i, ch in enumerate(data[:link_end]):
46+
if ch == "<":
47+
link_end = i
48+
break
49+
50+
while link_end > 0:
51+
cclose = data[link_end - 1]
52+
53+
copen = "(" if cclose == ")" else None
54+
55+
if cclose in _LINK_END_ASSORTMENT:
56+
link_end -= 1
57+
elif cclose == ";":
58+
new_end = link_end - 2
59+
while new_end > 0 and data[new_end].isalpha():
60+
new_end -= 1
61+
if new_end < link_end - 2 and data[new_end] == "&":
62+
link_end = new_end
63+
else:
64+
link_end -= 1
65+
elif copen is not None:
66+
opening = data[:link_end].count(copen)
67+
closing = data[:link_end].count(cclose)
68+
if closing <= opening:
69+
break
70+
link_end -= 1
71+
else:
72+
break
73+
74+
return link_end
75+
76+
77+
# ---------------------------------------------------------------------------
78+
# Domain validation
79+
# ---------------------------------------------------------------------------
80+
81+
82+
def _check_domain(data: str, allow_short: bool) -> int | None:
83+
"""Validate a domain name and return the length consumed, or ``None``."""
84+
if not data:
85+
return None
86+
87+
np = 0
88+
uscore1 = 0
89+
uscore2 = 0
90+
91+
for i, ch in enumerate(data):
92+
if ch == "_":
93+
uscore2 += 1
94+
elif ch == ".":
95+
uscore1 = uscore2
96+
uscore2 = 0
97+
np += 1
98+
elif not _is_valid_hostchar(ch) and ch != "-":
99+
if uscore1 == 0 and uscore2 == 0 and (allow_short or np > 0):
100+
return i
101+
return None
102+
# else: valid hostchar or '-'
103+
104+
if (uscore1 > 0 or uscore2 > 0) and np <= 10:
105+
return None
106+
if allow_short or np > 0:
107+
return len(data)
108+
return None
109+
110+
111+
# ---------------------------------------------------------------------------
112+
# www matching
113+
# ---------------------------------------------------------------------------
114+
115+
_EMAIL_OK = frozenset(".+-_")
116+
117+
118+
def match_www(text: str) -> tuple[str, int] | None:
119+
"""Match a bare ``www.`` URL at the start of *text*.
120+
121+
Returns ``(url_with_scheme, char_count)`` or ``None``.
122+
"""
123+
if not text.startswith("www."):
124+
return None
125+
126+
link_end = _check_domain(text[4:], False)
127+
if link_end is None:
128+
return None
129+
# link_end is offset from position 4
130+
link_end += 4
131+
132+
# extend to the end of non-space characters
133+
while link_end < len(text) and not _isspace(text[link_end]):
134+
link_end += 1
135+
136+
link_end = _autolink_delim(text, link_end)
137+
138+
matched = text[:link_end]
139+
url = "http://" + matched
140+
return url, len(matched)
141+
142+
143+
# ---------------------------------------------------------------------------
144+
# http(s):// matching
145+
# ---------------------------------------------------------------------------
146+
147+
148+
def match_http(text: str) -> tuple[str, int] | None:
149+
"""Match an ``http://`` or ``https://`` URL at the start of *text*.
150+
151+
Returns ``(url, char_count)`` or ``None``.
152+
"""
153+
if text.startswith("http://"):
154+
prefix_len = 7
155+
elif text.startswith("https://"):
156+
prefix_len = 8
157+
else:
158+
return None
159+
160+
link_end = _check_domain(text[prefix_len:], True)
161+
if link_end is None:
162+
return None
163+
link_end += prefix_len
164+
165+
while link_end < len(text) and not _isspace(text[link_end]):
166+
link_end += 1
167+
168+
link_end = _autolink_delim(text, link_end)
169+
170+
url = text[:link_end]
171+
return url, len(url)
172+
173+
174+
# ---------------------------------------------------------------------------
175+
# Email matching
176+
# ---------------------------------------------------------------------------
177+
178+
179+
def match_email(text: str) -> tuple[str, int] | None:
180+
"""Match an email address (optionally prefixed by ``mailto:``/``xmpp:``)."""
181+
pos = 0
182+
protocol: str | None = None
183+
if text.startswith("mailto:"):
184+
protocol = "mailto"
185+
pos = 7
186+
elif text.startswith("xmpp:"):
187+
protocol = "xmpp"
188+
pos = 5
189+
190+
return match_any_email(text, pos, protocol)
191+
192+
193+
def match_any_email(
194+
text: str, pos: int, protocol: str | None
195+
) -> tuple[str, int] | None:
196+
"""Match an email address in *text* starting the local-part scan at *pos*.
197+
198+
*protocol* is ``"mailto"``, ``"xmpp"``, or ``None`` (bare address).
199+
Returns ``(url, char_count)`` or ``None``.
200+
"""
201+
size = len(text)
202+
203+
# scan local part (before @)
204+
start_pos = pos
205+
while pos < size:
206+
ch = text[pos]
207+
if ch.isascii() and (ch.isalnum() or ch in _EMAIL_OK):
208+
pos += 1
209+
continue
210+
if ch == "@":
211+
break
212+
return None
213+
214+
if pos == start_pos:
215+
return None
216+
217+
# scan domain (after @)
218+
link_end = pos + 1
219+
np = 0
220+
num_slash = 0
221+
222+
while link_end < size:
223+
ch = text[link_end]
224+
if ch.isascii() and ch.isalnum():
225+
pass
226+
elif ch == "@":
227+
if protocol != "xmpp":
228+
return None
229+
elif (
230+
ch == "."
231+
and link_end < size - 1
232+
and text[link_end + 1].isascii()
233+
and text[link_end + 1].isalnum()
234+
):
235+
np += 1
236+
elif ch == "/" and protocol == "xmpp" and num_slash == 0:
237+
num_slash += 1
238+
elif ch != "-" and ch != "_":
239+
break
240+
link_end += 1
241+
242+
if link_end < 2 or np == 0:
243+
return None
244+
last_ch = text[link_end - 1]
245+
if not (last_ch.isascii() and last_ch.isalpha()) and last_ch != ".":
246+
return None
247+
248+
url = "mailto:" + text[:link_end] if protocol is None else text[:link_end]
249+
250+
return url, link_end

0 commit comments

Comments
 (0)