diff --git a/CHANGES.md b/CHANGES.md index 1d204b72..de90da95 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,7 +2,7 @@ ## python-markdown2 2.5.4 (not yet released) -(nothing yet) +- [pull #617] Add MarkdownFileLinks extra (#528) ## python-markdown2 2.5.3 diff --git a/lib/markdown2.py b/lib/markdown2.py index bcd2b8b4..8387c0fd 100755 --- a/lib/markdown2.py +++ b/lib/markdown2.py @@ -66,6 +66,7 @@ references, revision number references). * link-shortrefs: allow shortcut reference links, not followed by `[]` or a link label. +* markdown-file-links: Replace links to `.md` files with `.html` links * markdown-in-html: Allow the use of `markdown="1"` in a block HTML tag to have markdown processing be done on its contents. Similar to but with @@ -1446,25 +1447,6 @@ def _find_balanced(self, text: str, start: int, open_c: str, close_c: str) -> in i += 1 return i - def _extract_url_and_title(self, text: str, start: int) -> Union[tuple[str, str, int], tuple[None, None, None]]: - """Extracts the url and (optional) title from the tail of a link""" - # text[start] equals the opening parenthesis - idx = self._find_non_whitespace(text, start+1) - if idx == len(text): - return None, None, None - end_idx = idx - has_anglebrackets = text[idx] == "<" - if has_anglebrackets: - end_idx = self._find_balanced(text, end_idx+1, "<", ">") - end_idx = self._find_balanced(text, end_idx, "(", ")") - match = self._inline_link_title.search(text, idx, end_idx) - if not match: - return None, None, None - url, title = text[idx:match.start()], match.group("title") - if has_anglebrackets: - url = self._strip_anglebrackets.sub(r'\1', url) - return url, title, end_idx - # https://developer.mozilla.org/en-US/docs/web/http/basics_of_http/data_urls # https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types _data_url_re = re.compile(r''' @@ -1523,180 +1505,9 @@ def _do_links(self, text: str) -> str: Markdown.pl because of the lack of atomic matching support in Python's regex engine used in $g_nested_brackets. """ - MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24 - - # `anchor_allowed_pos` is used to support img links inside - # anchors, but not anchors inside anchors. An anchor's start - # pos must be `>= anchor_allowed_pos`. - anchor_allowed_pos = 0 - - curr_pos = 0 - - while True: - # The next '[' is the start of: - # - an inline anchor: [text](url "title") - # - a reference anchor: [text][id] - # - an inline img: ![text](url "title") - # - a reference img: ![text][id] - # - a footnote ref: [^id] - # (Only if 'footnotes' extra enabled) - # - a footnote defn: [^id]: ... - # (Only if 'footnotes' extra enabled) These have already - # been stripped in _strip_footnote_definitions() so no - # need to watch for them. - # - a link definition: [id]: url "title" - # These have already been stripped in - # _strip_link_definitions() so no need to watch for them. - # - not markup: [...anything else... - try: - start_idx = text.index('[', curr_pos) - except ValueError: - break - text_length = len(text) - - # Find the matching closing ']'. - # Markdown.pl allows *matching* brackets in link text so we - # will here too. Markdown.pl *doesn't* currently allow - # matching brackets in img alt text -- we'll differ in that - # regard. - bracket_depth = 0 - - for p in range( - start_idx + 1, - min(start_idx + MAX_LINK_TEXT_SENTINEL, text_length) - ): - ch = text[p] - if ch == ']': - bracket_depth -= 1 - if bracket_depth < 0: - break - elif ch == '[': - bracket_depth += 1 - else: - # Closing bracket not found within sentinel length. - # This isn't markup. - curr_pos = start_idx + 1 - continue - link_text = text[start_idx + 1: p] - - # Fix for issue 341 - Injecting XSS into link text - if self.safe_mode: - link_text = self._hash_html_spans(link_text) - link_text = self._unhash_html_spans(link_text) - - # Possibly a footnote ref? - if "footnotes" in self.extras and link_text.startswith("^"): - normed_id = re.sub(r'\W', '-', link_text[1:]) - if normed_id in self.footnotes: - result = ( - f'' - # insert special footnote marker that's easy to find and match against later - f'{self._footnote_marker}-{normed_id}' - ) - text = text[:start_idx] + result + text[p+1:] - else: - # This id isn't defined, leave the markup alone. - curr_pos = p + 1 - continue - - # Now determine what this is by the remainder. - p += 1 - - # -- Extract the URL, title and end index from the link - - # inline anchor or inline img - if text[p:p + 1] == '(': - url, title, url_end_idx = self._extract_url_and_title(text, p) - if url is None: - # text isn't markup - curr_pos = start_idx + 1 - continue - url = self._unhash_html_spans(url, code=True) - # reference anchor or reference img - else: - match = None - if 'link-shortrefs' in self.extras: - # check if there's no tailing id section - if link_text and re.match(r'[ ]?(?:\n[ ]*)?(?!\[)', text[p:]): - # try a match with `[]` inserted into the text - match = self._tail_of_reference_link_re.match(f'{text[:p]}[]{text[p:]}', p) - if match: - # if we get a match, we'll have to modify the `text` variable to insert the `[]` - # but we ONLY want to do that if the link_id is valid. This makes sure that we - # don't get stuck in any loops and also that when a user inputs `[abc]` we don't - # output `[abc][]` in the final HTML - if (match.group("id").lower() or link_text.lower()) in self.urls: - text = f'{text[:p]}[]{text[p:]}' - else: - match = None - - match = match or self._tail_of_reference_link_re.match(text, p) - if not match: - # text isn't markup - curr_pos = start_idx + 1 - continue - - link_id = match.group("id").lower() or link_text.lower() # for links like [this][] - - if link_id not in self.urls: - # This id isn't defined, leave the markup alone. - # set current pos to end of link title and continue from there - curr_pos = p - continue - - url = self.urls[link_id] - title = self.titles.get(link_id) - url_end_idx = match.end() - - # -- Encode and hash the URL and title to avoid conflicts with italics/bold - - url = ( - url - .replace('*', self._escape_table['*']) - .replace('_', self._escape_table['_']) - ) - if title: - title = ( - _xml_escape_attr(title) - .replace('*', self._escape_table['*']) - .replace('_', self._escape_table['_']) - ) - title_str = f' title="{title}"' - else: - title_str = '' - - # -- Process the anchor/image - - is_img = start_idx > 0 and text[start_idx-1] == "!" - if is_img: - start_idx -= 1 - img_class_str = self._html_class_str_from_tag("img") - result = result_head = ( - f'{self._hash_span(_xml_escape_attr(link_text))}= anchor_allowed_pos: - if self.safe_mode and not self._safe_href.match(url): - result_head = f'' - else: - result_head = f'' - result = f'{result_head}{link_text}' - else: - # anchor not allowed here/invalid markup - curr_pos = start_idx + 1 - continue - - if "smarty-pants" in self.extras: - result = result.replace('"', self._escape_table['"']) - - # allowed from curr_pos onwards, allowed from anchor_allowed_pos onwards. - # this means images can exist within `` tags but anchors can only come after the - # current anchor has been closed - curr_pos = start_idx + len(result_head) - anchor_allowed_pos = start_idx + len(result) - text = text[:start_idx] + result + text[url_end_idx:] - + link_processor = LinkProcessor(self, None) + if link_processor.test(text): + text = link_processor.run(text) return text def header_id_from_text(self, @@ -2682,6 +2493,342 @@ def test(self, text): return '*' in text or '_' in text return self.hash_table and re.search(r'md5-[0-9a-z]{32}', text) + +class _LinkProcessorExtraOpts(TypedDict, total=False): + '''Options for the `LinkProcessor` extra''' + tags: List[str] + '''List of tags to be processed by the extra. Default is `['a', 'img']`''' + inline: bool + '''Whether to process inline links. Default: True''' + ref: bool + '''Whether to process reference links. Default: True''' + + +class LinkProcessor(Extra): + name = 'link-processor' + order = (Stage.ITALIC_AND_BOLD,), (Stage.ESCAPE_SPECIAL,) + options: _LinkProcessorExtraOpts + + def __init__(self, md: Markdown, options: Optional[dict]): + options = options or {} + super().__init__(md, options) + + def parse_inline_anchor_or_image(self, text: str, _link_text: str, start_idx: int) -> Optional[Tuple[str, str, Optional[str], int]]: + ''' + Parse a string and extract a link from it. This can be an inline anchor or an image. + + Args: + text: the whole text containing the link + link_text: the human readable text inside the link + start_idx: the index of the link within `text` + + Returns: + None if a link was not able to be parsed from `text`. + If successful, a tuple is returned containing: + + 1. potentially modified version of the `text` param + 2. the URL + 3. the title (can be None if not present) + 4. the index where the link ends within text + ''' + idx = self.md._find_non_whitespace(text, start_idx + 1) + if idx == len(text): + return + end_idx = idx + has_anglebrackets = text[idx] == "<" + if has_anglebrackets: + end_idx = self.md._find_balanced(text, end_idx+1, "<", ">") + end_idx = self.md._find_balanced(text, end_idx, "(", ")") + match = self.md._inline_link_title.search(text, idx, end_idx) + if not match: + return + url, title = text[idx:match.start()], match.group("title") + if has_anglebrackets: + url = self.md._strip_anglebrackets.sub(r'\1', url) + return text, url, title, end_idx + + def process_link_shortrefs(self, text: str, link_text: str, start_idx: int) -> Tuple[Optional[re.Match], str]: + ''' + Detects shortref links within a string and converts them to normal references + + Args: + text: the whole text containing the link + link_text: the human readable text inside the link + start_idx: the index of the link within `text` + + Returns: + A tuple containing: + + 1. A potential `re.Match` against the link reference within `text` (will be None if not found) + 2. potentially modified version of the `text` param + ''' + match = None + # check if there's no tailing id section + if link_text and re.match(r'[ ]?(?:\n[ ]*)?(?!\[)', text[start_idx:]): + # try a match with `[]` inserted into the text + match = self.md._tail_of_reference_link_re.match(f'{text[:start_idx]}[]{text[start_idx:]}', start_idx) + if match: + # if we get a match, we'll have to modify the `text` variable to insert the `[]` + # but we ONLY want to do that if the link_id is valid. This makes sure that we + # don't get stuck in any loops and also that when a user inputs `[abc]` we don't + # output `[abc][]` in the final HTML + if (match.group("id").lower() or link_text.lower()) in self.md.urls: + text = f'{text[:start_idx]}[]{text[start_idx:]}' + else: + match = None + + return match, text + + def parse_ref_anchor_or_ref_image(self, text: str, link_text: str, start_idx: int) -> Optional[Tuple[str, Optional[str], Optional[str], int]]: + ''' + Parse a string and extract a link from it. This can be a reference anchor or image. + + Args: + text: the whole text containing the link + link_text: the human readable text inside the link + start_idx: the index of the link within `text` + + Returns: + None if a link was not able to be parsed from `text`. + If successful, a tuple is returned containing: + + 1. potentially modified version of the `text` param + 2. the URL (can be None if the reference doesn't exist) + 3. the title (can be None if not present) + 4. the index where the link ends within text + ''' + match = None + if 'link-shortrefs' in self.md.extras: + match, text = self.process_link_shortrefs(text, link_text, start_idx) + + match = match or self.md._tail_of_reference_link_re.match(text, start_idx) + if not match: + # text isn't markup + return + + link_id = match.group("id").lower() or link_text.lower() # for links like [this][] + + url = self.md.urls.get(link_id) + title = self.md.titles.get(link_id) + url_end_idx = match.end() + + return text, url, title, url_end_idx + + def process_image(self, url: str, title_attr: str, link_text: str) -> Tuple[str, int]: + ''' + Takes a URL, title and link text and returns an HTML `` tag + + Args: + url: the image URL/src + title_attr: a string containing the title attribute of the tag (eg: `' title="..."'`) + link_text: the human readable text portion of the link + + Returns: + A tuple containing: + + 1. The HTML string + 2. The length of the opening HTML tag in the string. For `` it's the whole string. + This section will be skipped by the link processor + ''' + img_class_str = self.md._html_class_str_from_tag("img") + result = ( + f'{self.md._hash_span(_xml_escape_attr(link_text))} Tuple[str, int]: + ''' + Takes a URL, title and link text and returns an HTML `` tag + + Args: + url: the URL + title_attr: a string containing the title attribute of the tag (eg: `' title="..."'`) + link_text: the human readable text portion of the link + + Returns: + A tuple containing: + + 1. The HTML string + 2. The length of the opening HTML tag in the string. This section will be skipped + by the link processor + ''' + if self.md.safe_mode and not self.md._safe_href.match(url): + result_head = f'' + else: + result_head = f'' + + return f'{result_head}{link_text}', len(result_head) + + def run(self, text: str): + MAX_LINK_TEXT_SENTINEL = 3000 # markdown2 issue 24 + + # `anchor_allowed_pos` is used to support img links inside + # anchors, but not anchors inside anchors. An anchor's start + # pos must be `>= anchor_allowed_pos`. + anchor_allowed_pos = 0 + + curr_pos = 0 + + while True: + # The next '[' is the start of: + # - an inline anchor: [text](url "title") + # - a reference anchor: [text][id] + # - an inline img: ![text](url "title") + # - a reference img: ![text][id] + # - a footnote ref: [^id] + # (Only if 'footnotes' extra enabled) + # - a footnote defn: [^id]: ... + # (Only if 'footnotes' extra enabled) These have already + # been stripped in _strip_footnote_definitions() so no + # need to watch for them. + # - a link definition: [id]: url "title" + # These have already been stripped in + # _strip_link_definitions() so no need to watch for them. + # - not markup: [...anything else... + try: + start_idx = text.index('[', curr_pos) + except ValueError: + break + text_length = len(text) + + # Find the matching closing ']'. + # Markdown.pl allows *matching* brackets in link text so we + # will here too. Markdown.pl *doesn't* currently allow + # matching brackets in img alt text -- we'll differ in that + # regard. + bracket_depth = 0 + + for p in range( + start_idx + 1, + min(start_idx + MAX_LINK_TEXT_SENTINEL, text_length) + ): + ch = text[p] + if ch == ']': + bracket_depth -= 1 + if bracket_depth < 0: + break + elif ch == '[': + bracket_depth += 1 + else: + # Closing bracket not found within sentinel length. + # This isn't markup. + curr_pos = start_idx + 1 + continue + link_text = text[start_idx + 1: p] + + # Fix for issue 341 - Injecting XSS into link text + if self.md.safe_mode: + link_text = self.md._hash_html_spans(link_text) + link_text = self.md._unhash_html_spans(link_text) + + # Possibly a footnote ref? + if "footnotes" in self.md.extras and link_text.startswith("^"): + normed_id = re.sub(r'\W', '-', link_text[1:]) + if normed_id in self.md.footnotes: + result = ( + f'' + # insert special footnote marker that's easy to find and match against later + f'{self.md._footnote_marker}-{normed_id}' + ) + text = text[:start_idx] + result + text[p+1:] + else: + # This id isn't defined, leave the markup alone. + curr_pos = p + 1 + continue + + # Now determine what this is by the remainder. + p += 1 + + # -- Extract the URL, title and end index from the link + + # inline anchor or inline img + if text[p:p + 1] == '(': + if not self.options.get('inline', True): + curr_pos = start_idx + 1 + continue + + parsed = self.parse_inline_anchor_or_image(text, link_text, p) + if not parsed: + # text isn't markup + curr_pos = start_idx + 1 + continue + + text, url, title, url_end_idx = parsed + url = self.md._unhash_html_spans(url, code=True) + # reference anchor or reference img + else: + if not self.options.get('ref', True): + curr_pos = start_idx + 1 + continue + + parsed = self.parse_ref_anchor_or_ref_image(text, link_text, p) + if not parsed: + curr_pos = start_idx + 1 + continue + + text, url, title, url_end_idx = parsed + if url is None: + # This id isn't defined, leave the markup alone. + # set current pos to end of link title and continue from there + curr_pos = p + continue + + # -- Encode and hash the URL and title to avoid conflicts with italics/bold + + url = ( + url + .replace('*', self.md._escape_table['*']) + .replace('_', self.md._escape_table['_']) + ) + if title: + title = ( + _xml_escape_attr(title) + .replace('*', self.md._escape_table['*']) + .replace('_', self.md._escape_table['_']) + ) + title_str = f' title="{title}"' + else: + title_str = '' + + # -- Process the anchor/image + + is_img = start_idx > 0 and text[start_idx-1] == "!" + if is_img: + if 'img' not in self.options.get('tags', ['img']): + curr_pos = start_idx + 1 + continue + + start_idx -= 1 + result, skip = self.process_image(url, title_str, link_text) + elif start_idx >= anchor_allowed_pos: + if 'a' not in self.options.get('tags', ['a']): + curr_pos = start_idx + 1 + continue + + result, skip = self.process_anchor(url, title_str, link_text) + else: + # anchor not allowed here/invalid markup + curr_pos = start_idx + 1 + continue + + if "smarty-pants" in self.md.extras: + result = result.replace('"', self.md._escape_table['"']) + + # allowed from curr_pos onwards, allowed from anchor_allowed_pos onwards. + # this means images can exist within `` tags but anchors can only come after the + # current anchor has been closed + curr_pos = start_idx + skip + anchor_allowed_pos = start_idx + len(result) + text = text[:start_idx] + result + text[url_end_idx:] + + return text + + def test(self, text): + return '(' in text or '[' in text + + # User facing extras # ---------------------------------------------------------- @@ -3085,6 +3232,48 @@ def test(self, text): return True +class _MarkdownFileLinksExtraOpts(_LinkProcessorExtraOpts, total=False): + '''Options for the `MarkdownFileLinks` extra''' + link_defs: bool + '''Whether to convert link definitions as well. Default: True''' + + +class MarkdownFileLinks(LinkProcessor): + ''' + Replace links to `.md` files with `.html` links + ''' + + name = 'markdown-file-links' + order = (Stage.LINKS,), (Stage.LINK_DEFS,) + options: _MarkdownFileLinksExtraOpts + + def __init__(self, md: Markdown, options: Optional[dict]): + # override LinkProcessor defaults + options = {'tags': ['a'], 'ref': False, **(options or {})} + super().__init__(md, options) + + def parse_inline_anchor_or_image(self, text: str, _link_text: str, start_idx: int): + result = super().parse_inline_anchor_or_image(text, _link_text, start_idx) + if not result or not result[1] or not result[1].endswith('.md'): + # return None for invalid markup, or links that don't end with '.md' + # so that we don't touch them, and other extras can process them freely + return + url = result[1].removesuffix('.md') + '.html' + return result[0], url, *result[2:] + + def run(self, text: str): + if Stage.LINKS > self.md.order > Stage.LINK_DEFS and self.options.get('link_defs', True): + # running just after link defs have been stripped + for key, url in self.md.urls.items(): + if url.endswith('.md'): + self.md.urls[key] = url.removesuffix('.md') + '.html' + + return super().run(text) + + def test(self, text): + return super().test(text) and '.md' in text + + class Mermaid(FencedCodeBlocks): name = 'mermaid' order = (FencedCodeBlocks,), () @@ -3583,6 +3772,7 @@ def test(self, text): Latex.register() LinkPatterns.register() MarkdownInHTML.register() +MarkdownFileLinks.register() MiddleWordEm.register() Mermaid.register() Numbering.register() diff --git a/test/testall.py b/test/testall.py index 1158f529..1cb7dfb9 100644 --- a/test/testall.py +++ b/test/testall.py @@ -17,7 +17,7 @@ def _python_ver_from_python(python): assert ' ' not in python o = os.popen('''%s -c "import sys; print(sys.version)"''' % python) ver_str = o.read().strip() - ver_bits = re.split(r"\.|[^\d]", ver_str, 2)[:2] + ver_bits = re.split(r"\.|[^\d]", ver_str, maxsplit=2)[:2] ver = tuple(map(int, ver_bits)) return ver diff --git a/test/tm-cases/fenced_code_blocks_issue426.html b/test/tm-cases/fenced_code_blocks_issue426.html index 66b1cb9c..ee3e8ae7 100644 --- a/test/tm-cases/fenced_code_blocks_issue426.html +++ b/test/tm-cases/fenced_code_blocks_issue426.html @@ -15,7 +15,7 @@

URL PARAMETERS IN THE TEMPLATE

  • ContextMixin defines the method get_context_data:

    -
    def get_context_data(self, **kwargs):
    +
    def get_context_data(self, **kwargs):
         kwargs.setdefault('view', self)
         if self.extra_context is not None:
             kwargs.update(self.extra_context)
    @@ -26,7 +26,7 @@ 

    URL PARAMETERS IN THE TEMPLATE

    So when overriding one must be careful to extends super's kwargs:

    -
    def get_context_data(self, **kwargs):
    +
    def get_context_data(self, **kwargs):
         kwargs = super().get_context_data(**kwargs)
         kwargs['page_title'] = "Documentation"
         return kwargs
    diff --git a/test/tm-cases/fenced_code_blocks_syntax_indentation.html b/test/tm-cases/fenced_code_blocks_syntax_indentation.html
    index f1a37817..eedd7c65 100644
    --- a/test/tm-cases/fenced_code_blocks_syntax_indentation.html
    +++ b/test/tm-cases/fenced_code_blocks_syntax_indentation.html
    @@ -1,5 +1,5 @@
     
    -
    def foo():
    +
    def foo():
         print "foo"
     
         print "bar"
    diff --git a/test/tm-cases/markdown_file_links.html b/test/tm-cases/markdown_file_links.html
    new file mode 100644
    index 00000000..916e2fea
    --- /dev/null
    +++ b/test/tm-cases/markdown_file_links.html
    @@ -0,0 +1,3 @@
    +

    This is a link to a markdown file

    + +

    This is a reference to a markdown file link

    diff --git a/test/tm-cases/markdown_file_links.opts b/test/tm-cases/markdown_file_links.opts new file mode 100644 index 00000000..e1d066da --- /dev/null +++ b/test/tm-cases/markdown_file_links.opts @@ -0,0 +1 @@ +{'extras': ['markdown-file-links']} \ No newline at end of file diff --git a/test/tm-cases/markdown_file_links.text b/test/tm-cases/markdown_file_links.text new file mode 100644 index 00000000..456340a9 --- /dev/null +++ b/test/tm-cases/markdown_file_links.text @@ -0,0 +1,6 @@ +[This is a link to a markdown file](./file.md) + +[This is a reference to a markdown file link][] + + +[This is a reference to a markdown file link]: ./something.md diff --git a/test/tm-cases/markdown_file_links_no_linkdefs.html b/test/tm-cases/markdown_file_links_no_linkdefs.html new file mode 100644 index 00000000..48f897df --- /dev/null +++ b/test/tm-cases/markdown_file_links_no_linkdefs.html @@ -0,0 +1 @@ +

    This is a reference to a markdown file link but link definition swapping is disabled

    diff --git a/test/tm-cases/markdown_file_links_no_linkdefs.opts b/test/tm-cases/markdown_file_links_no_linkdefs.opts new file mode 100644 index 00000000..c5ae83b5 --- /dev/null +++ b/test/tm-cases/markdown_file_links_no_linkdefs.opts @@ -0,0 +1 @@ +{'extras': {'markdown-file-links': {'link_defs': False}}} \ No newline at end of file diff --git a/test/tm-cases/markdown_file_links_no_linkdefs.text b/test/tm-cases/markdown_file_links_no_linkdefs.text new file mode 100644 index 00000000..6715929d --- /dev/null +++ b/test/tm-cases/markdown_file_links_no_linkdefs.text @@ -0,0 +1,4 @@ +[This is a reference to a markdown file link][] but link definition swapping is disabled + + +[This is a reference to a markdown file link]: ./something.md