From 912f4f95e5e4458718b8d45472c7798dd9e8991c Mon Sep 17 00:00:00 2001 From: JayJayArr Date: Tue, 15 Apr 2025 07:51:37 +0200 Subject: [PATCH] detect wikilinks, prevent plaintext extraction from links #1650 --- lychee-lib/src/extract/markdown.rs | 41 ++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/lychee-lib/src/extract/markdown.rs b/lychee-lib/src/extract/markdown.rs index ba3d8ed2b9..82a6cc48f4 100644 --- a/lychee-lib/src/extract/markdown.rs +++ b/lychee-lib/src/extract/markdown.rs @@ -10,7 +10,7 @@ use super::html::html5gum::{extract_html, extract_html_fragments}; /// Returns the default markdown extensions used by lychee. /// Sadly, `|` is not const for `Options` so we can't use a const global. fn md_extensions() -> Options { - Options::ENABLE_HEADING_ATTRIBUTES | Options::ENABLE_MATH + Options::ENABLE_HEADING_ATTRIBUTES | Options::ENABLE_MATH | Options::ENABLE_WIKILINKS } /// Extract unparsed URL strings from a Markdown string. @@ -18,6 +18,7 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec Vec Some(extract_raw_uri_from_plaintext(&dest_url)), // Wiki URL (`[[http://example.com]]`) - // This element is currently not matched and I'm not sure why. - // However, we keep it in here for future compatibility with - // markup5ever. LinkType::WikiLink { has_pothole: _ } => { + inside_link_block = true; Some(vec![RawUri { text: dest_url.to_string(), element: Some("a".to_string()), @@ -100,7 +99,7 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec { - if inside_code_block && !include_verbatim { + if (inside_code_block && !include_verbatim) || inside_link_block { None } else { Some(extract_raw_uri_from_plaintext(&txt)) @@ -123,6 +122,12 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec { + inside_link_block = false; + None + } + // Silently skip over other events _ => None, }) @@ -391,13 +396,29 @@ $$ let markdown = r"[[https://example.com/destination]]"; let expected = vec![RawUri { text: "https://example.com/destination".to_string(), - // This should be a link element, but is currently matched as plaintext - element: None, - attribute: None, - // element: Some("a".to_string()), - // attribute: Some("href".to_string()), + element: Some("a".to_string()), + attribute: Some("href".to_string()), }]; let uris = extract_markdown(markdown, true); assert_eq!(uris, expected); } + + #[test] + fn test_multiple_wiki_links() { + let markdown = r"[[https://example.com/destination]][[https://example.com/source]]"; + let expected = vec![ + RawUri { + text: "https://example.com/destination".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }, + RawUri { + text: "https://example.com/source".to_string(), + element: Some("a".to_string()), + attribute: Some("href".to_string()), + }, + ]; + let uris = extract_markdown(markdown, true); + assert_eq!(uris, expected); + } }