Skip to content

detect wikilinks, prevent plaintext extraction from links #1679

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 31 additions & 10 deletions lychee-lib/src/extract/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@ use super::html::html5gum::{extract_html, extract_html_fragments};
/// Returns the default markdown extensions used by lychee.
/// Sadly, `|` is not const for `Options` so we can't use a const global.
fn md_extensions() -> Options {
Options::ENABLE_HEADING_ATTRIBUTES | Options::ENABLE_MATH
Options::ENABLE_HEADING_ATTRIBUTES | Options::ENABLE_MATH | Options::ENABLE_WIKILINKS
}

/// Extract unparsed URL strings from a Markdown string.
pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUri> {
// In some cases it is undesirable to extract links from within code blocks,
// which is why we keep track of entries and exits while traversing the input.
let mut inside_code_block = false;
let mut inside_link_block = false;

let parser = TextMergeStream::new(Parser::new_ext(input, md_extensions()));
parser
Expand Down Expand Up @@ -62,10 +63,8 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
LinkType::Email =>
Some(extract_raw_uri_from_plaintext(&dest_url)),
// Wiki URL (`[[http://example.com]]`)
// This element is currently not matched and I'm not sure why.
// However, we keep it in here for future compatibility with
// markup5ever.
LinkType::WikiLink { has_pothole: _ } => {
inside_link_block = true;
Some(vec![RawUri {
text: dest_url.to_string(),
element: Some("a".to_string()),
Expand Down Expand Up @@ -100,7 +99,7 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr

// A text node.
Event::Text(txt) => {
if inside_code_block && !include_verbatim {
if (inside_code_block && !include_verbatim) || inside_link_block {
None
} else {
Some(extract_raw_uri_from_plaintext(&txt))
Expand All @@ -123,6 +122,12 @@ pub(crate) fn extract_markdown(input: &str, include_verbatim: bool) -> Vec<RawUr
}
}

// A detected link block.
Event::End(TagEnd::Link) => {
inside_link_block = false;
None
}

// Silently skip over other events
_ => None,
})
Expand Down Expand Up @@ -391,13 +396,29 @@ $$
let markdown = r"[[https://example.com/destination]]";
let expected = vec![RawUri {
text: "https://example.com/destination".to_string(),
// This should be a link element, but is currently matched as plaintext
element: None,
attribute: None,
// element: Some("a".to_string()),
// attribute: Some("href".to_string()),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];
let uris = extract_markdown(markdown, true);
assert_eq!(uris, expected);
}

#[test]
fn test_multiple_wiki_links() {
let markdown = r"[[https://example.com/destination]][[https://example.com/source]]";
let expected = vec![
RawUri {
text: "https://example.com/destination".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
RawUri {
text: "https://example.com/source".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
},
];
let uris = extract_markdown(markdown, true);
assert_eq!(uris, expected);
}
}
Loading