diff --git a/README.md b/README.md index de55c411dd..81516c8692 100644 --- a/README.md +++ b/README.md @@ -217,6 +217,7 @@ FLAGS: --exclude-private Exclude private IP address ranges from checking --glob-ignore-case Ignore case when expanding filesystem path glob inputs --help Prints help information + --html Treat the input as HTML --include-verbatim Find links in verbatim sections like `pre`- and `code` blocks -i, --insecure Proceed for server connections considered insecure (invalid TLS) -n, --no-progress Do not show progress bar. @@ -273,9 +274,9 @@ ARGS: ### Ignoring links You can exclude links from getting checked by specifying regex patterns -with `--exclude` (e.g. `--exclude example\.(com|org)`). +with `--exclude` (e.g. `--exclude example\.(com|org)`). If a file named `.lycheeignore` exists in the current working directory, its -contents are excluded as well. The file allows you to list multiple regular +contents are excluded as well. The file allows you to list multiple regular expressions for exclusion (one pattern per line). ### Caching diff --git a/fixtures/configs/smoketest.toml b/fixtures/configs/smoketest.toml index 597674bb85..a67631a038 100644 --- a/fixtures/configs/smoketest.toml +++ b/fixtures/configs/smoketest.toml @@ -87,6 +87,9 @@ include_verbatim = false # Ignore case of paths when matching glob patterns. glob_ignore_case = false +# Treat input as HTML +html = false + # Exclude URLs from checking (supports regex). exclude = [ '.*\.github.com\.*' ] diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 661e202212..79f45b6a5d 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -2,8 +2,8 @@ use crate::parse::{parse_base, parse_statuscodes}; use anyhow::{anyhow, Context, Error, Result}; use const_format::{concatcp, formatcp}; use lychee_lib::{ - Base, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, - DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, + Base, FileType, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, + DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, }; use secrecy::{ExposeSecret, SecretString}; use serde::Deserialize; @@ -122,6 +122,11 @@ impl LycheeOptions { // but we'd get no access to `glob_ignore_case`. /// Get parsed inputs from options. pub(crate) fn inputs(&self) -> Result> { + let file_type_hint = if self.config.html { + Some(FileType::Html) + } else { + None + }; let excluded = if self.config.exclude_path.is_empty() { None } else { @@ -129,7 +134,14 @@ impl LycheeOptions { }; self.raw_inputs .iter() - .map(|s| Input::new(s, None, self.config.glob_ignore_case, excluded.clone())) + .map(|s| { + Input::new( + s, + file_type_hint, + self.config.glob_ignore_case, + excluded.clone(), + ) + }) .collect::>() .context("Cannot parse inputs from arguments") } @@ -319,6 +331,11 @@ pub(crate) struct Config { #[serde(default)] pub(crate) glob_ignore_case: bool, + /// Treat the input as HTML + #[structopt(long)] + #[serde(default)] + pub(crate) html: bool, + /// Output file of status report #[structopt(short, long, parse(from_os_str))] #[serde(default)] @@ -393,6 +410,7 @@ impl Config { skip_missing: false; include_verbatim: false; glob_ignore_case: false; + html: false; output: None; require_https: false; } diff --git a/lychee-lib/src/types/input.rs b/lychee-lib/src/types/input.rs index 7281f63fac..a11909ede7 100644 --- a/lychee-lib/src/types/input.rs +++ b/lychee-lib/src/types/input.rs @@ -3,7 +3,7 @@ use crate::{helpers, ErrorKind, Result}; use async_stream::try_stream; use futures::stream::Stream; use glob::glob_with; -use jwalk::WalkDir; +use jwalk::WalkDirGeneric; use reqwest::Url; use serde::Serialize; use shellexpand::tilde; @@ -198,7 +198,7 @@ impl Input { } InputSource::FsPath(ref path) => { if path.is_dir() { - for entry in WalkDir::new(path).skip_hidden(true) + for entry in WalkDirGeneric::<((usize), (Option))>::new(path).skip_hidden(true) .process_read_dir(move |_, _, _, children| { children.retain(|child| { let entry = match child.as_ref() { @@ -224,19 +224,24 @@ impl Input { } return valid_extension(&entry.path()); }); + children.first_mut().map(|child| { + if let Ok(entry) = child { + entry.client_state = self.file_type_hint; + } + }); }) { let entry = entry?; if entry.file_type().is_dir() { continue; } - let content = Self::path_content(entry.path()).await?; + let content = Self::path_content(entry.path(), entry.client_state).await?; yield content } } else { if self.is_excluded_path(path) { return (); } - let content = Self::path_content(path).await; + let content = Self::path_content(path, self.file_type_hint).await; match content { Err(_) if skip_missing => (), Err(e) => Err(e)?, @@ -301,7 +306,7 @@ impl Input { if self.is_excluded_path(&path) { continue; } - let content: InputContent = Self::path_content(&path).await?; + let content: InputContent = Self::path_content(&path, self.file_type_hint).await?; yield content; } Err(e) => eprintln!("{e:?}"), @@ -325,13 +330,19 @@ impl Input { /// Will return `Err` if file contents can't be read pub async fn path_content + AsRef + Clone>( path: P, + file_type_hint: Option, ) -> Result { let path = path.into(); let content = tokio::fs::read_to_string(&path) .await .map_err(|e| ErrorKind::ReadFileInput(e, path.clone()))?; + let file_type = if file_type_hint.is_none() { + FileType::from(&path) + } else { + file_type_hint.unwrap_or_default() + }; let input_content = InputContent { - file_type: FileType::from(&path), + file_type, source: InputSource::FsPath(path), content, }; diff --git a/lychee.example.toml b/lychee.example.toml index 88425d8821..a86ed6595a 100644 --- a/lychee.example.toml +++ b/lychee.example.toml @@ -86,6 +86,9 @@ include_verbatim = false # Ignore case of paths when matching glob patterns. glob_ignore_case = false +# Treat the input as HTML. +html = false + # Exclude URLs from checking (supports regex). exclude = [ '.*\.github.com\.*' ]