From d8dca7d39cca277d0b61a65f2dc8f33149b73138 Mon Sep 17 00:00:00 2001 From: Keming Date: Fri, 11 Apr 2025 19:09:09 +0800 Subject: [PATCH] feat: detect website fragments Signed-off-by: Keming --- lychee-lib/Cargo.toml | 2 +- lychee-lib/src/checker/website.rs | 48 +++++++++++++++++++++++++++---- lychee-lib/src/client.rs | 1 + 3 files changed, 45 insertions(+), 6 deletions(-) diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index b61e42c4cc..c4d46b7317 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -53,6 +53,7 @@ secrecy = "0.10.3" serde = { version = "1.0.219", features = ["derive"] } serde_with = "3.12.0" shellexpand = "3.1.0" +tempfile = "3.19.1" thiserror = "2.0.12" tokio = { version = "1.44.1", features = ["full"] } toml = "0.8.20" @@ -65,7 +66,6 @@ features = ["runtime-tokio"] [dev-dependencies] doc-comment = "0.3.3" -tempfile = "3.19.1" wiremock = "0.6.3" serde_json = "1.0.140" rstest = "0.25.0" diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 62cadaff11..ff00677d75 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -3,13 +3,15 @@ use crate::{ quirks::Quirks, retry::RetryExt, types::uri::github::GithubUri, + utils::fragment_checker::FragmentChecker, BasicAuthCredentials, ErrorKind, Status, Uri, }; use async_trait::async_trait; -use http::StatusCode; +use http::{Method, StatusCode}; use octocrab::Octocrab; -use reqwest::Request; -use std::{collections::HashSet, time::Duration}; +use reqwest::{Request, Response}; +use std::{collections::HashSet, io::Write, time::Duration}; +use tempfile::NamedTempFile; #[derive(Debug, Clone)] pub(crate) struct WebsiteChecker { @@ -41,11 +43,19 @@ pub(crate) struct WebsiteChecker { /// /// This would treat unencrypted links as errors when HTTPS is available. require_https: bool, + + /// Whether to check the existence of fragments in the response HTML files. + /// + /// Will be disabled if the request method is `HEAD`. + include_fragments: bool, + + /// Utility for performing fragment checks in HTML files. + fragment_checker: FragmentChecker, } impl WebsiteChecker { #[allow(clippy::too_many_arguments)] - pub(crate) const fn new( + pub(crate) fn new( method: reqwest::Method, retry_wait_time: Duration, max_retries: u64, @@ -54,6 +64,7 @@ impl WebsiteChecker { github_client: Option, require_https: bool, plugin_request_chain: RequestChain, + include_fragments: bool, ) -> Self { Self { method, @@ -64,6 +75,8 @@ impl WebsiteChecker { retry_wait_time, accepted, require_https, + include_fragments, + fragment_checker: FragmentChecker::new(), } } @@ -87,12 +100,37 @@ impl WebsiteChecker { /// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest). async fn check_default(&self, request: Request) -> Status { + let method = request.method().clone(); match self.reqwest_client.execute(request).await { - Ok(ref response) => Status::new(response, self.accepted.clone()), + Ok(response) => { + let mut status = Status::new(&response, self.accepted.clone()); + if self.include_fragments && status.is_success() && method == Method::GET { + status = self.check_html_fragment(status, response).await; + } + status + } Err(e) => e.into(), } } + async fn check_html_fragment(&self, status: Status, response: Response) -> Status { + let url = response.url().clone(); + match response.text().await { + Ok(text) => { + let mut file = NamedTempFile::with_suffix(".html").unwrap(); + if let Err(e) = file.write_all(text.as_bytes()) { + return Status::Error(ErrorKind::ReadFileInput(e, file.path().to_path_buf())); + } + return match self.fragment_checker.check(file.path(), &url).await { + Ok(true) => status, + Ok(false) => Status::Error(ErrorKind::InvalidFragment(url.clone().into())), + Err(e) => Status::Error(e), + }; + } + Err(e) => Status::Error(ErrorKind::ReadResponseBody(e)), + } + } + /// Checks the given URI of a website. /// /// # Errors diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index f9566f06e4..442b50416d 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -392,6 +392,7 @@ impl ClientBuilder { github_client, self.require_https, self.plugin_request_chain, + self.include_fragments, ); Ok(Client {