Skip to content

feat: detect website fragments #1675

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
May 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions fixtures/fragments/file.html
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
<a href="#in-the-end">doesn't exist</a><br>
<a href="#">To the top</a><br>
<a href="#top">To the top alt</a><br>
<a href="https://github.com/lycheeverse/lychee#user-content-table-of-contents">To the lychee readme license fragment.</a>
</section>
</body>
</html>
2 changes: 2 additions & 0 deletions fixtures/fragments/file1.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,3 +63,5 @@ without related HTML element. Browser will scroll to the top of the page.
[Alternative link to top of file2](file2.md#top)

##### Lets wear a hat: être

A link to the non-existing fragment: [try](https://github.com/lycheeverse/lychee#non-existent-anchor).
12 changes: 9 additions & 3 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1837,10 +1837,16 @@ mod cli {
))
.stderr(contains("fixtures/fragments/file.html#top"))
.stderr(contains("fixtures/fragments/file2.md#top"))
.stdout(contains("25 Total"))
.stdout(contains("21 OK"))
.stderr(contains(
"https://github.com/lycheeverse/lychee#user-content-table-of-contents",
))
.stderr(contains(
"https://github.com/lycheeverse/lychee#non-existent-anchor",
))
.stdout(contains("27 Total"))
.stdout(contains("22 OK"))
// 4 failures because of missing fragments
.stdout(contains("4 Errors"));
.stdout(contains("5 Errors"));
}

#[test]
Expand Down
17 changes: 13 additions & 4 deletions lychee-lib/src/checker/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ use http::StatusCode;
use log::warn;
use std::path::{Path, PathBuf};

use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri};
use crate::{
utils::fragment_checker::{FragmentChecker, FragmentInput},
Base, ErrorKind, Status, Uri,
};

/// A utility for checking the existence and validity of file-based URIs.
///
Expand Down Expand Up @@ -167,9 +170,15 @@ impl FileChecker {
///
/// Returns a `Status` indicating the result of the fragment check.
async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
match self.fragment_checker.check(path, &uri.url).await {
Ok(true) => Status::Ok(StatusCode::OK),
Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
match FragmentInput::from_path(path).await {
Ok(input) => match self.fragment_checker.check(input, &uri.url).await {
Ok(true) => Status::Ok(StatusCode::OK),
Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
Err(err) => {
warn!("Skipping fragment check due to the following error: {err}");
Status::Ok(StatusCode::OK)
}
},
Err(err) => {
warn!("Skipping fragment check due to the following error: {err}");
Status::Ok(StatusCode::OK)
Expand Down
51 changes: 47 additions & 4 deletions lychee-lib/src/checker/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@ use crate::{
quirks::Quirks,
retry::RetryExt,
types::uri::github::GithubUri,
utils::fragment_checker::{FragmentChecker, FragmentInput},
BasicAuthCredentials, ErrorKind, Status, Uri,
};
use async_trait::async_trait;
use http::StatusCode;
use http::{Method, StatusCode};
use octocrab::Octocrab;
use reqwest::Request;
use reqwest::{Request, Response};
use std::{collections::HashSet, time::Duration};

#[derive(Debug, Clone)]
Expand Down Expand Up @@ -41,11 +42,19 @@ pub(crate) struct WebsiteChecker {
///
/// This would treat unencrypted links as errors when HTTPS is available.
require_https: bool,

/// Whether to check the existence of fragments in the response HTML files.
///
/// Will be disabled if the request method is `HEAD`.
include_fragments: bool,

/// Utility for performing fragment checks in HTML files.
fragment_checker: FragmentChecker,
}

impl WebsiteChecker {
#[allow(clippy::too_many_arguments)]
pub(crate) const fn new(
pub(crate) fn new(
method: reqwest::Method,
retry_wait_time: Duration,
max_retries: u64,
Expand All @@ -54,6 +63,7 @@ impl WebsiteChecker {
github_client: Option<Octocrab>,
require_https: bool,
plugin_request_chain: RequestChain,
include_fragments: bool,
) -> Self {
Self {
method,
Expand All @@ -64,6 +74,8 @@ impl WebsiteChecker {
retry_wait_time,
accepted,
require_https,
include_fragments,
fragment_checker: FragmentChecker::new(),
}
}

Expand All @@ -87,12 +99,43 @@ impl WebsiteChecker {

/// Check a URI using [reqwest](https://github.com/seanmonstar/reqwest).
async fn check_default(&self, request: Request) -> Status {
let method = request.method().clone();
match self.reqwest_client.execute(request).await {
Ok(ref response) => Status::new(response, self.accepted.clone()),
Ok(response) => {
let mut status = Status::new(&response, self.accepted.clone());
if self.include_fragments && status.is_success() && method == Method::GET {
status = self.check_html_fragment(status, response).await;
}
status
}
Err(e) => e.into(),
}
}

async fn check_html_fragment(&self, status: Status, response: Response) -> Status {
let url = response.url().clone();
match response.text().await {
Ok(text) => {
match self
.fragment_checker
.check(
FragmentInput {
content: text,
file_type: crate::FileType::Html,
},
&url,
)
.await
{
Ok(true) => status,
Ok(false) => Status::Error(ErrorKind::InvalidFragment(url.clone().into())),
Err(e) => Status::Error(e),
}
}
Err(e) => Status::Error(ErrorKind::ReadResponseBody(e)),
}
}

/// Checks the given URI of a website.
///
/// # Errors
Expand Down
18 changes: 12 additions & 6 deletions lychee-lib/src/client.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,10 @@ use typed_builder::TypedBuilder;

use crate::{
chain::RequestChain,
checker::file::FileChecker,
checker::{mail::MailChecker, website::WebsiteChecker},
checker::{file::FileChecker, mail::MailChecker, website::WebsiteChecker},
filter::{Excludes, Filter, Includes},
remap::Remaps,
utils::fragment_checker::FragmentChecker,
utils::fragment_checker::{FragmentChecker, FragmentInput},
Base, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri,
};

Expand Down Expand Up @@ -399,6 +398,7 @@ impl ClientBuilder {
github_client,
self.require_https,
self.plugin_request_chain,
self.include_fragments,
);

Ok(Client {
Expand Down Expand Up @@ -539,9 +539,15 @@ impl Client {

/// Checks a `file` URI's fragment.
pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
match self.fragment_checker.check(path, &uri.url).await {
Ok(true) => Status::Ok(StatusCode::OK),
Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
match FragmentInput::from_path(path).await {
Ok(input) => match self.fragment_checker.check(input, &uri.url).await {
Ok(true) => Status::Ok(StatusCode::OK),
Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
Err(err) => {
warn!("Skipping fragment check due to the following error: {err}");
Status::Ok(StatusCode::OK)
}
},
Err(err) => {
warn!("Skipping fragment check due to the following error: {err}");
Status::Ok(StatusCode::OK)
Expand Down
21 changes: 17 additions & 4 deletions lychee-lib/src/utils/fragment_checker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,20 @@ use percent_encoding::percent_decode_str;
use tokio::{fs, sync::Mutex};
use url::Url;

/// Holds the content and file type of the fragment input.
pub(crate) struct FragmentInput {
pub content: String,
pub file_type: FileType,
}

impl FragmentInput {
pub(crate) async fn from_path(path: &Path) -> Result<Self> {
let content = fs::read_to_string(path).await?;
let file_type = FileType::from(path);
Ok(Self { content, file_type })
}
}

/// Holds a cache of fragments for a given URL.
///
/// Fragments, also known as anchors, are used to link to a specific
Expand All @@ -37,14 +51,14 @@ impl FragmentChecker {
}
}

/// Checks if the given path contains the given fragment.
/// Checks if the given [`FragmentInput`] contains the given fragment.
///
/// Returns false, if there is a fragment in the link which is not empty or "top"
/// and the path is to a Markdown file, which doesn't contain the given fragment.
/// (Empty # and #top fragments are always valid, triggering the browser to scroll to top.)
///
/// In all other cases, returns true.
pub(crate) async fn check(&self, path: &Path, url: &Url) -> Result<bool> {
pub(crate) async fn check(&self, input: FragmentInput, url: &Url) -> Result<bool> {
let Some(fragment) = url.fragment() else {
return Ok(true);
};
Expand All @@ -54,7 +68,7 @@ impl FragmentChecker {
let mut fragment_decoded = percent_decode_str(fragment).decode_utf8()?;
let url_without_frag = Self::remove_fragment(url.clone());

let file_type = FileType::from(path);
let FragmentInput { content, file_type } = input;
let extractor = match file_type {
FileType::Markdown => extract_markdown_fragments,
FileType::Html => extract_html_fragments,
Expand All @@ -65,7 +79,6 @@ impl FragmentChecker {
}
match self.cache.lock().await.entry(url_without_frag) {
Entry::Vacant(entry) => {
let content = fs::read_to_string(path).await?;
let file_frags = extractor(&content);
let contains_fragment =
file_frags.contains(fragment) || file_frags.contains(&fragment_decoded as &str);
Expand Down