From 85ea4b5ee2155f6d3a73d08dfdf51b8994674153 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Wed, 7 Aug 2024 23:12:38 +0200
Subject: [PATCH 01/14] Add test for relative links
---
lychee-bin/tests/cli.rs | 42 +++++++++++++++++++++++++++++++++++++++++
1 file changed, 42 insertions(+)
diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs
index 2d23b9d9d0..bfefc34068 100644
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@@ -1667,4 +1667,46 @@ mod cli {
.success()
.stdout(contains("0 Errors"));
}
+
+ /// Test relative paths
+ ///
+ /// Imagine a web server hosting a site with the following structure:
+ /// root
+ /// └── test
+ /// ├── index.html
+ /// └── next.html
+ ///
+ /// where `root/test/index.html` contains `next`
+ /// When checking the link in `root/test/index.html` we should be able to
+ /// resolve the relative path to `root/test/next.html`
+ ///
+ /// Note that the relative path is not resolved to the root of the server
+ /// but relative to the file that contains the link.
+ #[tokio::test]
+ async fn test_resolve_relative_paths_in_subfolder() -> Result<()> {
+ let mock_server = wiremock::MockServer::start().await;
+
+ let body = r#"next"#;
+ wiremock::Mock::given(wiremock::matchers::method("GET"))
+ .and(wiremock::matchers::path("/test/index.html"))
+ .respond_with(wiremock::ResponseTemplate::new(200).set_body_string(body))
+ .mount(&mock_server)
+ .await;
+
+ wiremock::Mock::given(wiremock::matchers::method("GET"))
+ .and(wiremock::matchers::path("/test/next.html"))
+ .respond_with(wiremock::ResponseTemplate::new(200))
+ .mount(&mock_server)
+ .await;
+
+ let mut cmd = main_command();
+ cmd.arg("--verbose")
+ .arg(format!("{}/test/index.html", mock_server.uri()))
+ .assert()
+ .success()
+ .stdout(contains("1 Total"))
+ .stdout(contains("0 Errors"));
+
+ Ok(())
+ }
}
From d17c133d9d9612300178c95acf6d812fb3c99e1a Mon Sep 17 00:00:00 2001
From: Matthias
Date: Sun, 18 Aug 2024 01:55:03 +0200
Subject: [PATCH 02/14] Fix relative links
---
lychee-lib/src/collector.rs | 82 ++++++++++++---
lychee-lib/src/types/base.rs | 21 ++--
lychee-lib/src/utils/request.rs | 172 ++++++++++++++++++++++++--------
3 files changed, 209 insertions(+), 66 deletions(-)
diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
index 341bb21dba..8fb530db0a 100644
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@@ -1,3 +1,4 @@
+use crate::InputSource;
use crate::{
basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request,
Base, Input, Request, Result,
@@ -99,26 +100,31 @@ impl Collector {
///
/// Will return `Err` if links cannot be extracted from an input
pub fn collect_links(self, inputs: Vec) -> impl Stream> {
- let base = self.base;
+ let skip_missing_inputs = self.skip_missing_inputs;
+ let skip_hidden = self.skip_hidden;
+ let skip_ignored = self.skip_ignored;
+ let global_base = self.base;
stream::iter(inputs)
- .par_then_unordered(None, move |input| async move {
- input.get_contents(
- self.skip_missing_inputs,
- self.skip_hidden,
- self.skip_ignored,
- )
+ .par_then_unordered(None, move |input| {
+ let global_base = global_base.clone();
+ async move {
+ let input_base = match &input.source {
+ InputSource::RemoteUrl(url) => Some(Base::try_from(url.as_str()).unwrap()),
+ _ => None,
+ };
+ let base = input_base.or(global_base);
+ input
+ .get_contents(skip_missing_inputs, skip_hidden, skip_ignored)
+ .map(move |content| (content, base.clone()))
+ }
})
.flatten()
- .par_then_unordered(None, move |content| {
- // send to parallel worker
- let base = base.clone();
+ .par_then_unordered(None, move |(content, base)| {
let basic_auth_extractor = self.basic_auth_extractor.clone();
async move {
let content = content?;
-
let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
let uris: Vec = extractor.extract(&content);
-
let requests = request::create(uris, &content, &base, &basic_auth_extractor)?;
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}
@@ -426,4 +432,56 @@ mod tests {
assert_eq!(links, expected_links);
}
+
+ #[tokio::test]
+ async fn test_multiple_remote_urls() {
+ let mock_server_1 = mock_server!(
+ StatusCode::OK,
+ set_body_string(r#"Link"#)
+ );
+ let mock_server_2 = mock_server!(
+ StatusCode::OK,
+ set_body_string(r#"Link"#)
+ );
+
+ let inputs = vec![
+ Input {
+ source: InputSource::RemoteUrl(Box::new(
+ Url::parse(&format!(
+ "{}/foo/index.html",
+ mock_server_1.uri().trim_end_matches('/')
+ ))
+ .unwrap(),
+ )),
+ file_type_hint: Some(FileType::Html),
+ excluded_paths: None,
+ },
+ Input {
+ source: InputSource::RemoteUrl(Box::new(
+ Url::parse(&format!(
+ "{}/bar/index.html",
+ mock_server_2.uri().trim_end_matches('/')
+ ))
+ .unwrap(),
+ )),
+ file_type_hint: Some(FileType::Html),
+ excluded_paths: None,
+ },
+ ];
+
+ let links = collect(inputs, None).await;
+
+ let expected_links = HashSet::from_iter([
+ website(&format!(
+ "{}/foo/relative.html",
+ mock_server_1.uri().trim_end_matches('/')
+ )),
+ website(&format!(
+ "{}/bar/relative.html",
+ mock_server_2.uri().trim_end_matches('/')
+ )),
+ ]);
+
+ assert_eq!(links, expected_links);
+ }
}
diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs
index 73a40978f5..a745995756 100644
--- a/lychee-lib/src/types/base.rs
+++ b/lychee-lib/src/types/base.rs
@@ -36,18 +36,17 @@ impl Base {
}
}
- pub(crate) fn from_source(source: &InputSource) -> Option {
+ pub(crate) fn from_source(source: &InputSource) -> Option {
match &source {
InputSource::RemoteUrl(url) => {
- // TODO: This should be refactored.
- // Cases like https://user:pass@example.com are not handled
- // We can probably use the original URL and just replace the
- // path component in the caller of this function
- if let Some(port) = url.port() {
- Url::parse(&format!("{}://{}:{port}", url.scheme(), url.host_str()?)).ok()
- } else {
- Url::parse(&format!("{}://{}", url.scheme(), url.host_str()?)).ok()
- }
+ // Create a new URL with just the scheme, host, and port
+ let mut base_url = url.clone();
+ base_url.set_path("");
+ base_url.set_query(None);
+ base_url.set_fragment(None);
+
+ // We keep the username and password intact
+ Some(Base::Remote(*base_url))
}
// other inputs do not have a URL to extract a base
_ => None,
@@ -123,7 +122,7 @@ mod test_base {
let url = Url::parse(url).unwrap();
let source = InputSource::RemoteUrl(Box::new(url.clone()));
let base = Base::from_source(&source);
- let expected = Url::parse(expected).unwrap();
+ let expected = Base::Remote(Url::parse(expected).unwrap());
assert_eq!(base, Some(expected));
}
}
diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs
index 70f82ce1d8..cfe4be11dd 100644
--- a/lychee-lib/src/utils/request.rs
+++ b/lychee-lib/src/utils/request.rs
@@ -28,7 +28,9 @@ pub(crate) fn create(
base: &Option,
extractor: &Option,
) -> Result> {
- let base_url = Base::from_source(&input_content.source);
+ let base_url = base
+ .clone()
+ .or_else(|| Base::from_source(&input_content.source));
let requests: Result>> = uris
.into_iter()
@@ -38,30 +40,16 @@ pub(crate) fn create(
let element = raw_uri.element.clone();
let attribute = raw_uri.attribute.clone();
- // Truncate the source in case it gets too long Ideally we should
- // avoid the initial String allocation for `source` altogether
+ // Truncate the source in case it gets too long
let source = match &input_content.source {
InputSource::String(s) => {
InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect())
}
- // Cloning is cheap here
c => c.clone(),
};
if let Ok(uri) = Uri::try_from(raw_uri) {
let credentials = credentials(extractor, &uri);
-
- Ok(Some(Request::new(
- uri,
- source,
- element,
- attribute,
- credentials,
- )))
- } else if let Some(url) = base.as_ref().and_then(|u| u.join(&text)) {
- let uri = Uri { url };
- let credentials = credentials(extractor, &uri);
-
Ok(Some(Request::new(
uri,
source,
@@ -69,6 +57,21 @@ pub(crate) fn create(
attribute,
credentials,
)))
+ } else if let Some(base) = &base_url {
+ match base.join(&text) {
+ Some(url) => {
+ let uri = Uri { url };
+ let credentials = credentials(extractor, &uri);
+ Ok(Some(Request::new(
+ uri,
+ source,
+ element,
+ attribute,
+ credentials,
+ )))
+ }
+ None => Ok(None),
+ }
} else if let InputSource::FsPath(root) = &input_content.source {
let path = if is_anchor {
match root.file_name() {
@@ -81,11 +84,9 @@ pub(crate) fn create(
} else {
text
};
-
- if let Some(url) = create_uri_from_path(root, &path, base)? {
+ if let Some(url) = create_uri_from_path(root, &path, &base_url)? {
let uri = Uri { url };
let credentials = credentials(extractor, &uri);
-
Ok(Some(Request::new(
uri,
source,
@@ -94,24 +95,7 @@ pub(crate) fn create(
credentials,
)))
} else {
- // In case we cannot create a URI from a path but we didn't receive an error,
- // it means that some preconditions were not met, e.g. the `base_url` wasn't set.
- Ok(None)
- }
- } else if let Some(url) = construct_url(&base_url, &text) {
- if base.is_some() {
Ok(None)
- } else {
- let uri = Uri { url: url? };
- let credentials = credentials(extractor, &uri);
-
- Ok(Some(Request::new(
- uri,
- source,
- element,
- attribute,
- credentials,
- )))
}
} else {
info!("Handling of `{}` not implemented yet", text);
@@ -119,17 +103,11 @@ pub(crate) fn create(
}
})
.collect();
+
let requests: Vec = requests?.into_iter().flatten().collect();
Ok(HashSet::from_iter(requests))
}
-fn construct_url(base: &Option, text: &str) -> Option> {
- base.as_ref().map(|base| {
- base.join(text)
- .map_err(|e| ErrorKind::ParseUrl(e, format!("{base}{text}")))
- })
-}
-
fn create_uri_from_path(src: &Path, dst: &str, base: &Option) -> Result
+
+
\ No newline at end of file
diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs
index b03fc35e2c..68bf3ca7ca 100644
--- a/lychee-bin/src/client.rs
+++ b/lychee-bin/src/client.rs
@@ -55,6 +55,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) -
ClientBuilder::builder()
.remaps(remaps)
+ .base(cfg.base.clone())
.includes(includes)
.excludes(excludes)
.exclude_all_private(cfg.exclude_all_private)
diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs
index bfefc34068..c53d6c940a 100644
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@@ -266,13 +266,17 @@ mod cli {
#[test]
fn test_resolve_paths() {
+ // TODO:
+ // 1. Check the diff to see if the absolute file rewrite hack can be removed / is still in use
+ // 2. refactor the code to clean up base path handling
+
let mut cmd = main_command();
- let offline_dir = fixtures_path().join("offline");
+ let dir = fixtures_path().join("resolve_paths");
cmd.arg("--offline")
.arg("--base")
- .arg(&offline_dir)
- .arg(offline_dir.join("index.html"))
+ .arg(&dir)
+ .arg(dir.join("index.html"))
.env_clear()
.assert()
.success()
@@ -1216,8 +1220,9 @@ mod cli {
Ok(())
}
- /// If base-dir is not set, don't throw an error in case we encounter
- /// an absolute local link within a file (e.g. `/about`).
+ /// If `base-dir` is not set, don't throw an error in case we encounter
+ /// an absolute local link (e.g. `/about`) within a file.
+ /// Instead, simply ignore the link.
#[test]
fn test_ignore_absolute_local_links_without_base() -> Result<()> {
let mut cmd = main_command();
@@ -1409,9 +1414,7 @@ mod cli {
.arg("./NOT-A-REAL-TEST-FIXTURE.md")
.assert()
.failure()
- .stderr(contains(
- "Cannot find local file ./NOT-A-REAL-TEST-FIXTURE.md",
- ));
+ .stderr(contains("Invalid file path: ./NOT-A-REAL-TEST-FIXTURE.md"));
Ok(())
}
diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs
index 49c652a52e..d5d102aca0 100644
--- a/lychee-lib/src/client.rs
+++ b/lychee-lib/src/client.rs
@@ -13,7 +13,12 @@
clippy::default_trait_access,
clippy::used_underscore_binding
)]
-use std::{collections::HashSet, path::Path, sync::Arc, time::Duration};
+use std::{
+ collections::HashSet,
+ path::{Path, PathBuf},
+ sync::Arc,
+ time::Duration,
+};
#[cfg(all(feature = "email-check", feature = "native-tls"))]
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
@@ -37,7 +42,7 @@ use crate::{
remap::Remaps,
types::uri::github::GithubUri,
utils::fragment_checker::FragmentChecker,
- ErrorKind, Request, Response, Result, Status, Uri,
+ Base, ErrorKind, Request, Response, Result, Status, Uri,
};
#[cfg(all(feature = "email-check", feature = "native-tls"))]
@@ -248,6 +253,12 @@ pub struct ClientBuilder {
/// Response timeout per request in seconds.
timeout: Option,
+ /// Base for resolving paths.
+ ///
+ /// E.g. if the base is `/home/user/` and the path is `file.txt`, the
+ /// resolved path would be `/home/user/file.txt`.
+ base: Option,
+
/// Initial time between retries of failed requests.
///
/// Defaults to [`DEFAULT_RETRY_WAIT_TIME_SECS`].
@@ -387,6 +398,7 @@ impl ClientBuilder {
reqwest_client,
github_client,
remaps: self.remaps,
+ base: self.base,
fallback_extensions: self.fallback_extensions,
filter,
max_retries: self.max_retries,
@@ -416,6 +428,9 @@ pub struct Client {
/// Optional remapping rules for URIs matching pattern.
remaps: Option,
+ /// Optional base for resolving paths.
+ base: Option,
+
/// Automatically append file extensions to `file://` URIs as needed
fallback_extensions: Vec,
@@ -463,7 +478,7 @@ impl Client {
///
/// Returns an `Err` if:
/// - `request` does not represent a valid URI.
- /// - Encrypted connection for a HTTP URL is available but unused. (Only
+ /// - Encrypted connection for a HTTP URL is available but unused. (Only
/// checked when `Client::require_https` is `true`.)
#[allow(clippy::missing_panics_doc)]
pub async fn check(&self, request: T) -> Result
@@ -493,22 +508,72 @@ impl Client {
return Ok(Response::new(uri.clone(), Status::Excluded, source));
}
- let default_chain: RequestChain = Chain::new(vec![
- Box::::default(),
- Box::new(credentials),
- Box::new(Checker::new(
- self.retry_wait_time,
- self.max_retries,
- self.reqwest_client.clone(),
- self.accepted.clone(),
- )),
- ]);
-
let status = match uri.scheme() {
- _ if uri.is_file() => self.check_file(uri).await,
+ _ if uri.is_file() => {
+ // If base is set and the path is absolute, prepend the base
+ // to the path.
+ if let Some(ref base) = self.base {
+ let Ok(path) = uri.url.to_file_path() else {
+ return Err(ErrorKind::InvalidFilePath(uri.clone()).into());
+ };
+
+ if path.is_absolute() {
+ match base {
+ Base::Local(ref base_path) => {
+ // Ensure base_path is absolute
+ let absolute_base_path = if base_path.is_relative() {
+ std::env::current_dir()
+ .unwrap_or_else(|_| PathBuf::new())
+ .join(base_path)
+ } else {
+ base_path.to_path_buf()
+ };
+
+ // Optionally strip the leading slash from the path (if it exists)
+ let stripped = path.strip_prefix("/").unwrap_or(&path);
+ let resolved = absolute_base_path.join(stripped);
+
+ // Create a new URI with the resolved path
+ let mut uri = uri.clone();
+ uri.url = Url::from_file_path(&resolved).unwrap();
+
+ if resolved.exists() {
+ return Ok(Response::new(
+ uri.clone(),
+ Status::Ok(StatusCode::OK),
+ source,
+ ));
+ }
+
+ return Ok(Response::new(
+ uri.clone(),
+ Status::Error(ErrorKind::InvalidFilePath(uri.clone())),
+ source,
+ ));
+ }
+ Base::Remote(_) => (),
+ }
+ }
+ }
+
+ self.check_file(uri).await
+ }
_ if uri.is_mail() => self.check_mail(uri).await,
_ if uri.is_tel() => Status::Excluded,
- _ => self.check_website(uri, default_chain).await?,
+ _ => {
+ let default_chain: RequestChain = Chain::new(vec![
+ Box::::default(),
+ Box::new(credentials),
+ Box::new(Checker::new(
+ self.retry_wait_time,
+ self.max_retries,
+ self.reqwest_client.clone(),
+ self.accepted.clone(),
+ )),
+ ]);
+
+ self.check_website(uri, default_chain).await?
+ }
};
Ok(Response::new(uri.clone(), status, source))
@@ -690,6 +755,7 @@ impl Client {
/// Checks a `file` URI's fragment.
pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
+ println!("Checking fragment for {:?}", path);
match self.fragment_checker.check(path, &uri.url).await {
Ok(true) => Status::Ok(StatusCode::OK),
Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs
index c59718a3be..83e20a5e2c 100644
--- a/lychee-lib/src/utils/request.rs
+++ b/lychee-lib/src/utils/request.rs
@@ -26,14 +26,22 @@ fn create_request(
source: &InputSource,
base: &Option,
extractor: &Option,
-) -> Result {
- let uri = try_parse_into_uri(&raw_uri, source, base)?;
+) -> Result
> {
+ let Some(uri) = try_parse_into_uri(&raw_uri, source, base)? else {
+ return Ok(None);
+ };
let source = truncate_source(source);
let element = raw_uri.element.clone();
let attribute = raw_uri.attribute.clone();
let credentials = extract_credentials(extractor, &uri);
- Ok(Request::new(uri, source, element, attribute, credentials))
+ Ok(Some(Request::new(
+ uri,
+ source,
+ element,
+ attribute,
+ credentials,
+ )))
}
/// Try to parse the raw URI into a `Uri`.
@@ -47,7 +55,11 @@ fn create_request(
/// to create a valid URI.
/// - If a URI cannot be created from the file path.
/// - If the source is not a file path (i.e. the URI type is not supported).
-fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option) -> Result {
+fn try_parse_into_uri(
+ raw_uri: &RawUri,
+ source: &InputSource,
+ base: &Option,
+) -> Result
> {
let text = raw_uri.text.clone();
let uri = match Uri::try_from(raw_uri.clone()) {
Ok(uri) => uri,
@@ -57,12 +69,20 @@ fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option return Err(ErrorKind::InvalidBaseJoin(text.clone())),
},
None => match source {
- InputSource::FsPath(root) => create_uri_from_file_path(root, &text, base)?,
+ InputSource::FsPath(root) => {
+ // If absolute link (`/`) and `base` is not set,
+ // simply ignore the link as it's not resolvable.
+ if text.starts_with('/') && base.is_none() {
+ return Ok(None);
+ }
+
+ create_uri_from_file_path(root, &text, base)?
+ }
_ => return Err(ErrorKind::UnsupportedUriType(text)),
},
},
};
- Ok(uri)
+ Ok(Some(uri))
}
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
@@ -115,6 +135,9 @@ fn truncate_source(source: &InputSource) -> InputSource {
/// Create requests out of the collected URLs.
/// Only keeps "valid" URLs. This filters out anchors for example.
+///
+/// If a URLs is ignored (because of the current settings),
+/// it will not be added to the `HashSet`.
pub(crate) fn create(
uris: Vec,
input_content: &InputContent,
@@ -126,9 +149,15 @@ pub(crate) fn create(
.or_else(|| Base::from_source(&input_content.source));
let requests: Result> = uris
.into_iter()
- .map(|raw_uri| create_request(raw_uri, &input_content.source, &base, extractor))
+ .filter_map(|raw_uri| {
+ match create_request(raw_uri, &input_content.source, &base, extractor) {
+ Ok(Some(request)) => Some(Ok(request)),
+ Ok(None) => None,
+ Err(e) => Some(Err(e)),
+ }
+ })
.collect();
- Ok(HashSet::from_iter(requests?))
+ requests.map(HashSet::from_iter)
}
/// Create a URI from a path
@@ -147,6 +176,7 @@ fn resolve_and_create_url(
dest_path: &str,
base_uri: &Option,
) -> Result {
+ println!("resolve_and_create_url src_path: {:?}", src_path);
let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path);
// Decode the destination path to avoid double-encoding
@@ -301,7 +331,7 @@ mod tests {
assert_eq!(
actual,
- Request::new(
+ Some(Request::new(
Uri {
url: Url::from_file_path("/tmp/lychee/file.html").unwrap()
},
@@ -309,7 +339,7 @@ mod tests {
None,
None,
None,
- )
+ ))
);
}
@@ -329,7 +359,7 @@ mod tests {
assert_eq!(
actual,
- Request::new(
+ Some(Request::new(
Uri {
url: Url::from_file_path("/usr/local/share/doc/example.html").unwrap()
},
@@ -337,7 +367,7 @@ mod tests {
None,
None,
None,
- )
+ ))
);
}
@@ -352,7 +382,10 @@ mod tests {
let raw_uri = RawUri::from("relative.html");
let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap();
- assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html");
+ assert_eq!(
+ uri.unwrap().url.as_str(),
+ "file:///tmp/lychee/relative.html"
+ );
}
#[test]
@@ -366,6 +399,9 @@ mod tests {
let raw_uri = RawUri::from("absolute.html");
let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap();
- assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html");
+ assert_eq!(
+ uri.unwrap().url.as_str(),
+ "file:///tmp/lychee/absolute.html"
+ );
}
}
From e520599da7371330fb156dc98efe5b9e888e1902 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 14 Oct 2024 01:06:23 +0200
Subject: [PATCH 06/14] lint
---
lychee-lib/src/client.rs | 5 ++---
lychee-lib/src/utils/request.rs | 15 +++++++--------
2 files changed, 9 insertions(+), 11 deletions(-)
diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs
index d5d102aca0..e85c48c1ef 100644
--- a/lychee-lib/src/client.rs
+++ b/lychee-lib/src/client.rs
@@ -514,7 +514,7 @@ impl Client {
// to the path.
if let Some(ref base) = self.base {
let Ok(path) = uri.url.to_file_path() else {
- return Err(ErrorKind::InvalidFilePath(uri.clone()).into());
+ return Err(ErrorKind::InvalidFilePath(uri.clone()));
};
if path.is_absolute() {
@@ -526,7 +526,7 @@ impl Client {
.unwrap_or_else(|_| PathBuf::new())
.join(base_path)
} else {
- base_path.to_path_buf()
+ base_path.clone()
};
// Optionally strip the leading slash from the path (if it exists)
@@ -755,7 +755,6 @@ impl Client {
/// Checks a `file` URI's fragment.
pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
- println!("Checking fragment for {:?}", path);
match self.fragment_checker.check(path, &uri.url).await {
Ok(true) => Status::Ok(StatusCode::OK),
Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs
index 83e20a5e2c..e46862e8ba 100644
--- a/lychee-lib/src/utils/request.rs
+++ b/lychee-lib/src/utils/request.rs
@@ -22,12 +22,12 @@ fn extract_credentials(
/// Create a request from a raw URI.
fn create_request(
- raw_uri: RawUri,
+ raw_uri: &RawUri,
source: &InputSource,
base: &Option,
extractor: &Option,
) -> Result
> {
- let Some(uri) = try_parse_into_uri(&raw_uri, source, base)? else {
+ let Some(uri) = try_parse_into_uri(raw_uri, source, base)? else {
return Ok(None);
};
let source = truncate_source(source);
@@ -94,7 +94,7 @@ pub(crate) fn is_anchor(text: &str) -> bool {
///
fn create_uri_from_file_path(
- file_path: &PathBuf,
+ file_path: &Path,
link_text: &str,
base: &Option,
) -> Result {
@@ -103,7 +103,7 @@ fn create_uri_from_file_path(
let file_name = file_path
.file_name()
.and_then(|name| name.to_str())
- .ok_or_else(|| ErrorKind::InvalidFile(file_path.clone()))?;
+ .ok_or_else(|| ErrorKind::InvalidFile(file_path.to_path_buf()))?;
format!("{file_name}{link_text}")
} else {
@@ -150,7 +150,7 @@ pub(crate) fn create(
let requests: Result> = uris
.into_iter()
.filter_map(|raw_uri| {
- match create_request(raw_uri, &input_content.source, &base, extractor) {
+ match create_request(&raw_uri, &input_content.source, &base, extractor) {
Ok(Some(request)) => Some(Ok(request)),
Ok(None) => None,
Err(e) => Some(Err(e)),
@@ -176,7 +176,6 @@ fn resolve_and_create_url(
dest_path: &str,
base_uri: &Option,
) -> Result {
- println!("resolve_and_create_url src_path: {:?}", src_path);
let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path);
// Decode the destination path to avoid double-encoding
@@ -327,7 +326,7 @@ mod tests {
let input_source = InputSource::FsPath(PathBuf::from("page.html"));
let actual =
- create_request(RawUri::from("file.html"), &input_source, &base, &None).unwrap();
+ create_request(&RawUri::from("file.html"), &input_source, &base, &None).unwrap();
assert_eq!(
actual,
@@ -350,7 +349,7 @@ mod tests {
// Use an absolute path that's outside the base directory
let actual = create_request(
- RawUri::from("/usr/local/share/doc/example.html"),
+ &RawUri::from("/usr/local/share/doc/example.html"),
&input_source,
&base,
&None,
From c8c9db25e37c37b2738c85a30a851a48a681a3ea Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 14 Oct 2024 01:11:26 +0200
Subject: [PATCH 07/14] update test
---
lychee-bin/tests/cli.rs | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs
index c53d6c940a..c215f6f966 100644
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@@ -280,8 +280,8 @@ mod cli {
.env_clear()
.assert()
.success()
- .stdout(contains("4 Total"))
- .stdout(contains("4 OK"));
+ .stdout(contains("3 Total"))
+ .stdout(contains("3 OK"));
}
#[test]
From a55e4f39a9cabbe60be0e5091f4947db6f416eb2 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 14 Oct 2024 01:35:50 +0200
Subject: [PATCH 08/14] Remove absolute file path hack
---
lychee-bin/tests/cli.rs | 4 +-
lychee-lib/src/utils/request.rs | 70 ++++++++++++---------------------
2 files changed, 27 insertions(+), 47 deletions(-)
diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs
index c215f6f966..9dbae37d34 100644
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@@ -266,9 +266,7 @@ mod cli {
#[test]
fn test_resolve_paths() {
- // TODO:
- // 1. Check the diff to see if the absolute file rewrite hack can be removed / is still in use
- // 2. refactor the code to clean up base path handling
+ // TODO: Refactor the code to clean up base path handling
let mut cmd = main_command();
let dir = fixtures_path().join("resolve_paths");
diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs
index e46862e8ba..f2b77e01e7 100644
--- a/lychee-lib/src/utils/request.rs
+++ b/lychee-lib/src/utils/request.rs
@@ -1,3 +1,4 @@
+use log::warn;
use percent_encoding::percent_decode_str;
use reqwest::Url;
use std::{
@@ -26,22 +27,14 @@ fn create_request(
source: &InputSource,
base: &Option,
extractor: &Option,
-) -> Result
> {
- let Some(uri) = try_parse_into_uri(raw_uri, source, base)? else {
- return Ok(None);
- };
+) -> Result {
+ let uri = try_parse_into_uri(raw_uri, source, base)?;
let source = truncate_source(source);
let element = raw_uri.element.clone();
let attribute = raw_uri.attribute.clone();
let credentials = extract_credentials(extractor, &uri);
- Ok(Some(Request::new(
- uri,
- source,
- element,
- attribute,
- credentials,
- )))
+ Ok(Request::new(uri, source, element, attribute, credentials))
}
/// Try to parse the raw URI into a `Uri`.
@@ -55,11 +48,7 @@ fn create_request(
/// to create a valid URI.
/// - If a URI cannot be created from the file path.
/// - If the source is not a file path (i.e. the URI type is not supported).
-fn try_parse_into_uri(
- raw_uri: &RawUri,
- source: &InputSource,
- base: &Option,
-) -> Result
> {
+fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option) -> Result {
let text = raw_uri.text.clone();
let uri = match Uri::try_from(raw_uri.clone()) {
Ok(uri) => uri,
@@ -69,20 +58,12 @@ fn try_parse_into_uri(
None => return Err(ErrorKind::InvalidBaseJoin(text.clone())),
},
None => match source {
- InputSource::FsPath(root) => {
- // If absolute link (`/`) and `base` is not set,
- // simply ignore the link as it's not resolvable.
- if text.starts_with('/') && base.is_none() {
- return Ok(None);
- }
-
- create_uri_from_file_path(root, &text, base)?
- }
+ InputSource::FsPath(root) => create_uri_from_file_path(root, &text, base)?,
_ => return Err(ErrorKind::UnsupportedUriType(text)),
},
},
};
- Ok(Some(uri))
+ Ok(uri)
}
// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs
@@ -92,7 +73,11 @@ pub(crate) fn is_anchor(text: &str) -> bool {
/// Create a URI from a file path
///
-
+/// # Errors
+///
+/// - If the link text is an anchor and the file name cannot be extracted from the file path.
+/// - If the path cannot be resolved.
+/// - If the resolved path cannot be converted to a URL.
fn create_uri_from_file_path(
file_path: &Path,
link_text: &str,
@@ -147,17 +132,20 @@ pub(crate) fn create(
let base = base
.clone()
.or_else(|| Base::from_source(&input_content.source));
- let requests: Result> = uris
+
+ let requests: HashSet = uris
.into_iter()
.filter_map(|raw_uri| {
match create_request(&raw_uri, &input_content.source, &base, extractor) {
- Ok(Some(request)) => Some(Ok(request)),
- Ok(None) => None,
- Err(e) => Some(Err(e)),
+ Ok(request) => Some(request),
+ Err(e) => {
+ warn!("Error creating request: {:?}", e);
+ None
+ }
}
})
.collect();
- requests.map(HashSet::from_iter)
+ Ok(requests)
}
/// Create a URI from a path
@@ -330,7 +318,7 @@ mod tests {
assert_eq!(
actual,
- Some(Request::new(
+ Request::new(
Uri {
url: Url::from_file_path("/tmp/lychee/file.html").unwrap()
},
@@ -338,7 +326,7 @@ mod tests {
None,
None,
None,
- ))
+ )
);
}
@@ -358,7 +346,7 @@ mod tests {
assert_eq!(
actual,
- Some(Request::new(
+ Request::new(
Uri {
url: Url::from_file_path("/usr/local/share/doc/example.html").unwrap()
},
@@ -366,7 +354,7 @@ mod tests {
None,
None,
None,
- ))
+ )
);
}
@@ -381,10 +369,7 @@ mod tests {
let raw_uri = RawUri::from("relative.html");
let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap();
- assert_eq!(
- uri.unwrap().url.as_str(),
- "file:///tmp/lychee/relative.html"
- );
+ assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html");
}
#[test]
@@ -398,9 +383,6 @@ mod tests {
let raw_uri = RawUri::from("absolute.html");
let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap();
- assert_eq!(
- uri.unwrap().url.as_str(),
- "file:///tmp/lychee/absolute.html"
- );
+ assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html");
}
}
From eedaff26ef9d97ffee1221ba9cb6bbb506475bb2 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Mon, 14 Oct 2024 01:58:10 +0200
Subject: [PATCH 09/14] fix lints
---
lychee-lib/src/collector.rs | 2 +-
lychee-lib/src/utils/request.rs | 20 +++++++++-----------
2 files changed, 10 insertions(+), 12 deletions(-)
diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs
index fd55626398..955bdd24e7 100644
--- a/lychee-lib/src/collector.rs
+++ b/lychee-lib/src/collector.rs
@@ -124,7 +124,7 @@ impl Collector {
let content = content?;
let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
let uris: Vec = extractor.extract(&content);
- let requests = request::create(uris, &content, &base, &basic_auth_extractor)?;
+ let requests = request::create(uris, &content, &base, &basic_auth_extractor);
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}
})
diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs
index f2b77e01e7..7867e50f09 100644
--- a/lychee-lib/src/utils/request.rs
+++ b/lychee-lib/src/utils/request.rs
@@ -128,13 +128,12 @@ pub(crate) fn create(
input_content: &InputContent,
base: &Option,
extractor: &Option,
-) -> Result> {
+) -> HashSet {
let base = base
.clone()
.or_else(|| Base::from_source(&input_content.source));
- let requests: HashSet = uris
- .into_iter()
+ uris.into_iter()
.filter_map(|raw_uri| {
match create_request(&raw_uri, &input_content.source, &base, extractor) {
Ok(request) => Some(request),
@@ -144,8 +143,7 @@ pub(crate) fn create(
}
}
})
- .collect();
- Ok(requests)
+ .collect()
}
/// Create a URI from a path
@@ -218,7 +216,7 @@ mod tests {
);
let uris = vec![RawUri::from("relative.html")];
- let requests = create(uris, &input, &base, &None).unwrap();
+ let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
@@ -235,7 +233,7 @@ mod tests {
);
let uris = vec![RawUri::from("https://another.com/page")];
- let requests = create(uris, &input, &base, &None).unwrap();
+ let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
@@ -252,7 +250,7 @@ mod tests {
);
let uris = vec![RawUri::from("/root-relative")];
- let requests = create(uris, &input, &base, &None).unwrap();
+ let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
@@ -269,7 +267,7 @@ mod tests {
);
let uris = vec![RawUri::from("../parent")];
- let requests = create(uris, &input, &base, &None).unwrap();
+ let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
@@ -283,7 +281,7 @@ mod tests {
let input = create_input(r##"Fragment Link"##, FileType::Html);
let uris = vec![RawUri::from("#fragment")];
- let requests = create(uris, &input, &base, &None).unwrap();
+ let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
@@ -300,7 +298,7 @@ mod tests {
);
let uris = vec![RawUri::from("https://example.com/page")];
- let requests = create(uris, &input, &base, &None).unwrap();
+ let requests = create(uris, &input, &base, &None);
assert_eq!(requests.len(), 1);
assert!(requests
From 779c457c32ecb7c824b9199122643e9cc3977daa Mon Sep 17 00:00:00 2001
From: Matthias
Date: Wed, 16 Oct 2024 21:48:30 +0200
Subject: [PATCH 10/14] refactor
---
lychee-lib/src/checker/file.rs | 111 +++++++++++++++++
lychee-lib/src/checker/mod.rs | 7 ++
.../src/{checker.rs => checker/website.rs} | 2 +-
lychee-lib/src/client.rs | 115 +++---------------
4 files changed, 135 insertions(+), 100 deletions(-)
create mode 100644 lychee-lib/src/checker/file.rs
create mode 100644 lychee-lib/src/checker/mod.rs
rename lychee-lib/src/{checker.rs => checker/website.rs} (99%)
diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs
new file mode 100644
index 0000000000..d0f1f6c3a2
--- /dev/null
+++ b/lychee-lib/src/checker/file.rs
@@ -0,0 +1,111 @@
+use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri};
+use http::StatusCode;
+use log::warn;
+use std::path::{Path, PathBuf};
+
+#[derive(Debug, Clone)]
+pub(crate) struct FileChecker {
+ base: Option,
+ fallback_extensions: Vec,
+ include_fragments: bool,
+ fragment_checker: FragmentChecker,
+}
+
+impl FileChecker {
+ pub(crate) fn new(
+ base: Option,
+ fallback_extensions: Vec,
+ include_fragments: bool,
+ ) -> Self {
+ Self {
+ base,
+ fallback_extensions,
+ include_fragments,
+ fragment_checker: FragmentChecker::new(),
+ }
+ }
+
+ pub(crate) async fn check(&self, uri: &Uri) -> Status {
+ let Ok(path) = uri.url.to_file_path() else {
+ return ErrorKind::InvalidFilePath(uri.clone()).into();
+ };
+
+ if path.is_absolute() {
+ let resolved_path = self.resolve_absolute_path(&path);
+ return self.check_resolved_path(&resolved_path, uri).await;
+ }
+
+ self.check_path(&path, uri).await
+ }
+
+ async fn check_resolved_path(&self, path: &Path, uri: &Uri) -> Status {
+ if path.exists() {
+ if self.include_fragments {
+ self.check_fragment(path, uri).await
+ } else {
+ Status::Ok(StatusCode::OK)
+ }
+ } else {
+ ErrorKind::InvalidFilePath(uri.clone()).into()
+ }
+ }
+
+ async fn check_path(&self, path: &Path, uri: &Uri) -> Status {
+ if path.exists() {
+ return self.check_existing_path(path, uri).await;
+ }
+
+ if path.extension().is_some() {
+ return ErrorKind::InvalidFilePath(uri.clone()).into();
+ }
+
+ self.check_with_fallback_extensions(path, uri).await
+ }
+
+ async fn check_existing_path(&self, path: &Path, uri: &Uri) -> Status {
+ if self.include_fragments {
+ self.check_fragment(path, uri).await
+ } else {
+ Status::Ok(StatusCode::OK)
+ }
+ }
+
+ async fn check_with_fallback_extensions(&self, path: &Path, uri: &Uri) -> Status {
+ let mut path_buf = path.to_path_buf();
+ for ext in &self.fallback_extensions {
+ path_buf.set_extension(ext);
+ if path_buf.exists() {
+ return self.check_existing_path(&path_buf, uri).await;
+ }
+ }
+ ErrorKind::InvalidFilePath(uri.clone()).into()
+ }
+
+ fn resolve_absolute_path(&self, path: &Path) -> PathBuf {
+ if let Some(Base::Local(base_path)) = &self.base {
+ let absolute_base_path = if base_path.is_relative() {
+ std::env::current_dir()
+ .unwrap_or_else(|_| PathBuf::new())
+ .join(base_path)
+ } else {
+ base_path.to_path_buf()
+ };
+
+ let stripped = path.strip_prefix("/").unwrap_or(path);
+ absolute_base_path.join(stripped)
+ } else {
+ path.to_path_buf()
+ }
+ }
+
+ async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
+ match self.fragment_checker.check(path, &uri.url).await {
+ Ok(true) => Status::Ok(StatusCode::OK),
+ Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(),
+ Err(err) => {
+ warn!("Skipping fragment check due to the following error: {err}");
+ Status::Ok(StatusCode::OK)
+ }
+ }
+ }
+}
diff --git a/lychee-lib/src/checker/mod.rs b/lychee-lib/src/checker/mod.rs
new file mode 100644
index 0000000000..eaf206e21a
--- /dev/null
+++ b/lychee-lib/src/checker/mod.rs
@@ -0,0 +1,7 @@
+//! Checker Module
+//!
+//! This module contains all checkers, which are responsible for checking the status of a URL.
+//! Each checker implements [Handler](crate::chain::Handler).
+
+pub(crate) mod file;
+pub(crate) mod website;
diff --git a/lychee-lib/src/checker.rs b/lychee-lib/src/checker/website.rs
similarity index 99%
rename from lychee-lib/src/checker.rs
rename to lychee-lib/src/checker/website.rs
index 428551ad2b..f2cffd4298 100644
--- a/lychee-lib/src/checker.rs
+++ b/lychee-lib/src/checker/website.rs
@@ -77,4 +77,4 @@ impl Handler for Checker {
async fn handle(&mut self, input: Request) -> ChainResult {
ChainResult::Done(self.retry_request(input).await)
}
-}
+}
\ No newline at end of file
diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs
index e85c48c1ef..c78d9f863f 100644
--- a/lychee-lib/src/client.rs
+++ b/lychee-lib/src/client.rs
@@ -13,12 +13,7 @@
clippy::default_trait_access,
clippy::used_underscore_binding
)]
-use std::{
- collections::HashSet,
- path::{Path, PathBuf},
- sync::Arc,
- time::Duration,
-};
+use std::{collections::HashSet, path::Path, sync::Arc, time::Duration};
#[cfg(all(feature = "email-check", feature = "native-tls"))]
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
@@ -36,7 +31,8 @@ use typed_builder::TypedBuilder;
use crate::{
chain::{Chain, ClientRequestChains, RequestChain},
- checker::Checker,
+ checker::file::FileChecker,
+ checker::website::Checker,
filter::{Excludes, Filter, Includes},
quirks::Quirks,
remap::Remaps,
@@ -398,16 +394,18 @@ impl ClientBuilder {
reqwest_client,
github_client,
remaps: self.remaps,
- base: self.base,
- fallback_extensions: self.fallback_extensions,
filter,
max_retries: self.max_retries,
retry_wait_time: self.retry_wait_time,
method: self.method,
accepted: self.accepted,
require_https: self.require_https,
- include_fragments: self.include_fragments,
fragment_checker: FragmentChecker::new(),
+ file_checker: FileChecker::new(
+ self.base,
+ self.fallback_extensions,
+ self.include_fragments,
+ ),
plugin_request_chain: self.plugin_request_chain,
})
}
@@ -428,12 +426,6 @@ pub struct Client {
/// Optional remapping rules for URIs matching pattern.
remaps: Option,
- /// Optional base for resolving paths.
- base: Option,
-
- /// Automatically append file extensions to `file://` URIs as needed
- fallback_extensions: Vec,
-
/// Rules to decided whether each link should be checked or ignored.
filter: Filter,
@@ -459,13 +451,12 @@ pub struct Client {
/// This would treat unencrypted links as errors when HTTPS is available.
require_https: bool,
- /// Enable the checking of fragments in links.
- include_fragments: bool,
-
/// Caches Fragments
fragment_checker: FragmentChecker,
plugin_request_chain: RequestChain,
+
+ file_checker: FileChecker,
}
impl Client {
@@ -509,55 +500,7 @@ impl Client {
}
let status = match uri.scheme() {
- _ if uri.is_file() => {
- // If base is set and the path is absolute, prepend the base
- // to the path.
- if let Some(ref base) = self.base {
- let Ok(path) = uri.url.to_file_path() else {
- return Err(ErrorKind::InvalidFilePath(uri.clone()));
- };
-
- if path.is_absolute() {
- match base {
- Base::Local(ref base_path) => {
- // Ensure base_path is absolute
- let absolute_base_path = if base_path.is_relative() {
- std::env::current_dir()
- .unwrap_or_else(|_| PathBuf::new())
- .join(base_path)
- } else {
- base_path.clone()
- };
-
- // Optionally strip the leading slash from the path (if it exists)
- let stripped = path.strip_prefix("/").unwrap_or(&path);
- let resolved = absolute_base_path.join(stripped);
-
- // Create a new URI with the resolved path
- let mut uri = uri.clone();
- uri.url = Url::from_file_path(&resolved).unwrap();
-
- if resolved.exists() {
- return Ok(Response::new(
- uri.clone(),
- Status::Ok(StatusCode::OK),
- source,
- ));
- }
-
- return Ok(Response::new(
- uri.clone(),
- Status::Error(ErrorKind::InvalidFilePath(uri.clone())),
- source,
- ));
- }
- Base::Remote(_) => (),
- }
- }
- }
-
- self.check_file(uri).await
- }
+ _ if uri.is_file() => self.check_file(uri).await,
_ if uri.is_mail() => self.check_mail(uri).await,
_ if uri.is_tel() => Status::Excluded,
_ => {
@@ -579,6 +522,11 @@ impl Client {
Ok(Response::new(uri.clone(), status, source))
}
+ /// Check a single file using the file checker.
+ pub async fn check_file(&self, uri: &Uri) -> Status {
+ self.file_checker.check(uri).await
+ }
+
/// Remap `uri` using the client-defined remapping rules.
///
/// # Errors
@@ -722,37 +670,6 @@ impl Client {
Status::Ok(StatusCode::OK)
}
- /// Check a `file` URI.
- pub async fn check_file(&self, uri: &Uri) -> Status {
- let Ok(path) = uri.url.to_file_path() else {
- return ErrorKind::InvalidFilePath(uri.clone()).into();
- };
-
- if path.exists() {
- if self.include_fragments {
- return self.check_fragment(&path, uri).await;
- }
- return Status::Ok(StatusCode::OK);
- }
-
- if path.extension().is_some() {
- return ErrorKind::InvalidFilePath(uri.clone()).into();
- }
-
- // if the path has no file extension, try to append some
- let mut path_buf = path.clone();
- for ext in &self.fallback_extensions {
- path_buf.set_extension(ext);
- if path_buf.exists() {
- if self.include_fragments {
- return self.check_fragment(&path_buf, uri).await;
- }
- return Status::Ok(StatusCode::OK);
- }
- }
- ErrorKind::InvalidFilePath(uri.clone()).into()
- }
-
/// Checks a `file` URI's fragment.
pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
match self.fragment_checker.check(path, &uri.url).await {
From 2363e0496cae33fbe39b65dbbcaa3acb5eca8e31 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Wed, 16 Oct 2024 21:58:22 +0200
Subject: [PATCH 11/14] refactor
---
lychee-bin/src/formatters/response/color.rs | 14 +++-
lychee-bin/tests/cli.rs | 2 -
lychee-lib/src/checker/file.rs | 73 +++++++++---------
lychee-lib/src/checker/mail.rs | 53 +++++++++++++
lychee-lib/src/checker/mod.rs | 2 +-
lychee-lib/src/checker/website.rs | 8 +-
lychee-lib/src/client.rs | 83 ++++++++-------------
7 files changed, 139 insertions(+), 96 deletions(-)
create mode 100644 lychee-lib/src/checker/mail.rs
diff --git a/lychee-bin/src/formatters/response/color.rs b/lychee-bin/src/formatters/response/color.rs
index 0a84f2a393..9aa12df40f 100644
--- a/lychee-bin/src/formatters/response/color.rs
+++ b/lychee-bin/src/formatters/response/color.rs
@@ -65,13 +65,19 @@ mod tests {
}
}
+ #[cfg(test)]
+ /// Helper function to strip ANSI color codes for tests
+ fn strip_ansi_codes(s: &str) -> String {
+ console::strip_ansi_codes(s).to_string()
+ }
+
#[test]
fn test_format_response_with_ok_status() {
let formatter = ColorFormatter;
let body = mock_response_body(Status::Ok(StatusCode::OK), "https://example.com");
assert_eq!(
- formatter.format_response(&body),
- "\u{1b}[38;5;2m\u{1b}[1m [200]\u{1b}[0m https://example.com/"
+ strip_ansi_codes(&formatter.format_response(&body)),
+ " [200] https://example.com/"
);
}
@@ -83,8 +89,8 @@ mod tests {
"https://example.com/404",
);
assert_eq!(
- formatter.format_response(&body),
- "\u{1b}[38;5;197m [ERROR]\u{1b}[0m https://example.com/404"
+ strip_ansi_codes(&formatter.format_response(&body)),
+ " [ERROR] https://example.com/404"
);
}
diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs
index 9dbae37d34..261b77b0d7 100644
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@@ -266,8 +266,6 @@ mod cli {
#[test]
fn test_resolve_paths() {
- // TODO: Refactor the code to clean up base path handling
-
let mut cmd = main_command();
let dir = fixtures_path().join("resolve_paths");
diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs
index d0f1f6c3a2..f972bee09a 100644
--- a/lychee-lib/src/checker/file.rs
+++ b/lychee-lib/src/checker/file.rs
@@ -1,13 +1,26 @@
-use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri};
use http::StatusCode;
use log::warn;
use std::path::{Path, PathBuf};
+use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri};
+
+/// A utility for checking the existence and validity of file-based URIs.
+///
+/// `FileChecker` is responsible for resolving and validating file paths,
+/// handling both absolute and relative paths. It supports base path resolution,
+/// fallback extensions for files without extensions, and optional fragment checking.
+///
+/// This creates a `FileChecker` with a base path, fallback extensions for HTML files,
+/// and fragment checking enabled.
#[derive(Debug, Clone)]
pub(crate) struct FileChecker {
+ /// An optional base path or URL used for resolving relative paths.
base: Option,
+ /// A list of file extensions to try if the original path doesn't exist.
fallback_extensions: Vec,
+ /// Whether to check for the existence of fragments (e.g., #section-id) in HTML files.
include_fragments: bool,
+ /// A utility for performing fragment checks in HTML files.
fragment_checker: FragmentChecker,
}
@@ -30,23 +43,28 @@ impl FileChecker {
return ErrorKind::InvalidFilePath(uri.clone()).into();
};
- if path.is_absolute() {
- let resolved_path = self.resolve_absolute_path(&path);
- return self.check_resolved_path(&resolved_path, uri).await;
- }
-
- self.check_path(&path, uri).await
+ let resolved_path = self.resolve_path(&path);
+ self.check_path(&resolved_path, uri).await
}
- async fn check_resolved_path(&self, path: &Path, uri: &Uri) -> Status {
- if path.exists() {
- if self.include_fragments {
- self.check_fragment(path, uri).await
+ fn resolve_path(&self, path: &Path) -> PathBuf {
+ if let Some(Base::Local(base_path)) = &self.base {
+ if path.is_absolute() {
+ let absolute_base_path = if base_path.is_relative() {
+ std::env::current_dir()
+ .unwrap_or_else(|_| PathBuf::new())
+ .join(base_path)
+ } else {
+ base_path.clone()
+ };
+
+ let stripped = path.strip_prefix("/").unwrap_or(path);
+ absolute_base_path.join(stripped)
} else {
- Status::Ok(StatusCode::OK)
+ base_path.join(path)
}
} else {
- ErrorKind::InvalidFilePath(uri.clone()).into()
+ path.to_path_buf()
}
}
@@ -55,10 +73,6 @@ impl FileChecker {
return self.check_existing_path(path, uri).await;
}
- if path.extension().is_some() {
- return ErrorKind::InvalidFilePath(uri.clone()).into();
- }
-
self.check_with_fallback_extensions(path, uri).await
}
@@ -72,30 +86,21 @@ impl FileChecker {
async fn check_with_fallback_extensions(&self, path: &Path, uri: &Uri) -> Status {
let mut path_buf = path.to_path_buf();
+
+ // If the path already has an extension, try it first
+ if path_buf.extension().is_some() && path_buf.exists() {
+ return self.check_existing_path(&path_buf, uri).await;
+ }
+
+ // Try fallback extensions
for ext in &self.fallback_extensions {
path_buf.set_extension(ext);
if path_buf.exists() {
return self.check_existing_path(&path_buf, uri).await;
}
}
- ErrorKind::InvalidFilePath(uri.clone()).into()
- }
-
- fn resolve_absolute_path(&self, path: &Path) -> PathBuf {
- if let Some(Base::Local(base_path)) = &self.base {
- let absolute_base_path = if base_path.is_relative() {
- std::env::current_dir()
- .unwrap_or_else(|_| PathBuf::new())
- .join(base_path)
- } else {
- base_path.to_path_buf()
- };
- let stripped = path.strip_prefix("/").unwrap_or(path);
- absolute_base_path.join(stripped)
- } else {
- path.to_path_buf()
- }
+ ErrorKind::InvalidFilePath(uri.clone()).into()
}
async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
diff --git a/lychee-lib/src/checker/mail.rs b/lychee-lib/src/checker/mail.rs
new file mode 100644
index 0000000000..1c3954568a
--- /dev/null
+++ b/lychee-lib/src/checker/mail.rs
@@ -0,0 +1,53 @@
+use crate::{ErrorKind, Status, Uri};
+use http::StatusCode;
+
+#[cfg(all(feature = "email-check", feature = "native-tls"))]
+use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
+
+#[cfg(all(feature = "email-check", feature = "native-tls"))]
+use crate::types::mail;
+
+/// A utility for checking the validity of email addresses.
+///
+/// `EmailChecker` is responsible for validating email addresses,
+/// optionally performing reachability checks when the appropriate
+/// features are enabled.
+#[derive(Debug, Clone)]
+pub(crate) struct MailChecker {}
+
+impl MailChecker {
+ /// Creates a new `EmailChecker`.
+ pub(crate) const fn new() -> Self {
+ Self {}
+ }
+
+ /// Check a mail address, or equivalently a `mailto` URI.
+ ///
+ /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`),
+ /// which are ignored by this check. They are not part of the mail address
+ /// and instead passed to a mail client.
+ pub(crate) async fn check_mail(&self, uri: &Uri) -> Status {
+ #[cfg(all(feature = "email-check", feature = "native-tls"))]
+ {
+ self.perform_email_check(uri).await
+ }
+
+ #[cfg(not(all(feature = "email-check", feature = "native-tls")))]
+ {
+ Status::Excluded
+ }
+ }
+
+ #[cfg(all(feature = "email-check", feature = "native-tls"))]
+ async fn perform_email_check(&self, uri: &Uri) -> Status {
+ let address = uri.url.path().to_string();
+ let input = CheckEmailInput::new(address);
+ let result = &(check_email(&input).await);
+
+ if let Reachable::Invalid = result.is_reachable {
+ ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into()
+ } else {
+ Status::Ok(StatusCode::OK)
+ }
+ }
+}
diff --git a/lychee-lib/src/checker/mod.rs b/lychee-lib/src/checker/mod.rs
index eaf206e21a..bfbef9de51 100644
--- a/lychee-lib/src/checker/mod.rs
+++ b/lychee-lib/src/checker/mod.rs
@@ -1,7 +1,7 @@
//! Checker Module
//!
//! This module contains all checkers, which are responsible for checking the status of a URL.
-//! Each checker implements [Handler](crate::chain::Handler).
pub(crate) mod file;
+pub(crate) mod mail;
pub(crate) mod website;
diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs
index f2cffd4298..668cf827e2 100644
--- a/lychee-lib/src/checker/website.rs
+++ b/lychee-lib/src/checker/website.rs
@@ -9,14 +9,14 @@ use reqwest::Request;
use std::{collections::HashSet, time::Duration};
#[derive(Debug, Clone)]
-pub(crate) struct Checker {
+pub(crate) struct WebsiteChecker {
retry_wait_time: Duration,
max_retries: u64,
reqwest_client: reqwest::Client,
accepted: Option>,
}
-impl Checker {
+impl WebsiteChecker {
pub(crate) const fn new(
retry_wait_time: Duration,
max_retries: u64,
@@ -73,8 +73,8 @@ fn clone_unwrap(request: &Request) -> Request {
}
#[async_trait]
-impl Handler for Checker {
+impl Handler for WebsiteChecker {
async fn handle(&mut self, input: Request) -> ChainResult {
ChainResult::Done(self.retry_request(input).await)
}
-}
\ No newline at end of file
+}
diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs
index c78d9f863f..7f59c2ce39 100644
--- a/lychee-lib/src/client.rs
+++ b/lychee-lib/src/client.rs
@@ -15,8 +15,6 @@
)]
use std::{collections::HashSet, path::Path, sync::Arc, time::Duration};
-#[cfg(all(feature = "email-check", feature = "native-tls"))]
-use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
use http::{
header::{HeaderMap, HeaderValue},
StatusCode,
@@ -32,18 +30,15 @@ use typed_builder::TypedBuilder;
use crate::{
chain::{Chain, ClientRequestChains, RequestChain},
checker::file::FileChecker,
- checker::website::Checker,
+ checker::{mail::MailChecker, website::WebsiteChecker},
filter::{Excludes, Filter, Includes},
quirks::Quirks,
remap::Remaps,
types::uri::github::GithubUri,
utils::fragment_checker::FragmentChecker,
- Base, ErrorKind, Request, Response, Result, Status, Uri,
+ Base, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri,
};
-#[cfg(all(feature = "email-check", feature = "native-tls"))]
-use crate::types::mail;
-
/// Default number of redirects before a request is deemed as failed, 5.
pub const DEFAULT_MAX_REDIRECTS: usize = 5;
/// Default number of retries before a request is deemed as failed, 3.
@@ -401,6 +396,7 @@ impl ClientBuilder {
accepted: self.accepted,
require_https: self.require_https,
fragment_checker: FragmentChecker::new(),
+ email_checker: MailChecker::new(),
file_checker: FileChecker::new(
self.base,
self.fallback_extensions,
@@ -457,6 +453,8 @@ pub struct Client {
plugin_request_chain: RequestChain,
file_checker: FileChecker,
+
+ email_checker: MailChecker,
}
impl Client {
@@ -500,23 +498,11 @@ impl Client {
}
let status = match uri.scheme() {
+ // We don't check tel: URIs
+ _ if uri.is_tel() => Status::Excluded,
_ if uri.is_file() => self.check_file(uri).await,
_ if uri.is_mail() => self.check_mail(uri).await,
- _ if uri.is_tel() => Status::Excluded,
- _ => {
- let default_chain: RequestChain = Chain::new(vec![
- Box::::default(),
- Box::new(credentials),
- Box::new(Checker::new(
- self.retry_wait_time,
- self.max_retries,
- self.reqwest_client.clone(),
- self.accepted.clone(),
- )),
- ]);
-
- self.check_website(uri, default_chain).await?
- }
+ _ => self.check_website(uri, credentials).await?,
};
Ok(Response::new(uri.clone(), status, source))
@@ -554,7 +540,22 @@ impl Client {
/// - The request failed.
/// - The response status code is not accepted.
/// - The URI cannot be converted to HTTPS.
- pub async fn check_website(&self, uri: &Uri, default_chain: RequestChain) -> Result {
+ pub async fn check_website(
+ &self,
+ uri: &Uri,
+ credentials: Option,
+ ) -> Result {
+ let default_chain: RequestChain = Chain::new(vec![
+ Box::::default(),
+ Box::new(credentials),
+ Box::new(WebsiteChecker::new(
+ self.retry_wait_time,
+ self.max_retries,
+ self.reqwest_client.clone(),
+ self.accepted.clone(),
+ )),
+ ]);
+
match self.check_website_inner(uri, &default_chain).await {
Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
if self
@@ -577,6 +578,8 @@ impl Client {
///
/// Unsupported schemes will be ignored
///
+ /// Note: we use `inner` to improve compile times by avoiding monomorphization
+ ///
/// # Errors
///
/// This returns an `Err` if
@@ -617,7 +620,7 @@ impl Client {
// Pull out the heavy machinery in case of a failed normal request.
// This could be a GitHub URL and we ran into the rate limiter.
- // TODO: We should first try to parse the URI as GitHub URI first (Lucius, Jan 2023)
+ // TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023)
async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
if status.is_success() {
return status;
@@ -670,6 +673,11 @@ impl Client {
Status::Ok(StatusCode::OK)
}
+ /// Checks a `mailto` URI.
+ pub async fn check_mail(&self, uri: &Uri) -> Status {
+ self.email_checker.check_mail(uri).await
+ }
+
/// Checks a `file` URI's fragment.
pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
match self.fragment_checker.check(path, &uri.url).await {
@@ -681,33 +689,6 @@ impl Client {
}
}
}
-
- /// Check a mail address, or equivalently a `mailto` URI.
- ///
- /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`),
- /// which are ignored by this check. The are not part of the mail address
- /// and instead passed to a mail client.
- #[cfg(all(feature = "email-check", feature = "native-tls"))]
- pub async fn check_mail(&self, uri: &Uri) -> Status {
- let address = uri.url.path().to_string();
- let input = CheckEmailInput::new(address);
- let result = &(check_email(&input).await);
-
- if let Reachable::Invalid = result.is_reachable {
- ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into()
- } else {
- Status::Ok(StatusCode::OK)
- }
- }
-
- /// Check a mail address, or equivalently a `mailto` URI.
- ///
- /// This implementation simply excludes all email addresses.
- #[cfg(not(all(feature = "email-check", feature = "native-tls")))]
- #[allow(clippy::unused_async)]
- pub async fn check_mail(&self, _uri: &Uri) -> Status {
- Status::Excluded
- }
}
// Check if the given `Url` would cause `reqwest` to panic.
From 0f9bd5cbb099ac8ee6e0b09e6d5e1c3a2c4f5937 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Thu, 17 Oct 2024 02:04:49 +0200
Subject: [PATCH 12/14] refactor and fix tests
---
lychee-bin/src/commands/check.rs | 10 +-
lychee-bin/tests/cli.rs | 6 +-
lychee-lib/src/checker/website.rs | 160 +++++++++++++++++++++-
lychee-lib/src/client.rs | 211 ++++--------------------------
4 files changed, 191 insertions(+), 196 deletions(-)
diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs
index fd654998c0..2ffc0ed372 100644
--- a/lychee-bin/src/commands/check.rs
+++ b/lychee-bin/src/commands/check.rs
@@ -429,14 +429,12 @@ mod tests {
#[tokio::test]
async fn test_invalid_url() {
- // Run a normal request with an invalid Url
let client = ClientBuilder::builder().build().client().unwrap();
- let request = Request::try_from("http://\"").unwrap();
- let response = check_url(&client, request).await;
- assert!(response.status().is_error());
+ let uri = Uri::try_from("http://\"").unwrap();
+ let response = client.check_website(&uri, None).await.unwrap();
assert!(matches!(
- response.status(),
- Status::Error(ErrorKind::InvalidURI(_))
+ response,
+ Status::Unsupported(ErrorKind::BuildRequestClient(_))
));
}
diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs
index 261b77b0d7..fcb7530cb7 100644
--- a/lychee-bin/tests/cli.rs
+++ b/lychee-bin/tests/cli.rs
@@ -944,13 +944,17 @@ mod cli {
// check content of cache file
let data = fs::read_to_string(&cache_file)?;
+
+ if data.is_empty() {
+ println!("Cache file is empty!");
+ }
+
assert!(data.contains(&format!("{}/,200", mock_server_ok.uri())));
assert!(!data.contains(&format!("{}/,204", mock_server_no_content.uri())));
assert!(!data.contains(&format!("{}/,429", mock_server_too_many_requests.uri())));
// clear the cache file
fs::remove_file(&cache_file)?;
-
Ok(())
}
diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs
index 668cf827e2..447de3a8a4 100644
--- a/lychee-lib/src/checker/website.rs
+++ b/lychee-lib/src/checker/website.rs
@@ -1,33 +1,69 @@
use crate::{
- chain::{ChainResult, Handler},
+ chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain},
+ quirks::Quirks,
retry::RetryExt,
- Status,
+ types::uri::github::GithubUri,
+ BasicAuthCredentials, ErrorKind, Status, Uri,
};
use async_trait::async_trait;
use http::StatusCode;
+use octocrab::Octocrab;
use reqwest::Request;
use std::{collections::HashSet, time::Duration};
#[derive(Debug, Clone)]
pub(crate) struct WebsiteChecker {
- retry_wait_time: Duration,
- max_retries: u64,
+ /// Request method used for making requests.
+ method: reqwest::Method,
+
+ /// The HTTP client used for requests.
reqwest_client: reqwest::Client,
+
+ /// GitHub client used for requests.
+ github_client: Option,
+
+ /// The chain of plugins to be executed on each request.
+ plugin_request_chain: RequestChain,
+
+ /// Maximum number of retries per request before returning an error.
+ max_retries: u64,
+
+ /// Initial wait time between retries of failed requests. This doubles after
+ /// each failure.
+ retry_wait_time: Duration,
+
+ /// Set of accepted return codes / status codes.
+ ///
+ /// Unmatched return codes/ status codes are deemed as errors.
accepted: Option>,
+
+ /// Requires using HTTPS when it's available.
+ ///
+ /// This would treat unencrypted links as errors when HTTPS is available.
+ require_https: bool,
}
impl WebsiteChecker {
+ #[allow(clippy::too_many_arguments)]
pub(crate) const fn new(
+ method: reqwest::Method,
retry_wait_time: Duration,
max_retries: u64,
reqwest_client: reqwest::Client,
accepted: Option>,
+ github_client: Option,
+ require_https: bool,
+ plugin_request_chain: RequestChain,
) -> Self {
Self {
- retry_wait_time,
- max_retries,
+ method,
reqwest_client,
+ github_client,
+ plugin_request_chain,
+ max_retries,
+ retry_wait_time,
accepted,
+ require_https,
}
}
@@ -36,7 +72,6 @@ impl WebsiteChecker {
pub(crate) async fn retry_request(&self, request: Request) -> Status {
let mut retries: u64 = 0;
let mut wait_time = self.retry_wait_time;
-
let mut status = self.check_default(clone_unwrap(&request)).await;
while retries < self.max_retries {
if status.is_success() || !status.should_retry() {
@@ -57,6 +92,117 @@ impl WebsiteChecker {
Err(e) => e.into(),
}
}
+
+ /// Checks the given URI of a website.
+ ///
+ /// # Errors
+ ///
+ /// This returns an `Err` if
+ /// - The URI is invalid.
+ /// - The request failed.
+ /// - The response status code is not accepted.
+ /// - The URI cannot be converted to HTTPS.
+ pub(crate) async fn check_website(
+ &self,
+ uri: &Uri,
+ credentials: Option,
+ ) -> Result {
+ let default_chain: RequestChain = Chain::new(vec![
+ Box::::default(),
+ Box::new(credentials),
+ Box::new(self.clone()),
+ ]);
+
+ match self.check_website_inner(uri, &default_chain).await {
+ Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
+ if self
+ .check_website_inner(&uri.to_https()?, &default_chain)
+ .await
+ .is_success()
+ {
+ Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?)))
+ } else {
+ Ok(Status::Ok(code))
+ }
+ }
+ s => Ok(s),
+ }
+ }
+
+ /// Checks the given URI of a website.
+ ///
+ /// Unsupported schemes will be ignored
+ ///
+ /// Note: we use `inner` to improve compile times by avoiding monomorphization
+ ///
+ /// # Errors
+ ///
+ /// This returns an `Err` if
+ /// - The URI is invalid.
+ /// - The request failed.
+ /// - The response status code is not accepted.
+ async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status {
+ let request = self
+ .reqwest_client
+ .request(self.method.clone(), uri.as_str())
+ .build();
+
+ let request = match request {
+ Ok(r) => r,
+ Err(e) => return e.into(),
+ };
+
+ let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain])
+ .traverse(request)
+ .await;
+
+ self.handle_github(status, uri).await
+ }
+
+ // Pull out the heavy machinery in case of a failed normal request.
+ // This could be a GitHub URL and we ran into the rate limiter.
+ // TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023)
+ async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
+ if status.is_success() {
+ return status;
+ }
+
+ if let Ok(github_uri) = GithubUri::try_from(uri) {
+ let status = self.check_github(github_uri).await;
+ if status.is_success() {
+ return status;
+ }
+ }
+
+ status
+ }
+
+ /// Check a `uri` hosted on `GitHub` via the GitHub API.
+ ///
+ /// # Caveats
+ ///
+ /// Files inside private repositories won't get checked and instead would
+ /// be reported as valid if the repository itself is reachable through the
+ /// API.
+ ///
+ /// A better approach would be to download the file through the API or
+ /// clone the repo, but we chose the pragmatic approach.
+ async fn check_github(&self, uri: GithubUri) -> Status {
+ let Some(client) = &self.github_client else {
+ return ErrorKind::MissingGitHubToken.into();
+ };
+ let repo = match client.repos(&uri.owner, &uri.repo).get().await {
+ Ok(repo) => repo,
+ Err(e) => return ErrorKind::GithubRequest(e).into(),
+ };
+ if let Some(true) = repo.private {
+ return Status::Ok(StatusCode::OK);
+ } else if let Some(endpoint) = uri.endpoint {
+ return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo))
+ .into();
+ }
+ Status::Ok(StatusCode::OK)
+ }
}
/// Clones a `reqwest::Request`.
diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs
index 7f59c2ce39..cbeab51e1f 100644
--- a/lychee-lib/src/client.rs
+++ b/lychee-lib/src/client.rs
@@ -22,19 +22,17 @@ use http::{
use log::{debug, warn};
use octocrab::Octocrab;
use regex::RegexSet;
-use reqwest::{header, redirect, Url};
+use reqwest::{header, redirect};
use reqwest_cookie_store::CookieStoreMutex;
use secrecy::{ExposeSecret, SecretString};
use typed_builder::TypedBuilder;
use crate::{
- chain::{Chain, ClientRequestChains, RequestChain},
+ chain::RequestChain,
checker::file::FileChecker,
checker::{mail::MailChecker, website::WebsiteChecker},
filter::{Excludes, Filter, Includes},
- quirks::Quirks,
remap::Remaps,
- types::uri::github::GithubUri,
utils::fragment_checker::FragmentChecker,
Base, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri,
};
@@ -385,24 +383,28 @@ impl ClientBuilder {
include_mail: self.include_mail,
};
- Ok(Client {
+ let website_checker = WebsiteChecker::new(
+ self.method,
+ self.retry_wait_time,
+ self.max_retries,
reqwest_client,
+ self.accepted,
github_client,
+ self.require_https,
+ self.plugin_request_chain,
+ );
+
+ Ok(Client {
remaps: self.remaps,
filter,
- max_retries: self.max_retries,
- retry_wait_time: self.retry_wait_time,
- method: self.method,
- accepted: self.accepted,
- require_https: self.require_https,
- fragment_checker: FragmentChecker::new(),
email_checker: MailChecker::new(),
+ website_checker,
file_checker: FileChecker::new(
self.base,
self.fallback_extensions,
self.include_fragments,
),
- plugin_request_chain: self.plugin_request_chain,
+ fragment_checker: FragmentChecker::new(),
})
}
}
@@ -413,48 +415,23 @@ impl ClientBuilder {
/// options.
#[derive(Debug, Clone)]
pub struct Client {
- /// Underlying `reqwest` client instance that handles the HTTP requests.
- reqwest_client: reqwest::Client,
-
- /// Optional GitHub client that handles communications with GitHub.
- github_client: Option,
-
/// Optional remapping rules for URIs matching pattern.
remaps: Option,
/// Rules to decided whether each link should be checked or ignored.
filter: Filter,
- /// Maximum number of retries per request before returning an error.
- max_retries: u64,
-
- /// Initial wait time between retries of failed requests. This doubles after
- /// each failure.
- retry_wait_time: Duration,
+ /// A checker for website URLs.
+ website_checker: WebsiteChecker,
- /// HTTP method used for requests, e.g. `GET` or `HEAD`.
- ///
- /// The same method will be used for all links.
- method: reqwest::Method,
-
- /// Set of accepted return codes / status codes.
- ///
- /// Unmatched return codes/ status codes are deemed as errors.
- accepted: Option>,
+ /// A checker for file URLs.
+ file_checker: FileChecker,
- /// Requires using HTTPS when it's available.
- ///
- /// This would treat unencrypted links as errors when HTTPS is available.
- require_https: bool,
+ /// A checker for email URLs.
+ email_checker: MailChecker,
/// Caches Fragments
fragment_checker: FragmentChecker,
-
- plugin_request_chain: RequestChain,
-
- file_checker: FileChecker,
-
- email_checker: MailChecker,
}
impl Client {
@@ -545,132 +522,7 @@ impl Client {
uri: &Uri,
credentials: Option,
) -> Result {
- let default_chain: RequestChain = Chain::new(vec![
- Box::::default(),
- Box::new(credentials),
- Box::new(WebsiteChecker::new(
- self.retry_wait_time,
- self.max_retries,
- self.reqwest_client.clone(),
- self.accepted.clone(),
- )),
- ]);
-
- match self.check_website_inner(uri, &default_chain).await {
- Status::Ok(code) if self.require_https && uri.scheme() == "http" => {
- if self
- .check_website_inner(&uri.to_https()?, &default_chain)
- .await
- .is_success()
- {
- Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?)))
- } else {
- // HTTPS is not available for this URI,
- // so the original HTTP URL is fine.
- Ok(Status::Ok(code))
- }
- }
- s => Ok(s),
- }
- }
-
- /// Checks the given URI of a website.
- ///
- /// Unsupported schemes will be ignored
- ///
- /// Note: we use `inner` to improve compile times by avoiding monomorphization
- ///
- /// # Errors
- ///
- /// This returns an `Err` if
- /// - The URI is invalid.
- /// - The request failed.
- /// - The response status code is not accepted.
- pub async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status {
- // Workaround for upstream reqwest panic
- if validate_url(&uri.url) {
- if matches!(uri.scheme(), "http" | "https") {
- // This is a truly invalid URI with a known scheme.
- // If we pass that to reqwest it would panic.
- return Status::Error(ErrorKind::InvalidURI(uri.clone()));
- }
- // This is merely a URI with a scheme that is not supported by
- // reqwest yet. It would be safe to pass that to reqwest and it
- // wouldn't panic, but it's also unnecessary, because it would
- // simply return an error.
- return Status::Unsupported(ErrorKind::InvalidURI(uri.clone()));
- }
-
- let request = self
- .reqwest_client
- .request(self.method.clone(), uri.as_str())
- .build();
-
- let request = match request {
- Ok(r) => r,
- Err(e) => return e.into(),
- };
-
- let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain])
- .traverse(request)
- .await;
-
- self.handle_github(status, uri).await
- }
-
- // Pull out the heavy machinery in case of a failed normal request.
- // This could be a GitHub URL and we ran into the rate limiter.
- // TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023)
- async fn handle_github(&self, status: Status, uri: &Uri) -> Status {
- if status.is_success() {
- return status;
- }
-
- if let Ok(github_uri) = GithubUri::try_from(uri) {
- let status = self.check_github(github_uri).await;
- // Only return GitHub status in case of success
- // Otherwise return the original error, which has more information
- if status.is_success() {
- return status;
- }
- }
-
- status
- }
-
- /// Check a `uri` hosted on `GitHub` via the GitHub API.
- ///
- /// # Caveats
- ///
- /// Files inside private repositories won't get checked and instead would
- /// be reported as valid if the repository itself is reachable through the
- /// API.
- ///
- /// A better approach would be to download the file through the API or
- /// clone the repo, but we chose the pragmatic approach.
- async fn check_github(&self, uri: GithubUri) -> Status {
- let Some(client) = &self.github_client else {
- return ErrorKind::MissingGitHubToken.into();
- };
- let repo = match client.repos(&uri.owner, &uri.repo).get().await {
- Ok(repo) => repo,
- Err(e) => return ErrorKind::GithubRequest(e).into(),
- };
- if let Some(true) = repo.private {
- // The private repo exists. Assume a given endpoint exists as well
- // (e.g. `issues` in `github.com/org/private/issues`). This is not
- // always the case but simplifies the check.
- return Status::Ok(StatusCode::OK);
- } else if let Some(endpoint) = uri.endpoint {
- // The URI returned a non-200 status code from a normal request and
- // now we find that this public repo is reachable through the API,
- // so that must mean the full URI (which includes the additional
- // endpoint) must be invalid.
- return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo))
- .into();
- }
- // Found public repo without endpoint
- Status::Ok(StatusCode::OK)
+ self.website_checker.check_website(uri, credentials).await
}
/// Checks a `mailto` URI.
@@ -691,16 +543,6 @@ impl Client {
}
}
-// Check if the given `Url` would cause `reqwest` to panic.
-// This is a workaround for https://github.com/lycheeverse/lychee/issues/539
-// and can be removed once https://github.com/seanmonstar/reqwest/pull/1399
-// got merged.
-// It is exactly the same check that reqwest runs internally, but unfortunately
-// it `unwrap`s (and panics!) instead of returning an error, which we could handle.
-fn validate_url(url: &Url) -> bool {
- http::Uri::try_from(url.as_str()).is_err()
-}
-
/// A shorthand function to check a single URI.
///
/// This provides the simplest link check utility without having to create a
@@ -740,7 +582,7 @@ mod tests {
chain::{ChainResult, Handler, RequestChain},
mock_server,
test_utils::get_mock_client_response,
- Request, Status, Uri,
+ ErrorKind, Request, Status, Uri,
};
#[tokio::test]
@@ -989,9 +831,14 @@ mod tests {
#[tokio::test]
async fn test_avoid_reqwest_panic() {
let client = ClientBuilder::builder().build().client().unwrap();
- // This request will fail, but it won't panic
+ // This request will result in an Unsupported status, but it won't panic
let res = client.check("http://\"").await.unwrap();
- assert!(res.status().is_error());
+
+ assert!(matches!(
+ res.status(),
+ Status::Unsupported(ErrorKind::BuildRequestClient(_))
+ ));
+ assert!(res.status().is_unsupported());
}
#[tokio::test]
From 384b15f340d2ba009a6c5765a1fddb9d297c3df5 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Thu, 17 Oct 2024 18:03:59 +0200
Subject: [PATCH 13/14] fix lint
---
lychee-lib/src/checker/file.rs | 89 +++++++++++++++++++++++++++++-----
lychee-lib/src/checker/mail.rs | 22 +++++----
2 files changed, 89 insertions(+), 22 deletions(-)
diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs
index f972bee09a..0060bfbde0 100644
--- a/lychee-lib/src/checker/file.rs
+++ b/lychee-lib/src/checker/file.rs
@@ -6,25 +6,29 @@ use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, U
/// A utility for checking the existence and validity of file-based URIs.
///
-/// `FileChecker` is responsible for resolving and validating file paths,
-/// handling both absolute and relative paths. It supports base path resolution,
-/// fallback extensions for files without extensions, and optional fragment checking.
-///
-/// This creates a `FileChecker` with a base path, fallback extensions for HTML files,
-/// and fragment checking enabled.
+/// `FileChecker` resolves and validates file paths, handling both absolute and relative paths.
+/// It supports base path resolution, fallback extensions for files without extensions,
+/// and optional fragment checking for HTML files.
#[derive(Debug, Clone)]
pub(crate) struct FileChecker {
- /// An optional base path or URL used for resolving relative paths.
+ /// Base path or URL used for resolving relative paths.
base: Option,
- /// A list of file extensions to try if the original path doesn't exist.
+ /// List of file extensions to try if the original path doesn't exist.
fallback_extensions: Vec,
- /// Whether to check for the existence of fragments (e.g., #section-id) in HTML files.
+ /// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files.
include_fragments: bool,
- /// A utility for performing fragment checks in HTML files.
+ /// Utility for performing fragment checks in HTML files.
fragment_checker: FragmentChecker,
}
impl FileChecker {
+ /// Creates a new `FileChecker` with the given configuration.
+ ///
+ /// # Arguments
+ ///
+ /// * `base` - Optional base path or URL for resolving relative paths.
+ /// * `fallback_extensions` - List of extensions to try if the original file is not found.
+ /// * `include_fragments` - Whether to check for fragment existence in HTML files.
pub(crate) fn new(
base: Option,
fallback_extensions: Vec,
@@ -38,6 +42,18 @@ impl FileChecker {
}
}
+ /// Checks the given file URI for existence and validity.
+ ///
+ /// This method resolves the URI to a file path, checks if the file exists,
+ /// and optionally checks for the existence of fragments in HTML files.
+ ///
+ /// # Arguments
+ ///
+ /// * `uri` - The URI to check.
+ ///
+ /// # Returns
+ ///
+ /// Returns a `Status` indicating the result of the check.
pub(crate) async fn check(&self, uri: &Uri) -> Status {
let Ok(path) = uri.url.to_file_path() else {
return ErrorKind::InvalidFilePath(uri.clone()).into();
@@ -47,13 +63,20 @@ impl FileChecker {
self.check_path(&resolved_path, uri).await
}
+ /// Resolves the given path using the base path, if one is set.
+ ///
+ /// # Arguments
+ ///
+ /// * `path` - The path to resolve.
+ ///
+ /// # Returns
+ ///
+ /// Returns the resolved path as a `PathBuf`.
fn resolve_path(&self, path: &Path) -> PathBuf {
if let Some(Base::Local(base_path)) = &self.base {
if path.is_absolute() {
let absolute_base_path = if base_path.is_relative() {
- std::env::current_dir()
- .unwrap_or_else(|_| PathBuf::new())
- .join(base_path)
+ std::env::current_dir().unwrap_or_default().join(base_path)
} else {
base_path.clone()
};
@@ -68,6 +91,16 @@ impl FileChecker {
}
}
+ /// Checks if the given path exists and performs additional checks if necessary.
+ ///
+ /// # Arguments
+ ///
+ /// * `path` - The path to check.
+ /// * `uri` - The original URI, used for error reporting.
+ ///
+ /// # Returns
+ ///
+ /// Returns a `Status` indicating the result of the check.
async fn check_path(&self, path: &Path, uri: &Uri) -> Status {
if path.exists() {
return self.check_existing_path(path, uri).await;
@@ -76,6 +109,16 @@ impl FileChecker {
self.check_with_fallback_extensions(path, uri).await
}
+ /// Checks an existing path, optionally verifying fragments for HTML files.
+ ///
+ /// # Arguments
+ ///
+ /// * `path` - The path to check.
+ /// * `uri` - The original URI, used for error reporting.
+ ///
+ /// # Returns
+ ///
+ /// Returns a `Status` indicating the result of the check.
async fn check_existing_path(&self, path: &Path, uri: &Uri) -> Status {
if self.include_fragments {
self.check_fragment(path, uri).await
@@ -84,6 +127,16 @@ impl FileChecker {
}
}
+ /// Attempts to find a file by trying different extensions specified in `fallback_extensions`.
+ ///
+ /// # Arguments
+ ///
+ /// * `path` - The original path to check.
+ /// * `uri` - The original URI, used for error reporting.
+ ///
+ /// # Returns
+ ///
+ /// Returns a `Status` indicating the result of the check.
async fn check_with_fallback_extensions(&self, path: &Path, uri: &Uri) -> Status {
let mut path_buf = path.to_path_buf();
@@ -103,6 +156,16 @@ impl FileChecker {
ErrorKind::InvalidFilePath(uri.clone()).into()
}
+ /// Checks for the existence of a fragment in an HTML file.
+ ///
+ /// # Arguments
+ ///
+ /// * `path` - The path to the HTML file.
+ /// * `uri` - The original URI, containing the fragment to check.
+ ///
+ /// # Returns
+ ///
+ /// Returns a `Status` indicating the result of the fragment check.
async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status {
match self.fragment_checker.check(path, &uri.url).await {
Ok(true) => Status::Ok(StatusCode::OK),
diff --git a/lychee-lib/src/checker/mail.rs b/lychee-lib/src/checker/mail.rs
index 1c3954568a..e7dcef7b8d 100644
--- a/lychee-lib/src/checker/mail.rs
+++ b/lychee-lib/src/checker/mail.rs
@@ -1,6 +1,11 @@
-use crate::{ErrorKind, Status, Uri};
+#[cfg(all(feature = "email-check", feature = "native-tls"))]
use http::StatusCode;
+#[cfg(all(feature = "email-check", feature = "native-tls"))]
+use crate::ErrorKind;
+
+use crate::{Status, Uri};
+
#[cfg(all(feature = "email-check", feature = "native-tls"))]
use check_if_email_exists::{check_email, CheckEmailInput, Reachable};
@@ -26,16 +31,15 @@ impl MailChecker {
/// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`),
/// which are ignored by this check. They are not part of the mail address
/// and instead passed to a mail client.
+ #[cfg(all(feature = "email-check", feature = "native-tls"))]
pub(crate) async fn check_mail(&self, uri: &Uri) -> Status {
- #[cfg(all(feature = "email-check", feature = "native-tls"))]
- {
- self.perform_email_check(uri).await
- }
+ self.perform_email_check(uri).await
+ }
- #[cfg(not(all(feature = "email-check", feature = "native-tls")))]
- {
- Status::Excluded
- }
+ /// Ignore the mail check if the `email-check` and `native-tls` features are not enabled.
+ #[cfg(not(all(feature = "email-check", feature = "native-tls")))]
+ pub(crate) async fn check_mail(&self, _uri: &Uri) -> Status {
+ Status::Excluded
}
#[cfg(all(feature = "email-check", feature = "native-tls"))]
From 76e406805f633c3c0e75419e611c2b667ada5d70 Mon Sep 17 00:00:00 2001
From: Matthias
Date: Sat, 26 Oct 2024 03:46:34 +0200
Subject: [PATCH 14/14] Refactor error handling in WebsiteChecker
---
lychee-lib/src/checker/website.rs | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs
index 447de3a8a4..62cadaff11 100644
--- a/lychee-lib/src/checker/website.rs
+++ b/lychee-lib/src/checker/website.rs
@@ -193,7 +193,7 @@ impl WebsiteChecker {
};
let repo = match client.repos(&uri.owner, &uri.repo).get().await {
Ok(repo) => repo,
- Err(e) => return ErrorKind::GithubRequest(e).into(),
+ Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(),
};
if let Some(true) = repo.private {
return Status::Ok(StatusCode::OK);