From 85ea4b5ee2155f6d3a73d08dfdf51b8994674153 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 7 Aug 2024 23:12:38 +0200 Subject: [PATCH 01/14] Add test for relative links --- lychee-bin/tests/cli.rs | 42 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 2d23b9d9d0..bfefc34068 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1667,4 +1667,46 @@ mod cli { .success() .stdout(contains("0 Errors")); } + + /// Test relative paths + /// + /// Imagine a web server hosting a site with the following structure: + /// root + /// └── test + /// ├── index.html + /// └── next.html + /// + /// where `root/test/index.html` contains `next` + /// When checking the link in `root/test/index.html` we should be able to + /// resolve the relative path to `root/test/next.html` + /// + /// Note that the relative path is not resolved to the root of the server + /// but relative to the file that contains the link. + #[tokio::test] + async fn test_resolve_relative_paths_in_subfolder() -> Result<()> { + let mock_server = wiremock::MockServer::start().await; + + let body = r#"next"#; + wiremock::Mock::given(wiremock::matchers::method("GET")) + .and(wiremock::matchers::path("/test/index.html")) + .respond_with(wiremock::ResponseTemplate::new(200).set_body_string(body)) + .mount(&mock_server) + .await; + + wiremock::Mock::given(wiremock::matchers::method("GET")) + .and(wiremock::matchers::path("/test/next.html")) + .respond_with(wiremock::ResponseTemplate::new(200)) + .mount(&mock_server) + .await; + + let mut cmd = main_command(); + cmd.arg("--verbose") + .arg(format!("{}/test/index.html", mock_server.uri())) + .assert() + .success() + .stdout(contains("1 Total")) + .stdout(contains("0 Errors")); + + Ok(()) + } } From d17c133d9d9612300178c95acf6d812fb3c99e1a Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 18 Aug 2024 01:55:03 +0200 Subject: [PATCH 02/14] Fix relative links --- lychee-lib/src/collector.rs | 82 ++++++++++++--- lychee-lib/src/types/base.rs | 21 ++-- lychee-lib/src/utils/request.rs | 172 ++++++++++++++++++++++++-------- 3 files changed, 209 insertions(+), 66 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 341bb21dba..8fb530db0a 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -1,3 +1,4 @@ +use crate::InputSource; use crate::{ basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request, Base, Input, Request, Result, @@ -99,26 +100,31 @@ impl Collector { /// /// Will return `Err` if links cannot be extracted from an input pub fn collect_links(self, inputs: Vec) -> impl Stream> { - let base = self.base; + let skip_missing_inputs = self.skip_missing_inputs; + let skip_hidden = self.skip_hidden; + let skip_ignored = self.skip_ignored; + let global_base = self.base; stream::iter(inputs) - .par_then_unordered(None, move |input| async move { - input.get_contents( - self.skip_missing_inputs, - self.skip_hidden, - self.skip_ignored, - ) + .par_then_unordered(None, move |input| { + let global_base = global_base.clone(); + async move { + let input_base = match &input.source { + InputSource::RemoteUrl(url) => Some(Base::try_from(url.as_str()).unwrap()), + _ => None, + }; + let base = input_base.or(global_base); + input + .get_contents(skip_missing_inputs, skip_hidden, skip_ignored) + .map(move |content| (content, base.clone())) + } }) .flatten() - .par_then_unordered(None, move |content| { - // send to parallel worker - let base = base.clone(); + .par_then_unordered(None, move |(content, base)| { let basic_auth_extractor = self.basic_auth_extractor.clone(); async move { let content = content?; - let extractor = Extractor::new(self.use_html5ever, self.include_verbatim); let uris: Vec = extractor.extract(&content); - let requests = request::create(uris, &content, &base, &basic_auth_extractor)?; Result::Ok(stream::iter(requests.into_iter().map(Ok))) } @@ -426,4 +432,56 @@ mod tests { assert_eq!(links, expected_links); } + + #[tokio::test] + async fn test_multiple_remote_urls() { + let mock_server_1 = mock_server!( + StatusCode::OK, + set_body_string(r#"Link"#) + ); + let mock_server_2 = mock_server!( + StatusCode::OK, + set_body_string(r#"Link"#) + ); + + let inputs = vec![ + Input { + source: InputSource::RemoteUrl(Box::new( + Url::parse(&format!( + "{}/foo/index.html", + mock_server_1.uri().trim_end_matches('/') + )) + .unwrap(), + )), + file_type_hint: Some(FileType::Html), + excluded_paths: None, + }, + Input { + source: InputSource::RemoteUrl(Box::new( + Url::parse(&format!( + "{}/bar/index.html", + mock_server_2.uri().trim_end_matches('/') + )) + .unwrap(), + )), + file_type_hint: Some(FileType::Html), + excluded_paths: None, + }, + ]; + + let links = collect(inputs, None).await; + + let expected_links = HashSet::from_iter([ + website(&format!( + "{}/foo/relative.html", + mock_server_1.uri().trim_end_matches('/') + )), + website(&format!( + "{}/bar/relative.html", + mock_server_2.uri().trim_end_matches('/') + )), + ]); + + assert_eq!(links, expected_links); + } } diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index 73a40978f5..a745995756 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -36,18 +36,17 @@ impl Base { } } - pub(crate) fn from_source(source: &InputSource) -> Option { + pub(crate) fn from_source(source: &InputSource) -> Option { match &source { InputSource::RemoteUrl(url) => { - // TODO: This should be refactored. - // Cases like https://user:pass@example.com are not handled - // We can probably use the original URL and just replace the - // path component in the caller of this function - if let Some(port) = url.port() { - Url::parse(&format!("{}://{}:{port}", url.scheme(), url.host_str()?)).ok() - } else { - Url::parse(&format!("{}://{}", url.scheme(), url.host_str()?)).ok() - } + // Create a new URL with just the scheme, host, and port + let mut base_url = url.clone(); + base_url.set_path(""); + base_url.set_query(None); + base_url.set_fragment(None); + + // We keep the username and password intact + Some(Base::Remote(*base_url)) } // other inputs do not have a URL to extract a base _ => None, @@ -123,7 +122,7 @@ mod test_base { let url = Url::parse(url).unwrap(); let source = InputSource::RemoteUrl(Box::new(url.clone())); let base = Base::from_source(&source); - let expected = Url::parse(expected).unwrap(); + let expected = Base::Remote(Url::parse(expected).unwrap()); assert_eq!(base, Some(expected)); } } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 70f82ce1d8..cfe4be11dd 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -28,7 +28,9 @@ pub(crate) fn create( base: &Option, extractor: &Option, ) -> Result> { - let base_url = Base::from_source(&input_content.source); + let base_url = base + .clone() + .or_else(|| Base::from_source(&input_content.source)); let requests: Result>> = uris .into_iter() @@ -38,30 +40,16 @@ pub(crate) fn create( let element = raw_uri.element.clone(); let attribute = raw_uri.attribute.clone(); - // Truncate the source in case it gets too long Ideally we should - // avoid the initial String allocation for `source` altogether + // Truncate the source in case it gets too long let source = match &input_content.source { InputSource::String(s) => { InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect()) } - // Cloning is cheap here c => c.clone(), }; if let Ok(uri) = Uri::try_from(raw_uri) { let credentials = credentials(extractor, &uri); - - Ok(Some(Request::new( - uri, - source, - element, - attribute, - credentials, - ))) - } else if let Some(url) = base.as_ref().and_then(|u| u.join(&text)) { - let uri = Uri { url }; - let credentials = credentials(extractor, &uri); - Ok(Some(Request::new( uri, source, @@ -69,6 +57,21 @@ pub(crate) fn create( attribute, credentials, ))) + } else if let Some(base) = &base_url { + match base.join(&text) { + Some(url) => { + let uri = Uri { url }; + let credentials = credentials(extractor, &uri); + Ok(Some(Request::new( + uri, + source, + element, + attribute, + credentials, + ))) + } + None => Ok(None), + } } else if let InputSource::FsPath(root) = &input_content.source { let path = if is_anchor { match root.file_name() { @@ -81,11 +84,9 @@ pub(crate) fn create( } else { text }; - - if let Some(url) = create_uri_from_path(root, &path, base)? { + if let Some(url) = create_uri_from_path(root, &path, &base_url)? { let uri = Uri { url }; let credentials = credentials(extractor, &uri); - Ok(Some(Request::new( uri, source, @@ -94,24 +95,7 @@ pub(crate) fn create( credentials, ))) } else { - // In case we cannot create a URI from a path but we didn't receive an error, - // it means that some preconditions were not met, e.g. the `base_url` wasn't set. - Ok(None) - } - } else if let Some(url) = construct_url(&base_url, &text) { - if base.is_some() { Ok(None) - } else { - let uri = Uri { url: url? }; - let credentials = credentials(extractor, &uri); - - Ok(Some(Request::new( - uri, - source, - element, - attribute, - credentials, - ))) } } else { info!("Handling of `{}` not implemented yet", text); @@ -119,17 +103,11 @@ pub(crate) fn create( } }) .collect(); + let requests: Vec = requests?.into_iter().flatten().collect(); Ok(HashSet::from_iter(requests)) } -fn construct_url(base: &Option, text: &str) -> Option> { - base.as_ref().map(|base| { - base.join(text) - .map_err(|e| ErrorKind::ParseUrl(e, format!("{base}{text}"))) - }) -} - fn create_uri_from_path(src: &Path, dst: &str, base: &Option) -> Result> { let (dst, frag) = url::remove_get_params_and_separate_fragment(dst); // Avoid double-encoding already encoded destination paths by removing any @@ -158,6 +136,7 @@ fn create_uri_from_path(src: &Path, dst: &str, base: &Option) -> Result InputContent { + InputContent { + content: content.to_string(), + file_type, + source: InputSource::String(content.to_string()), + } + } + + #[test] + fn test_relative_url_resolution() { + let base = Some(Base::try_from("https://example.com/path/page.html").unwrap()); + let input = create_input( + r#"Relative Link"#, + FileType::Html, + ); + + let uris = vec![RawUri::from("relative.html")]; + let requests = create(uris, &input, &base, &None).unwrap(); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://example.com/path/relative.html")); + } + + #[test] + fn test_absolute_url_resolution() { + let base = Some(Base::try_from("https://example.com/path/page.html").unwrap()); + let input = create_input( + r#"Absolute Link"#, + FileType::Html, + ); + + let uris = vec![RawUri::from("https://another.com/page")]; + let requests = create(uris, &input, &base, &None).unwrap(); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://another.com/page")); + } + + #[test] + fn test_root_relative_url_resolution() { + let base = Some(Base::try_from("https://example.com/path/page.html").unwrap()); + let input = create_input( + r#"Root Relative Link"#, + FileType::Html, + ); + + let uris = vec![RawUri::from("/root-relative")]; + let requests = create(uris, &input, &base, &None).unwrap(); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://example.com/root-relative")); + } + + #[test] + fn test_parent_directory_url_resolution() { + let base = Some(Base::try_from("https://example.com/path/page.html").unwrap()); + let input = create_input( + r#"Parent Directory Link"#, + FileType::Html, + ); + + let uris = vec![RawUri::from("../parent")]; + let requests = create(uris, &input, &base, &None).unwrap(); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://example.com/parent")); + } + + #[test] + fn test_fragment_url_resolution() { + let base = Some(Base::try_from("https://example.com/path/page.html").unwrap()); + let input = create_input(r##"Fragment Link"##, FileType::Html); + + let uris = vec![RawUri::from("#fragment")]; + let requests = create(uris, &input, &base, &None).unwrap(); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://example.com/path/page.html#fragment")); + } + + #[test] + fn test_no_base_url_resolution() { + let base = None; + let input = create_input( + r#"Absolute Link"#, + FileType::Html, + ); + + let uris = vec![RawUri::from("https://example.com/page")]; + let requests = create(uris, &input, &base, &None).unwrap(); + + assert_eq!(requests.len(), 1); + assert!(requests + .iter() + .any(|r| r.uri.url.as_str() == "https://example.com/page")); + } } From fafeb32a5b5b8dda2e6816a42a93698f40ccc1dc Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 23 Aug 2024 18:03:04 +0200 Subject: [PATCH 03/14] Create relative-links-v2 with current changes --- lychee-bin/src/formatters/response/color.rs | 4 +- lychee-lib/src/collector.rs | 31 +- lychee-lib/src/test_utils.rs | 10 + lychee-lib/src/types/base.rs | 15 +- lychee-lib/src/types/error.rs | 24 +- lychee-lib/src/types/status_code/selector.rs | 2 +- lychee-lib/src/types/uri/raw.rs | 20 -- lychee-lib/src/types/uri/valid.rs | 5 + lychee-lib/src/utils/request.rs | 323 +++++++++++++------ 9 files changed, 305 insertions(+), 129 deletions(-) diff --git a/lychee-bin/src/formatters/response/color.rs b/lychee-bin/src/formatters/response/color.rs index 137c70a6fb..0a84f2a393 100644 --- a/lychee-bin/src/formatters/response/color.rs +++ b/lychee-bin/src/formatters/response/color.rs @@ -71,7 +71,7 @@ mod tests { let body = mock_response_body(Status::Ok(StatusCode::OK), "https://example.com"); assert_eq!( formatter.format_response(&body), - " [200] https://example.com/" + "\u{1b}[38;5;2m\u{1b}[1m [200]\u{1b}[0m https://example.com/" ); } @@ -84,7 +84,7 @@ mod tests { ); assert_eq!( formatter.format_response(&body), - " [ERROR] https://example.com/404" + "\u{1b}[38;5;197m [ERROR]\u{1b}[0m https://example.com/404" ); } diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 8fb530db0a..08bf1cdbfe 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -143,7 +143,7 @@ mod tests { use super::*; use crate::{ mock_server, - test_utils::{load_fixture, mail, website}, + test_utils::{load_fixture, mail, path, website}, types::{FileType, Input, InputSource}, Result, Uri, }; @@ -484,4 +484,33 @@ mod tests { assert_eq!(links, expected_links); } + + #[tokio::test] + async fn test_file_path_with_base() { + let base = Base::try_from("/path/to/root").unwrap(); + assert_eq!(base, Base::Local("/path/to/root".into())); + + let input = Input { + source: InputSource::String( + r#" + Index + About + Another + "# + .into(), + ), + file_type_hint: Some(FileType::Html), + excluded_paths: None, + }; + + let links = collect(vec![input], Some(base)).await; + + let expected_links = HashSet::from_iter([ + path("/path/to/root/index.html"), + path("/path/to/root/about.html"), + path("/another.html"), + ]); + + assert_eq!(links, expected_links); + } } diff --git a/lychee-lib/src/test_utils.rs b/lychee-lib/src/test_utils.rs index 99fc6aa2de..ff7be8e8c6 100644 --- a/lychee-lib/src/test_utils.rs +++ b/lychee-lib/src/test_utils.rs @@ -39,6 +39,16 @@ pub(crate) fn website(url: &str) -> Uri { Uri::from(Url::parse(url).expect("Expected valid Website URI")) } +/// Helper method to convert a `std::path::Path `into a URI with the `file` scheme +/// +/// # Panic +/// +/// This panics if the given path is not absolute, so it should only be used for +/// testing +pub(crate) fn path>(path: P) -> Uri { + Uri::from(Url::from_file_path(path.as_ref()).expect("Expected valid File URI")) +} + /// Creates a mail URI from a string pub(crate) fn mail(address: &str) -> Uri { if address.starts_with("mailto:") { diff --git a/lychee-lib/src/types/base.rs b/lychee-lib/src/types/base.rs index a745995756..b7b76c7e5b 100644 --- a/lychee-lib/src/types/base.rs +++ b/lychee-lib/src/types/base.rs @@ -23,7 +23,10 @@ impl Base { pub(crate) fn join(&self, link: &str) -> Option { match self { Self::Remote(url) => url.join(link).ok(), - Self::Local(_) => None, + Self::Local(path) => { + let full_path = path.join(link); + Url::from_file_path(full_path).ok() + } } } @@ -100,6 +103,16 @@ mod test_base { assert!(Base::try_from("data:text/plain,Hello?World#").is_err()); } + #[test] + fn test_valid_local_path_string_as_base() -> Result<()> { + let cases = vec!["/tmp/lychee", "/tmp/lychee/", "tmp/lychee/"]; + + for case in cases { + assert_eq!(Base::try_from(case)?, Base::Local(PathBuf::from(case))); + } + Ok(()) + } + #[test] fn test_valid_local() -> Result<()> { let dir = tempfile::tempdir().unwrap(); diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 286332fb5e..8019f3b718 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -86,12 +86,24 @@ pub enum ErrorKind { #[error("Error with base dir `{0}` : {1}")] InvalidBase(String, String), + /// Cannot join the given text with the base URL + #[error("Cannot join '{0}' with the base URL")] + InvalidBaseJoin(String), + + /// Cannot convert the given path to a URI + #[error("Cannot convert path '{0}' to a URI")] + InvalidPathToUri(String), + + /// The given URI type is not supported + #[error("Unsupported URI type: '{0}'")] + UnsupportedUriType(String), + /// The given input can not be parsed into a valid URI remapping #[error("Error remapping URL: `{0}`")] InvalidUrlRemap(String), /// The given path does not resolve to a valid file - #[error("Cannot find local file {0}")] + #[error("Invalid file path: {0}")] InvalidFile(PathBuf), /// Error while traversing an input directory @@ -243,6 +255,13 @@ impl PartialEq for ErrorKind { } (Self::Cookies(e1), Self::Cookies(e2)) => e1 == e2, (Self::InvalidFile(p1), Self::InvalidFile(p2)) => p1 == p2, + (Self::InvalidFilePath(u1), Self::InvalidFilePath(u2)) => u1 == u2, + (Self::InvalidFragment(u1), Self::InvalidFragment(u2)) => u1 == u2, + (Self::InvalidUrlFromPath(p1), Self::InvalidUrlFromPath(p2)) => p1 == p2, + (Self::InvalidBase(b1, e1), Self::InvalidBase(b2, e2)) => b1 == b2 && e1 == e2, + (Self::InvalidUrlRemap(r1), Self::InvalidUrlRemap(r2)) => r1 == r2, + (Self::EmptyUrl, Self::EmptyUrl) => true, + _ => false, } } @@ -278,6 +297,9 @@ impl Hash for ErrorKind { Self::UnreachableEmailAddress(u, ..) => u.hash(state), Self::InsecureURL(u, ..) => u.hash(state), Self::InvalidBase(base, e) => (base, e).hash(state), + Self::InvalidBaseJoin(s) => s.hash(state), + Self::InvalidPathToUri(s) => s.hash(state), + Self::UnsupportedUriType(s) => s.hash(state), Self::InvalidUrlRemap(remap) => (remap).hash(state), Self::InvalidHeader(e) => e.to_string().hash(state), Self::InvalidGlobPattern(e) => e.to_string().hash(state), diff --git a/lychee-lib/src/types/status_code/selector.rs b/lychee-lib/src/types/status_code/selector.rs index 060e097227..c93b662c1f 100644 --- a/lychee-lib/src/types/status_code/selector.rs +++ b/lychee-lib/src/types/status_code/selector.rs @@ -5,7 +5,7 @@ use thiserror::Error; use crate::{types::accept::AcceptRange, AcceptRangeError}; -#[derive(Debug, Error)] +#[derive(Debug, Error, PartialEq)] pub enum StatusCodeSelectorError { #[error("invalid/empty input")] InvalidInput, diff --git a/lychee-lib/src/types/uri/raw.rs b/lychee-lib/src/types/uri/raw.rs index f78b23cd6d..3ad51f2cf8 100644 --- a/lychee-lib/src/types/uri/raw.rs +++ b/lychee-lib/src/types/uri/raw.rs @@ -19,12 +19,6 @@ pub struct RawUri { pub attribute: Option, } -impl RawUri { - // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs - pub(crate) fn is_anchor(&self) -> bool { - self.text.starts_with('#') - } -} impl Display for RawUri { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { write!(f, "{} (Attribute: {:?})", self.text, self.attribute) @@ -40,17 +34,3 @@ impl From<&str> for RawUri { } } } - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_is_anchor() { - let raw_uri = RawUri::from("#anchor"); - assert!(raw_uri.is_anchor()); - - let raw_uri = RawUri::from("notan#anchor"); - assert!(!raw_uri.is_anchor()); - } -} diff --git a/lychee-lib/src/types/uri/valid.rs b/lychee-lib/src/types/uri/valid.rs index 573f685295..522acc4849 100644 --- a/lychee-lib/src/types/uri/valid.rs +++ b/lychee-lib/src/types/uri/valid.rs @@ -378,4 +378,9 @@ mod tests { website("https://example.com") ); } + + #[test] + fn test_file_uri() { + assert!(Uri::try_from("file:///path/to/file").unwrap().is_file()); + } } diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index cfe4be11dd..c59718a3be 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -1,4 +1,3 @@ -use log::info; use percent_encoding::percent_decode_str; use reqwest::Url; use std::{ @@ -13,13 +12,107 @@ use crate::{ Base, BasicAuthCredentials, ErrorKind, Request, Result, Uri, }; -const MAX_TRUNCATED_STR_LEN: usize = 100; - /// Extract basic auth credentials for a given URL. -fn credentials(extractor: &Option, uri: &Uri) -> Option { +fn extract_credentials( + extractor: &Option, + uri: &Uri, +) -> Option { extractor.as_ref().and_then(|ext| ext.matches(uri)) } +/// Create a request from a raw URI. +fn create_request( + raw_uri: RawUri, + source: &InputSource, + base: &Option, + extractor: &Option, +) -> Result { + let uri = try_parse_into_uri(&raw_uri, source, base)?; + let source = truncate_source(source); + let element = raw_uri.element.clone(); + let attribute = raw_uri.attribute.clone(); + let credentials = extract_credentials(extractor, &uri); + + Ok(Request::new(uri, source, element, attribute, credentials)) +} + +/// Try to parse the raw URI into a `Uri`. +/// +/// If the raw URI is not a valid URI, create a URI by joining the base URL with the text. +/// If the base URL is not available, create a URI from the file path. +/// +/// # Errors +/// +/// - If the text (the unparsed URI represented as a `String`) cannot be joined with the base +/// to create a valid URI. +/// - If a URI cannot be created from the file path. +/// - If the source is not a file path (i.e. the URI type is not supported). +fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option) -> Result { + let text = raw_uri.text.clone(); + let uri = match Uri::try_from(raw_uri.clone()) { + Ok(uri) => uri, + Err(_) => match base { + Some(base_url) => match base_url.join(&text) { + Some(url) => Uri { url }, + None => return Err(ErrorKind::InvalidBaseJoin(text.clone())), + }, + None => match source { + InputSource::FsPath(root) => create_uri_from_file_path(root, &text, base)?, + _ => return Err(ErrorKind::UnsupportedUriType(text)), + }, + }, + }; + Ok(uri) +} + +// Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs +pub(crate) fn is_anchor(text: &str) -> bool { + text.starts_with('#') +} + +/// Create a URI from a file path +/// + +fn create_uri_from_file_path( + file_path: &PathBuf, + link_text: &str, + base: &Option, +) -> Result { + let target_path = if is_anchor(link_text) { + // For anchors, we need to append the anchor to the file name. + let file_name = file_path + .file_name() + .and_then(|name| name.to_str()) + .ok_or_else(|| ErrorKind::InvalidFile(file_path.clone()))?; + + format!("{file_name}{link_text}") + } else { + link_text.to_string() + }; + let Ok(constructed_url) = resolve_and_create_url(file_path, &target_path, base) else { + return Err(ErrorKind::InvalidPathToUri(target_path)); + }; + Ok(Uri { + url: constructed_url, + }) +} + +/// Truncate the source in case it gets too long +/// +/// This is only needed for string inputs. +/// For other inputs, the source is simply a "label" (an enum variant). +// TODO: This would not be necessary if we used `Cow` for the source. +fn truncate_source(source: &InputSource) -> InputSource { + const MAX_TRUNCATED_STR_LEN: usize = 100; + + match source { + InputSource::String(s) => { + InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect()) + } + other => other.clone(), + } +} + /// Create requests out of the collected URLs. /// Only keeps "valid" URLs. This filters out anchors for example. pub(crate) fn create( @@ -28,109 +121,49 @@ pub(crate) fn create( base: &Option, extractor: &Option, ) -> Result> { - let base_url = base + let base = base .clone() .or_else(|| Base::from_source(&input_content.source)); - - let requests: Result>> = uris + let requests: Result> = uris .into_iter() - .map(|raw_uri| { - let is_anchor = raw_uri.is_anchor(); - let text = raw_uri.text.clone(); - let element = raw_uri.element.clone(); - let attribute = raw_uri.attribute.clone(); - - // Truncate the source in case it gets too long - let source = match &input_content.source { - InputSource::String(s) => { - InputSource::String(s.chars().take(MAX_TRUNCATED_STR_LEN).collect()) - } - c => c.clone(), - }; - - if let Ok(uri) = Uri::try_from(raw_uri) { - let credentials = credentials(extractor, &uri); - Ok(Some(Request::new( - uri, - source, - element, - attribute, - credentials, - ))) - } else if let Some(base) = &base_url { - match base.join(&text) { - Some(url) => { - let uri = Uri { url }; - let credentials = credentials(extractor, &uri); - Ok(Some(Request::new( - uri, - source, - element, - attribute, - credentials, - ))) - } - None => Ok(None), - } - } else if let InputSource::FsPath(root) = &input_content.source { - let path = if is_anchor { - match root.file_name() { - Some(file_name) => match file_name.to_str() { - Some(valid_str) => valid_str.to_string() + &text, - None => return Err(ErrorKind::InvalidFile(root.clone())), - }, - None => return Err(ErrorKind::InvalidFile(root.clone())), - } - } else { - text - }; - if let Some(url) = create_uri_from_path(root, &path, &base_url)? { - let uri = Uri { url }; - let credentials = credentials(extractor, &uri); - Ok(Some(Request::new( - uri, - source, - element, - attribute, - credentials, - ))) - } else { - Ok(None) - } - } else { - info!("Handling of `{}` not implemented yet", text); - Ok(None) - } - }) + .map(|raw_uri| create_request(raw_uri, &input_content.source, &base, extractor)) .collect(); - - let requests: Vec = requests?.into_iter().flatten().collect(); - Ok(HashSet::from_iter(requests)) + Ok(HashSet::from_iter(requests?)) } -fn create_uri_from_path(src: &Path, dst: &str, base: &Option) -> Result> { - let (dst, frag) = url::remove_get_params_and_separate_fragment(dst); - // Avoid double-encoding already encoded destination paths by removing any - // potential encoding (e.g. `web%20site` becomes `web site`). - // That's because Url::from_file_path will encode the full URL in the end. - // This behavior cannot be configured. - // See https://github.com/lycheeverse/lychee/pull/262#issuecomment-915245411 - // TODO: This is not a perfect solution. - // Ideally, only `src` and `base` should be URL encoded (as is done by - // `from_file_path` at the moment) while `dst` gets left untouched and simply - // appended to the end. - let decoded = percent_decode_str(dst).decode_utf8()?; - let resolved = path::resolve(src, &PathBuf::from(&*decoded), base)?; - match resolved { - Some(path) => Url::from_file_path(&path) - .map(|mut url| { - url.set_fragment(frag); - url - }) - .map(Some) - .map_err(|_e| ErrorKind::InvalidUrlFromPath(path)), - None => Ok(None), - } +/// Create a URI from a path +/// +/// `src_path` is the path of the source file. +/// `dest_path` is the path being linked to. +/// The optional `base_uri` specifies the base URI to resolve the destination path against. +/// +/// # Errors +/// +/// - If the percent-decoded destination path cannot be decoded as UTF-8. +/// - The path cannot be resolved +/// - The resolved path cannot be converted to a URL. +fn resolve_and_create_url( + src_path: &Path, + dest_path: &str, + base_uri: &Option, +) -> Result { + let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path); + + // Decode the destination path to avoid double-encoding + // This addresses the issue mentioned in the original comment about double-encoding + let decoded_dest = percent_decode_str(dest_path).decode_utf8()?; + + let Ok(Some(resolved_path)) = path::resolve(src_path, &PathBuf::from(&*decoded_dest), base_uri) + else { + return Err(ErrorKind::InvalidPathToUri(decoded_dest.to_string())); + }; + + let Ok(mut url) = Url::from_file_path(&resolved_path) else { + return Err(ErrorKind::InvalidUrlFromPath(resolved_path.clone())); + }; + + url.set_fragment(fragment); + Ok(url) } #[cfg(test)] @@ -138,11 +171,17 @@ mod tests { use super::*; use crate::types::FileType; + #[test] + fn test_is_anchor() { + assert!(is_anchor("#anchor")); + assert!(!is_anchor("notan#anchor")); + } + #[test] fn test_create_uri_from_path() { let result = - create_uri_from_path(&PathBuf::from("/README.md"), "test+encoding", &None).unwrap(); - assert_eq!(result.unwrap().as_str(), "file:///test+encoding"); + resolve_and_create_url(&PathBuf::from("/README.md"), "test+encoding", &None).unwrap(); + assert_eq!(result.as_str(), "file:///test+encoding"); } fn create_input(content: &str, file_type: FileType) -> InputContent { @@ -251,4 +290,82 @@ mod tests { .iter() .any(|r| r.uri.url.as_str() == "https://example.com/page")); } + + #[test] + fn test_create_request_from_relative_file_path() { + let base = Some(Base::Local(PathBuf::from("/tmp/lychee"))); + let input_source = InputSource::FsPath(PathBuf::from("page.html")); + + let actual = + create_request(RawUri::from("file.html"), &input_source, &base, &None).unwrap(); + + assert_eq!( + actual, + Request::new( + Uri { + url: Url::from_file_path("/tmp/lychee/file.html").unwrap() + }, + input_source, + None, + None, + None, + ) + ); + } + + #[test] + fn test_create_request_from_absolute_file_path() { + let base = Some(Base::Local(PathBuf::from("/tmp/lychee"))); + let input_source = InputSource::FsPath(PathBuf::from("/tmp/lychee/page.html")); + + // Use an absolute path that's outside the base directory + let actual = create_request( + RawUri::from("/usr/local/share/doc/example.html"), + &input_source, + &base, + &None, + ) + .unwrap(); + + assert_eq!( + actual, + Request::new( + Uri { + url: Url::from_file_path("/usr/local/share/doc/example.html").unwrap() + }, + input_source, + None, + None, + None, + ) + ); + } + + #[test] + fn test_parse_relative_path_into_uri() { + let base = Some(Base::Local(PathBuf::from("/tmp/lychee"))); + let input = create_input( + r#"Relative Link"#, + FileType::Html, + ); + + let raw_uri = RawUri::from("relative.html"); + let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap(); + + assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html"); + } + + #[test] + fn test_parse_absolute_path_into_uri() { + let base = Some(Base::Local(PathBuf::from("/tmp/lychee"))); + let input = create_input( + r#"Absolute Link"#, + FileType::Html, + ); + + let raw_uri = RawUri::from("absolute.html"); + let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap(); + + assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html"); + } } From 681427132da2518d458e331c05a3e77eb6680dec Mon Sep 17 00:00:00 2001 From: Matthias Date: Sat, 24 Aug 2024 00:40:11 +0200 Subject: [PATCH 04/14] cleanup --- lychee-lib/src/collector.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index 08bf1cdbfe..fd55626398 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -106,13 +106,12 @@ impl Collector { let global_base = self.base; stream::iter(inputs) .par_then_unordered(None, move |input| { - let global_base = global_base.clone(); + let default_base = global_base.clone(); async move { - let input_base = match &input.source { - InputSource::RemoteUrl(url) => Some(Base::try_from(url.as_str()).unwrap()), - _ => None, + let base = match &input.source { + InputSource::RemoteUrl(url) => Base::try_from(url.as_str()).ok(), + _ => default_base, }; - let base = input_base.or(global_base); input .get_contents(skip_missing_inputs, skip_hidden, skip_ignored) .map(move |content| (content, base.clone())) From e5db194351f5fb7f0e6169263e357e57530186fc Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 8 Oct 2024 15:59:54 +0200 Subject: [PATCH 05/14] Add about and index HTML files for resolve paths feature. This should work now, but it's hacked together like crazy and needs a lot of refactoring before we can merge this. Just pushing this upstream so that I don't forget about it. --- fixtures/resolve_paths/about/index.html | 8 ++ .../resolve_paths/another page/index.html | 0 fixtures/resolve_paths/index.html | 21 ++++ lychee-bin/src/client.rs | 1 + lychee-bin/tests/cli.rs | 19 ++-- lychee-lib/src/client.rs | 98 ++++++++++++++++--- lychee-lib/src/utils/request.rs | 64 +++++++++--- 7 files changed, 173 insertions(+), 38 deletions(-) create mode 100644 fixtures/resolve_paths/about/index.html create mode 100644 fixtures/resolve_paths/another page/index.html create mode 100644 fixtures/resolve_paths/index.html diff --git a/fixtures/resolve_paths/about/index.html b/fixtures/resolve_paths/about/index.html new file mode 100644 index 0000000000..3141b661a0 --- /dev/null +++ b/fixtures/resolve_paths/about/index.html @@ -0,0 +1,8 @@ + + + About + + +

About

+ + diff --git a/fixtures/resolve_paths/another page/index.html b/fixtures/resolve_paths/another page/index.html new file mode 100644 index 0000000000..e69de29bb2 diff --git a/fixtures/resolve_paths/index.html b/fixtures/resolve_paths/index.html new file mode 100644 index 0000000000..79d285a684 --- /dev/null +++ b/fixtures/resolve_paths/index.html @@ -0,0 +1,21 @@ + + + Index + + +

Index Title

+

+

+

+ + \ No newline at end of file diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index b03fc35e2c..68bf3ca7ca 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -55,6 +55,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - ClientBuilder::builder() .remaps(remaps) + .base(cfg.base.clone()) .includes(includes) .excludes(excludes) .exclude_all_private(cfg.exclude_all_private) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index bfefc34068..c53d6c940a 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -266,13 +266,17 @@ mod cli { #[test] fn test_resolve_paths() { + // TODO: + // 1. Check the diff to see if the absolute file rewrite hack can be removed / is still in use + // 2. refactor the code to clean up base path handling + let mut cmd = main_command(); - let offline_dir = fixtures_path().join("offline"); + let dir = fixtures_path().join("resolve_paths"); cmd.arg("--offline") .arg("--base") - .arg(&offline_dir) - .arg(offline_dir.join("index.html")) + .arg(&dir) + .arg(dir.join("index.html")) .env_clear() .assert() .success() @@ -1216,8 +1220,9 @@ mod cli { Ok(()) } - /// If base-dir is not set, don't throw an error in case we encounter - /// an absolute local link within a file (e.g. `/about`). + /// If `base-dir` is not set, don't throw an error in case we encounter + /// an absolute local link (e.g. `/about`) within a file. + /// Instead, simply ignore the link. #[test] fn test_ignore_absolute_local_links_without_base() -> Result<()> { let mut cmd = main_command(); @@ -1409,9 +1414,7 @@ mod cli { .arg("./NOT-A-REAL-TEST-FIXTURE.md") .assert() .failure() - .stderr(contains( - "Cannot find local file ./NOT-A-REAL-TEST-FIXTURE.md", - )); + .stderr(contains("Invalid file path: ./NOT-A-REAL-TEST-FIXTURE.md")); Ok(()) } diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 49c652a52e..d5d102aca0 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -13,7 +13,12 @@ clippy::default_trait_access, clippy::used_underscore_binding )] -use std::{collections::HashSet, path::Path, sync::Arc, time::Duration}; +use std::{ + collections::HashSet, + path::{Path, PathBuf}, + sync::Arc, + time::Duration, +}; #[cfg(all(feature = "email-check", feature = "native-tls"))] use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; @@ -37,7 +42,7 @@ use crate::{ remap::Remaps, types::uri::github::GithubUri, utils::fragment_checker::FragmentChecker, - ErrorKind, Request, Response, Result, Status, Uri, + Base, ErrorKind, Request, Response, Result, Status, Uri, }; #[cfg(all(feature = "email-check", feature = "native-tls"))] @@ -248,6 +253,12 @@ pub struct ClientBuilder { /// Response timeout per request in seconds. timeout: Option, + /// Base for resolving paths. + /// + /// E.g. if the base is `/home/user/` and the path is `file.txt`, the + /// resolved path would be `/home/user/file.txt`. + base: Option, + /// Initial time between retries of failed requests. /// /// Defaults to [`DEFAULT_RETRY_WAIT_TIME_SECS`]. @@ -387,6 +398,7 @@ impl ClientBuilder { reqwest_client, github_client, remaps: self.remaps, + base: self.base, fallback_extensions: self.fallback_extensions, filter, max_retries: self.max_retries, @@ -416,6 +428,9 @@ pub struct Client { /// Optional remapping rules for URIs matching pattern. remaps: Option, + /// Optional base for resolving paths. + base: Option, + /// Automatically append file extensions to `file://` URIs as needed fallback_extensions: Vec, @@ -463,7 +478,7 @@ impl Client { /// /// Returns an `Err` if: /// - `request` does not represent a valid URI. - /// - Encrypted connection for a HTTP URL is available but unused. (Only + /// - Encrypted connection for a HTTP URL is available but unused. (Only /// checked when `Client::require_https` is `true`.) #[allow(clippy::missing_panics_doc)] pub async fn check(&self, request: T) -> Result @@ -493,22 +508,72 @@ impl Client { return Ok(Response::new(uri.clone(), Status::Excluded, source)); } - let default_chain: RequestChain = Chain::new(vec![ - Box::::default(), - Box::new(credentials), - Box::new(Checker::new( - self.retry_wait_time, - self.max_retries, - self.reqwest_client.clone(), - self.accepted.clone(), - )), - ]); - let status = match uri.scheme() { - _ if uri.is_file() => self.check_file(uri).await, + _ if uri.is_file() => { + // If base is set and the path is absolute, prepend the base + // to the path. + if let Some(ref base) = self.base { + let Ok(path) = uri.url.to_file_path() else { + return Err(ErrorKind::InvalidFilePath(uri.clone()).into()); + }; + + if path.is_absolute() { + match base { + Base::Local(ref base_path) => { + // Ensure base_path is absolute + let absolute_base_path = if base_path.is_relative() { + std::env::current_dir() + .unwrap_or_else(|_| PathBuf::new()) + .join(base_path) + } else { + base_path.to_path_buf() + }; + + // Optionally strip the leading slash from the path (if it exists) + let stripped = path.strip_prefix("/").unwrap_or(&path); + let resolved = absolute_base_path.join(stripped); + + // Create a new URI with the resolved path + let mut uri = uri.clone(); + uri.url = Url::from_file_path(&resolved).unwrap(); + + if resolved.exists() { + return Ok(Response::new( + uri.clone(), + Status::Ok(StatusCode::OK), + source, + )); + } + + return Ok(Response::new( + uri.clone(), + Status::Error(ErrorKind::InvalidFilePath(uri.clone())), + source, + )); + } + Base::Remote(_) => (), + } + } + } + + self.check_file(uri).await + } _ if uri.is_mail() => self.check_mail(uri).await, _ if uri.is_tel() => Status::Excluded, - _ => self.check_website(uri, default_chain).await?, + _ => { + let default_chain: RequestChain = Chain::new(vec![ + Box::::default(), + Box::new(credentials), + Box::new(Checker::new( + self.retry_wait_time, + self.max_retries, + self.reqwest_client.clone(), + self.accepted.clone(), + )), + ]); + + self.check_website(uri, default_chain).await? + } }; Ok(Response::new(uri.clone(), status, source)) @@ -690,6 +755,7 @@ impl Client { /// Checks a `file` URI's fragment. pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { + println!("Checking fragment for {:?}", path); match self.fragment_checker.check(path, &uri.url).await { Ok(true) => Status::Ok(StatusCode::OK), Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(), diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index c59718a3be..83e20a5e2c 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -26,14 +26,22 @@ fn create_request( source: &InputSource, base: &Option, extractor: &Option, -) -> Result { - let uri = try_parse_into_uri(&raw_uri, source, base)?; +) -> Result> { + let Some(uri) = try_parse_into_uri(&raw_uri, source, base)? else { + return Ok(None); + }; let source = truncate_source(source); let element = raw_uri.element.clone(); let attribute = raw_uri.attribute.clone(); let credentials = extract_credentials(extractor, &uri); - Ok(Request::new(uri, source, element, attribute, credentials)) + Ok(Some(Request::new( + uri, + source, + element, + attribute, + credentials, + ))) } /// Try to parse the raw URI into a `Uri`. @@ -47,7 +55,11 @@ fn create_request( /// to create a valid URI. /// - If a URI cannot be created from the file path. /// - If the source is not a file path (i.e. the URI type is not supported). -fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option) -> Result { +fn try_parse_into_uri( + raw_uri: &RawUri, + source: &InputSource, + base: &Option, +) -> Result> { let text = raw_uri.text.clone(); let uri = match Uri::try_from(raw_uri.clone()) { Ok(uri) => uri, @@ -57,12 +69,20 @@ fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option return Err(ErrorKind::InvalidBaseJoin(text.clone())), }, None => match source { - InputSource::FsPath(root) => create_uri_from_file_path(root, &text, base)?, + InputSource::FsPath(root) => { + // If absolute link (`/`) and `base` is not set, + // simply ignore the link as it's not resolvable. + if text.starts_with('/') && base.is_none() { + return Ok(None); + } + + create_uri_from_file_path(root, &text, base)? + } _ => return Err(ErrorKind::UnsupportedUriType(text)), }, }, }; - Ok(uri) + Ok(Some(uri)) } // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs @@ -115,6 +135,9 @@ fn truncate_source(source: &InputSource) -> InputSource { /// Create requests out of the collected URLs. /// Only keeps "valid" URLs. This filters out anchors for example. +/// +/// If a URLs is ignored (because of the current settings), +/// it will not be added to the `HashSet`. pub(crate) fn create( uris: Vec, input_content: &InputContent, @@ -126,9 +149,15 @@ pub(crate) fn create( .or_else(|| Base::from_source(&input_content.source)); let requests: Result> = uris .into_iter() - .map(|raw_uri| create_request(raw_uri, &input_content.source, &base, extractor)) + .filter_map(|raw_uri| { + match create_request(raw_uri, &input_content.source, &base, extractor) { + Ok(Some(request)) => Some(Ok(request)), + Ok(None) => None, + Err(e) => Some(Err(e)), + } + }) .collect(); - Ok(HashSet::from_iter(requests?)) + requests.map(HashSet::from_iter) } /// Create a URI from a path @@ -147,6 +176,7 @@ fn resolve_and_create_url( dest_path: &str, base_uri: &Option, ) -> Result { + println!("resolve_and_create_url src_path: {:?}", src_path); let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path); // Decode the destination path to avoid double-encoding @@ -301,7 +331,7 @@ mod tests { assert_eq!( actual, - Request::new( + Some(Request::new( Uri { url: Url::from_file_path("/tmp/lychee/file.html").unwrap() }, @@ -309,7 +339,7 @@ mod tests { None, None, None, - ) + )) ); } @@ -329,7 +359,7 @@ mod tests { assert_eq!( actual, - Request::new( + Some(Request::new( Uri { url: Url::from_file_path("/usr/local/share/doc/example.html").unwrap() }, @@ -337,7 +367,7 @@ mod tests { None, None, None, - ) + )) ); } @@ -352,7 +382,10 @@ mod tests { let raw_uri = RawUri::from("relative.html"); let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap(); - assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html"); + assert_eq!( + uri.unwrap().url.as_str(), + "file:///tmp/lychee/relative.html" + ); } #[test] @@ -366,6 +399,9 @@ mod tests { let raw_uri = RawUri::from("absolute.html"); let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap(); - assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html"); + assert_eq!( + uri.unwrap().url.as_str(), + "file:///tmp/lychee/absolute.html" + ); } } From e520599da7371330fb156dc98efe5b9e888e1902 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 14 Oct 2024 01:06:23 +0200 Subject: [PATCH 06/14] lint --- lychee-lib/src/client.rs | 5 ++--- lychee-lib/src/utils/request.rs | 15 +++++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index d5d102aca0..e85c48c1ef 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -514,7 +514,7 @@ impl Client { // to the path. if let Some(ref base) = self.base { let Ok(path) = uri.url.to_file_path() else { - return Err(ErrorKind::InvalidFilePath(uri.clone()).into()); + return Err(ErrorKind::InvalidFilePath(uri.clone())); }; if path.is_absolute() { @@ -526,7 +526,7 @@ impl Client { .unwrap_or_else(|_| PathBuf::new()) .join(base_path) } else { - base_path.to_path_buf() + base_path.clone() }; // Optionally strip the leading slash from the path (if it exists) @@ -755,7 +755,6 @@ impl Client { /// Checks a `file` URI's fragment. pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { - println!("Checking fragment for {:?}", path); match self.fragment_checker.check(path, &uri.url).await { Ok(true) => Status::Ok(StatusCode::OK), Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(), diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index 83e20a5e2c..e46862e8ba 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -22,12 +22,12 @@ fn extract_credentials( /// Create a request from a raw URI. fn create_request( - raw_uri: RawUri, + raw_uri: &RawUri, source: &InputSource, base: &Option, extractor: &Option, ) -> Result> { - let Some(uri) = try_parse_into_uri(&raw_uri, source, base)? else { + let Some(uri) = try_parse_into_uri(raw_uri, source, base)? else { return Ok(None); }; let source = truncate_source(source); @@ -94,7 +94,7 @@ pub(crate) fn is_anchor(text: &str) -> bool { /// fn create_uri_from_file_path( - file_path: &PathBuf, + file_path: &Path, link_text: &str, base: &Option, ) -> Result { @@ -103,7 +103,7 @@ fn create_uri_from_file_path( let file_name = file_path .file_name() .and_then(|name| name.to_str()) - .ok_or_else(|| ErrorKind::InvalidFile(file_path.clone()))?; + .ok_or_else(|| ErrorKind::InvalidFile(file_path.to_path_buf()))?; format!("{file_name}{link_text}") } else { @@ -150,7 +150,7 @@ pub(crate) fn create( let requests: Result> = uris .into_iter() .filter_map(|raw_uri| { - match create_request(raw_uri, &input_content.source, &base, extractor) { + match create_request(&raw_uri, &input_content.source, &base, extractor) { Ok(Some(request)) => Some(Ok(request)), Ok(None) => None, Err(e) => Some(Err(e)), @@ -176,7 +176,6 @@ fn resolve_and_create_url( dest_path: &str, base_uri: &Option, ) -> Result { - println!("resolve_and_create_url src_path: {:?}", src_path); let (dest_path, fragment) = url::remove_get_params_and_separate_fragment(dest_path); // Decode the destination path to avoid double-encoding @@ -327,7 +326,7 @@ mod tests { let input_source = InputSource::FsPath(PathBuf::from("page.html")); let actual = - create_request(RawUri::from("file.html"), &input_source, &base, &None).unwrap(); + create_request(&RawUri::from("file.html"), &input_source, &base, &None).unwrap(); assert_eq!( actual, @@ -350,7 +349,7 @@ mod tests { // Use an absolute path that's outside the base directory let actual = create_request( - RawUri::from("/usr/local/share/doc/example.html"), + &RawUri::from("/usr/local/share/doc/example.html"), &input_source, &base, &None, From c8c9db25e37c37b2738c85a30a851a48a681a3ea Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 14 Oct 2024 01:11:26 +0200 Subject: [PATCH 07/14] update test --- lychee-bin/tests/cli.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index c53d6c940a..c215f6f966 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -280,8 +280,8 @@ mod cli { .env_clear() .assert() .success() - .stdout(contains("4 Total")) - .stdout(contains("4 OK")); + .stdout(contains("3 Total")) + .stdout(contains("3 OK")); } #[test] From a55e4f39a9cabbe60be0e5091f4947db6f416eb2 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 14 Oct 2024 01:35:50 +0200 Subject: [PATCH 08/14] Remove absolute file path hack --- lychee-bin/tests/cli.rs | 4 +- lychee-lib/src/utils/request.rs | 70 ++++++++++++--------------------- 2 files changed, 27 insertions(+), 47 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index c215f6f966..9dbae37d34 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -266,9 +266,7 @@ mod cli { #[test] fn test_resolve_paths() { - // TODO: - // 1. Check the diff to see if the absolute file rewrite hack can be removed / is still in use - // 2. refactor the code to clean up base path handling + // TODO: Refactor the code to clean up base path handling let mut cmd = main_command(); let dir = fixtures_path().join("resolve_paths"); diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index e46862e8ba..f2b77e01e7 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -1,3 +1,4 @@ +use log::warn; use percent_encoding::percent_decode_str; use reqwest::Url; use std::{ @@ -26,22 +27,14 @@ fn create_request( source: &InputSource, base: &Option, extractor: &Option, -) -> Result> { - let Some(uri) = try_parse_into_uri(raw_uri, source, base)? else { - return Ok(None); - }; +) -> Result { + let uri = try_parse_into_uri(raw_uri, source, base)?; let source = truncate_source(source); let element = raw_uri.element.clone(); let attribute = raw_uri.attribute.clone(); let credentials = extract_credentials(extractor, &uri); - Ok(Some(Request::new( - uri, - source, - element, - attribute, - credentials, - ))) + Ok(Request::new(uri, source, element, attribute, credentials)) } /// Try to parse the raw URI into a `Uri`. @@ -55,11 +48,7 @@ fn create_request( /// to create a valid URI. /// - If a URI cannot be created from the file path. /// - If the source is not a file path (i.e. the URI type is not supported). -fn try_parse_into_uri( - raw_uri: &RawUri, - source: &InputSource, - base: &Option, -) -> Result> { +fn try_parse_into_uri(raw_uri: &RawUri, source: &InputSource, base: &Option) -> Result { let text = raw_uri.text.clone(); let uri = match Uri::try_from(raw_uri.clone()) { Ok(uri) => uri, @@ -69,20 +58,12 @@ fn try_parse_into_uri( None => return Err(ErrorKind::InvalidBaseJoin(text.clone())), }, None => match source { - InputSource::FsPath(root) => { - // If absolute link (`/`) and `base` is not set, - // simply ignore the link as it's not resolvable. - if text.starts_with('/') && base.is_none() { - return Ok(None); - } - - create_uri_from_file_path(root, &text, base)? - } + InputSource::FsPath(root) => create_uri_from_file_path(root, &text, base)?, _ => return Err(ErrorKind::UnsupportedUriType(text)), }, }, }; - Ok(Some(uri)) + Ok(uri) } // Taken from https://github.com/getzola/zola/blob/master/components/link_checker/src/lib.rs @@ -92,7 +73,11 @@ pub(crate) fn is_anchor(text: &str) -> bool { /// Create a URI from a file path /// - +/// # Errors +/// +/// - If the link text is an anchor and the file name cannot be extracted from the file path. +/// - If the path cannot be resolved. +/// - If the resolved path cannot be converted to a URL. fn create_uri_from_file_path( file_path: &Path, link_text: &str, @@ -147,17 +132,20 @@ pub(crate) fn create( let base = base .clone() .or_else(|| Base::from_source(&input_content.source)); - let requests: Result> = uris + + let requests: HashSet = uris .into_iter() .filter_map(|raw_uri| { match create_request(&raw_uri, &input_content.source, &base, extractor) { - Ok(Some(request)) => Some(Ok(request)), - Ok(None) => None, - Err(e) => Some(Err(e)), + Ok(request) => Some(request), + Err(e) => { + warn!("Error creating request: {:?}", e); + None + } } }) .collect(); - requests.map(HashSet::from_iter) + Ok(requests) } /// Create a URI from a path @@ -330,7 +318,7 @@ mod tests { assert_eq!( actual, - Some(Request::new( + Request::new( Uri { url: Url::from_file_path("/tmp/lychee/file.html").unwrap() }, @@ -338,7 +326,7 @@ mod tests { None, None, None, - )) + ) ); } @@ -358,7 +346,7 @@ mod tests { assert_eq!( actual, - Some(Request::new( + Request::new( Uri { url: Url::from_file_path("/usr/local/share/doc/example.html").unwrap() }, @@ -366,7 +354,7 @@ mod tests { None, None, None, - )) + ) ); } @@ -381,10 +369,7 @@ mod tests { let raw_uri = RawUri::from("relative.html"); let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap(); - assert_eq!( - uri.unwrap().url.as_str(), - "file:///tmp/lychee/relative.html" - ); + assert_eq!(uri.url.as_str(), "file:///tmp/lychee/relative.html"); } #[test] @@ -398,9 +383,6 @@ mod tests { let raw_uri = RawUri::from("absolute.html"); let uri = try_parse_into_uri(&raw_uri, &input.source, &base).unwrap(); - assert_eq!( - uri.unwrap().url.as_str(), - "file:///tmp/lychee/absolute.html" - ); + assert_eq!(uri.url.as_str(), "file:///tmp/lychee/absolute.html"); } } From eedaff26ef9d97ffee1221ba9cb6bbb506475bb2 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 14 Oct 2024 01:58:10 +0200 Subject: [PATCH 09/14] fix lints --- lychee-lib/src/collector.rs | 2 +- lychee-lib/src/utils/request.rs | 20 +++++++++----------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/lychee-lib/src/collector.rs b/lychee-lib/src/collector.rs index fd55626398..955bdd24e7 100644 --- a/lychee-lib/src/collector.rs +++ b/lychee-lib/src/collector.rs @@ -124,7 +124,7 @@ impl Collector { let content = content?; let extractor = Extractor::new(self.use_html5ever, self.include_verbatim); let uris: Vec = extractor.extract(&content); - let requests = request::create(uris, &content, &base, &basic_auth_extractor)?; + let requests = request::create(uris, &content, &base, &basic_auth_extractor); Result::Ok(stream::iter(requests.into_iter().map(Ok))) } }) diff --git a/lychee-lib/src/utils/request.rs b/lychee-lib/src/utils/request.rs index f2b77e01e7..7867e50f09 100644 --- a/lychee-lib/src/utils/request.rs +++ b/lychee-lib/src/utils/request.rs @@ -128,13 +128,12 @@ pub(crate) fn create( input_content: &InputContent, base: &Option, extractor: &Option, -) -> Result> { +) -> HashSet { let base = base .clone() .or_else(|| Base::from_source(&input_content.source)); - let requests: HashSet = uris - .into_iter() + uris.into_iter() .filter_map(|raw_uri| { match create_request(&raw_uri, &input_content.source, &base, extractor) { Ok(request) => Some(request), @@ -144,8 +143,7 @@ pub(crate) fn create( } } }) - .collect(); - Ok(requests) + .collect() } /// Create a URI from a path @@ -218,7 +216,7 @@ mod tests { ); let uris = vec![RawUri::from("relative.html")]; - let requests = create(uris, &input, &base, &None).unwrap(); + let requests = create(uris, &input, &base, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -235,7 +233,7 @@ mod tests { ); let uris = vec![RawUri::from("https://another.com/page")]; - let requests = create(uris, &input, &base, &None).unwrap(); + let requests = create(uris, &input, &base, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -252,7 +250,7 @@ mod tests { ); let uris = vec![RawUri::from("/root-relative")]; - let requests = create(uris, &input, &base, &None).unwrap(); + let requests = create(uris, &input, &base, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -269,7 +267,7 @@ mod tests { ); let uris = vec![RawUri::from("../parent")]; - let requests = create(uris, &input, &base, &None).unwrap(); + let requests = create(uris, &input, &base, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -283,7 +281,7 @@ mod tests { let input = create_input(r##"Fragment Link"##, FileType::Html); let uris = vec![RawUri::from("#fragment")]; - let requests = create(uris, &input, &base, &None).unwrap(); + let requests = create(uris, &input, &base, &None); assert_eq!(requests.len(), 1); assert!(requests @@ -300,7 +298,7 @@ mod tests { ); let uris = vec![RawUri::from("https://example.com/page")]; - let requests = create(uris, &input, &base, &None).unwrap(); + let requests = create(uris, &input, &base, &None); assert_eq!(requests.len(), 1); assert!(requests From 779c457c32ecb7c824b9199122643e9cc3977daa Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 16 Oct 2024 21:48:30 +0200 Subject: [PATCH 10/14] refactor --- lychee-lib/src/checker/file.rs | 111 +++++++++++++++++ lychee-lib/src/checker/mod.rs | 7 ++ .../src/{checker.rs => checker/website.rs} | 2 +- lychee-lib/src/client.rs | 115 +++--------------- 4 files changed, 135 insertions(+), 100 deletions(-) create mode 100644 lychee-lib/src/checker/file.rs create mode 100644 lychee-lib/src/checker/mod.rs rename lychee-lib/src/{checker.rs => checker/website.rs} (99%) diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs new file mode 100644 index 0000000000..d0f1f6c3a2 --- /dev/null +++ b/lychee-lib/src/checker/file.rs @@ -0,0 +1,111 @@ +use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri}; +use http::StatusCode; +use log::warn; +use std::path::{Path, PathBuf}; + +#[derive(Debug, Clone)] +pub(crate) struct FileChecker { + base: Option, + fallback_extensions: Vec, + include_fragments: bool, + fragment_checker: FragmentChecker, +} + +impl FileChecker { + pub(crate) fn new( + base: Option, + fallback_extensions: Vec, + include_fragments: bool, + ) -> Self { + Self { + base, + fallback_extensions, + include_fragments, + fragment_checker: FragmentChecker::new(), + } + } + + pub(crate) async fn check(&self, uri: &Uri) -> Status { + let Ok(path) = uri.url.to_file_path() else { + return ErrorKind::InvalidFilePath(uri.clone()).into(); + }; + + if path.is_absolute() { + let resolved_path = self.resolve_absolute_path(&path); + return self.check_resolved_path(&resolved_path, uri).await; + } + + self.check_path(&path, uri).await + } + + async fn check_resolved_path(&self, path: &Path, uri: &Uri) -> Status { + if path.exists() { + if self.include_fragments { + self.check_fragment(path, uri).await + } else { + Status::Ok(StatusCode::OK) + } + } else { + ErrorKind::InvalidFilePath(uri.clone()).into() + } + } + + async fn check_path(&self, path: &Path, uri: &Uri) -> Status { + if path.exists() { + return self.check_existing_path(path, uri).await; + } + + if path.extension().is_some() { + return ErrorKind::InvalidFilePath(uri.clone()).into(); + } + + self.check_with_fallback_extensions(path, uri).await + } + + async fn check_existing_path(&self, path: &Path, uri: &Uri) -> Status { + if self.include_fragments { + self.check_fragment(path, uri).await + } else { + Status::Ok(StatusCode::OK) + } + } + + async fn check_with_fallback_extensions(&self, path: &Path, uri: &Uri) -> Status { + let mut path_buf = path.to_path_buf(); + for ext in &self.fallback_extensions { + path_buf.set_extension(ext); + if path_buf.exists() { + return self.check_existing_path(&path_buf, uri).await; + } + } + ErrorKind::InvalidFilePath(uri.clone()).into() + } + + fn resolve_absolute_path(&self, path: &Path) -> PathBuf { + if let Some(Base::Local(base_path)) = &self.base { + let absolute_base_path = if base_path.is_relative() { + std::env::current_dir() + .unwrap_or_else(|_| PathBuf::new()) + .join(base_path) + } else { + base_path.to_path_buf() + }; + + let stripped = path.strip_prefix("/").unwrap_or(path); + absolute_base_path.join(stripped) + } else { + path.to_path_buf() + } + } + + async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { + match self.fragment_checker.check(path, &uri.url).await { + Ok(true) => Status::Ok(StatusCode::OK), + Ok(false) => ErrorKind::InvalidFragment(uri.clone()).into(), + Err(err) => { + warn!("Skipping fragment check due to the following error: {err}"); + Status::Ok(StatusCode::OK) + } + } + } +} diff --git a/lychee-lib/src/checker/mod.rs b/lychee-lib/src/checker/mod.rs new file mode 100644 index 0000000000..eaf206e21a --- /dev/null +++ b/lychee-lib/src/checker/mod.rs @@ -0,0 +1,7 @@ +//! Checker Module +//! +//! This module contains all checkers, which are responsible for checking the status of a URL. +//! Each checker implements [Handler](crate::chain::Handler). + +pub(crate) mod file; +pub(crate) mod website; diff --git a/lychee-lib/src/checker.rs b/lychee-lib/src/checker/website.rs similarity index 99% rename from lychee-lib/src/checker.rs rename to lychee-lib/src/checker/website.rs index 428551ad2b..f2cffd4298 100644 --- a/lychee-lib/src/checker.rs +++ b/lychee-lib/src/checker/website.rs @@ -77,4 +77,4 @@ impl Handler for Checker { async fn handle(&mut self, input: Request) -> ChainResult { ChainResult::Done(self.retry_request(input).await) } -} +} \ No newline at end of file diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index e85c48c1ef..c78d9f863f 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -13,12 +13,7 @@ clippy::default_trait_access, clippy::used_underscore_binding )] -use std::{ - collections::HashSet, - path::{Path, PathBuf}, - sync::Arc, - time::Duration, -}; +use std::{collections::HashSet, path::Path, sync::Arc, time::Duration}; #[cfg(all(feature = "email-check", feature = "native-tls"))] use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; @@ -36,7 +31,8 @@ use typed_builder::TypedBuilder; use crate::{ chain::{Chain, ClientRequestChains, RequestChain}, - checker::Checker, + checker::file::FileChecker, + checker::website::Checker, filter::{Excludes, Filter, Includes}, quirks::Quirks, remap::Remaps, @@ -398,16 +394,18 @@ impl ClientBuilder { reqwest_client, github_client, remaps: self.remaps, - base: self.base, - fallback_extensions: self.fallback_extensions, filter, max_retries: self.max_retries, retry_wait_time: self.retry_wait_time, method: self.method, accepted: self.accepted, require_https: self.require_https, - include_fragments: self.include_fragments, fragment_checker: FragmentChecker::new(), + file_checker: FileChecker::new( + self.base, + self.fallback_extensions, + self.include_fragments, + ), plugin_request_chain: self.plugin_request_chain, }) } @@ -428,12 +426,6 @@ pub struct Client { /// Optional remapping rules for URIs matching pattern. remaps: Option, - /// Optional base for resolving paths. - base: Option, - - /// Automatically append file extensions to `file://` URIs as needed - fallback_extensions: Vec, - /// Rules to decided whether each link should be checked or ignored. filter: Filter, @@ -459,13 +451,12 @@ pub struct Client { /// This would treat unencrypted links as errors when HTTPS is available. require_https: bool, - /// Enable the checking of fragments in links. - include_fragments: bool, - /// Caches Fragments fragment_checker: FragmentChecker, plugin_request_chain: RequestChain, + + file_checker: FileChecker, } impl Client { @@ -509,55 +500,7 @@ impl Client { } let status = match uri.scheme() { - _ if uri.is_file() => { - // If base is set and the path is absolute, prepend the base - // to the path. - if let Some(ref base) = self.base { - let Ok(path) = uri.url.to_file_path() else { - return Err(ErrorKind::InvalidFilePath(uri.clone())); - }; - - if path.is_absolute() { - match base { - Base::Local(ref base_path) => { - // Ensure base_path is absolute - let absolute_base_path = if base_path.is_relative() { - std::env::current_dir() - .unwrap_or_else(|_| PathBuf::new()) - .join(base_path) - } else { - base_path.clone() - }; - - // Optionally strip the leading slash from the path (if it exists) - let stripped = path.strip_prefix("/").unwrap_or(&path); - let resolved = absolute_base_path.join(stripped); - - // Create a new URI with the resolved path - let mut uri = uri.clone(); - uri.url = Url::from_file_path(&resolved).unwrap(); - - if resolved.exists() { - return Ok(Response::new( - uri.clone(), - Status::Ok(StatusCode::OK), - source, - )); - } - - return Ok(Response::new( - uri.clone(), - Status::Error(ErrorKind::InvalidFilePath(uri.clone())), - source, - )); - } - Base::Remote(_) => (), - } - } - } - - self.check_file(uri).await - } + _ if uri.is_file() => self.check_file(uri).await, _ if uri.is_mail() => self.check_mail(uri).await, _ if uri.is_tel() => Status::Excluded, _ => { @@ -579,6 +522,11 @@ impl Client { Ok(Response::new(uri.clone(), status, source)) } + /// Check a single file using the file checker. + pub async fn check_file(&self, uri: &Uri) -> Status { + self.file_checker.check(uri).await + } + /// Remap `uri` using the client-defined remapping rules. /// /// # Errors @@ -722,37 +670,6 @@ impl Client { Status::Ok(StatusCode::OK) } - /// Check a `file` URI. - pub async fn check_file(&self, uri: &Uri) -> Status { - let Ok(path) = uri.url.to_file_path() else { - return ErrorKind::InvalidFilePath(uri.clone()).into(); - }; - - if path.exists() { - if self.include_fragments { - return self.check_fragment(&path, uri).await; - } - return Status::Ok(StatusCode::OK); - } - - if path.extension().is_some() { - return ErrorKind::InvalidFilePath(uri.clone()).into(); - } - - // if the path has no file extension, try to append some - let mut path_buf = path.clone(); - for ext in &self.fallback_extensions { - path_buf.set_extension(ext); - if path_buf.exists() { - if self.include_fragments { - return self.check_fragment(&path_buf, uri).await; - } - return Status::Ok(StatusCode::OK); - } - } - ErrorKind::InvalidFilePath(uri.clone()).into() - } - /// Checks a `file` URI's fragment. pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { match self.fragment_checker.check(path, &uri.url).await { From 2363e0496cae33fbe39b65dbbcaa3acb5eca8e31 Mon Sep 17 00:00:00 2001 From: Matthias Date: Wed, 16 Oct 2024 21:58:22 +0200 Subject: [PATCH 11/14] refactor --- lychee-bin/src/formatters/response/color.rs | 14 +++- lychee-bin/tests/cli.rs | 2 - lychee-lib/src/checker/file.rs | 73 +++++++++--------- lychee-lib/src/checker/mail.rs | 53 +++++++++++++ lychee-lib/src/checker/mod.rs | 2 +- lychee-lib/src/checker/website.rs | 8 +- lychee-lib/src/client.rs | 83 ++++++++------------- 7 files changed, 139 insertions(+), 96 deletions(-) create mode 100644 lychee-lib/src/checker/mail.rs diff --git a/lychee-bin/src/formatters/response/color.rs b/lychee-bin/src/formatters/response/color.rs index 0a84f2a393..9aa12df40f 100644 --- a/lychee-bin/src/formatters/response/color.rs +++ b/lychee-bin/src/formatters/response/color.rs @@ -65,13 +65,19 @@ mod tests { } } + #[cfg(test)] + /// Helper function to strip ANSI color codes for tests + fn strip_ansi_codes(s: &str) -> String { + console::strip_ansi_codes(s).to_string() + } + #[test] fn test_format_response_with_ok_status() { let formatter = ColorFormatter; let body = mock_response_body(Status::Ok(StatusCode::OK), "https://example.com"); assert_eq!( - formatter.format_response(&body), - "\u{1b}[38;5;2m\u{1b}[1m [200]\u{1b}[0m https://example.com/" + strip_ansi_codes(&formatter.format_response(&body)), + " [200] https://example.com/" ); } @@ -83,8 +89,8 @@ mod tests { "https://example.com/404", ); assert_eq!( - formatter.format_response(&body), - "\u{1b}[38;5;197m [ERROR]\u{1b}[0m https://example.com/404" + strip_ansi_codes(&formatter.format_response(&body)), + " [ERROR] https://example.com/404" ); } diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 9dbae37d34..261b77b0d7 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -266,8 +266,6 @@ mod cli { #[test] fn test_resolve_paths() { - // TODO: Refactor the code to clean up base path handling - let mut cmd = main_command(); let dir = fixtures_path().join("resolve_paths"); diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index d0f1f6c3a2..f972bee09a 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -1,13 +1,26 @@ -use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri}; use http::StatusCode; use log::warn; use std::path::{Path, PathBuf}; +use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri}; + +/// A utility for checking the existence and validity of file-based URIs. +/// +/// `FileChecker` is responsible for resolving and validating file paths, +/// handling both absolute and relative paths. It supports base path resolution, +/// fallback extensions for files without extensions, and optional fragment checking. +/// +/// This creates a `FileChecker` with a base path, fallback extensions for HTML files, +/// and fragment checking enabled. #[derive(Debug, Clone)] pub(crate) struct FileChecker { + /// An optional base path or URL used for resolving relative paths. base: Option, + /// A list of file extensions to try if the original path doesn't exist. fallback_extensions: Vec, + /// Whether to check for the existence of fragments (e.g., #section-id) in HTML files. include_fragments: bool, + /// A utility for performing fragment checks in HTML files. fragment_checker: FragmentChecker, } @@ -30,23 +43,28 @@ impl FileChecker { return ErrorKind::InvalidFilePath(uri.clone()).into(); }; - if path.is_absolute() { - let resolved_path = self.resolve_absolute_path(&path); - return self.check_resolved_path(&resolved_path, uri).await; - } - - self.check_path(&path, uri).await + let resolved_path = self.resolve_path(&path); + self.check_path(&resolved_path, uri).await } - async fn check_resolved_path(&self, path: &Path, uri: &Uri) -> Status { - if path.exists() { - if self.include_fragments { - self.check_fragment(path, uri).await + fn resolve_path(&self, path: &Path) -> PathBuf { + if let Some(Base::Local(base_path)) = &self.base { + if path.is_absolute() { + let absolute_base_path = if base_path.is_relative() { + std::env::current_dir() + .unwrap_or_else(|_| PathBuf::new()) + .join(base_path) + } else { + base_path.clone() + }; + + let stripped = path.strip_prefix("/").unwrap_or(path); + absolute_base_path.join(stripped) } else { - Status::Ok(StatusCode::OK) + base_path.join(path) } } else { - ErrorKind::InvalidFilePath(uri.clone()).into() + path.to_path_buf() } } @@ -55,10 +73,6 @@ impl FileChecker { return self.check_existing_path(path, uri).await; } - if path.extension().is_some() { - return ErrorKind::InvalidFilePath(uri.clone()).into(); - } - self.check_with_fallback_extensions(path, uri).await } @@ -72,30 +86,21 @@ impl FileChecker { async fn check_with_fallback_extensions(&self, path: &Path, uri: &Uri) -> Status { let mut path_buf = path.to_path_buf(); + + // If the path already has an extension, try it first + if path_buf.extension().is_some() && path_buf.exists() { + return self.check_existing_path(&path_buf, uri).await; + } + + // Try fallback extensions for ext in &self.fallback_extensions { path_buf.set_extension(ext); if path_buf.exists() { return self.check_existing_path(&path_buf, uri).await; } } - ErrorKind::InvalidFilePath(uri.clone()).into() - } - - fn resolve_absolute_path(&self, path: &Path) -> PathBuf { - if let Some(Base::Local(base_path)) = &self.base { - let absolute_base_path = if base_path.is_relative() { - std::env::current_dir() - .unwrap_or_else(|_| PathBuf::new()) - .join(base_path) - } else { - base_path.to_path_buf() - }; - let stripped = path.strip_prefix("/").unwrap_or(path); - absolute_base_path.join(stripped) - } else { - path.to_path_buf() - } + ErrorKind::InvalidFilePath(uri.clone()).into() } async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { diff --git a/lychee-lib/src/checker/mail.rs b/lychee-lib/src/checker/mail.rs new file mode 100644 index 0000000000..1c3954568a --- /dev/null +++ b/lychee-lib/src/checker/mail.rs @@ -0,0 +1,53 @@ +use crate::{ErrorKind, Status, Uri}; +use http::StatusCode; + +#[cfg(all(feature = "email-check", feature = "native-tls"))] +use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; + +#[cfg(all(feature = "email-check", feature = "native-tls"))] +use crate::types::mail; + +/// A utility for checking the validity of email addresses. +/// +/// `EmailChecker` is responsible for validating email addresses, +/// optionally performing reachability checks when the appropriate +/// features are enabled. +#[derive(Debug, Clone)] +pub(crate) struct MailChecker {} + +impl MailChecker { + /// Creates a new `EmailChecker`. + pub(crate) const fn new() -> Self { + Self {} + } + + /// Check a mail address, or equivalently a `mailto` URI. + /// + /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`), + /// which are ignored by this check. They are not part of the mail address + /// and instead passed to a mail client. + pub(crate) async fn check_mail(&self, uri: &Uri) -> Status { + #[cfg(all(feature = "email-check", feature = "native-tls"))] + { + self.perform_email_check(uri).await + } + + #[cfg(not(all(feature = "email-check", feature = "native-tls")))] + { + Status::Excluded + } + } + + #[cfg(all(feature = "email-check", feature = "native-tls"))] + async fn perform_email_check(&self, uri: &Uri) -> Status { + let address = uri.url.path().to_string(); + let input = CheckEmailInput::new(address); + let result = &(check_email(&input).await); + + if let Reachable::Invalid = result.is_reachable { + ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into() + } else { + Status::Ok(StatusCode::OK) + } + } +} diff --git a/lychee-lib/src/checker/mod.rs b/lychee-lib/src/checker/mod.rs index eaf206e21a..bfbef9de51 100644 --- a/lychee-lib/src/checker/mod.rs +++ b/lychee-lib/src/checker/mod.rs @@ -1,7 +1,7 @@ //! Checker Module //! //! This module contains all checkers, which are responsible for checking the status of a URL. -//! Each checker implements [Handler](crate::chain::Handler). pub(crate) mod file; +pub(crate) mod mail; pub(crate) mod website; diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index f2cffd4298..668cf827e2 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -9,14 +9,14 @@ use reqwest::Request; use std::{collections::HashSet, time::Duration}; #[derive(Debug, Clone)] -pub(crate) struct Checker { +pub(crate) struct WebsiteChecker { retry_wait_time: Duration, max_retries: u64, reqwest_client: reqwest::Client, accepted: Option>, } -impl Checker { +impl WebsiteChecker { pub(crate) const fn new( retry_wait_time: Duration, max_retries: u64, @@ -73,8 +73,8 @@ fn clone_unwrap(request: &Request) -> Request { } #[async_trait] -impl Handler for Checker { +impl Handler for WebsiteChecker { async fn handle(&mut self, input: Request) -> ChainResult { ChainResult::Done(self.retry_request(input).await) } -} \ No newline at end of file +} diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index c78d9f863f..7f59c2ce39 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -15,8 +15,6 @@ )] use std::{collections::HashSet, path::Path, sync::Arc, time::Duration}; -#[cfg(all(feature = "email-check", feature = "native-tls"))] -use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; use http::{ header::{HeaderMap, HeaderValue}, StatusCode, @@ -32,18 +30,15 @@ use typed_builder::TypedBuilder; use crate::{ chain::{Chain, ClientRequestChains, RequestChain}, checker::file::FileChecker, - checker::website::Checker, + checker::{mail::MailChecker, website::WebsiteChecker}, filter::{Excludes, Filter, Includes}, quirks::Quirks, remap::Remaps, types::uri::github::GithubUri, utils::fragment_checker::FragmentChecker, - Base, ErrorKind, Request, Response, Result, Status, Uri, + Base, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri, }; -#[cfg(all(feature = "email-check", feature = "native-tls"))] -use crate::types::mail; - /// Default number of redirects before a request is deemed as failed, 5. pub const DEFAULT_MAX_REDIRECTS: usize = 5; /// Default number of retries before a request is deemed as failed, 3. @@ -401,6 +396,7 @@ impl ClientBuilder { accepted: self.accepted, require_https: self.require_https, fragment_checker: FragmentChecker::new(), + email_checker: MailChecker::new(), file_checker: FileChecker::new( self.base, self.fallback_extensions, @@ -457,6 +453,8 @@ pub struct Client { plugin_request_chain: RequestChain, file_checker: FileChecker, + + email_checker: MailChecker, } impl Client { @@ -500,23 +498,11 @@ impl Client { } let status = match uri.scheme() { + // We don't check tel: URIs + _ if uri.is_tel() => Status::Excluded, _ if uri.is_file() => self.check_file(uri).await, _ if uri.is_mail() => self.check_mail(uri).await, - _ if uri.is_tel() => Status::Excluded, - _ => { - let default_chain: RequestChain = Chain::new(vec![ - Box::::default(), - Box::new(credentials), - Box::new(Checker::new( - self.retry_wait_time, - self.max_retries, - self.reqwest_client.clone(), - self.accepted.clone(), - )), - ]); - - self.check_website(uri, default_chain).await? - } + _ => self.check_website(uri, credentials).await?, }; Ok(Response::new(uri.clone(), status, source)) @@ -554,7 +540,22 @@ impl Client { /// - The request failed. /// - The response status code is not accepted. /// - The URI cannot be converted to HTTPS. - pub async fn check_website(&self, uri: &Uri, default_chain: RequestChain) -> Result { + pub async fn check_website( + &self, + uri: &Uri, + credentials: Option, + ) -> Result { + let default_chain: RequestChain = Chain::new(vec![ + Box::::default(), + Box::new(credentials), + Box::new(WebsiteChecker::new( + self.retry_wait_time, + self.max_retries, + self.reqwest_client.clone(), + self.accepted.clone(), + )), + ]); + match self.check_website_inner(uri, &default_chain).await { Status::Ok(code) if self.require_https && uri.scheme() == "http" => { if self @@ -577,6 +578,8 @@ impl Client { /// /// Unsupported schemes will be ignored /// + /// Note: we use `inner` to improve compile times by avoiding monomorphization + /// /// # Errors /// /// This returns an `Err` if @@ -617,7 +620,7 @@ impl Client { // Pull out the heavy machinery in case of a failed normal request. // This could be a GitHub URL and we ran into the rate limiter. - // TODO: We should first try to parse the URI as GitHub URI first (Lucius, Jan 2023) + // TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023) async fn handle_github(&self, status: Status, uri: &Uri) -> Status { if status.is_success() { return status; @@ -670,6 +673,11 @@ impl Client { Status::Ok(StatusCode::OK) } + /// Checks a `mailto` URI. + pub async fn check_mail(&self, uri: &Uri) -> Status { + self.email_checker.check_mail(uri).await + } + /// Checks a `file` URI's fragment. pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { match self.fragment_checker.check(path, &uri.url).await { @@ -681,33 +689,6 @@ impl Client { } } } - - /// Check a mail address, or equivalently a `mailto` URI. - /// - /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`), - /// which are ignored by this check. The are not part of the mail address - /// and instead passed to a mail client. - #[cfg(all(feature = "email-check", feature = "native-tls"))] - pub async fn check_mail(&self, uri: &Uri) -> Status { - let address = uri.url.path().to_string(); - let input = CheckEmailInput::new(address); - let result = &(check_email(&input).await); - - if let Reachable::Invalid = result.is_reachable { - ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into() - } else { - Status::Ok(StatusCode::OK) - } - } - - /// Check a mail address, or equivalently a `mailto` URI. - /// - /// This implementation simply excludes all email addresses. - #[cfg(not(all(feature = "email-check", feature = "native-tls")))] - #[allow(clippy::unused_async)] - pub async fn check_mail(&self, _uri: &Uri) -> Status { - Status::Excluded - } } // Check if the given `Url` would cause `reqwest` to panic. From 0f9bd5cbb099ac8ee6e0b09e6d5e1c3a2c4f5937 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 17 Oct 2024 02:04:49 +0200 Subject: [PATCH 12/14] refactor and fix tests --- lychee-bin/src/commands/check.rs | 10 +- lychee-bin/tests/cli.rs | 6 +- lychee-lib/src/checker/website.rs | 160 +++++++++++++++++++++- lychee-lib/src/client.rs | 211 ++++-------------------------- 4 files changed, 191 insertions(+), 196 deletions(-) diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index fd654998c0..2ffc0ed372 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -429,14 +429,12 @@ mod tests { #[tokio::test] async fn test_invalid_url() { - // Run a normal request with an invalid Url let client = ClientBuilder::builder().build().client().unwrap(); - let request = Request::try_from("http://\"").unwrap(); - let response = check_url(&client, request).await; - assert!(response.status().is_error()); + let uri = Uri::try_from("http://\"").unwrap(); + let response = client.check_website(&uri, None).await.unwrap(); assert!(matches!( - response.status(), - Status::Error(ErrorKind::InvalidURI(_)) + response, + Status::Unsupported(ErrorKind::BuildRequestClient(_)) )); } diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 261b77b0d7..fcb7530cb7 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -944,13 +944,17 @@ mod cli { // check content of cache file let data = fs::read_to_string(&cache_file)?; + + if data.is_empty() { + println!("Cache file is empty!"); + } + assert!(data.contains(&format!("{}/,200", mock_server_ok.uri()))); assert!(!data.contains(&format!("{}/,204", mock_server_no_content.uri()))); assert!(!data.contains(&format!("{}/,429", mock_server_too_many_requests.uri()))); // clear the cache file fs::remove_file(&cache_file)?; - Ok(()) } diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 668cf827e2..447de3a8a4 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -1,33 +1,69 @@ use crate::{ - chain::{ChainResult, Handler}, + chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain}, + quirks::Quirks, retry::RetryExt, - Status, + types::uri::github::GithubUri, + BasicAuthCredentials, ErrorKind, Status, Uri, }; use async_trait::async_trait; use http::StatusCode; +use octocrab::Octocrab; use reqwest::Request; use std::{collections::HashSet, time::Duration}; #[derive(Debug, Clone)] pub(crate) struct WebsiteChecker { - retry_wait_time: Duration, - max_retries: u64, + /// Request method used for making requests. + method: reqwest::Method, + + /// The HTTP client used for requests. reqwest_client: reqwest::Client, + + /// GitHub client used for requests. + github_client: Option, + + /// The chain of plugins to be executed on each request. + plugin_request_chain: RequestChain, + + /// Maximum number of retries per request before returning an error. + max_retries: u64, + + /// Initial wait time between retries of failed requests. This doubles after + /// each failure. + retry_wait_time: Duration, + + /// Set of accepted return codes / status codes. + /// + /// Unmatched return codes/ status codes are deemed as errors. accepted: Option>, + + /// Requires using HTTPS when it's available. + /// + /// This would treat unencrypted links as errors when HTTPS is available. + require_https: bool, } impl WebsiteChecker { + #[allow(clippy::too_many_arguments)] pub(crate) const fn new( + method: reqwest::Method, retry_wait_time: Duration, max_retries: u64, reqwest_client: reqwest::Client, accepted: Option>, + github_client: Option, + require_https: bool, + plugin_request_chain: RequestChain, ) -> Self { Self { - retry_wait_time, - max_retries, + method, reqwest_client, + github_client, + plugin_request_chain, + max_retries, + retry_wait_time, accepted, + require_https, } } @@ -36,7 +72,6 @@ impl WebsiteChecker { pub(crate) async fn retry_request(&self, request: Request) -> Status { let mut retries: u64 = 0; let mut wait_time = self.retry_wait_time; - let mut status = self.check_default(clone_unwrap(&request)).await; while retries < self.max_retries { if status.is_success() || !status.should_retry() { @@ -57,6 +92,117 @@ impl WebsiteChecker { Err(e) => e.into(), } } + + /// Checks the given URI of a website. + /// + /// # Errors + /// + /// This returns an `Err` if + /// - The URI is invalid. + /// - The request failed. + /// - The response status code is not accepted. + /// - The URI cannot be converted to HTTPS. + pub(crate) async fn check_website( + &self, + uri: &Uri, + credentials: Option, + ) -> Result { + let default_chain: RequestChain = Chain::new(vec![ + Box::::default(), + Box::new(credentials), + Box::new(self.clone()), + ]); + + match self.check_website_inner(uri, &default_chain).await { + Status::Ok(code) if self.require_https && uri.scheme() == "http" => { + if self + .check_website_inner(&uri.to_https()?, &default_chain) + .await + .is_success() + { + Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?))) + } else { + Ok(Status::Ok(code)) + } + } + s => Ok(s), + } + } + + /// Checks the given URI of a website. + /// + /// Unsupported schemes will be ignored + /// + /// Note: we use `inner` to improve compile times by avoiding monomorphization + /// + /// # Errors + /// + /// This returns an `Err` if + /// - The URI is invalid. + /// - The request failed. + /// - The response status code is not accepted. + async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status { + let request = self + .reqwest_client + .request(self.method.clone(), uri.as_str()) + .build(); + + let request = match request { + Ok(r) => r, + Err(e) => return e.into(), + }; + + let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain]) + .traverse(request) + .await; + + self.handle_github(status, uri).await + } + + // Pull out the heavy machinery in case of a failed normal request. + // This could be a GitHub URL and we ran into the rate limiter. + // TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023) + async fn handle_github(&self, status: Status, uri: &Uri) -> Status { + if status.is_success() { + return status; + } + + if let Ok(github_uri) = GithubUri::try_from(uri) { + let status = self.check_github(github_uri).await; + if status.is_success() { + return status; + } + } + + status + } + + /// Check a `uri` hosted on `GitHub` via the GitHub API. + /// + /// # Caveats + /// + /// Files inside private repositories won't get checked and instead would + /// be reported as valid if the repository itself is reachable through the + /// API. + /// + /// A better approach would be to download the file through the API or + /// clone the repo, but we chose the pragmatic approach. + async fn check_github(&self, uri: GithubUri) -> Status { + let Some(client) = &self.github_client else { + return ErrorKind::MissingGitHubToken.into(); + }; + let repo = match client.repos(&uri.owner, &uri.repo).get().await { + Ok(repo) => repo, + Err(e) => return ErrorKind::GithubRequest(e).into(), + }; + if let Some(true) = repo.private { + return Status::Ok(StatusCode::OK); + } else if let Some(endpoint) = uri.endpoint { + return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo)) + .into(); + } + Status::Ok(StatusCode::OK) + } } /// Clones a `reqwest::Request`. diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 7f59c2ce39..cbeab51e1f 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -22,19 +22,17 @@ use http::{ use log::{debug, warn}; use octocrab::Octocrab; use regex::RegexSet; -use reqwest::{header, redirect, Url}; +use reqwest::{header, redirect}; use reqwest_cookie_store::CookieStoreMutex; use secrecy::{ExposeSecret, SecretString}; use typed_builder::TypedBuilder; use crate::{ - chain::{Chain, ClientRequestChains, RequestChain}, + chain::RequestChain, checker::file::FileChecker, checker::{mail::MailChecker, website::WebsiteChecker}, filter::{Excludes, Filter, Includes}, - quirks::Quirks, remap::Remaps, - types::uri::github::GithubUri, utils::fragment_checker::FragmentChecker, Base, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri, }; @@ -385,24 +383,28 @@ impl ClientBuilder { include_mail: self.include_mail, }; - Ok(Client { + let website_checker = WebsiteChecker::new( + self.method, + self.retry_wait_time, + self.max_retries, reqwest_client, + self.accepted, github_client, + self.require_https, + self.plugin_request_chain, + ); + + Ok(Client { remaps: self.remaps, filter, - max_retries: self.max_retries, - retry_wait_time: self.retry_wait_time, - method: self.method, - accepted: self.accepted, - require_https: self.require_https, - fragment_checker: FragmentChecker::new(), email_checker: MailChecker::new(), + website_checker, file_checker: FileChecker::new( self.base, self.fallback_extensions, self.include_fragments, ), - plugin_request_chain: self.plugin_request_chain, + fragment_checker: FragmentChecker::new(), }) } } @@ -413,48 +415,23 @@ impl ClientBuilder { /// options. #[derive(Debug, Clone)] pub struct Client { - /// Underlying `reqwest` client instance that handles the HTTP requests. - reqwest_client: reqwest::Client, - - /// Optional GitHub client that handles communications with GitHub. - github_client: Option, - /// Optional remapping rules for URIs matching pattern. remaps: Option, /// Rules to decided whether each link should be checked or ignored. filter: Filter, - /// Maximum number of retries per request before returning an error. - max_retries: u64, - - /// Initial wait time between retries of failed requests. This doubles after - /// each failure. - retry_wait_time: Duration, + /// A checker for website URLs. + website_checker: WebsiteChecker, - /// HTTP method used for requests, e.g. `GET` or `HEAD`. - /// - /// The same method will be used for all links. - method: reqwest::Method, - - /// Set of accepted return codes / status codes. - /// - /// Unmatched return codes/ status codes are deemed as errors. - accepted: Option>, + /// A checker for file URLs. + file_checker: FileChecker, - /// Requires using HTTPS when it's available. - /// - /// This would treat unencrypted links as errors when HTTPS is available. - require_https: bool, + /// A checker for email URLs. + email_checker: MailChecker, /// Caches Fragments fragment_checker: FragmentChecker, - - plugin_request_chain: RequestChain, - - file_checker: FileChecker, - - email_checker: MailChecker, } impl Client { @@ -545,132 +522,7 @@ impl Client { uri: &Uri, credentials: Option, ) -> Result { - let default_chain: RequestChain = Chain::new(vec![ - Box::::default(), - Box::new(credentials), - Box::new(WebsiteChecker::new( - self.retry_wait_time, - self.max_retries, - self.reqwest_client.clone(), - self.accepted.clone(), - )), - ]); - - match self.check_website_inner(uri, &default_chain).await { - Status::Ok(code) if self.require_https && uri.scheme() == "http" => { - if self - .check_website_inner(&uri.to_https()?, &default_chain) - .await - .is_success() - { - Ok(Status::Error(ErrorKind::InsecureURL(uri.to_https()?))) - } else { - // HTTPS is not available for this URI, - // so the original HTTP URL is fine. - Ok(Status::Ok(code)) - } - } - s => Ok(s), - } - } - - /// Checks the given URI of a website. - /// - /// Unsupported schemes will be ignored - /// - /// Note: we use `inner` to improve compile times by avoiding monomorphization - /// - /// # Errors - /// - /// This returns an `Err` if - /// - The URI is invalid. - /// - The request failed. - /// - The response status code is not accepted. - pub async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status { - // Workaround for upstream reqwest panic - if validate_url(&uri.url) { - if matches!(uri.scheme(), "http" | "https") { - // This is a truly invalid URI with a known scheme. - // If we pass that to reqwest it would panic. - return Status::Error(ErrorKind::InvalidURI(uri.clone())); - } - // This is merely a URI with a scheme that is not supported by - // reqwest yet. It would be safe to pass that to reqwest and it - // wouldn't panic, but it's also unnecessary, because it would - // simply return an error. - return Status::Unsupported(ErrorKind::InvalidURI(uri.clone())); - } - - let request = self - .reqwest_client - .request(self.method.clone(), uri.as_str()) - .build(); - - let request = match request { - Ok(r) => r, - Err(e) => return e.into(), - }; - - let status = ClientRequestChains::new(vec![&self.plugin_request_chain, default_chain]) - .traverse(request) - .await; - - self.handle_github(status, uri).await - } - - // Pull out the heavy machinery in case of a failed normal request. - // This could be a GitHub URL and we ran into the rate limiter. - // TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023) - async fn handle_github(&self, status: Status, uri: &Uri) -> Status { - if status.is_success() { - return status; - } - - if let Ok(github_uri) = GithubUri::try_from(uri) { - let status = self.check_github(github_uri).await; - // Only return GitHub status in case of success - // Otherwise return the original error, which has more information - if status.is_success() { - return status; - } - } - - status - } - - /// Check a `uri` hosted on `GitHub` via the GitHub API. - /// - /// # Caveats - /// - /// Files inside private repositories won't get checked and instead would - /// be reported as valid if the repository itself is reachable through the - /// API. - /// - /// A better approach would be to download the file through the API or - /// clone the repo, but we chose the pragmatic approach. - async fn check_github(&self, uri: GithubUri) -> Status { - let Some(client) = &self.github_client else { - return ErrorKind::MissingGitHubToken.into(); - }; - let repo = match client.repos(&uri.owner, &uri.repo).get().await { - Ok(repo) => repo, - Err(e) => return ErrorKind::GithubRequest(e).into(), - }; - if let Some(true) = repo.private { - // The private repo exists. Assume a given endpoint exists as well - // (e.g. `issues` in `github.com/org/private/issues`). This is not - // always the case but simplifies the check. - return Status::Ok(StatusCode::OK); - } else if let Some(endpoint) = uri.endpoint { - // The URI returned a non-200 status code from a normal request and - // now we find that this public repo is reachable through the API, - // so that must mean the full URI (which includes the additional - // endpoint) must be invalid. - return ErrorKind::InvalidGithubUrl(format!("{}/{}/{endpoint}", uri.owner, uri.repo)) - .into(); - } - // Found public repo without endpoint - Status::Ok(StatusCode::OK) + self.website_checker.check_website(uri, credentials).await } /// Checks a `mailto` URI. @@ -691,16 +543,6 @@ impl Client { } } -// Check if the given `Url` would cause `reqwest` to panic. -// This is a workaround for https://github.com/lycheeverse/lychee/issues/539 -// and can be removed once https://github.com/seanmonstar/reqwest/pull/1399 -// got merged. -// It is exactly the same check that reqwest runs internally, but unfortunately -// it `unwrap`s (and panics!) instead of returning an error, which we could handle. -fn validate_url(url: &Url) -> bool { - http::Uri::try_from(url.as_str()).is_err() -} - /// A shorthand function to check a single URI. /// /// This provides the simplest link check utility without having to create a @@ -740,7 +582,7 @@ mod tests { chain::{ChainResult, Handler, RequestChain}, mock_server, test_utils::get_mock_client_response, - Request, Status, Uri, + ErrorKind, Request, Status, Uri, }; #[tokio::test] @@ -989,9 +831,14 @@ mod tests { #[tokio::test] async fn test_avoid_reqwest_panic() { let client = ClientBuilder::builder().build().client().unwrap(); - // This request will fail, but it won't panic + // This request will result in an Unsupported status, but it won't panic let res = client.check("http://\"").await.unwrap(); - assert!(res.status().is_error()); + + assert!(matches!( + res.status(), + Status::Unsupported(ErrorKind::BuildRequestClient(_)) + )); + assert!(res.status().is_unsupported()); } #[tokio::test] From 384b15f340d2ba009a6c5765a1fddb9d297c3df5 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 17 Oct 2024 18:03:59 +0200 Subject: [PATCH 13/14] fix lint --- lychee-lib/src/checker/file.rs | 89 +++++++++++++++++++++++++++++----- lychee-lib/src/checker/mail.rs | 22 +++++---- 2 files changed, 89 insertions(+), 22 deletions(-) diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index f972bee09a..0060bfbde0 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -6,25 +6,29 @@ use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, U /// A utility for checking the existence and validity of file-based URIs. /// -/// `FileChecker` is responsible for resolving and validating file paths, -/// handling both absolute and relative paths. It supports base path resolution, -/// fallback extensions for files without extensions, and optional fragment checking. -/// -/// This creates a `FileChecker` with a base path, fallback extensions for HTML files, -/// and fragment checking enabled. +/// `FileChecker` resolves and validates file paths, handling both absolute and relative paths. +/// It supports base path resolution, fallback extensions for files without extensions, +/// and optional fragment checking for HTML files. #[derive(Debug, Clone)] pub(crate) struct FileChecker { - /// An optional base path or URL used for resolving relative paths. + /// Base path or URL used for resolving relative paths. base: Option, - /// A list of file extensions to try if the original path doesn't exist. + /// List of file extensions to try if the original path doesn't exist. fallback_extensions: Vec, - /// Whether to check for the existence of fragments (e.g., #section-id) in HTML files. + /// Whether to check for the existence of fragments (e.g., `#section-id`) in HTML files. include_fragments: bool, - /// A utility for performing fragment checks in HTML files. + /// Utility for performing fragment checks in HTML files. fragment_checker: FragmentChecker, } impl FileChecker { + /// Creates a new `FileChecker` with the given configuration. + /// + /// # Arguments + /// + /// * `base` - Optional base path or URL for resolving relative paths. + /// * `fallback_extensions` - List of extensions to try if the original file is not found. + /// * `include_fragments` - Whether to check for fragment existence in HTML files. pub(crate) fn new( base: Option, fallback_extensions: Vec, @@ -38,6 +42,18 @@ impl FileChecker { } } + /// Checks the given file URI for existence and validity. + /// + /// This method resolves the URI to a file path, checks if the file exists, + /// and optionally checks for the existence of fragments in HTML files. + /// + /// # Arguments + /// + /// * `uri` - The URI to check. + /// + /// # Returns + /// + /// Returns a `Status` indicating the result of the check. pub(crate) async fn check(&self, uri: &Uri) -> Status { let Ok(path) = uri.url.to_file_path() else { return ErrorKind::InvalidFilePath(uri.clone()).into(); @@ -47,13 +63,20 @@ impl FileChecker { self.check_path(&resolved_path, uri).await } + /// Resolves the given path using the base path, if one is set. + /// + /// # Arguments + /// + /// * `path` - The path to resolve. + /// + /// # Returns + /// + /// Returns the resolved path as a `PathBuf`. fn resolve_path(&self, path: &Path) -> PathBuf { if let Some(Base::Local(base_path)) = &self.base { if path.is_absolute() { let absolute_base_path = if base_path.is_relative() { - std::env::current_dir() - .unwrap_or_else(|_| PathBuf::new()) - .join(base_path) + std::env::current_dir().unwrap_or_default().join(base_path) } else { base_path.clone() }; @@ -68,6 +91,16 @@ impl FileChecker { } } + /// Checks if the given path exists and performs additional checks if necessary. + /// + /// # Arguments + /// + /// * `path` - The path to check. + /// * `uri` - The original URI, used for error reporting. + /// + /// # Returns + /// + /// Returns a `Status` indicating the result of the check. async fn check_path(&self, path: &Path, uri: &Uri) -> Status { if path.exists() { return self.check_existing_path(path, uri).await; @@ -76,6 +109,16 @@ impl FileChecker { self.check_with_fallback_extensions(path, uri).await } + /// Checks an existing path, optionally verifying fragments for HTML files. + /// + /// # Arguments + /// + /// * `path` - The path to check. + /// * `uri` - The original URI, used for error reporting. + /// + /// # Returns + /// + /// Returns a `Status` indicating the result of the check. async fn check_existing_path(&self, path: &Path, uri: &Uri) -> Status { if self.include_fragments { self.check_fragment(path, uri).await @@ -84,6 +127,16 @@ impl FileChecker { } } + /// Attempts to find a file by trying different extensions specified in `fallback_extensions`. + /// + /// # Arguments + /// + /// * `path` - The original path to check. + /// * `uri` - The original URI, used for error reporting. + /// + /// # Returns + /// + /// Returns a `Status` indicating the result of the check. async fn check_with_fallback_extensions(&self, path: &Path, uri: &Uri) -> Status { let mut path_buf = path.to_path_buf(); @@ -103,6 +156,16 @@ impl FileChecker { ErrorKind::InvalidFilePath(uri.clone()).into() } + /// Checks for the existence of a fragment in an HTML file. + /// + /// # Arguments + /// + /// * `path` - The path to the HTML file. + /// * `uri` - The original URI, containing the fragment to check. + /// + /// # Returns + /// + /// Returns a `Status` indicating the result of the fragment check. async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { match self.fragment_checker.check(path, &uri.url).await { Ok(true) => Status::Ok(StatusCode::OK), diff --git a/lychee-lib/src/checker/mail.rs b/lychee-lib/src/checker/mail.rs index 1c3954568a..e7dcef7b8d 100644 --- a/lychee-lib/src/checker/mail.rs +++ b/lychee-lib/src/checker/mail.rs @@ -1,6 +1,11 @@ -use crate::{ErrorKind, Status, Uri}; +#[cfg(all(feature = "email-check", feature = "native-tls"))] use http::StatusCode; +#[cfg(all(feature = "email-check", feature = "native-tls"))] +use crate::ErrorKind; + +use crate::{Status, Uri}; + #[cfg(all(feature = "email-check", feature = "native-tls"))] use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; @@ -26,16 +31,15 @@ impl MailChecker { /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`), /// which are ignored by this check. They are not part of the mail address /// and instead passed to a mail client. + #[cfg(all(feature = "email-check", feature = "native-tls"))] pub(crate) async fn check_mail(&self, uri: &Uri) -> Status { - #[cfg(all(feature = "email-check", feature = "native-tls"))] - { - self.perform_email_check(uri).await - } + self.perform_email_check(uri).await + } - #[cfg(not(all(feature = "email-check", feature = "native-tls")))] - { - Status::Excluded - } + /// Ignore the mail check if the `email-check` and `native-tls` features are not enabled. + #[cfg(not(all(feature = "email-check", feature = "native-tls")))] + pub(crate) async fn check_mail(&self, _uri: &Uri) -> Status { + Status::Excluded } #[cfg(all(feature = "email-check", feature = "native-tls"))] From 76e406805f633c3c0e75419e611c2b667ada5d70 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sat, 26 Oct 2024 03:46:34 +0200 Subject: [PATCH 14/14] Refactor error handling in WebsiteChecker --- lychee-lib/src/checker/website.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 447de3a8a4..62cadaff11 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -193,7 +193,7 @@ impl WebsiteChecker { }; let repo = match client.repos(&uri.owner, &uri.repo).get().await { Ok(repo) => repo, - Err(e) => return ErrorKind::GithubRequest(e).into(), + Err(e) => return ErrorKind::GithubRequest(Box::new(e)).into(), }; if let Some(true) = repo.private { return Status::Ok(StatusCode::OK);