diff --git a/lychee-bin/src/formatters/response/color.rs b/lychee-bin/src/formatters/response/color.rs index 0a84f2a393..9aa12df40f 100644 --- a/lychee-bin/src/formatters/response/color.rs +++ b/lychee-bin/src/formatters/response/color.rs @@ -65,13 +65,19 @@ mod tests { } } + #[cfg(test)] + /// Helper function to strip ANSI color codes for tests + fn strip_ansi_codes(s: &str) -> String { + console::strip_ansi_codes(s).to_string() + } + #[test] fn test_format_response_with_ok_status() { let formatter = ColorFormatter; let body = mock_response_body(Status::Ok(StatusCode::OK), "https://example.com"); assert_eq!( - formatter.format_response(&body), - "\u{1b}[38;5;2m\u{1b}[1m [200]\u{1b}[0m https://example.com/" + strip_ansi_codes(&formatter.format_response(&body)), + " [200] https://example.com/" ); } @@ -83,8 +89,8 @@ mod tests { "https://example.com/404", ); assert_eq!( - formatter.format_response(&body), - "\u{1b}[38;5;197m [ERROR]\u{1b}[0m https://example.com/404" + strip_ansi_codes(&formatter.format_response(&body)), + " [ERROR] https://example.com/404" ); } diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 9dbae37d34..261b77b0d7 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -266,8 +266,6 @@ mod cli { #[test] fn test_resolve_paths() { - // TODO: Refactor the code to clean up base path handling - let mut cmd = main_command(); let dir = fixtures_path().join("resolve_paths"); diff --git a/lychee-lib/src/checker/file.rs b/lychee-lib/src/checker/file.rs index d0f1f6c3a2..f972bee09a 100644 --- a/lychee-lib/src/checker/file.rs +++ b/lychee-lib/src/checker/file.rs @@ -1,13 +1,26 @@ -use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri}; use http::StatusCode; use log::warn; use std::path::{Path, PathBuf}; +use crate::{utils::fragment_checker::FragmentChecker, Base, ErrorKind, Status, Uri}; + +/// A utility for checking the existence and validity of file-based URIs. +/// +/// `FileChecker` is responsible for resolving and validating file paths, +/// handling both absolute and relative paths. It supports base path resolution, +/// fallback extensions for files without extensions, and optional fragment checking. +/// +/// This creates a `FileChecker` with a base path, fallback extensions for HTML files, +/// and fragment checking enabled. #[derive(Debug, Clone)] pub(crate) struct FileChecker { + /// An optional base path or URL used for resolving relative paths. base: Option, + /// A list of file extensions to try if the original path doesn't exist. fallback_extensions: Vec, + /// Whether to check for the existence of fragments (e.g., #section-id) in HTML files. include_fragments: bool, + /// A utility for performing fragment checks in HTML files. fragment_checker: FragmentChecker, } @@ -30,23 +43,28 @@ impl FileChecker { return ErrorKind::InvalidFilePath(uri.clone()).into(); }; - if path.is_absolute() { - let resolved_path = self.resolve_absolute_path(&path); - return self.check_resolved_path(&resolved_path, uri).await; - } - - self.check_path(&path, uri).await + let resolved_path = self.resolve_path(&path); + self.check_path(&resolved_path, uri).await } - async fn check_resolved_path(&self, path: &Path, uri: &Uri) -> Status { - if path.exists() { - if self.include_fragments { - self.check_fragment(path, uri).await + fn resolve_path(&self, path: &Path) -> PathBuf { + if let Some(Base::Local(base_path)) = &self.base { + if path.is_absolute() { + let absolute_base_path = if base_path.is_relative() { + std::env::current_dir() + .unwrap_or_else(|_| PathBuf::new()) + .join(base_path) + } else { + base_path.clone() + }; + + let stripped = path.strip_prefix("/").unwrap_or(path); + absolute_base_path.join(stripped) } else { - Status::Ok(StatusCode::OK) + base_path.join(path) } } else { - ErrorKind::InvalidFilePath(uri.clone()).into() + path.to_path_buf() } } @@ -55,10 +73,6 @@ impl FileChecker { return self.check_existing_path(path, uri).await; } - if path.extension().is_some() { - return ErrorKind::InvalidFilePath(uri.clone()).into(); - } - self.check_with_fallback_extensions(path, uri).await } @@ -72,30 +86,21 @@ impl FileChecker { async fn check_with_fallback_extensions(&self, path: &Path, uri: &Uri) -> Status { let mut path_buf = path.to_path_buf(); + + // If the path already has an extension, try it first + if path_buf.extension().is_some() && path_buf.exists() { + return self.check_existing_path(&path_buf, uri).await; + } + + // Try fallback extensions for ext in &self.fallback_extensions { path_buf.set_extension(ext); if path_buf.exists() { return self.check_existing_path(&path_buf, uri).await; } } - ErrorKind::InvalidFilePath(uri.clone()).into() - } - - fn resolve_absolute_path(&self, path: &Path) -> PathBuf { - if let Some(Base::Local(base_path)) = &self.base { - let absolute_base_path = if base_path.is_relative() { - std::env::current_dir() - .unwrap_or_else(|_| PathBuf::new()) - .join(base_path) - } else { - base_path.to_path_buf() - }; - let stripped = path.strip_prefix("/").unwrap_or(path); - absolute_base_path.join(stripped) - } else { - path.to_path_buf() - } + ErrorKind::InvalidFilePath(uri.clone()).into() } async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { diff --git a/lychee-lib/src/checker/mail.rs b/lychee-lib/src/checker/mail.rs new file mode 100644 index 0000000000..1c3954568a --- /dev/null +++ b/lychee-lib/src/checker/mail.rs @@ -0,0 +1,53 @@ +use crate::{ErrorKind, Status, Uri}; +use http::StatusCode; + +#[cfg(all(feature = "email-check", feature = "native-tls"))] +use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; + +#[cfg(all(feature = "email-check", feature = "native-tls"))] +use crate::types::mail; + +/// A utility for checking the validity of email addresses. +/// +/// `EmailChecker` is responsible for validating email addresses, +/// optionally performing reachability checks when the appropriate +/// features are enabled. +#[derive(Debug, Clone)] +pub(crate) struct MailChecker {} + +impl MailChecker { + /// Creates a new `EmailChecker`. + pub(crate) const fn new() -> Self { + Self {} + } + + /// Check a mail address, or equivalently a `mailto` URI. + /// + /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`), + /// which are ignored by this check. They are not part of the mail address + /// and instead passed to a mail client. + pub(crate) async fn check_mail(&self, uri: &Uri) -> Status { + #[cfg(all(feature = "email-check", feature = "native-tls"))] + { + self.perform_email_check(uri).await + } + + #[cfg(not(all(feature = "email-check", feature = "native-tls")))] + { + Status::Excluded + } + } + + #[cfg(all(feature = "email-check", feature = "native-tls"))] + async fn perform_email_check(&self, uri: &Uri) -> Status { + let address = uri.url.path().to_string(); + let input = CheckEmailInput::new(address); + let result = &(check_email(&input).await); + + if let Reachable::Invalid = result.is_reachable { + ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into() + } else { + Status::Ok(StatusCode::OK) + } + } +} diff --git a/lychee-lib/src/checker/mod.rs b/lychee-lib/src/checker/mod.rs index eaf206e21a..bfbef9de51 100644 --- a/lychee-lib/src/checker/mod.rs +++ b/lychee-lib/src/checker/mod.rs @@ -1,7 +1,7 @@ //! Checker Module //! //! This module contains all checkers, which are responsible for checking the status of a URL. -//! Each checker implements [Handler](crate::chain::Handler). pub(crate) mod file; +pub(crate) mod mail; pub(crate) mod website; diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index f2cffd4298..668cf827e2 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -9,14 +9,14 @@ use reqwest::Request; use std::{collections::HashSet, time::Duration}; #[derive(Debug, Clone)] -pub(crate) struct Checker { +pub(crate) struct WebsiteChecker { retry_wait_time: Duration, max_retries: u64, reqwest_client: reqwest::Client, accepted: Option>, } -impl Checker { +impl WebsiteChecker { pub(crate) const fn new( retry_wait_time: Duration, max_retries: u64, @@ -73,8 +73,8 @@ fn clone_unwrap(request: &Request) -> Request { } #[async_trait] -impl Handler for Checker { +impl Handler for WebsiteChecker { async fn handle(&mut self, input: Request) -> ChainResult { ChainResult::Done(self.retry_request(input).await) } -} \ No newline at end of file +} diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index c78d9f863f..7f59c2ce39 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -15,8 +15,6 @@ )] use std::{collections::HashSet, path::Path, sync::Arc, time::Duration}; -#[cfg(all(feature = "email-check", feature = "native-tls"))] -use check_if_email_exists::{check_email, CheckEmailInput, Reachable}; use http::{ header::{HeaderMap, HeaderValue}, StatusCode, @@ -32,18 +30,15 @@ use typed_builder::TypedBuilder; use crate::{ chain::{Chain, ClientRequestChains, RequestChain}, checker::file::FileChecker, - checker::website::Checker, + checker::{mail::MailChecker, website::WebsiteChecker}, filter::{Excludes, Filter, Includes}, quirks::Quirks, remap::Remaps, types::uri::github::GithubUri, utils::fragment_checker::FragmentChecker, - Base, ErrorKind, Request, Response, Result, Status, Uri, + Base, BasicAuthCredentials, ErrorKind, Request, Response, Result, Status, Uri, }; -#[cfg(all(feature = "email-check", feature = "native-tls"))] -use crate::types::mail; - /// Default number of redirects before a request is deemed as failed, 5. pub const DEFAULT_MAX_REDIRECTS: usize = 5; /// Default number of retries before a request is deemed as failed, 3. @@ -401,6 +396,7 @@ impl ClientBuilder { accepted: self.accepted, require_https: self.require_https, fragment_checker: FragmentChecker::new(), + email_checker: MailChecker::new(), file_checker: FileChecker::new( self.base, self.fallback_extensions, @@ -457,6 +453,8 @@ pub struct Client { plugin_request_chain: RequestChain, file_checker: FileChecker, + + email_checker: MailChecker, } impl Client { @@ -500,23 +498,11 @@ impl Client { } let status = match uri.scheme() { + // We don't check tel: URIs + _ if uri.is_tel() => Status::Excluded, _ if uri.is_file() => self.check_file(uri).await, _ if uri.is_mail() => self.check_mail(uri).await, - _ if uri.is_tel() => Status::Excluded, - _ => { - let default_chain: RequestChain = Chain::new(vec![ - Box::::default(), - Box::new(credentials), - Box::new(Checker::new( - self.retry_wait_time, - self.max_retries, - self.reqwest_client.clone(), - self.accepted.clone(), - )), - ]); - - self.check_website(uri, default_chain).await? - } + _ => self.check_website(uri, credentials).await?, }; Ok(Response::new(uri.clone(), status, source)) @@ -554,7 +540,22 @@ impl Client { /// - The request failed. /// - The response status code is not accepted. /// - The URI cannot be converted to HTTPS. - pub async fn check_website(&self, uri: &Uri, default_chain: RequestChain) -> Result { + pub async fn check_website( + &self, + uri: &Uri, + credentials: Option, + ) -> Result { + let default_chain: RequestChain = Chain::new(vec![ + Box::::default(), + Box::new(credentials), + Box::new(WebsiteChecker::new( + self.retry_wait_time, + self.max_retries, + self.reqwest_client.clone(), + self.accepted.clone(), + )), + ]); + match self.check_website_inner(uri, &default_chain).await { Status::Ok(code) if self.require_https && uri.scheme() == "http" => { if self @@ -577,6 +578,8 @@ impl Client { /// /// Unsupported schemes will be ignored /// + /// Note: we use `inner` to improve compile times by avoiding monomorphization + /// /// # Errors /// /// This returns an `Err` if @@ -617,7 +620,7 @@ impl Client { // Pull out the heavy machinery in case of a failed normal request. // This could be a GitHub URL and we ran into the rate limiter. - // TODO: We should first try to parse the URI as GitHub URI first (Lucius, Jan 2023) + // TODO: We should try to parse the URI as GitHub URI first (Lucius, Jan 2023) async fn handle_github(&self, status: Status, uri: &Uri) -> Status { if status.is_success() { return status; @@ -670,6 +673,11 @@ impl Client { Status::Ok(StatusCode::OK) } + /// Checks a `mailto` URI. + pub async fn check_mail(&self, uri: &Uri) -> Status { + self.email_checker.check_mail(uri).await + } + /// Checks a `file` URI's fragment. pub async fn check_fragment(&self, path: &Path, uri: &Uri) -> Status { match self.fragment_checker.check(path, &uri.url).await { @@ -681,33 +689,6 @@ impl Client { } } } - - /// Check a mail address, or equivalently a `mailto` URI. - /// - /// URIs may contain query parameters (e.g. `contact@example.com?subject="Hello"`), - /// which are ignored by this check. The are not part of the mail address - /// and instead passed to a mail client. - #[cfg(all(feature = "email-check", feature = "native-tls"))] - pub async fn check_mail(&self, uri: &Uri) -> Status { - let address = uri.url.path().to_string(); - let input = CheckEmailInput::new(address); - let result = &(check_email(&input).await); - - if let Reachable::Invalid = result.is_reachable { - ErrorKind::UnreachableEmailAddress(uri.clone(), mail::error_from_output(result)).into() - } else { - Status::Ok(StatusCode::OK) - } - } - - /// Check a mail address, or equivalently a `mailto` URI. - /// - /// This implementation simply excludes all email addresses. - #[cfg(not(all(feature = "email-check", feature = "native-tls")))] - #[allow(clippy::unused_async)] - pub async fn check_mail(&self, _uri: &Uri) -> Status { - Status::Excluded - } } // Check if the given `Url` would cause `reqwest` to panic.