Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow excluding cache based on status code #1403

Merged
merged 14 commits into from
Oct 14, 2024
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,22 @@ Options:

[default: 1d]

--cache-exclude-status <CACHE_EXCLUDE_STATUS>
A list of status codes that will be ignored from the cache

The following accept range syntax is supported: [start]..[=]end|code. Some valid
examples are:

- 429
- 500..=599
- 500..

Use "lychee --cache-exclude-status '429, 500..502' <inputs>..." to provide a comma- separated
list of excluded status codes. This example will not cache results with a status code of 429, 500,
501 and 502.

[default: ]

--dump
Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked

Expand Down
94 changes: 89 additions & 5 deletions lychee-bin/src/commands/check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use reqwest::Url;
use tokio::sync::mpsc;
use tokio_stream::wrappers::ReceiverStream;

use lychee_lib::{Client, ErrorKind, Request, Response};
use lychee_lib::{Client, ErrorKind, Request, Response, Uri};
use lychee_lib::{InputSource, Result};
use lychee_lib::{ResponseBody, Status};

Expand Down Expand Up @@ -46,6 +46,7 @@ where

let client = params.client;
let cache = params.cache;
let cache_exclude_status = params.cfg.cache_exclude_status.into_set();
let accept = params.cfg.accept.into_set();

let pb = if params.cfg.no_progress || params.cfg.verbose.log_level() >= log::Level::Info {
Expand All @@ -61,6 +62,7 @@ where
max_concurrency,
client,
cache,
cache_exclude_status,
accept,
));

Expand Down Expand Up @@ -219,14 +221,22 @@ async fn request_channel_task(
max_concurrency: usize,
client: Client,
cache: Arc<Cache>,
cache_exclude_status: HashSet<u16>,
accept: HashSet<u16>,
) {
StreamExt::for_each_concurrent(
ReceiverStream::new(recv_req),
max_concurrency,
|request: Result<Request>| async {
let request = request.expect("cannot read request");
let response = handle(&client, cache.clone(), request, accept.clone()).await;
let response = handle(
&client,
cache.clone(),
cache_exclude_status.clone(),
request,
accept.clone(),
)
.await;

send_resp
.send(response)
Expand Down Expand Up @@ -260,6 +270,7 @@ async fn check_url(client: &Client, request: Request) -> Response {
async fn handle(
client: &Client,
cache: Arc<Cache>,
cache_exclude_status: HashSet<u16>,
request: Request,
accept: HashSet<u16>,
) -> Response {
Expand Down Expand Up @@ -287,16 +298,37 @@ async fn handle(
// benefit.
// - Skip caching unsupported URLs as they might be supported in a
// future run.
// - Skip caching excluded links; they might not be excluded in the next run
// - Skip caching excluded links; they might not be excluded in the next run.
// - Skip caching links for which the status code has been explicitly excluded from the cache.
let status = response.status();
if uri.is_file() || status.is_excluded() || status.is_unsupported() || status.is_unknown() {
if ignore_cache(&uri, status, &cache_exclude_status) {
return response;
}

cache.insert(uri, status.into());
response
}

/// Returns `true` if the response should be ignored in the cache.
///
/// The response should be ignored if:
/// - The URI is a file URI.
/// - The status is excluded.
/// - The status is unsupported.
/// - The status is unknown.
/// - The status code is excluded from the cache.
fn ignore_cache(uri: &Uri, status: &Status, cache_exclude_status: &HashSet<u16>) -> bool {
let status_code_excluded = status
.code()
.map_or(false, |code| cache_exclude_status.contains(&code.as_u16()));

uri.is_file()
|| status.is_excluded()
|| status.is_unsupported()
|| status.is_unknown()
|| status_code_excluded
}

fn show_progress(
output: &mut dyn Write,
progress_bar: &Option<ProgressBar>,
Expand Down Expand Up @@ -344,8 +376,9 @@ fn get_failed_urls(stats: &mut ResponseStats) -> Vec<(InputSource, Url)> {
#[cfg(test)]
mod tests {
use crate::{formatters::get_response_formatter, options};
use http::StatusCode;
use log::info;
use lychee_lib::{CacheStatus, ClientBuilder, InputSource, Uri};
use lychee_lib::{CacheStatus, ClientBuilder, ErrorKind, InputSource, Uri};

use super::*;

Expand Down Expand Up @@ -406,4 +439,55 @@ mod tests {
Status::Error(ErrorKind::InvalidURI(_))
));
}

#[test]
fn test_cache_by_default() {
assert!(!ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::Ok(StatusCode::OK),
&HashSet::default()
));
}

#[test]
// Cache is ignored for file URLs
fn test_cache_ignore_file_urls() {
assert!(ignore_cache(
&Uri::try_from("file:///home").unwrap(),
&Status::Ok(StatusCode::OK),
&HashSet::default()
));
}

#[test]
// Cache is ignored for unsupported status
fn test_cache_ignore_unsupported_status() {
assert!(ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::Unsupported(ErrorKind::EmptyUrl),
&HashSet::default()
));
}

#[test]
// Cache is ignored for unknown status
fn test_cache_ignore_unknown_status() {
assert!(ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::UnknownStatusCode(StatusCode::IM_A_TEAPOT),
&HashSet::default()
));
}

#[test]
fn test_cache_ignore_excluded_status() {
// Cache is ignored for excluded status codes
let exclude = [StatusCode::OK.as_u16()].iter().copied().collect();

assert!(ignore_cache(
&Uri::try_from("https://[::1]").unwrap(),
&Status::Ok(StatusCode::OK),
&exclude
));
}
}
45 changes: 39 additions & 6 deletions lychee-bin/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ use clap::builder::PossibleValuesParser;
use clap::{arg, builder::TypedValueParser, Parser};
use const_format::{concatcp, formatcp};
use lychee_lib::{
AcceptSelector, Base, BasicAuthSelector, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES,
DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT,
Base, BasicAuthSelector, Input, StatusCodeExcluder, StatusCodeSelector, DEFAULT_MAX_REDIRECTS,
DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT,
};
use secrecy::{ExposeSecret, SecretString};
use serde::Deserialize;
Expand Down Expand Up @@ -145,7 +145,8 @@ default_function! {
retry_wait_time: usize = DEFAULT_RETRY_WAIT_TIME_SECS;
method: String = DEFAULT_METHOD.to_string();
verbosity: Verbosity = Verbosity::default();
accept_selector: AcceptSelector = AcceptSelector::default();
cache_exclude_selector: StatusCodeExcluder = StatusCodeExcluder::new();
accept_selector: StatusCodeSelector = StatusCodeSelector::default();
}

// Macro for merging configuration values
Expand Down Expand Up @@ -231,6 +232,26 @@ pub(crate) struct Config {
#[serde(with = "humantime_serde")]
pub(crate) max_cache_age: Duration,

/// A list of status codes that will be excluded from the cache
#[arg(
long,
default_value_t,
long_help = "A list of status codes that will be ignored from the cache

The following accept range syntax is supported: [start]..[=]end|code. Some valid
examples are:

- 429
- 500..=599
- 500..

Use \"lychee --cache-exclude-status '429, 500..502' <inputs>...\" to provide a comma- separated
list of excluded status codes. This example will not cache results with a status code of 429, 500,
501 and 502."
)]
#[serde(default = "cache_exclude_selector")]
pub(crate) cache_exclude_status: StatusCodeExcluder,

/// Don't perform any link checking.
/// Instead, dump all the links extracted from inputs that would be checked
#[arg(long)]
Expand Down Expand Up @@ -394,7 +415,7 @@ separated list of accepted status codes. This example will accept 200, 201,
202, 203, 204, 429, and 500 as valid status codes."
)]
#[serde(default = "accept_selector")]
pub(crate) accept: AcceptSelector,
pub(crate) accept: StatusCodeSelector,

/// Enable the checking of fragments in links.
#[arg(long)]
Expand Down Expand Up @@ -509,6 +530,7 @@ impl Config {
max_retries: DEFAULT_MAX_RETRIES;
max_concurrency: DEFAULT_MAX_CONCURRENCY;
max_cache_age: humantime::parse_duration(DEFAULT_MAX_CACHE_AGE).unwrap();
cache_exclude_status: StatusCodeExcluder::default();
threads: None;
user_agent: DEFAULT_USER_AGENT;
insecure: false;
Expand Down Expand Up @@ -538,7 +560,7 @@ impl Config {
require_https: false;
cookie_jar: None;
include_fragments: false;
accept: AcceptSelector::default();
accept: StatusCodeSelector::default();
}

if self
Expand All @@ -564,7 +586,7 @@ mod tests {
#[test]
fn test_accept_status_codes() {
let toml = Config {
accept: AcceptSelector::from_str("200..=204, 429, 500").unwrap(),
accept: StatusCodeSelector::from_str("200..=204, 429, 500").unwrap(),
..Default::default()
};

Expand All @@ -577,4 +599,15 @@ mod tests {
assert!(cli.accept.contains(204));
assert!(!cli.accept.contains(205));
}

#[test]
fn test_default() {
let cli = Config::default();

assert_eq!(
cli.accept,
StatusCodeSelector::from_str("100..=103,200..=299").expect("no error")
);
assert_eq!(cli.cache_exclude_status, StatusCodeExcluder::new());
}
}
59 changes: 59 additions & 0 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -895,6 +895,65 @@ mod cli {
Ok(())
}

#[tokio::test]
async fn test_lycheecache_exclude_custom_status_codes() -> Result<()> {
let base_path = fixtures_path().join("cache");
let cache_file = base_path.join(LYCHEE_CACHE_FILE);

// Unconditionally remove cache file if it exists
let _ = fs::remove_file(&cache_file);

let mock_server_ok = mock_server!(StatusCode::OK);
let mock_server_no_content = mock_server!(StatusCode::NO_CONTENT);
let mock_server_too_many_requests = mock_server!(StatusCode::TOO_MANY_REQUESTS);

let dir = tempfile::tempdir()?;
let mut file = File::create(dir.path().join("c.md"))?;

writeln!(file, "{}", mock_server_ok.uri().as_str())?;
writeln!(file, "{}", mock_server_no_content.uri().as_str())?;
writeln!(file, "{}", mock_server_too_many_requests.uri().as_str())?;

let mut cmd = main_command();
let test_cmd = cmd
.current_dir(&base_path)
.arg(dir.path().join("c.md"))
.arg("--verbose")
.arg("--no-progress")
.arg("--cache")
.arg("--cache-exclude-status")
.arg("204,429");

assert!(
!cache_file.exists(),
"cache file should not exist before this test"
);

// run first without cache to generate the cache file
test_cmd
.assert()
.stderr(contains(format!("[200] {}/\n", mock_server_ok.uri())))
.stderr(contains(format!(
"[204] {}/ | OK (204 No Content): No Content\n",
mock_server_no_content.uri()
)))
.stderr(contains(format!(
"[429] {}/ | Failed: Network error: Too Many Requests\n",
mock_server_too_many_requests.uri()
)));

// check content of cache file
let data = fs::read_to_string(&cache_file)?;
assert!(data.contains(&format!("{}/,200", mock_server_ok.uri())));
assert!(!data.contains(&format!("{}/,204", mock_server_no_content.uri())));
assert!(!data.contains(&format!("{}/,429", mock_server_too_many_requests.uri())));

// clear the cache file
fs::remove_file(&cache_file)?;

Ok(())
}

#[tokio::test]
async fn test_lycheecache_accept_custom_status_codes() -> Result<()> {
let base_path = fixtures_path().join("cache_accept_custom_status_codes");
Expand Down
5 changes: 3 additions & 2 deletions lychee-lib/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ pub use crate::{
collector::Collector,
filter::{Excludes, Filter, Includes},
types::{
uri::valid::Uri, AcceptRange, AcceptRangeError, AcceptSelector, Base, BasicAuthCredentials,
uri::valid::Uri, AcceptRange, AcceptRangeError, Base, BasicAuthCredentials,
BasicAuthSelector, CacheStatus, CookieJar, ErrorKind, FileType, Input, InputContent,
InputSource, Request, Response, ResponseBody, Result, Status,
InputSource, Request, Response, ResponseBody, Result, Status, StatusCodeExcluder,
StatusCodeSelector,
},
};
2 changes: 0 additions & 2 deletions lychee-lib/src/types/accept/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
mod range;
mod selector;

pub use range::*;
pub use selector::*;
4 changes: 2 additions & 2 deletions lychee-lib/src/types/accept/range.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ use thiserror::Error;
static RANGE_PATTERN: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^([0-9]{3})?\.\.(=?)([0-9]{3})+$|^([0-9]{3})$").unwrap());

/// The [`AcceptRangeParseError`] indicates that the parsing process of an
/// [`AcceptRange`] from a string failed due to various underlying reasons.
/// Indicates that the parsing process of an [`AcceptRange`] from a string
/// failed due to various underlying reasons.
#[derive(Debug, Error, PartialEq)]
pub enum AcceptRangeError {
/// The string input didn't contain any range pattern.
Expand Down
Loading
Loading