Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle relative links #1481

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1569,4 +1569,46 @@ mod cli {
.success()
.stdout(contains("0 Errors"));
}

/// Test relative paths
///
/// Imagine a web server hosting a site with the following structure:
/// root
/// └── test
/// ├── index.html
/// └── next.html
///
/// where `root/test/index.html` contains `<a href="next.html">next</a>`
/// When checking the link in `root/test/index.html` we should be able to
/// resolve the relative path to `root/test/next.html`
///
/// Note that the relative path is not resolved to the root of the server
/// but relative to the file that contains the link.
#[tokio::test]
async fn test_resolve_relative_paths_in_subfolder() -> Result<()> {
let mock_server = wiremock::MockServer::start().await;

let body = r#"<a href="next.html">next</a>"#;
wiremock::Mock::given(wiremock::matchers::method("GET"))
.and(wiremock::matchers::path("/test/index.html"))
.respond_with(wiremock::ResponseTemplate::new(200).set_body_string(body))
.mount(&mock_server)
.await;

wiremock::Mock::given(wiremock::matchers::method("GET"))
.and(wiremock::matchers::path("/test/next.html"))
.respond_with(wiremock::ResponseTemplate::new(200))
.mount(&mock_server)
.await;

let mut cmd = main_command();
cmd.arg("--verbose")
.arg(format!("{}/test/index.html", mock_server.uri()))
.assert()
.success()
.stdout(contains("1 Total"))
.stdout(contains("0 Errors"));

Ok(())
}
}
76 changes: 68 additions & 8 deletions lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::InputSource;
use crate::{
basic_auth::BasicAuthExtractor, extract::Extractor, types::uri::raw::RawUri, utils::request,
Base, Input, Request, Result,
Expand Down Expand Up @@ -81,22 +82,29 @@ impl Collector {
/// Will return `Err` if links cannot be extracted from an input
pub fn collect_links(self, inputs: Vec<Input>) -> impl Stream<Item = Result<Request>> {
let skip_missing_inputs = self.skip_missing_inputs;
let base = self.base;
stream::iter(inputs)
.par_then_unordered(None, move |input| async move {
input.get_contents(skip_missing_inputs)
.par_then_unordered(None, move |input| {
let base = match input.source {
InputSource::RemoteUrl(ref url) => {
// If the URL is a cannot-be-a-base URL (i.e. it's a relative URL),
// use `self.base` as the base URL.
Base::try_from(url.as_str()).ok().or(self.base.clone())
}
_ => self.base.clone(),
};
async move {
input
.get_contents(skip_missing_inputs)
.map(move |content| (content, base.clone()))
}
})
.flatten()
.par_then_unordered(None, move |content| {
// send to parallel worker
let base = base.clone();
.par_then_unordered(None, move |(content, base)| {
let basic_auth_extractor = self.basic_auth_extractor.clone();
async move {
let content = content?;

let extractor = Extractor::new(self.use_html5ever, self.include_verbatim);
let uris: Vec<RawUri> = extractor.extract(&content);

let requests = request::create(uris, &content, &base, &basic_auth_extractor)?;
Result::Ok(stream::iter(requests.into_iter().map(Ok)))
}
Expand Down Expand Up @@ -390,4 +398,56 @@ mod tests {

assert_eq!(links, expected_links);
}

#[tokio::test]
async fn test_multiple_remote_urls() {
let mock_server_1 = mock_server!(
StatusCode::OK,
set_body_string(r#"<a href="relative.html">Link</a>"#)
);
let mock_server_2 = mock_server!(
StatusCode::OK,
set_body_string(r#"<a href="relative.html">Link</a>"#)
);

let inputs = vec![
Input {
source: InputSource::RemoteUrl(Box::new(
Url::parse(&format!(
"{}/foo/index.html",
mock_server_1.uri().trim_end_matches('/')
))
.unwrap(),
)),
file_type_hint: Some(FileType::Html),
excluded_paths: None,
},
Input {
source: InputSource::RemoteUrl(Box::new(
Url::parse(&format!(
"{}/bar/index.html",
mock_server_2.uri().trim_end_matches('/')
))
.unwrap(),
)),
file_type_hint: Some(FileType::Html),
excluded_paths: None,
},
];

let links = collect(inputs, None).await;

let expected_links = HashSet::from_iter([
website(&format!(
"{}/foo/relative.html",
mock_server_1.uri().trim_end_matches('/')
)),
website(&format!(
"{}/bar/relative.html",
mock_server_2.uri().trim_end_matches('/')
)),
]);

assert_eq!(links, expected_links);
}
}
33 changes: 22 additions & 11 deletions lychee-lib/src/types/base.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ use std::{convert::TryFrom, path::PathBuf};

use crate::{ErrorKind, InputSource};

/// Base URL or path for a document.
///
/// When encountering links without a full domain in a document,
/// the base determines where this resource can be found.
/// Both, local and remote targets are supported.
Expand All @@ -19,6 +21,13 @@ pub enum Base {

impl Base {
/// Join link with base url
///
/// This joins the given link with the base URL.
/// The result is a new URL if the base is a remote URL
/// where the link is joined with the base URL.
///
/// This is only possible if the base is a remote URL.
/// If the base is a local path, this will return None.
#[must_use]
pub(crate) fn join(&self, link: &str) -> Option<Url> {
match self {
Expand All @@ -36,18 +45,17 @@ impl Base {
}
}

pub(crate) fn from_source(source: &InputSource) -> Option<Url> {
pub(crate) fn from_source(source: &InputSource) -> Option<Base> {
match &source {
InputSource::RemoteUrl(url) => {
// TODO: This should be refactored.
// Cases like https://user:[email protected] are not handled
// We can probably use the original URL and just replace the
// path component in the caller of this function
if let Some(port) = url.port() {
Url::parse(&format!("{}://{}:{port}", url.scheme(), url.host_str()?)).ok()
} else {
Url::parse(&format!("{}://{}", url.scheme(), url.host_str()?)).ok()
}
// Create a new URL with just the scheme, host, and port
let mut base_url = url.clone();
base_url.set_path("");
base_url.set_query(None);
base_url.set_fragment(None);

// We keep the username and password intact
Some(Base::Remote(*base_url))
}
// other inputs do not have a URL to extract a base
_ => None,
Expand All @@ -58,6 +66,9 @@ impl Base {
impl TryFrom<&str> for Base {
type Error = ErrorKind;

/// Try to parse the given string as a URL or a local path
/// to be used as a base.
/// If the URL cannot be a base, an error is returned.
fn try_from(value: &str) -> Result<Self, Self::Error> {
if let Ok(url) = Url::parse(value) {
if url.cannot_be_a_base() {
Expand Down Expand Up @@ -123,7 +134,7 @@ mod test_base {
let url = Url::parse(url).unwrap();
let source = InputSource::RemoteUrl(Box::new(url.clone()));
let base = Base::from_source(&source);
let expected = Url::parse(expected).unwrap();
let expected = Base::Remote(Url::parse(expected).unwrap());
assert_eq!(base, Some(expected));
}
}
Expand Down
20 changes: 20 additions & 0 deletions lychee-lib/src/types/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,26 @@ impl Input {
}

/// Retrieve the contents from the input
///
/// This will return a stream of `InputContent` instances.
///
/// - If the input source is a remote URL, the stream will contain the
/// contents of the URL.
/// - If the input source is a glob pattern, the stream will contain the
/// contents of all matched files.
/// - If the input source is a directory, the stream will contain the
/// contents of all files in the directory (recursively).
/// This will skip excluded paths.
/// - If the input source is a file, the stream will contain the contents of
/// the file.
/// - If the input source is stdin, the stream will contain the contents of
/// the stdin.
/// - If the input source is a string, the stream will contain the contents
/// of the string.
///
/// If `skip_missing` is set to `true`, missing files will be skipped and no
/// error will be returned. The file type hint will be used to determine the
/// file type of the input if it is not already known.
///
/// # Errors
///
Expand Down
Loading
Loading