Skip to content

Commit

Permalink
Don't check Twitter URLs (#1147)
Browse files Browse the repository at this point in the history
Twitter completely locked down and requires
a login to read tweets. (Temporarily) disable all
Twitter URLs to avoid false-positives.

For context:
zedeus/nitter#919
https://news.ycombinator.com/item?id=36540957
https://techcrunch.com/2023/06/30/twitter-now-requires-an-account-to-view-tweets/

Fixes #1108
  • Loading branch information
mre authored Jul 13, 2023
1 parent 8f4907c commit 40ba187
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 40 deletions.
8 changes: 5 additions & 3 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,10 @@ mod cli {
}

#[test]
#[ignore = "Twitter quirk works locally but is flaky on Github (timeout)"]
fn test_twitter_quirk() {
// Exclude Twitter links because they require login to view tweets.
// https://techcrunch.com/2023/06/30/twitter-now-requires-an-account-to-view-tweets/
// https://github.com/zedeus/nitter/issues/919
fn test_ignored_hosts() {
let url = "https://twitter.com/zarfeblong/status/1339742840142872577";

main_command()
Expand All @@ -253,7 +255,7 @@ mod cli {
.assert()
.success()
.stdout(contains("1 Total"))
.stdout(contains("1 OK"));
.stdout(contains("1 Excluded"));
}

#[tokio::test]
Expand Down
23 changes: 23 additions & 0 deletions lychee-lib/src/filter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ static EXAMPLE_DOMAINS: Lazy<HashSet<&'static str>> =
#[cfg(any(test, feature = "check_example_domains"))]
static EXAMPLE_DOMAINS: Lazy<HashSet<&'static str>> = Lazy::new(HashSet::new);

static UNSUPPORTED_DOMAINS: Lazy<HashSet<&'static str>> = Lazy::new(|| {
HashSet::from_iter([
// Twitter requires an account to view tweets
// https://news.ycombinator.com/item?id=36540957
"twitter.com",
])
});

/// Pre-defined exclusions for known false-positives
const FALSE_POSITIVE_PAT: &[&str] = &[
r"^https?://schemas.openxmlformats.org",
Expand Down Expand Up @@ -70,6 +78,20 @@ pub fn is_example_domain(uri: &Uri) -> bool {
res
}

#[inline]
#[must_use]
/// Check if the host belongs to a known unsupported domain
pub fn is_unsupported_domain(uri: &Uri) -> bool {
if let Some(domain) = uri.domain() {
// It is not enough to use `UNSUPPORTED_DOMAINS.contains(domain)` here
// as this would not include checks for subdomains, such as
// `foo.example.com`
UNSUPPORTED_DOMAINS.iter().any(|tld| domain.ends_with(tld))
} else {
false
}
}

/// A generic URI filter
/// Used to decide if a given URI should be checked or skipped
#[allow(clippy::struct_excessive_bools)]
Expand Down Expand Up @@ -179,6 +201,7 @@ impl Filter {
|| self.is_host_excluded(uri)
|| self.is_scheme_excluded(uri)
|| is_example_domain(uri)
|| is_unsupported_domain(uri)
{
return true;
}
Expand Down
37 changes: 0 additions & 37 deletions lychee-lib/src/quirks/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@ use regex::Regex;
use reqwest::{Request, Url};
use std::collections::HashMap;

static TWITTER_PATTERN: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^(https?://)?(www\.)?twitter.com").unwrap());
static CRATES_PATTERN: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^(https?://)?(www\.)?crates.io").unwrap());
static YOUTUBE_PATTERN: Lazy<Regex> =
Expand All @@ -33,13 +31,6 @@ pub(crate) struct Quirks {
impl Default for Quirks {
fn default() -> Self {
let quirks = vec![
Quirk {
pattern: &TWITTER_PATTERN,
rewrite: |mut request| {
request.url_mut().set_host(Some("nitter.net")).unwrap();
request
},
},
Quirk {
pattern: &CRATES_PATTERN,
rewrite: |mut request| {
Expand Down Expand Up @@ -118,34 +109,6 @@ mod tests {
}
}

#[test]
fn test_twitter_request() {
let cases = vec![
(
"https://twitter.com/search?q=rustlang",
"https://nitter.net/search?q=rustlang",
),
("http://twitter.com/jack", "http://nitter.net/jack"),
(
"https://twitter.com/notifications",
"https://nitter.net/notifications",
),
];

for (input, output) in cases {
let url = Url::parse(input).unwrap();
let expected = Url::parse(output).unwrap();

let request = Request::new(Method::GET, url.clone());
let modified = Quirks::default().apply(request);

assert_eq!(
MockRequest(modified),
MockRequest::new(Method::GET, expected)
);
}
}

#[test]
fn test_cratesio_request() {
let url = Url::parse("https://crates.io/crates/lychee").unwrap();
Expand Down

0 comments on commit 40ba187

Please sign in to comment.