Skip to content

Commit

Permalink
Merge pull request #546 from squidowl/fix/unicode-urls
Browse files Browse the repository at this point in the history
Unicode URL Parsing
  • Loading branch information
tarkah committed Sep 10, 2024
2 parents fc8fbd4 + 2591e44 commit 5accdc7
Showing 1 changed file with 21 additions and 5 deletions.
26 changes: 21 additions & 5 deletions data/src/message.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use irc::proto;
use irc::proto::Command;
use itertools::Itertools;
use once_cell::sync::Lazy;
use regex::Regex;
use regex::{Regex, RegexBuilder};
use serde::{Deserialize, Deserializer, Serialize};
use url::Url;

Expand All @@ -22,13 +22,13 @@ use crate::{ctcp, Config, User};
// - https://datatracker.ietf.org/doc/html/rfc1738#section-5
// - https://www.ietf.org/rfc/rfc2396.txt

const URL_PATH_UNRESERVED: &str = r#"a-zA-Z0-9-_.!~*'()"#;
const URL_PATH_UNRESERVED: &str = r#"\p{Letter}\p{Number}\-_.!~*'()"#;

const URL_PATH_RESERVED: &str = r#";?:@&=+$,"#;

const URL_PATH: &str = concatcp!(r#"["#, URL_PATH_UNRESERVED, URL_PATH_RESERVED, r#"%\/#]"#);

const URL_PATH_UNRESERVED_EXC_PUNC: &str = r#"a-zA-Z0-9-_~*'("#;
const URL_PATH_UNRESERVED_EXC_PUNC: &str = r#"\p{Letter}\p{Number}\-_~*'("#;

const URL_PATH_RESERVED_EXC_PUNC: &str = r#"@&=+$"#;

Expand All @@ -40,8 +40,8 @@ const URL_PATH_EXC_PUNC: &str = concatcp!(
);

static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
Regex::new(concatcp!(
r#"(?i)(((https?|ircs?):\/\/|www\.)[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,63}\b"#,
RegexBuilder::new(concatcp!(
r#"(?i)(((https?|ircs?):\/\/|www\.)[\p{Letter}\p{Number}\-@:%._+~#=]{1,256}\.[\p{Letter}\p{Number}()]{1,63}\b"#,
r#"("#,
URL_PATH,
r#"*"#,
Expand All @@ -50,6 +50,8 @@ static URL_REGEX: Lazy<Regex> = Lazy::new(|| {
URL_PATH_EXC_PUNC,
r#"?)|halloy:\/\/[^ ]*)"#
))
.size_limit(15728640) // 1.5x default size_limit
.build()
.unwrap()
});

Expand Down Expand Up @@ -1014,6 +1016,20 @@ mod test {
Fragment::Text(").".into()),
],
),
(
"https://www.reddit.com/r/witze/comments/1fcoz5a/ein_vampir_auf_einem_tandem_gerät_in_eine/",
vec![Fragment::Url(
"https://www.reddit.com/r/witze/comments/1fcoz5a/ein_vampir_auf_einem_tandem_gerät_in_eine/"
.parse()
.unwrap()
)],
),
(
"http://öbb.at",
vec![
Fragment::Url("http://öbb.at".parse().unwrap()),
],
),
];

for (text, expected) in tests {
Expand Down

0 comments on commit 5accdc7

Please sign in to comment.