diff --git a/charabia/Cargo.toml b/charabia/Cargo.toml index 6c3bdeb..6dd4e1c 100644 --- a/charabia/Cargo.toml +++ b/charabia/Cargo.toml @@ -37,7 +37,7 @@ litemap = "0.6.1" zerovec = "0.9.3" [features] -default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer"] +default = ["chinese", "hebrew", "japanese", "thai", "korean", "greek", "latin-camelcase", "latin-snakecase", "khmer", "vietnamese"] # allow chinese specialized tokenization chinese = ["dep:pinyin", "dep:jieba-rs"] @@ -65,6 +65,9 @@ latin-camelcase = ["dep:finl_unicode"] khmer = [] +# allow vietnamese specialized tokenization +vietnamese = [] + # allow splitting snake_case latin words latin-snakecase = ["dep:finl_unicode"] diff --git a/charabia/src/normalizer/mod.rs b/charabia/src/normalizer/mod.rs index 4fe13de..404346b 100644 --- a/charabia/src/normalizer/mod.rs +++ b/charabia/src/normalizer/mod.rs @@ -15,6 +15,7 @@ pub use self::japanese::JapaneseNormalizer; pub use self::lowercase::LowercaseNormalizer; use self::nonspacing_mark::NonspacingMarkNormalizer; use self::quote::QuoteNormalizer; +pub use self::vietnamese::VietnameseNormalizer; use crate::segmenter::SegmentedTokenIter; use crate::Token; @@ -31,6 +32,8 @@ mod japanese; mod lowercase; mod nonspacing_mark; mod quote; +#[cfg(feature = "vietnamese")] +mod vietnamese; /// List of [`Normalizer`]s used by [`Normalize::normalize`] that are not considered lossy. pub static NORMALIZERS: Lazy>> = Lazy::new(|| { @@ -54,6 +57,8 @@ pub static LOSSY_NORMALIZERS: Lazy>> = Lazy::new(|| { Box::new(GreekNormalizer), Box::new(ArabicNormalizer), Box::new(NonspacingMarkNormalizer), + #[cfg(feature = "vietnamese")] + Box::new(VietnameseNormalizer), ] }); diff --git a/charabia/src/normalizer/vietnamese.rs b/charabia/src/normalizer/vietnamese.rs new file mode 100644 index 0000000..31e0836 --- /dev/null +++ b/charabia/src/normalizer/vietnamese.rs @@ -0,0 +1,22 @@ +use super::{CharNormalizer, CharOrStr}; +use crate::Script; +use crate::Token; + +pub struct VietnameseNormalizer; + +impl CharNormalizer for VietnameseNormalizer { + fn normalize_char(&self, c: char) -> Option { + match c { + 'Ð' | 'Đ' | 'đ' => Some("d".to_string().into()), // not only Vietnamese, but also many European countries use these letters + _ => None, + } + } + + fn should_normalize(&self, token: &Token) -> bool { + token.script == Script::Latin && token.lemma.chars().any(is_should_normalize) + } +} + +fn is_should_normalize(c: char) -> bool { + matches!(c, 'Ð' | 'Đ' | 'đ') +}