Skip to content

Commit

Permalink
Add multilang in tokenizer manager.
Browse files Browse the repository at this point in the history
  • Loading branch information
fmassot committed Jul 17, 2023
1 parent c7475e0 commit da127ae
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,10 @@ impl TokenizerConfig {
pub fn text_analyzer(&self) -> anyhow::Result<TextAnalyzer> {
let mut text_analyzer_builder = match &self.tokenizer_type {
TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
// Note(fmassot): `multilang` is currently an "all-in-one" tokenizer with default
// filter. Static filters allows better performance which is a requirement
// for the `happy-plazza` project. We may want to revisit that later.
#[cfg(feature = "multilang")]
TokenizerType::Multilang => TextAnalyzer::builder(MultiLangTokenizer::default())
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.dynamic(),
TokenizerType::Multilang => {
TextAnalyzer::builder(MultiLangTokenizer::default()).dynamic()
}
TokenizerType::Ngram(options) => {
let tokenizer =
NgramTokenizer::new(options.min_gram, options.max_gram, options.prefix_only)
Expand Down
8 changes: 7 additions & 1 deletion quickwit/quickwit-query/src/tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,13 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {
.build(),
);
#[cfg(feature = "multilang")]
tokenizer_manager.register("multilang", MultiLangTokenizer::default());
tokenizer_manager.register(
"multilang_default",
TextAnalyzer::builder(MultiLangTokenizer::default())
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.build(),
);
tokenizer_manager
}

Expand Down

0 comments on commit da127ae

Please sign in to comment.