From 0278dbdfad709a77340d7822e0b8934d0fa35ff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Massot?= Date: Mon, 17 Jul 2023 22:26:56 +0900 Subject: [PATCH] Add 'source_code_default' tokenizer with a preset filters. --- .../src/default_doc_mapper/tokenizer_entry.rs | 4 +++- quickwit/quickwit-query/src/tokenizers.rs | 13 +++++++------ 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs index 14b5aa7eea4..57a86c21169 100644 --- a/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs +++ b/quickwit/quickwit-doc-mapper/src/default_doc_mapper/tokenizer_entry.rs @@ -18,7 +18,7 @@ // along with this program. If not, see . use anyhow::Context; -use quickwit_query::DEFAULT_REMOVE_TOKEN_LENGTH; +use quickwit_query::{CodeTokenizer, DEFAULT_REMOVE_TOKEN_LENGTH}; use serde::{Deserialize, Serialize}; use tantivy::tokenizer::{ AsciiFoldingFilter, LowerCaser, NgramTokenizer, RegexTokenizer, RemoveLongFilter, @@ -49,6 +49,7 @@ impl TokenizerConfig { pub fn text_analyzer(&self) -> anyhow::Result { let mut text_analyzer_builder = match &self.tokenizer_type { TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(), + TokenizerType::SourceCode => TextAnalyzer::builder(CodeTokenizer::default()).dynamic(), TokenizerType::Ngram(options) => { let tokenizer = NgramTokenizer::new(options.min_gram, options.max_gram, options.prefix_only) @@ -121,6 +122,7 @@ impl TokenFilterType { #[serde(tag = "type", rename_all = "snake_case")] pub enum TokenizerType { Simple, + SourceCode, Ngram(NgramTokenizerOption), Regex(RegexTokenizerOption), } diff --git a/quickwit/quickwit-query/src/tokenizers.rs b/quickwit/quickwit-query/src/tokenizers.rs index 19b0b25ed74..c9764c4c05a 100644 --- a/quickwit/quickwit-query/src/tokenizers.rs +++ b/quickwit/quickwit-query/src/tokenizers.rs @@ -22,8 +22,8 @@ use std::str::CharIndices; use once_cell::sync::Lazy; use tantivy::tokenizer::{ - LowerCaser, RawTokenizer, RemoveLongFilter, TextAnalyzer, Token, TokenStream, Tokenizer, - TokenizerManager, + AsciiFoldingFilter, LowerCaser, RawTokenizer, RemoveLongFilter, TextAnalyzer, Token, + TokenStream, Tokenizer, TokenizerManager, }; pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255; @@ -60,10 +60,11 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager { .build(), ); tokenizer_manager.register( - "source_code", + "source_code_default", TextAnalyzer::builder(CodeTokenizer::default()) .filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH)) .filter(LowerCaser) + .filter(AsciiFoldingFilter) .build(), ); @@ -596,14 +597,14 @@ mod tests { #[test] fn test_code_tokenizer_in_tokenizer_manager() { let mut code_tokenizer = create_default_quickwit_tokenizer_manager() - .get("source_code") + .get("source_code_default") .unwrap(); - let mut token_stream = code_tokenizer.token_stream("PigCaféFactory2"); + let mut token_stream = code_tokenizer.token_stream("PigCafeFactory2"); let mut tokens = Vec::new(); while let Some(token) = token_stream.next() { tokens.push(token.text.to_string()); } - assert_eq!(tokens, vec!["pig", "café", "factory", "2"]) + assert_eq!(tokens, vec!["pig", "cafe", "factory", "2"]) } #[test]