Skip to content

Commit

Permalink
Add 'source_code_default' tokenizer with a preset filters.
Browse files Browse the repository at this point in the history
  • Loading branch information
fmassot committed Jul 17, 2023
1 parent 9586c5b commit 0278dbd
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
// along with this program. If not, see <http://www.gnu.org/licenses/>.

use anyhow::Context;
use quickwit_query::DEFAULT_REMOVE_TOKEN_LENGTH;
use quickwit_query::{CodeTokenizer, DEFAULT_REMOVE_TOKEN_LENGTH};
use serde::{Deserialize, Serialize};
use tantivy::tokenizer::{
AsciiFoldingFilter, LowerCaser, NgramTokenizer, RegexTokenizer, RemoveLongFilter,
Expand Down Expand Up @@ -49,6 +49,7 @@ impl TokenizerConfig {
pub fn text_analyzer(&self) -> anyhow::Result<TextAnalyzer> {
let mut text_analyzer_builder = match &self.tokenizer_type {
TokenizerType::Simple => TextAnalyzer::builder(SimpleTokenizer::default()).dynamic(),
TokenizerType::SourceCode => TextAnalyzer::builder(CodeTokenizer::default()).dynamic(),
TokenizerType::Ngram(options) => {
let tokenizer =
NgramTokenizer::new(options.min_gram, options.max_gram, options.prefix_only)
Expand Down Expand Up @@ -121,6 +122,7 @@ impl TokenFilterType {
#[serde(tag = "type", rename_all = "snake_case")]
pub enum TokenizerType {
Simple,
SourceCode,
Ngram(NgramTokenizerOption),
Regex(RegexTokenizerOption),
}
Expand Down
13 changes: 7 additions & 6 deletions quickwit/quickwit-query/src/tokenizers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ use std::str::CharIndices;

use once_cell::sync::Lazy;
use tantivy::tokenizer::{
LowerCaser, RawTokenizer, RemoveLongFilter, TextAnalyzer, Token, TokenStream, Tokenizer,
TokenizerManager,
AsciiFoldingFilter, LowerCaser, RawTokenizer, RemoveLongFilter, TextAnalyzer, Token,
TokenStream, Tokenizer, TokenizerManager,
};

pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255;
Expand Down Expand Up @@ -60,10 +60,11 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {
.build(),
);
tokenizer_manager.register(
"source_code",
"source_code_default",
TextAnalyzer::builder(CodeTokenizer::default())
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.filter(AsciiFoldingFilter)
.build(),
);

Expand Down Expand Up @@ -596,14 +597,14 @@ mod tests {
#[test]
fn test_code_tokenizer_in_tokenizer_manager() {
let mut code_tokenizer = create_default_quickwit_tokenizer_manager()
.get("source_code")
.get("source_code_default")
.unwrap();
let mut token_stream = code_tokenizer.token_stream("PigCaféFactory2");
let mut token_stream = code_tokenizer.token_stream("PigCafeFactory2");
let mut tokens = Vec::new();
while let Some(token) = token_stream.next() {
tokens.push(token.text.to_string());
}
assert_eq!(tokens, vec!["pig", "café", "factory", "2"])
assert_eq!(tokens, vec!["pig", "cafe", "factory", "2"])
}

#[test]
Expand Down

0 comments on commit 0278dbd

Please sign in to comment.