Skip to content

Commit

Permalink
wildcard casing (#5251)
Browse files Browse the repository at this point in the history
* fix handling of range in search-plan

* fix reported casing of raw_lowercase

and switch normalizer to a raw tokenizer

* add test
  • Loading branch information
trinity-1686a authored Jul 29, 2024
1 parent 8dc5622 commit 36dea46
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 4 deletions.
55 changes: 55 additions & 0 deletions quickwit/quickwit-query/src/query_ast/wildcard_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -198,3 +198,58 @@ impl BuildTantivyAst for WildcardQuery {
Ok(phrase_prefix_query.into())
}
}

#[cfg(test)]
mod tests {
use tantivy::schema::{TextFieldIndexing, TextOptions};

use super::*;
use crate::create_default_quickwit_tokenizer_manager;

#[test]
fn test_extract_term_for_wildcard() {
let query = WildcardQuery {
field: "my_field".to_string(),
value: "MyString Wh1ch a nOrMal Tokenizer would cut*".to_string(),
};
let tokenizer_manager = create_default_quickwit_tokenizer_manager();
for tokenizer in ["raw", "whitespace"] {
let mut schema_builder = TantivySchema::builder();
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
schema_builder.add_text_field("my_field", text_options);
let schema = schema_builder.build();

let (_field, term) = query
.extract_prefix_term(&schema, &tokenizer_manager)
.unwrap();
let value = term.value();
let text = value.as_str().unwrap();
assert_eq!(text, query.value.trim_end_matches('*'));
}

for tokenizer in [
"raw_lowercase",
"lowercase",
"default",
"en_stem",
"chinese_compatible",
"source_code_default",
"source_code_with_hex",
] {
let mut schema_builder = TantivySchema::builder();
let text_options = TextOptions::default()
.set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
schema_builder.add_text_field("my_field", text_options);
let schema = schema_builder.build();

let (_field, term) = query
.extract_prefix_term(&schema, &tokenizer_manager)
.unwrap();

let value = term.value();
let text = value.as_str().unwrap();
assert_eq!(text, &query.value.trim_end_matches('*').to_lowercase());
}
}
}
4 changes: 2 additions & 2 deletions quickwit/quickwit-query/src/tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.build();
tokenizer_manager.register("raw_lowercase", raw_tokenizer, false);
tokenizer_manager.register("raw_lowercase", raw_tokenizer, true);

let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
Expand All @@ -77,7 +77,7 @@ pub fn create_default_quickwit_tokenizer_manager() -> TokenizerManager {
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.filter(LowerCaser)
.build();
tokenizer_manager.register("chinese_compatible", chinese_tokenizer, false);
tokenizer_manager.register("chinese_compatible", chinese_tokenizer, true);
tokenizer_manager.register(
"source_code_default",
TextAnalyzer::builder(CodeTokenizer::default())
Expand Down
8 changes: 7 additions & 1 deletion quickwit/quickwit-query/src/tokenizers/tokenizer_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ use crate::DEFAULT_REMOVE_TOKEN_LENGTH;

const RAW_TOKENIZER_NAME: &str = "raw";
const LOWERCASE_TOKENIZER_NAME: &str = "lowercase";
const RAW_LOWERCASE_TOKENIZER_NAME: &str = "raw_lowercase";

#[derive(Clone)]
pub struct TokenizerManager {
Expand All @@ -50,6 +51,11 @@ impl TokenizerManager {
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.build();
this.register(RAW_TOKENIZER_NAME, raw_tokenizer, false);
let raw_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
.build();
this.register(RAW_LOWERCASE_TOKENIZER_NAME, raw_tokenizer, true);
let lower_case_tokenizer = TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.filter(RemoveLongFilter::limit(DEFAULT_REMOVE_TOKEN_LENGTH))
Expand Down Expand Up @@ -83,7 +89,7 @@ impl TokenizerManager {
.get(tokenizer_name)
.copied()?;
let analyzer = if use_lowercaser {
LOWERCASE_TOKENIZER_NAME
RAW_LOWERCASE_TOKENIZER_NAME
} else {
RAW_TOKENIZER_NAME
};
Expand Down
17 changes: 16 additions & 1 deletion quickwit/quickwit-search/src/root.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1207,6 +1207,11 @@ pub async fn search_plan(
.terms_grouped_by_field
.values()
.map(|terms: &HashMap<tantivy::Term, bool>| terms.len())
.sum::<usize>()
+ warmup_info
.term_ranges_grouped_by_field
.values()
.map(|terms: &HashMap<_, bool>| terms.len())
.sum::<usize>();
let position_query_count = warmup_info
.terms_grouped_by_field
Expand All @@ -1217,7 +1222,17 @@ pub async fn search_plan(
.filter(|load_position| **load_position)
.count()
})
.sum();
.sum::<usize>()
+ warmup_info
.term_ranges_grouped_by_field
.values()
.map(|terms: &HashMap<_, bool>| {
terms
.values()
.filter(|load_position| **load_position)
.count()
})
.sum::<usize>();
Ok(SearchPlanResponse {
result: serde_json::to_string(&SearchPlanResponseRest {
quickwit_ast: request_metadata.query_ast_resolved,
Expand Down

0 comments on commit 36dea46

Please sign in to comment.