-
Notifications
You must be signed in to change notification settings - Fork 330
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
use correct tokenizer when building termsetquery #4983
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -20,9 +20,10 @@ | |||||||||||||||||||||||||||||||
use std::collections::HashMap; | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
use serde::{Deserialize, Serialize}; | ||||||||||||||||||||||||||||||||
use tantivy::schema::Schema as TantivySchema; | ||||||||||||||||||||||||||||||||
use tantivy::schema::{FieldType, Schema as TantivySchema}; | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
use super::{BuildTantivyAst, QueryAst}; | ||||||||||||||||||||||||||||||||
use crate::query_ast::utils::DYNAMIC_FIELD_NAME; | ||||||||||||||||||||||||||||||||
use crate::query_ast::{FullTextParams, TantivyQueryAst}; | ||||||||||||||||||||||||||||||||
use crate::tokenizers::TokenizerManager; | ||||||||||||||||||||||||||||||||
use crate::{BooleanOperand, InvalidQuery}; | ||||||||||||||||||||||||||||||||
|
@@ -42,26 +43,43 @@ impl From<TermQuery> for QueryAst { | |||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nitpick fn get_field(field_path: &str, schema: &'a TantivySchema) -> Option<Field> {
if let (field, _field_path) = schema.find_field(&self.field) {
return Some(field);
}
schema.get_field(DYNAMIC_FIELD_NAME).ok()
} |
||||||||||||||||||||||||||||||||
impl TermQuery { | ||||||||||||||||||||||||||||||||
#[cfg(test)] | ||||||||||||||||||||||||||||||||
pub fn from_field_value(field: impl ToString, value: impl ToString) -> Self { | ||||||||||||||||||||||||||||||||
Self { | ||||||||||||||||||||||||||||||||
field: field.to_string(), | ||||||||||||||||||||||||||||||||
value: value.to_string(), | ||||||||||||||||||||||||||||||||
fn get_tokenizer<'a>(&self, schema: &'a TantivySchema) -> Option<&'a str> { | ||||||||||||||||||||||||||||||||
let field = schema | ||||||||||||||||||||||||||||||||
.find_field(&self.field) | ||||||||||||||||||||||||||||||||
.or_else(|| schema.find_field(DYNAMIC_FIELD_NAME))? | ||||||||||||||||||||||||||||||||
.0; | ||||||||||||||||||||||||||||||||
Comment on lines
+47
to
+50
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||||||||||
match schema.get_field_entry(field).field_type() { | ||||||||||||||||||||||||||||||||
FieldType::Str(text_options) => Some(text_options.get_indexing_options()?.tokenizer()), | ||||||||||||||||||||||||||||||||
FieldType::JsonObject(json_options) => { | ||||||||||||||||||||||||||||||||
Some(json_options.get_text_indexing_options()?.tokenizer()) | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
_ => None, | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
impl BuildTantivyAst for TermQuery { | ||||||||||||||||||||||||||||||||
fn build_tantivy_ast_impl( | ||||||||||||||||||||||||||||||||
pub(crate) fn ast_for_term_extraction( | ||||||||||||||||||||||||||||||||
&self, | ||||||||||||||||||||||||||||||||
schema: &TantivySchema, | ||||||||||||||||||||||||||||||||
tokenizer_manager: &TokenizerManager, | ||||||||||||||||||||||||||||||||
_search_fields: &[String], | ||||||||||||||||||||||||||||||||
_with_validation: bool, | ||||||||||||||||||||||||||||||||
) -> Result<TantivyQueryAst, InvalidQuery> { | ||||||||||||||||||||||||||||||||
self.build_ast( | ||||||||||||||||||||||||||||||||
schema, | ||||||||||||||||||||||||||||||||
tokenizer_manager, | ||||||||||||||||||||||||||||||||
self.get_tokenizer(schema) | ||||||||||||||||||||||||||||||||
.or(Some("raw")) | ||||||||||||||||||||||||||||||||
.map(ToString::to_string), | ||||||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||||||
Comment on lines
+65
to
+71
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
fn build_ast( | ||||||||||||||||||||||||||||||||
&self, | ||||||||||||||||||||||||||||||||
schema: &TantivySchema, | ||||||||||||||||||||||||||||||||
tokenizer_manager: &TokenizerManager, | ||||||||||||||||||||||||||||||||
tokenizer: Option<String>, | ||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is never called with |
||||||||||||||||||||||||||||||||
) -> Result<TantivyQueryAst, InvalidQuery> { | ||||||||||||||||||||||||||||||||
let full_text_params = FullTextParams { | ||||||||||||||||||||||||||||||||
fulmicoton marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||||||||||||||||||||||||||
tokenizer: Some("raw".to_string()), | ||||||||||||||||||||||||||||||||
// The parameter below won't matter, since we will have only one term | ||||||||||||||||||||||||||||||||
tokenizer, | ||||||||||||||||||||||||||||||||
// This parameter should be an Or to handle integers on json fields | ||||||||||||||||||||||||||||||||
mode: BooleanOperand::Or.into(), | ||||||||||||||||||||||||||||||||
zero_terms_query: Default::default(), | ||||||||||||||||||||||||||||||||
}; | ||||||||||||||||||||||||||||||||
|
@@ -73,6 +91,26 @@ impl BuildTantivyAst for TermQuery { | |||||||||||||||||||||||||||||||
tokenizer_manager, | ||||||||||||||||||||||||||||||||
) | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
#[cfg(test)] | ||||||||||||||||||||||||||||||||
pub fn from_field_value(field: impl ToString, value: impl ToString) -> Self { | ||||||||||||||||||||||||||||||||
Self { | ||||||||||||||||||||||||||||||||
field: field.to_string(), | ||||||||||||||||||||||||||||||||
value: value.to_string(), | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
impl BuildTantivyAst for TermQuery { | ||||||||||||||||||||||||||||||||
fn build_tantivy_ast_impl( | ||||||||||||||||||||||||||||||||
&self, | ||||||||||||||||||||||||||||||||
schema: &TantivySchema, | ||||||||||||||||||||||||||||||||
tokenizer_manager: &TokenizerManager, | ||||||||||||||||||||||||||||||||
_search_fields: &[String], | ||||||||||||||||||||||||||||||||
_with_validation: bool, | ||||||||||||||||||||||||||||||||
) -> Result<TantivyQueryAst, InvalidQuery> { | ||||||||||||||||||||||||||||||||
self.build_ast(schema, tokenizer_manager, Some("raw".to_string())) | ||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||
// Private struct used for serialization. | ||||||||||||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,7 +20,7 @@ | |
use std::collections::{BTreeSet, HashMap, HashSet}; | ||
|
||
use serde::{Deserialize, Serialize}; | ||
use tantivy::schema::Schema as TantivySchema; | ||
use tantivy::schema::{IndexRecordOption, Schema as TantivySchema, Type}; | ||
use tantivy::Term; | ||
|
||
use crate::query_ast::{BuildTantivyAst, QueryAst, TantivyQueryAst, TermQuery}; | ||
|
@@ -36,13 +36,20 @@ pub struct TermSetQuery { | |
pub terms_per_field: HashMap<String, BTreeSet<String>>, | ||
} | ||
|
||
fn is_term_str(term: &Term) -> bool { | ||
let val = term.value(); | ||
let typ = val.json_path_type().unwrap_or_else(|| val.typ()); | ||
typ == Type::Str | ||
} | ||
|
||
impl TermSetQuery { | ||
fn make_term_iterator( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function has a very non trivial purpose and a retrun type. It needs a comment. |
||
&self, | ||
schema: &TantivySchema, | ||
tokenizer_manager: &TokenizerManager, | ||
) -> Result<HashSet<Term>, InvalidQuery> { | ||
let mut terms: HashSet<Term> = HashSet::default(); | ||
) -> Result<(HashSet<Term>, Vec<Vec<Term>>), InvalidQuery> { | ||
let mut all_terms: HashSet<Term> = HashSet::default(); | ||
let mut intersections: Vec<Vec<Term>> = Vec::new(); | ||
for (full_path, values) in &self.terms_per_field { | ||
for value in values { | ||
// Mapping a text (field, value) is non-trival: | ||
|
@@ -56,15 +63,34 @@ impl TermSetQuery { | |
field: full_path.to_string(), | ||
value: value.to_string(), | ||
}; | ||
let ast = | ||
term_query.build_tantivy_ast_call(schema, tokenizer_manager, &[], false)?; | ||
let ast = term_query.ast_for_term_extraction(schema, tokenizer_manager)?; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's not build a
|
||
let tantivy_query: Box<dyn crate::TantivyQuery> = ast.simplify().into(); | ||
let mut terms = Vec::new(); | ||
tantivy_query.query_terms(&mut |term, _| { | ||
terms.insert(term.clone()); | ||
terms.push(term.clone()); | ||
}); | ||
|
||
let str_term_count = terms.iter().filter(|term| is_term_str(term)).count(); | ||
if str_term_count <= 1 { | ||
for term in terms { | ||
all_terms.insert(term); | ||
} | ||
} else { | ||
// we have a string, and it got split into multiple tokens, so we want an | ||
// intersection of them | ||
let mut phrase = Vec::with_capacity(terms.len()); | ||
for term in terms { | ||
if is_term_str(&term) { | ||
phrase.push(term); | ||
} else { | ||
all_terms.insert(term); | ||
} | ||
} | ||
intersections.push(phrase); | ||
} | ||
Comment on lines
+73
to
+90
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic is rather odd here. You implicitly rely on the fact that you know that the query ast is a disjunction over different field: e.g. If you want to take that kind of shortcuts (after having considered other solutions), add comments to explain why it works. Ideally, assert that the query is indeed what you think it is. |
||
} | ||
} | ||
Ok(terms) | ||
Ok((all_terms, intersections)) | ||
} | ||
} | ||
|
||
|
@@ -76,9 +102,26 @@ impl BuildTantivyAst for TermSetQuery { | |
_search_fields: &[String], | ||
_with_validation: bool, | ||
) -> Result<TantivyQueryAst, InvalidQuery> { | ||
let terms_it = self.make_term_iterator(schema, tokenizer_manager)?; | ||
let term_set_query = tantivy::query::TermSetQuery::new(terms_it); | ||
Ok(term_set_query.into()) | ||
use tantivy::query::{BooleanQuery, Query, TermQuery, TermSetQuery}; | ||
|
||
let (terms_it, intersections) = self.make_term_iterator(schema, tokenizer_manager)?; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we probably need a comment here. |
||
let term_set_query = TermSetQuery::new(terms_it); | ||
if intersections.is_empty() { | ||
Ok(term_set_query.into()) | ||
} else { | ||
let mut sub_queries: Vec<Box<dyn Query>> = Vec::with_capacity(intersections.len() + 1); | ||
sub_queries.push(Box::new(term_set_query)); | ||
for intersection in intersections { | ||
let terms = intersection | ||
.into_iter() | ||
.map(|term| { | ||
Box::new(TermQuery::new(term, IndexRecordOption::Basic)) as Box<dyn Query> | ||
}) | ||
.collect(); | ||
sub_queries.push(Box::new(BooleanQuery::intersection(terms))); | ||
} | ||
Ok(BooleanQuery::union(sub_queries).into()) | ||
} | ||
} | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
endpoint: dyn/search | ||
params: | ||
query: "my_field:CaSe" | ||
expected: | ||
num_hits: 3 | ||
--- | ||
endpoint: dyn/search | ||
params: | ||
query: "my_field:IN [CaSe]" | ||
expected: | ||
--- | ||
# we want lower-case to hit only lower-case, not all lower and all case | ||
endpoint: dyn/search | ||
params: | ||
query: "my_field:IN [mixed lower-case]" | ||
expected: | ||
num_hits: 2 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,3 +4,6 @@ endpoint: indexes/simple | |
--- | ||
method: DELETE | ||
endpoint: indexes/tagged | ||
--- | ||
method: DELETE | ||
endpoint: indexes/dyn |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
term_query.rs
should not have been modified at all.This PR is obfuscating what it does with logic that only concerns term_set_query.
The only thing being factorized is
build_ast
as far as I could tell, and it is definitely not worth it.