Skip to content

Commit

Permalink
enable str fast field range queries (#5324)
Browse files Browse the repository at this point in the history
* enable str fast field range queries

enable range queries on str based fast field queries
update tantivy

* or queries, add normalization test case

* fix test
  • Loading branch information
PSeitz authored Aug 20, 2024
1 parent 53f2125 commit c59be63
Show file tree
Hide file tree
Showing 5 changed files with 192 additions and 95 deletions.
18 changes: 9 additions & 9 deletions quickwit/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion quickwit/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ quickwit-serve = { path = "quickwit-serve" }
quickwit-storage = { path = "quickwit-storage" }
quickwit-telemetry = { path = "quickwit-telemetry" }

tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "3d1c4b3", default-features = false, features = [
tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "c71ec80", default-features = false, features = [
"lz4-compression",
"mmap",
"quickwit",
Expand Down
205 changes: 120 additions & 85 deletions quickwit/quickwit-query/src/query_ast/range_query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,12 @@ use std::ops::Bound;

use serde::{Deserialize, Serialize};
use tantivy::fastfield::FastValue;
use tantivy::query::{EmptyQuery, RangeQuery as TantivyRangeQuery};
use tantivy::query::FastFieldRangeQuery;
use tantivy::schema::Schema as TantivySchema;
use tantivy::tokenizer::TextAnalyzer;
use tantivy::{DateTime, Term};

use super::tantivy_query_ast::TantivyBoolQuery;
use super::QueryAst;
use crate::json_literal::InterpretUserInput;
use crate::query_ast::tantivy_query_ast::TantivyQueryAst;
Expand Down Expand Up @@ -80,11 +82,41 @@ impl From<RangeQuery> for QueryAst {
}
}

fn term_with_fastval<T: FastValue>(term: &Term, val: T) -> Term {
let mut term = term.clone();
term.append_type_and_fast_value(val);
term
}

fn query_from_fast_val_range<T: FastValue>(
empty_term: &Term,
range: (Bound<T>, Bound<T>),
) -> FastFieldRangeQuery {
let (lower_bound, upper_bound) = range;
FastFieldRangeQuery::new(
lower_bound.map(|val| term_with_fastval(empty_term, val)),
upper_bound.map(|val| term_with_fastval(empty_term, val)),
)
}

fn get_normalized_text(normalizer: &mut Option<TextAnalyzer>, text: &str) -> String {
if let Some(normalizer) = normalizer {
let mut token_stream = normalizer.token_stream(text);
let mut tokens = Vec::new();
token_stream.process(&mut |token| {
tokens.push(token.text.clone());
});
tokens[0].to_string()
} else {
text.to_string()
}
}

impl BuildTantivyAst for RangeQuery {
fn build_tantivy_ast_impl(
&self,
schema: &TantivySchema,
_tokenizer_manager: &TokenizerManager,
tokenizer_manager: &TokenizerManager,
_search_fields: &[String],
_with_validation: bool,
) -> Result<TantivyQueryAst, InvalidQuery> {
Expand All @@ -97,16 +129,28 @@ impl BuildTantivyAst for RangeQuery {
)));
}
Ok(match field_entry.field_type() {
tantivy::schema::FieldType::Str(_) => {
return Err(InvalidQuery::RangeQueryNotSupportedForField {
value_type: "str",
field_name: field_entry.name().to_string(),
});
tantivy::schema::FieldType::Str(options) => {
let mut normalizer = options
.get_fast_field_tokenizer_name()
.and_then(|tokenizer_name| tokenizer_manager.get_normalizer(tokenizer_name));

let (lower_bound, upper_bound) =
convert_bounds(&self.lower_bound, &self.upper_bound, field_entry.name())?;

FastFieldRangeQuery::new(
lower_bound.map(|text| {
Term::from_field_text(field, &get_normalized_text(&mut normalizer, text))
}),
upper_bound.map(|text| {
Term::from_field_text(field, &get_normalized_text(&mut normalizer, text))
}),
)
.into()
}
tantivy::schema::FieldType::U64(_) => {
let (lower_bound, upper_bound) =
convert_bounds(&self.lower_bound, &self.upper_bound, field_entry.name())?;
TantivyRangeQuery::new(
FastFieldRangeQuery::new(
lower_bound.map(|val| Term::from_field_u64(field, val)),
upper_bound.map(|val| Term::from_field_u64(field, val)),
)
Expand All @@ -115,7 +159,7 @@ impl BuildTantivyAst for RangeQuery {
tantivy::schema::FieldType::I64(_) => {
let (lower_bound, upper_bound) =
convert_bounds(&self.lower_bound, &self.upper_bound, field_entry.name())?;
TantivyRangeQuery::new(
FastFieldRangeQuery::new(
lower_bound.map(|val| Term::from_field_i64(field, val)),
upper_bound.map(|val| Term::from_field_i64(field, val)),
)
Expand All @@ -124,7 +168,7 @@ impl BuildTantivyAst for RangeQuery {
tantivy::schema::FieldType::F64(_) => {
let (lower_bound, upper_bound) =
convert_bounds(&self.lower_bound, &self.upper_bound, field_entry.name())?;
TantivyRangeQuery::new(
FastFieldRangeQuery::new(
lower_bound.map(|val| Term::from_field_f64(field, val)),
upper_bound.map(|val| Term::from_field_f64(field, val)),
)
Expand All @@ -143,7 +187,7 @@ impl BuildTantivyAst for RangeQuery {
|date: &DateTime| date.truncate(date_options.get_precision());
let lower_bound = map_bound(&lower_bound, truncate_datetime);
let upper_bound = map_bound(&upper_bound, truncate_datetime);
TantivyRangeQuery::new(
FastFieldRangeQuery::new(
lower_bound.map(|val| Term::from_field_date(field, val)),
upper_bound.map(|val| Term::from_field_date(field, val)),
)
Expand All @@ -156,48 +200,71 @@ impl BuildTantivyAst for RangeQuery {
});
}
tantivy::schema::FieldType::Bytes(_) => todo!(),
tantivy::schema::FieldType::JsonObject(opt) => {
tantivy::schema::FieldType::JsonObject(options) => {
let mut sub_queries: Vec<TantivyQueryAst> = Vec::new();
let empty_term =
Term::from_field_json_path(field, json_path, opt.is_expand_dots_enabled());
fn term_with_fastval<T: FastValue>(term: &Term, val: T) -> Term {
let mut term = term.clone();
term.append_type_and_fast_value(val);
term
}
fn query_from_fast_val_range<T: FastValue>(
empty_term: &Term,
range: (Bound<T>, Bound<T>),
) -> TantivyRangeQuery {
TantivyRangeQuery::new(
range.0.map(|val| term_with_fastval(empty_term, val)),
range.1.map(|val| term_with_fastval(empty_term, val)),
)
}
Term::from_field_json_path(field, json_path, options.is_expand_dots_enabled());
// Try to convert the bounds into numerical values in following order i64, u64,
// f64. Tantivy will convert to the correct numerical type of the column if it
// doesn't match.
let bounds_range: Option<(Bound<i64>, Bound<i64>)> =
let bounds_range_i64: Option<(Bound<i64>, Bound<i64>)> =
convert_bound(&self.lower_bound).zip(convert_bound(&self.upper_bound));
if let Some(range) = bounds_range {
return Ok(query_from_fast_val_range(&empty_term, range).into());
}
let bounds_range: Option<(Bound<u64>, Bound<u64>)> =
let bounds_range_u64: Option<(Bound<u64>, Bound<u64>)> =
convert_bound(&self.lower_bound).zip(convert_bound(&self.upper_bound));
let bounds_range_f64: Option<(Bound<f64>, Bound<f64>)> =
convert_bound(&self.lower_bound).zip(convert_bound(&self.upper_bound));
if let Some(range) = bounds_range {
return Ok(query_from_fast_val_range(&empty_term, range).into());
if let Some(range) = bounds_range_i64 {
sub_queries.push(query_from_fast_val_range(&empty_term, range).into());
} else if let Some(range) = bounds_range_u64 {
sub_queries.push(query_from_fast_val_range(&empty_term, range).into());
} else if let Some(range) = bounds_range_f64 {
sub_queries.push(query_from_fast_val_range(&empty_term, range).into());
}
let bounds_range: Option<(Bound<f64>, Bound<f64>)> =

let mut normalizer = options
.get_fast_field_tokenizer_name()
.and_then(|tokenizer_name| tokenizer_manager.get_normalizer(tokenizer_name));

let bounds_range_str: Option<(Bound<&str>, Bound<&str>)> =
convert_bound(&self.lower_bound).zip(convert_bound(&self.upper_bound));
if let Some(range) = bounds_range {
return Ok(query_from_fast_val_range(&empty_term, range).into());
if let Some(range) = bounds_range_str {
let str_query = FastFieldRangeQuery::new(
range.0.map(|val| {
let val = get_normalized_text(&mut normalizer, val);
let mut term = empty_term.clone();
term.append_type_and_str(&val);
term
}),
range.1.map(|val| {
let val = get_normalized_text(&mut normalizer, val);
let mut term = empty_term.clone();
term.append_type_and_str(&val);
term
}),
)
.into();
sub_queries.push(str_query);
}
if sub_queries.is_empty() {
return Err(InvalidQuery::InvalidBoundary {
expected_value_type: "i64, u64, f64, str",
field_name: field_entry.name().to_string(),
});
}
if sub_queries.len() == 1 {
return Ok(sub_queries.pop().unwrap());
}
// TODO add support for str query
return Ok(EmptyQuery.into());

let bool_query = TantivyBoolQuery {
should: sub_queries,
..Default::default()
};
bool_query.into()
}
tantivy::schema::FieldType::IpAddr(_) => {
let (lower_bound, upper_bound) =
convert_bounds(&self.lower_bound, &self.upper_bound, field_entry.name())?;
TantivyRangeQuery::new(
FastFieldRangeQuery::new(
lower_bound.map(|val| Term::from_field_ip_addr(field, val)),
upper_bound.map(|val| Term::from_field_ip_addr(field, val)),
)
Expand Down Expand Up @@ -276,22 +343,22 @@ mod tests {
"my_i64_field",
JsonLiteral::String("1980".to_string()),
JsonLiteral::String("1989".to_string()),
"RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=0, type=I64, \
1980)), upper_bound: Included(Term(field=0, type=I64, 1989)) } }",
"FastFieldRangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=0, \
type=I64, 1980)), upper_bound: Included(Term(field=0, type=I64, 1989)) } }",
);
test_range_query_typed_field_util(
"my_u64_field",
JsonLiteral::String("1980".to_string()),
JsonLiteral::String("1989".to_string()),
"RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=1, type=U64, \
1980)), upper_bound: Included(Term(field=1, type=U64, 1989)) } }",
"FastFieldRangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=1, \
type=U64, 1980)), upper_bound: Included(Term(field=1, type=U64, 1989)) } }",
);
test_range_query_typed_field_util(
"my_f64_field",
JsonLiteral::String("1980".to_string()),
JsonLiteral::String("1989".to_string()),
"RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=2, type=F64, \
1980.0)), upper_bound: Included(Term(field=2, type=F64, 1989.0)) } }",
"FastFieldRangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=2, \
type=F64, 1980.0)), upper_bound: Included(Term(field=2, type=F64, 1989.0)) } }",
);
}

Expand Down Expand Up @@ -330,42 +397,6 @@ mod tests {
);
}

#[test]
fn test_range_query_field_unsupported_type_field() {
let schema = make_schema(false);
let range_query = RangeQuery {
field: "my_str_field".to_string(),
lower_bound: Bound::Included(JsonLiteral::String("1980".to_string())),
upper_bound: Bound::Included(JsonLiteral::String("1989".to_string())),
};
// with validation
let invalid_query: InvalidQuery = range_query
.build_tantivy_ast_call(
&schema,
&create_default_quickwit_tokenizer_manager(),
&[],
true,
)
.unwrap_err();
assert!(matches!(
invalid_query,
InvalidQuery::RangeQueryNotSupportedForField { .. }
));
// without validation
assert_eq!(
range_query
.build_tantivy_ast_call(
&schema,
&create_default_quickwit_tokenizer_manager(),
&[],
false
)
.unwrap()
.const_predicate(),
Some(MatchAllOrNone::MatchNone)
);
}

#[test]
fn test_range_dynamic() {
let range_query = RangeQuery {
Expand All @@ -384,9 +415,13 @@ mod tests {
.unwrap();
assert_eq!(
format!("{:?}", tantivy_ast),
"Leaf(RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=6, \
type=Json, path=hello, type=I64, 1980)), upper_bound: Included(Term(field=6, \
type=Json, path=hello, type=I64, 1989)) } })"
"Bool(TantivyBoolQuery { must: [], must_not: [], should: [Leaf(FastFieldRangeQuery { \
bounds: BoundsRange { lower_bound: Included(Term(field=6, type=Json, path=hello, \
type=I64, 1980)), upper_bound: Included(Term(field=6, type=Json, path=hello, \
type=I64, 1989)) } }), Leaf(FastFieldRangeQuery { bounds: BoundsRange { lower_bound: \
Included(Term(field=6, type=Json, path=hello, type=Str, \"1980\")), upper_bound: \
Included(Term(field=6, type=Json, path=hello, type=Str, \"1989\")) } })], filter: [] \
})"
);
}

Expand Down
Loading

0 comments on commit c59be63

Please sign in to comment.