From c59be636fc8ed9775c5d07e6b0ce2912382d9a47 Mon Sep 17 00:00:00 2001 From: PSeitz Date: Tue, 20 Aug 2024 13:21:50 +0200 Subject: [PATCH] enable str fast field range queries (#5324) * enable str fast field range queries enable range queries on str based fast field queries update tantivy * or queries, add normalization test case * fix test --- quickwit/Cargo.lock | 18 +- quickwit/Cargo.toml | 2 +- .../src/query_ast/range_query.rs | 205 ++++++++++-------- .../es_compatibility/0007-range_queries.yaml | 58 +++++ .../es_compatibility/_setup.quickwit.yaml | 4 + 5 files changed, 192 insertions(+), 95 deletions(-) diff --git a/quickwit/Cargo.lock b/quickwit/Cargo.lock index 75aad196108..a8d34e1406a 100644 --- a/quickwit/Cargo.lock +++ b/quickwit/Cargo.lock @@ -4733,7 +4733,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=3d1c4b3#3d1c4b313a63a854214eea669a865837e146ee17" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=c71ec80#c71ec8086d6563c4bb7e573182a26b280a3ac519" dependencies = [ "stable_deref_trait", ] @@ -8159,7 +8159,7 @@ dependencies = [ [[package]] name = "tantivy" version = "0.23.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=3d1c4b3#3d1c4b313a63a854214eea669a865837e146ee17" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=c71ec80#c71ec8086d6563c4bb7e573182a26b280a3ac519" dependencies = [ "aho-corasick", "arc-swap", @@ -8212,7 +8212,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.6.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=3d1c4b3#3d1c4b313a63a854214eea669a865837e146ee17" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=c71ec80#c71ec8086d6563c4bb7e573182a26b280a3ac519" dependencies = [ "bitpacking", ] @@ -8220,7 +8220,7 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=3d1c4b3#3d1c4b313a63a854214eea669a865837e146ee17" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=c71ec80#c71ec8086d6563c4bb7e573182a26b280a3ac519" dependencies = [ "downcast-rs", "fastdivide", @@ -8235,7 +8235,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.7.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=3d1c4b3#3d1c4b313a63a854214eea669a865837e146ee17" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=c71ec80#c71ec8086d6563c4bb7e573182a26b280a3ac519" dependencies = [ "async-trait", "byteorder", @@ -8258,7 +8258,7 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.22.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=3d1c4b3#3d1c4b313a63a854214eea669a865837e146ee17" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=c71ec80#c71ec8086d6563c4bb7e573182a26b280a3ac519" dependencies = [ "nom", ] @@ -8266,7 +8266,7 @@ dependencies = [ [[package]] name = "tantivy-sstable" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=3d1c4b3#3d1c4b313a63a854214eea669a865837e146ee17" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=c71ec80#c71ec8086d6563c4bb7e573182a26b280a3ac519" dependencies = [ "tantivy-bitpacker", "tantivy-common", @@ -8277,7 +8277,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=3d1c4b3#3d1c4b313a63a854214eea669a865837e146ee17" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=c71ec80#c71ec8086d6563c4bb7e573182a26b280a3ac519" dependencies = [ "murmurhash32", "rand_distr", @@ -8287,7 +8287,7 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" version = "0.3.0" -source = "git+https://github.com/quickwit-oss/tantivy/?rev=3d1c4b3#3d1c4b313a63a854214eea669a865837e146ee17" +source = "git+https://github.com/quickwit-oss/tantivy/?rev=c71ec80#c71ec8086d6563c4bb7e573182a26b280a3ac519" dependencies = [ "serde", ] diff --git a/quickwit/Cargo.toml b/quickwit/Cargo.toml index d1c9a4f5f49..51b3df6d541 100644 --- a/quickwit/Cargo.toml +++ b/quickwit/Cargo.toml @@ -325,7 +325,7 @@ quickwit-serve = { path = "quickwit-serve" } quickwit-storage = { path = "quickwit-storage" } quickwit-telemetry = { path = "quickwit-telemetry" } -tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "3d1c4b3", default-features = false, features = [ +tantivy = { git = "https://github.com/quickwit-oss/tantivy/", rev = "c71ec80", default-features = false, features = [ "lz4-compression", "mmap", "quickwit", diff --git a/quickwit/quickwit-query/src/query_ast/range_query.rs b/quickwit/quickwit-query/src/query_ast/range_query.rs index 2a6bcbeac3c..f8445d4bd8c 100644 --- a/quickwit/quickwit-query/src/query_ast/range_query.rs +++ b/quickwit/quickwit-query/src/query_ast/range_query.rs @@ -21,10 +21,12 @@ use std::ops::Bound; use serde::{Deserialize, Serialize}; use tantivy::fastfield::FastValue; -use tantivy::query::{EmptyQuery, RangeQuery as TantivyRangeQuery}; +use tantivy::query::FastFieldRangeQuery; use tantivy::schema::Schema as TantivySchema; +use tantivy::tokenizer::TextAnalyzer; use tantivy::{DateTime, Term}; +use super::tantivy_query_ast::TantivyBoolQuery; use super::QueryAst; use crate::json_literal::InterpretUserInput; use crate::query_ast::tantivy_query_ast::TantivyQueryAst; @@ -80,11 +82,41 @@ impl From for QueryAst { } } +fn term_with_fastval(term: &Term, val: T) -> Term { + let mut term = term.clone(); + term.append_type_and_fast_value(val); + term +} + +fn query_from_fast_val_range( + empty_term: &Term, + range: (Bound, Bound), +) -> FastFieldRangeQuery { + let (lower_bound, upper_bound) = range; + FastFieldRangeQuery::new( + lower_bound.map(|val| term_with_fastval(empty_term, val)), + upper_bound.map(|val| term_with_fastval(empty_term, val)), + ) +} + +fn get_normalized_text(normalizer: &mut Option, text: &str) -> String { + if let Some(normalizer) = normalizer { + let mut token_stream = normalizer.token_stream(text); + let mut tokens = Vec::new(); + token_stream.process(&mut |token| { + tokens.push(token.text.clone()); + }); + tokens[0].to_string() + } else { + text.to_string() + } +} + impl BuildTantivyAst for RangeQuery { fn build_tantivy_ast_impl( &self, schema: &TantivySchema, - _tokenizer_manager: &TokenizerManager, + tokenizer_manager: &TokenizerManager, _search_fields: &[String], _with_validation: bool, ) -> Result { @@ -97,16 +129,28 @@ impl BuildTantivyAst for RangeQuery { ))); } Ok(match field_entry.field_type() { - tantivy::schema::FieldType::Str(_) => { - return Err(InvalidQuery::RangeQueryNotSupportedForField { - value_type: "str", - field_name: field_entry.name().to_string(), - }); + tantivy::schema::FieldType::Str(options) => { + let mut normalizer = options + .get_fast_field_tokenizer_name() + .and_then(|tokenizer_name| tokenizer_manager.get_normalizer(tokenizer_name)); + + let (lower_bound, upper_bound) = + convert_bounds(&self.lower_bound, &self.upper_bound, field_entry.name())?; + + FastFieldRangeQuery::new( + lower_bound.map(|text| { + Term::from_field_text(field, &get_normalized_text(&mut normalizer, text)) + }), + upper_bound.map(|text| { + Term::from_field_text(field, &get_normalized_text(&mut normalizer, text)) + }), + ) + .into() } tantivy::schema::FieldType::U64(_) => { let (lower_bound, upper_bound) = convert_bounds(&self.lower_bound, &self.upper_bound, field_entry.name())?; - TantivyRangeQuery::new( + FastFieldRangeQuery::new( lower_bound.map(|val| Term::from_field_u64(field, val)), upper_bound.map(|val| Term::from_field_u64(field, val)), ) @@ -115,7 +159,7 @@ impl BuildTantivyAst for RangeQuery { tantivy::schema::FieldType::I64(_) => { let (lower_bound, upper_bound) = convert_bounds(&self.lower_bound, &self.upper_bound, field_entry.name())?; - TantivyRangeQuery::new( + FastFieldRangeQuery::new( lower_bound.map(|val| Term::from_field_i64(field, val)), upper_bound.map(|val| Term::from_field_i64(field, val)), ) @@ -124,7 +168,7 @@ impl BuildTantivyAst for RangeQuery { tantivy::schema::FieldType::F64(_) => { let (lower_bound, upper_bound) = convert_bounds(&self.lower_bound, &self.upper_bound, field_entry.name())?; - TantivyRangeQuery::new( + FastFieldRangeQuery::new( lower_bound.map(|val| Term::from_field_f64(field, val)), upper_bound.map(|val| Term::from_field_f64(field, val)), ) @@ -143,7 +187,7 @@ impl BuildTantivyAst for RangeQuery { |date: &DateTime| date.truncate(date_options.get_precision()); let lower_bound = map_bound(&lower_bound, truncate_datetime); let upper_bound = map_bound(&upper_bound, truncate_datetime); - TantivyRangeQuery::new( + FastFieldRangeQuery::new( lower_bound.map(|val| Term::from_field_date(field, val)), upper_bound.map(|val| Term::from_field_date(field, val)), ) @@ -156,48 +200,71 @@ impl BuildTantivyAst for RangeQuery { }); } tantivy::schema::FieldType::Bytes(_) => todo!(), - tantivy::schema::FieldType::JsonObject(opt) => { + tantivy::schema::FieldType::JsonObject(options) => { + let mut sub_queries: Vec = Vec::new(); let empty_term = - Term::from_field_json_path(field, json_path, opt.is_expand_dots_enabled()); - fn term_with_fastval(term: &Term, val: T) -> Term { - let mut term = term.clone(); - term.append_type_and_fast_value(val); - term - } - fn query_from_fast_val_range( - empty_term: &Term, - range: (Bound, Bound), - ) -> TantivyRangeQuery { - TantivyRangeQuery::new( - range.0.map(|val| term_with_fastval(empty_term, val)), - range.1.map(|val| term_with_fastval(empty_term, val)), - ) - } + Term::from_field_json_path(field, json_path, options.is_expand_dots_enabled()); // Try to convert the bounds into numerical values in following order i64, u64, // f64. Tantivy will convert to the correct numerical type of the column if it // doesn't match. - let bounds_range: Option<(Bound, Bound)> = + let bounds_range_i64: Option<(Bound, Bound)> = convert_bound(&self.lower_bound).zip(convert_bound(&self.upper_bound)); - if let Some(range) = bounds_range { - return Ok(query_from_fast_val_range(&empty_term, range).into()); - } - let bounds_range: Option<(Bound, Bound)> = + let bounds_range_u64: Option<(Bound, Bound)> = + convert_bound(&self.lower_bound).zip(convert_bound(&self.upper_bound)); + let bounds_range_f64: Option<(Bound, Bound)> = convert_bound(&self.lower_bound).zip(convert_bound(&self.upper_bound)); - if let Some(range) = bounds_range { - return Ok(query_from_fast_val_range(&empty_term, range).into()); + if let Some(range) = bounds_range_i64 { + sub_queries.push(query_from_fast_val_range(&empty_term, range).into()); + } else if let Some(range) = bounds_range_u64 { + sub_queries.push(query_from_fast_val_range(&empty_term, range).into()); + } else if let Some(range) = bounds_range_f64 { + sub_queries.push(query_from_fast_val_range(&empty_term, range).into()); } - let bounds_range: Option<(Bound, Bound)> = + + let mut normalizer = options + .get_fast_field_tokenizer_name() + .and_then(|tokenizer_name| tokenizer_manager.get_normalizer(tokenizer_name)); + + let bounds_range_str: Option<(Bound<&str>, Bound<&str>)> = convert_bound(&self.lower_bound).zip(convert_bound(&self.upper_bound)); - if let Some(range) = bounds_range { - return Ok(query_from_fast_val_range(&empty_term, range).into()); + if let Some(range) = bounds_range_str { + let str_query = FastFieldRangeQuery::new( + range.0.map(|val| { + let val = get_normalized_text(&mut normalizer, val); + let mut term = empty_term.clone(); + term.append_type_and_str(&val); + term + }), + range.1.map(|val| { + let val = get_normalized_text(&mut normalizer, val); + let mut term = empty_term.clone(); + term.append_type_and_str(&val); + term + }), + ) + .into(); + sub_queries.push(str_query); + } + if sub_queries.is_empty() { + return Err(InvalidQuery::InvalidBoundary { + expected_value_type: "i64, u64, f64, str", + field_name: field_entry.name().to_string(), + }); + } + if sub_queries.len() == 1 { + return Ok(sub_queries.pop().unwrap()); } - // TODO add support for str query - return Ok(EmptyQuery.into()); + + let bool_query = TantivyBoolQuery { + should: sub_queries, + ..Default::default() + }; + bool_query.into() } tantivy::schema::FieldType::IpAddr(_) => { let (lower_bound, upper_bound) = convert_bounds(&self.lower_bound, &self.upper_bound, field_entry.name())?; - TantivyRangeQuery::new( + FastFieldRangeQuery::new( lower_bound.map(|val| Term::from_field_ip_addr(field, val)), upper_bound.map(|val| Term::from_field_ip_addr(field, val)), ) @@ -276,22 +343,22 @@ mod tests { "my_i64_field", JsonLiteral::String("1980".to_string()), JsonLiteral::String("1989".to_string()), - "RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=0, type=I64, \ - 1980)), upper_bound: Included(Term(field=0, type=I64, 1989)) } }", + "FastFieldRangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=0, \ + type=I64, 1980)), upper_bound: Included(Term(field=0, type=I64, 1989)) } }", ); test_range_query_typed_field_util( "my_u64_field", JsonLiteral::String("1980".to_string()), JsonLiteral::String("1989".to_string()), - "RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=1, type=U64, \ - 1980)), upper_bound: Included(Term(field=1, type=U64, 1989)) } }", + "FastFieldRangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=1, \ + type=U64, 1980)), upper_bound: Included(Term(field=1, type=U64, 1989)) } }", ); test_range_query_typed_field_util( "my_f64_field", JsonLiteral::String("1980".to_string()), JsonLiteral::String("1989".to_string()), - "RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=2, type=F64, \ - 1980.0)), upper_bound: Included(Term(field=2, type=F64, 1989.0)) } }", + "FastFieldRangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=2, \ + type=F64, 1980.0)), upper_bound: Included(Term(field=2, type=F64, 1989.0)) } }", ); } @@ -330,42 +397,6 @@ mod tests { ); } - #[test] - fn test_range_query_field_unsupported_type_field() { - let schema = make_schema(false); - let range_query = RangeQuery { - field: "my_str_field".to_string(), - lower_bound: Bound::Included(JsonLiteral::String("1980".to_string())), - upper_bound: Bound::Included(JsonLiteral::String("1989".to_string())), - }; - // with validation - let invalid_query: InvalidQuery = range_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - true, - ) - .unwrap_err(); - assert!(matches!( - invalid_query, - InvalidQuery::RangeQueryNotSupportedForField { .. } - )); - // without validation - assert_eq!( - range_query - .build_tantivy_ast_call( - &schema, - &create_default_quickwit_tokenizer_manager(), - &[], - false - ) - .unwrap() - .const_predicate(), - Some(MatchAllOrNone::MatchNone) - ); - } - #[test] fn test_range_dynamic() { let range_query = RangeQuery { @@ -384,9 +415,13 @@ mod tests { .unwrap(); assert_eq!( format!("{:?}", tantivy_ast), - "Leaf(RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=6, \ - type=Json, path=hello, type=I64, 1980)), upper_bound: Included(Term(field=6, \ - type=Json, path=hello, type=I64, 1989)) } })" + "Bool(TantivyBoolQuery { must: [], must_not: [], should: [Leaf(FastFieldRangeQuery { \ + bounds: BoundsRange { lower_bound: Included(Term(field=6, type=Json, path=hello, \ + type=I64, 1980)), upper_bound: Included(Term(field=6, type=Json, path=hello, \ + type=I64, 1989)) } }), Leaf(FastFieldRangeQuery { bounds: BoundsRange { lower_bound: \ + Included(Term(field=6, type=Json, path=hello, type=Str, \"1980\")), upper_bound: \ + Included(Term(field=6, type=Json, path=hello, type=Str, \"1989\")) } })], filter: [] \ + })" ); } diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/0007-range_queries.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/0007-range_queries.yaml index c3c625395c4..5337325c229 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/0007-range_queries.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/0007-range_queries.yaml @@ -187,3 +187,61 @@ expected: total: value: 1 relation: "eq" +--- +# This field is not a JSON field and doesn not have fast field normalization. +# That means it is case sensitive +json: + query: + range: + repo.name: + gte: "h" + lte: "z" +expected: + hits: + total: + value: 62 + relation: "eq" +--- +# This field is a JSON field and has fast field normalization. +# That means it is case insensitive +json: + query: + range: + actor.login: + gte: "H" # should automatically be normalized + lte: "Z" +expected: + hits: + total: + value: 68 + relation: "eq" +--- +# This field is a JSON field and has fast field normalization. +# That means it is case insensitive +json: + query: + range: + actor.login: + gte: "h" # should automatically be normalized + lte: "z" +expected: + hits: + total: + value: 68 + relation: "eq" +--- +# This field is a JSON field and has fast field normalization. +# That means it is case insensitive +json: + query: + range: + actor.login: + gte: "H" # should automatically be normalized + lte: "Z" +expected: + hits: + total: + value: 68 + relation: "eq" + + diff --git a/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml index bad0012b553..ec6e9f81a3d 100644 --- a/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/es_compatibility/_setup.quickwit.yaml @@ -36,6 +36,10 @@ json: timestamp_field: created_at mode: dynamic field_mappings: + - name: repo.name + type: text + fast: true + indexed: true - name: actor.id type: u64 fast: true