From c0fd8e5bed401a1b1217929d66a79440957b775e Mon Sep 17 00:00:00 2001 From: PSeitz Date: Thu, 13 Jul 2023 18:13:41 +0800 Subject: [PATCH] sort by two fast fields (#3614) * sort by two fast fields * SortByPair * remove invalid check * clippy * rename SortBy to SortByValue * simplify code * fix reverse dance * add python test sort 2 fields --- quickwit/quickwit-proto/build.rs | 2 + .../protos/quickwit/search_api.proto | 42 +- quickwit/quickwit-proto/src/lib.rs | 17 +- quickwit/quickwit-proto/src/quickwit.rs | 81 +-- .../quickwit-search/src/cluster_client.rs | 3 +- quickwit/quickwit-search/src/collector.rs | 515 +++++++++++++----- quickwit/quickwit-search/src/leaf.rs | 2 +- quickwit/quickwit-search/src/leaf_cache.rs | 6 +- quickwit/quickwit-search/src/root.rs | 73 ++- .../quickwit-search/src/search_stream/leaf.rs | 5 +- quickwit/quickwit-search/src/tests.rs | 147 ++++- .../src/elastic_search_api/rest_handler.rs | 25 +- .../src/search_api/rest_handler.rs | 25 +- .../quickwit-storage/src/storage_resolver.rs | 1 - .../sort_orders/0001-sort-elasticapi.yaml | 50 +- .../scenarii/sort_orders/_setup.quickwit.yaml | 2 + 16 files changed, 722 insertions(+), 274 deletions(-) diff --git a/quickwit/quickwit-proto/build.rs b/quickwit/quickwit-proto/build.rs index c92dafbaf2e..cf4a3a784c1 100644 --- a/quickwit/quickwit-proto/build.rs +++ b/quickwit/quickwit-proto/build.rs @@ -32,6 +32,8 @@ fn main() -> Result<(), Box> { .type_attribute(".", "#[derive(Serialize, Deserialize, utoipa::ToSchema)]") .type_attribute("IndexingTask", "#[derive(Eq, Hash)]") .type_attribute("SearchRequest", "#[derive(Eq, Hash)]") + .type_attribute("SortField", "#[derive(Eq, Hash)]") + .type_attribute("SortByValue", "#[derive(Ord, PartialOrd)]") .type_attribute("DeleteQuery", "#[serde(default)]") .field_attribute( "DeleteQuery.start_timestamp", diff --git a/quickwit/quickwit-proto/protos/quickwit/search_api.proto b/quickwit/quickwit-proto/protos/quickwit/search_api.proto index 2b5485c5e26..4f26b9a690b 100644 --- a/quickwit/quickwit-proto/protos/quickwit/search_api.proto +++ b/quickwit/quickwit-proto/protos/quickwit/search_api.proto @@ -95,21 +95,25 @@ message SearchRequest { // deprecated tag field reserved 8; - // Sort order - optional SortOrder sort_order = 9; + // deprecated `sort_order`` + reserved 9; - // Sort by fast field. If unset sort by docid - // sort_by_field can be: - // - a field name - // - _score - // - None, in which case the hits will be sorted by (SplitId, doc_id). - optional string sort_by_field = 10; + // deprecated `sort_by_field`` + reserved 10; // json serialized aggregation_request optional string aggregation_request = 11; // Fields to extract snippet on repeated string snippet_fields = 12; + + // Optional sort by one or more fields (limited to 2 at the moment). + repeated SortField sort_fields = 14; +} + +message SortField { + string field_name = 1; + SortOrder sort_order = 2; } enum SortOrder { @@ -226,16 +230,13 @@ message PartialHit { // - the split_id, // - the segment_ord, // - the doc id. - oneof sort_value { - uint64 u64 = 5; - int64 i64 = 6; - double f64 = 7; - bool boolean = 8; - } + // Deprecated reserved 1; // Room for eventual future sorted key types. - reserved 9 to 20; + reserved 12 to 20; + SortByValue sort_value = 10; + SortByValue sort_value2 = 11; string split_id = 2; @@ -247,6 +248,17 @@ message PartialHit { uint32 doc_id = 4; } +message SortByValue { + oneof sort_value { + uint64 u64 = 1; + int64 i64 = 2; + double f64 = 3; + bool boolean = 4; + } + // Room for eventual future sorted key types. + reserved 5 to 20; +} + message LeafSearchResponse { // Total number of documents matched by the query. uint64 num_hits = 1; diff --git a/quickwit/quickwit-proto/src/lib.rs b/quickwit/quickwit-proto/src/lib.rs index 69906ee9654..aa78ad53148 100644 --- a/quickwit/quickwit-proto/src/lib.rs +++ b/quickwit/quickwit-proto/src/lib.rs @@ -26,7 +26,7 @@ use ulid::Ulid; mod quickwit; mod quickwit_indexing_api; mod quickwit_metastore_api; -pub use partial_hit::SortValue; +pub use sort_by_value::SortValue; use std::cmp::Ordering; pub mod indexing_api { @@ -187,7 +187,6 @@ pub fn convert_to_grpc_result( } impl TryFrom for SearchRequest { - type Error = anyhow::Error; fn try_from(search_stream_req: SearchStreamRequest) -> Result { @@ -197,7 +196,7 @@ impl TryFrom for SearchRequest { snippet_fields: search_stream_req.snippet_fields, start_timestamp: search_stream_req.start_timestamp, end_timestamp: search_stream_req.end_timestamp, - .. Default::default() + ..Default::default() }) } } @@ -212,7 +211,7 @@ impl TryFrom for SearchRequest { query_ast: delete_query.query_ast, start_timestamp: delete_query.start_timestamp, end_timestamp: delete_query.end_timestamp, - .. Default::default() + ..Default::default() }) } } @@ -464,7 +463,17 @@ pub fn query_ast_from_user_text(user_text: &str, default_fields: Option for SortByValue { + fn from(sort_value: SortValue) -> Self { + SortByValue { + sort_value: Some(sort_value), + } + } +} +impl Copy for SortValue {} impl Eq for SortValue {} impl Ord for SortValue { diff --git a/quickwit/quickwit-proto/src/quickwit.rs b/quickwit/quickwit-proto/src/quickwit.rs index b8646a4f9ab..f8c7ffd444a 100644 --- a/quickwit/quickwit-proto/src/quickwit.rs +++ b/quickwit/quickwit-proto/src/quickwit.rs @@ -26,22 +26,25 @@ pub struct SearchRequest { /// The results with rank [start_offset..start_offset + max_hits) are returned. #[prost(uint64, tag = "7")] pub start_offset: u64, - /// Sort order - #[prost(enumeration = "SortOrder", optional, tag = "9")] - pub sort_order: ::core::option::Option, - /// Sort by fast field. If unset sort by docid - /// sort_by_field can be: - /// - a field name - /// - _score - /// - None, in which case the hits will be sorted by (SplitId, doc_id). - #[prost(string, optional, tag = "10")] - pub sort_by_field: ::core::option::Option<::prost::alloc::string::String>, /// json serialized aggregation_request #[prost(string, optional, tag = "11")] pub aggregation_request: ::core::option::Option<::prost::alloc::string::String>, /// Fields to extract snippet on #[prost(string, repeated, tag = "12")] pub snippet_fields: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, + /// Optional sort by one or more fields (limited to 2 at the moment). + #[prost(message, repeated, tag = "14")] + pub sort_fields: ::prost::alloc::vec::Vec, +} +#[derive(Serialize, Deserialize, utoipa::ToSchema)] +#[derive(Eq, Hash)] +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SortField { + #[prost(string, tag = "1")] + pub field_name: ::prost::alloc::string::String, + #[prost(enumeration = "SortOrder", tag = "2")] + pub sort_order: i32, } #[derive(Serialize, Deserialize, utoipa::ToSchema)] #[allow(clippy::derive_partial_eq_without_eq)] @@ -164,10 +167,25 @@ pub struct Hit { /// Instead, it holds a document_uri which is enough information to /// go and fetch the actual document data, by performing a `get_doc(...)` /// request. +/// +/// Value of the sorting key for the given document. +/// +/// Quickwit only computes top-K of this sorting field. +/// If the user requested for a bottom-K of a given fast field, then quickwit simply +/// emits an decreasing mapping of this fast field. +/// +/// In case of a tie, quickwit uses the increasing order of +/// - the split_id, +/// - the segment_ord, +/// - the doc id. #[derive(Serialize, Deserialize, utoipa::ToSchema)] #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct PartialHit { + #[prost(message, optional, tag = "10")] + pub sort_value: ::core::option::Option, + #[prost(message, optional, tag = "11")] + pub sort_value2: ::core::option::Option, #[prost(string, tag = "2")] pub split_id: ::prost::alloc::string::String, /// (segment_ord, doc) form a tantivy DocAddress, which is sufficient to identify a document @@ -177,43 +195,28 @@ pub struct PartialHit { /// The DocId identifies a unique document at the scale of a tantivy segment. #[prost(uint32, tag = "4")] pub doc_id: u32, - /// Value of the sorting key for the given document. - /// - /// Quickwit only computes top-K of this sorting field. - /// If the user requested for a bottom-K of a given fast field, then quickwit simply - /// emits an decreasing mapping of this fast field. - /// - /// In case of a tie, quickwit uses the increasing order of - /// - the split_id, - /// - the segment_ord, - /// - the doc id. - #[prost(oneof = "partial_hit::SortValue", tags = "5, 6, 7, 8")] - pub sort_value: ::core::option::Option, } -/// Nested message and enum types in `PartialHit`. -pub mod partial_hit { - /// Value of the sorting key for the given document. - /// - /// Quickwit only computes top-K of this sorting field. - /// If the user requested for a bottom-K of a given fast field, then quickwit simply - /// emits an decreasing mapping of this fast field. - /// - /// In case of a tie, quickwit uses the increasing order of - /// - the split_id, - /// - the segment_ord, - /// - the doc id. +#[derive(Serialize, Deserialize, utoipa::ToSchema)] +#[derive(Ord, PartialOrd)] +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct SortByValue { + #[prost(oneof = "sort_by_value::SortValue", tags = "1, 2, 3, 4")] + pub sort_value: ::core::option::Option, +} +/// Nested message and enum types in `SortByValue`. +pub mod sort_by_value { #[derive(Serialize, Deserialize, utoipa::ToSchema)] - #[derive(Copy)] #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Oneof)] pub enum SortValue { - #[prost(uint64, tag = "5")] + #[prost(uint64, tag = "1")] U64(u64), - #[prost(int64, tag = "6")] + #[prost(int64, tag = "2")] I64(i64), - #[prost(double, tag = "7")] + #[prost(double, tag = "3")] F64(f64), - #[prost(bool, tag = "8")] + #[prost(bool, tag = "4")] Boolean(bool), } } diff --git a/quickwit/quickwit-search/src/cluster_client.rs b/quickwit/quickwit-search/src/cluster_client.rs index 37ffa35f37e..60ffd6fbe13 100644 --- a/quickwit/quickwit-search/src/cluster_client.rs +++ b/quickwit/quickwit-search/src/cluster_client.rs @@ -244,7 +244,8 @@ mod tests { fn mock_partial_hit(split_id: &str, sort_value: u64, doc_id: u32) -> PartialHit { PartialHit { - sort_value: Some(SortValue::U64(sort_value)), + sort_value: Some(SortValue::U64(sort_value).into()), + sort_value2: None, split_id: split_id.to_string(), segment_ord: 1, doc_id, diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs index 09addc29342..5200fcd8358 100644 --- a/quickwit/quickwit-search/src/collector.rs +++ b/quickwit/quickwit-search/src/collector.rs @@ -38,7 +38,7 @@ use crate::find_trace_ids_collector::{FindTraceIdsCollector, FindTraceIdsSegment use crate::GlobalDocAddress; #[derive(Clone, Debug)] -pub(crate) enum SortBy { +pub(crate) enum SortByComponent { DocId, FastField { field_name: String, @@ -48,13 +48,79 @@ pub(crate) enum SortBy { order: SortOrder, }, } - -impl SortBy { +impl From for SortByPair { + fn from(value: SortByComponent) -> Self { + Self { + first: value, + second: None, + } + } +} +#[derive(Clone)] +pub(crate) struct SortByPair { + first: SortByComponent, + second: Option, +} +impl SortByPair { + pub fn sort_orders(&self) -> (SortOrder, SortOrder) { + ( + self.first.sort_order(), + self.second + .as_ref() + .map(|sort_by| sort_by.sort_order()) + .unwrap_or(SortOrder::Desc), + ) + } +} +impl SortByComponent { + fn to_sorting_field_computer_component( + &self, + segment_reader: &SegmentReader, + ) -> tantivy::Result { + match self { + SortByComponent::DocId => Ok(SortingFieldComputerComponent::DocId), + SortByComponent::FastField { field_name, order } => { + let sort_column_opt: Option<(Column, ColumnType)> = + segment_reader.fast_fields().u64_lenient(field_name)?; + let (sort_column, column_type) = sort_column_opt.unwrap_or_else(|| { + ( + Column::build_empty_column(segment_reader.max_doc()), + ColumnType::U64, + ) + }); + let sort_field_type = SortFieldType::try_from(column_type)?; + Ok(SortingFieldComputerComponent::FastField { + sort_column, + sort_field_type, + order: *order, + }) + } + SortByComponent::Score { order } => { + Ok(SortingFieldComputerComponent::Score { order: *order }) + } + } + } + pub fn requires_scoring(&self) -> bool { + match self { + SortByComponent::DocId => false, + SortByComponent::FastField { .. } => false, + SortByComponent::Score { .. } => true, + } + } + pub fn add_fast_field(&self, set: &mut HashSet) { + if let SortByComponent::FastField { + field_name, + order: _, + } = self + { + set.insert(field_name.clone()); + } + } pub fn sort_order(&self) -> SortOrder { match self { - SortBy::DocId => SortOrder::Desc, - SortBy::FastField { order, .. } => *order, - SortBy::Score { order } => *order, + SortByComponent::DocId => SortOrder::Desc, + SortByComponent::FastField { order, .. } => *order, + SortByComponent::Score { order } => *order, } } } @@ -70,7 +136,7 @@ enum SortFieldType { /// The `SortingFieldComputer` can be seen as the specialization of `SortBy` applied to a specific /// `SegmentReader`. Its role is to compute the sorting field given a `DocId`. -enum SortingFieldComputer { +enum SortingFieldComputerComponent { /// If undefined, we simply sort by DocIds. DocId, FastField { @@ -82,31 +148,39 @@ enum SortingFieldComputer { order: SortOrder, }, } - -impl SortingFieldComputer { - fn recover_typed_sort_value(&self, sort_value: u64) -> SortValue { - match self { - SortingFieldComputer::DocId => SortValue::U64(sort_value), - SortingFieldComputer::FastField { - sort_field_type, - order, - .. - } => { - let sort_value = match order { - SortOrder::Asc => u64::MAX - sort_value, - SortOrder::Desc => sort_value, - }; - match sort_field_type { - SortFieldType::U64 => SortValue::U64(sort_value), - SortFieldType::I64 => SortValue::I64(i64::from_u64(sort_value)), - SortFieldType::F64 => SortValue::F64(f64::from_u64(sort_value)), - SortFieldType::DateTime => SortValue::I64(i64::from_u64(sort_value)), - SortFieldType::Bool => SortValue::Boolean(sort_value == 1u64), - } +impl SortingFieldComputerComponent { + fn recover_typed_sort_value(&self, sort_value: Option) -> Option { + let recover_from_fast_field = |sort_value: u64, order: SortOrder, sort_field_type| { + let sort_value = match order { + SortOrder::Asc => u64::MAX - sort_value, + SortOrder::Desc => sort_value, + }; + match sort_field_type { + SortFieldType::U64 => SortValue::U64(sort_value), + SortFieldType::I64 => SortValue::I64(i64::from_u64(sort_value)), + SortFieldType::F64 => SortValue::F64(f64::from_u64(sort_value)), + SortFieldType::DateTime => SortValue::I64(i64::from_u64(sort_value)), + SortFieldType::Bool => SortValue::Boolean(sort_value == 1u64), } - SortingFieldComputer::Score { .. } => { - SortValue::F64(MonotonicallyMappableToU64::from_u64(sort_value)) + }; + if let Some(sort_value) = sort_value { + match self { + SortingFieldComputerComponent::DocId => Some(SortValue::U64(sort_value)), + SortingFieldComputerComponent::FastField { + sort_column: _, + sort_field_type, + order, + } => Some(recover_from_fast_field( + sort_value, + *order, + *sort_field_type, + )), + SortingFieldComputerComponent::Score { order: _ } => Some(SortValue::F64( + MonotonicallyMappableToU64::from_u64(sort_value), + )), } + } else { + None } } @@ -121,32 +195,79 @@ impl SortingFieldComputer { /// Given the u64 value, it is possible to recover the original value using /// `Self::recover_typed_sort_value`. fn compute_u64_sort_value_opt(&self, doc_id: DocId, score: Score) -> Option { + let apply_sort = |order, field_val| match order { + SortOrder::Desc => field_val, + SortOrder::Asc => u64::MAX - field_val, + }; match self { - SortingFieldComputer::FastField { - sort_column: fast_field_reader, - order, - .. - } => { - let field_val = fast_field_reader.first(doc_id)?; - match order { - // Descending is our most common case. - SortOrder::Desc => Some(field_val), - // We get Ascending order by using a decreasing mapping over u64 as the - // sorting_field. - SortOrder::Asc => Some(u64::MAX - field_val), - } - } - SortingFieldComputer::DocId => Some(doc_id as u64), - SortingFieldComputer::Score { order } => { + SortingFieldComputerComponent::DocId => Some(doc_id as u64), + SortingFieldComputerComponent::FastField { + sort_column, order, .. + } => sort_column + .first(doc_id) + .map(|field_val| apply_sort(*order, field_val)), + SortingFieldComputerComponent::Score { order } => { let u64_score = (score as f64).to_u64(); - match order { - SortOrder::Desc => Some(u64_score), - SortOrder::Asc => Some(u64::MAX - u64_score), - } + let u64_score = apply_sort(*order, u64_score); + Some(u64_score) } } } } +impl From for SortingFieldComputerPair { + fn from(value: SortingFieldComputerComponent) -> Self { + Self { + first: value, + second: None, + } + } +} + +pub(crate) struct SortingFieldComputerPair { + first: SortingFieldComputerComponent, + second: Option, +} + +impl SortingFieldComputerPair { + fn recover_typed_sort_value( + &self, + sort_value1: Option, + sort_value2: Option, + ) -> (Option, Option) { + let first_sort = sort_value1 + .and_then(|sort_value1| self.first.recover_typed_sort_value(Some(sort_value1))); + let second_sort = sort_value2.and_then(|sort_value2| { + self.second + .as_ref() + .expect("no second sort field, but got second sort value") + .recover_typed_sort_value(Some(sort_value2)) + }); + (first_sort, second_sort) + } + + /// Returns the ranking key for the given element + /// + /// The functions return None if the sort key is a fast field, for which we have no value + /// for the given doc_id. + /// All value are mapped using a monotonic mapping. + /// If the sort ord is ascending, we use the mapping `val -> u64::MAX - val` to + /// artificially invert the order. + /// + /// Given the u64 value, it is possible to recover the original value using + /// `Self::recover_typed_sort_value`. + fn compute_u64_sort_value_opt( + &self, + doc_id: DocId, + score: Score, + ) -> (Option, Option) { + let first = self.first.compute_u64_sort_value_opt(doc_id, score); + let second = self + .second + .as_ref() + .and_then(|second| second.compute_u64_sort_value_opt(doc_id, score)); + (first, second) + } +} impl TryFrom for SortFieldType { type Error = tantivy::TantivyError; @@ -169,36 +290,27 @@ impl TryFrom for SortFieldType { /// Takes a user-defined sorting criteria and resolves it to a /// segment specific `SortFieldComputer`. fn resolve_sort_by( - sort_by: &SortBy, + sort_by: &SortByPair, segment_reader: &SegmentReader, -) -> tantivy::Result { - match sort_by { - SortBy::DocId => Ok(SortingFieldComputer::DocId), - SortBy::FastField { field_name, order } => { - let sort_column_opt: Option<(Column, ColumnType)> = - segment_reader.fast_fields().u64_lenient(field_name)?; - let (sort_column, column_type) = sort_column_opt.unwrap_or_else(|| { - ( - Column::build_empty_column(segment_reader.max_doc()), - ColumnType::U64, - ) - }); - let sort_field_type = SortFieldType::try_from(column_type)?; - Ok(SortingFieldComputer::FastField { - sort_column, - sort_field_type, - order: *order, - }) - } - SortBy::Score { order } => Ok(SortingFieldComputer::Score { order: *order }), - } +) -> tantivy::Result { + Ok(SortingFieldComputerPair { + first: sort_by + .first + .to_sorting_field_computer_component(segment_reader)?, + second: sort_by + .second + .as_ref() + .map(|first| first.to_sorting_field_computer_component(segment_reader)) + .transpose()?, + }) } /// PartialHitHeapItem order is the inverse of the natural order /// so that we actually have a min-heap. -#[derive(Clone, Copy)] +#[derive(Clone, Copy, Debug)] struct PartialHitHeapItem { - sort_value_opt: Option, + sort_value_opt1: Option, + sort_value_opt2: Option, doc_id: DocId, } @@ -211,7 +323,8 @@ impl PartialOrd for PartialHitHeapItem { impl Ord for PartialHitHeapItem { #[inline] fn cmp(&self, other: &Self) -> Ordering { - let by_sorting_field = other.sort_value_opt.cmp(&self.sort_value_opt); + let by_sorting_field1 = other.sort_value_opt1.cmp(&self.sort_value_opt1); + let by_sorting_field2 = other.sort_value_opt2.cmp(&self.sort_value_opt2); let lazy_order_by_doc_id = || { self.doc_id @@ -220,7 +333,9 @@ impl Ord for PartialHitHeapItem { }; // In case of a tie on the feature, we sort by ascending `DocId`. - by_sorting_field.then_with(lazy_order_by_doc_id) + by_sorting_field1 + .then_with(|| by_sorting_field2) + .then_with(lazy_order_by_doc_id) } } @@ -241,7 +356,7 @@ enum AggregationSegmentCollectors { pub struct QuickwitSegmentCollector { num_hits: u64, split_id: String, - sort_by: SortingFieldComputer, + sort_by: SortingFieldComputerPair, hits: BinaryHeap, max_hits: usize, segment_ord: u32, @@ -257,17 +372,18 @@ impl QuickwitSegmentCollector { #[inline] fn collect_top_k(&mut self, doc_id: DocId, score: Score) { - let sorting_field_value_opt: Option = + let (sorting_field_value_opt1, sorting_field_value_opt2): (Option, Option) = self.sort_by.compute_u64_sort_value_opt(doc_id, score); if self.at_capacity() { - if let Some(sorting_field_value) = sorting_field_value_opt { + if let Some(sorting_field_value) = sorting_field_value_opt1 { if let Some(limit_sorting_field) = - self.hits.peek().and_then(|head| head.sort_value_opt) + self.hits.peek().and_then(|head| head.sort_value_opt1) { // In case of a tie, we keep the document with a lower `DocId`. if limit_sorting_field < sorting_field_value { if let Some(mut head) = self.hits.peek_mut() { - head.sort_value_opt = Some(sorting_field_value); + head.sort_value_opt1 = Some(sorting_field_value); + head.sort_value_opt2 = sorting_field_value_opt2; head.doc_id = doc_id; } } @@ -277,7 +393,8 @@ impl QuickwitSegmentCollector { // we have not reached capacity yet, so we can just push the // element. self.hits.push(PartialHitHeapItem { - sort_value_opt: sorting_field_value_opt, + sort_value_opt1: sorting_field_value_opt1, + sort_value_opt2: sorting_field_value_opt2, doc_id, }); } @@ -324,13 +441,21 @@ impl SegmentCollector for QuickwitSegmentCollector { .hits .into_sorted_vec() .into_iter() - .map(|hit| PartialHit { - sort_value: hit - .sort_value_opt - .map(|sort_value| sort_by.recover_typed_sort_value(sort_value)), - segment_ord, - doc_id: hit.doc_id, - split_id: split_id.clone(), + .map(|hit| { + let (sort_value1, sort_value2) = + sort_by.recover_typed_sort_value(hit.sort_value_opt1, hit.sort_value_opt2); + + PartialHit { + sort_value: Some(quickwit_proto::SortByValue { + sort_value: sort_value1, + }), + sort_value2: Some(quickwit_proto::SortByValue { + sort_value: sort_value2, + }), + segment_ord, + doc_id: hit.doc_id, + split_id: split_id.clone(), + } }) .collect(); @@ -391,7 +516,7 @@ pub(crate) struct QuickwitCollector { pub split_id: String, pub start_offset: usize, pub max_hits: usize, - pub sort_by: SortBy, + pub sort_by: SortByPair, timestamp_filter_builder_opt: Option, pub aggregation: Option, pub aggregation_limits: AggregationLimits, @@ -400,11 +525,9 @@ pub(crate) struct QuickwitCollector { impl QuickwitCollector { pub fn fast_field_names(&self) -> HashSet { let mut fast_field_names = HashSet::default(); - match &self.sort_by { - SortBy::DocId | SortBy::Score { .. } => {} - SortBy::FastField { field_name, .. } => { - fast_field_names.insert(field_name.clone()); - } + self.sort_by.first.add_fast_field(&mut fast_field_names); + if let Some(sort_by_second) = &self.sort_by.second { + sort_by_second.add_fast_field(&mut fast_field_names); } if let Some(aggregations) = &self.aggregation { fast_field_names.extend(aggregations.fast_field_names()); @@ -475,10 +598,13 @@ impl Collector for QuickwitCollector { // We do not need BM25 scoring in Quickwit if it is not opted-in. // By returning false, we inform tantivy that it does not need to decompress // term frequencies. - match self.sort_by { - SortBy::DocId | SortBy::FastField { .. } => false, - SortBy::Score { .. } => true, - } + self.sort_by.first.requires_scoring() + || self + .sort_by + .second + .as_ref() + .map(|sort_by| sort_by.requires_scoring()) + .unwrap_or(false) } fn merge_fruits( @@ -491,9 +617,14 @@ impl Collector for QuickwitCollector { // All leaves will return their top [0..max_hits) documents. // We compute the overall [0..start_offset + max_hits) documents ... let num_hits = self.start_offset + self.max_hits; - let sort_order = self.sort_by.sort_order(); - let mut merged_leaf_response = - merge_leaf_responses(&self.aggregation, segment_fruits?, sort_order, num_hits)?; + let (sort_order1, sort_order2) = self.sort_by.sort_orders(); + let mut merged_leaf_response = merge_leaf_responses( + &self.aggregation, + segment_fruits?, + sort_order1, + sort_order2, + num_hits, + )?; // ... and drop the first [..start_offsets) hits. merged_leaf_response .partial_hits @@ -515,7 +646,8 @@ fn map_error(err: postcard::Error) -> TantivyError { fn merge_leaf_responses( aggregations_opt: &Option, mut leaf_responses: Vec, - sort_order: SortOrder, + sort_order1: SortOrder, + sort_order2: SortOrder, max_hits: usize, ) -> tantivy::Result { // Optimization: No merging needed if there is only one result. @@ -586,8 +718,12 @@ fn merge_leaf_responses( .into_iter() .flat_map(|leaf_response| leaf_response.partial_hits) .collect(); - let top_k_partial_hits: Vec = - top_k_partial_hits(all_partial_hits.into_iter(), sort_order, max_hits); + let top_k_partial_hits: Vec = top_k_partial_hits( + all_partial_hits.into_iter(), + sort_order1, + sort_order2, + max_hits, + ); Ok(LeafSearchResponse { intermediate_aggregation_result: merged_intermediate_aggregation_result, num_hits, @@ -603,47 +739,97 @@ fn merge_leaf_responses( /// TODO we could possibly optimize the sort away (but I doubt it matters). fn top_k_partial_hits( partial_hits: impl Iterator, - sort_order: SortOrder, + sort_order1: SortOrder, + sort_order2: SortOrder, num_hits: usize, ) -> Vec { - match sort_order { - SortOrder::Asc => top_k(partial_hits.into_iter(), num_hits, |partial_hit| { - // This reverse dance is a little bit complicated. - // Note that `Option>` is very different from `Reverse>`. - // - // We do want the earlier: documents without any values should always get ranked after - // documents with a value, regardless of whether we use ascending or - // descending order. - let score = partial_hit.sort_value.map(Reverse); - let addr = GlobalDocAddress::from_partial_hit(partial_hit); - (score, Reverse(addr)) - }), - SortOrder::Desc => top_k(partial_hits.into_iter(), num_hits, |partial_hit| { - let addr = GlobalDocAddress::from_partial_hit(partial_hit); - (partial_hit.sort_value, addr) - }), - } -} - -pub(crate) fn sort_by_from_request(search_request: &SearchRequest) -> SortBy { - let sort_order = search_request - .sort_order - .and_then(SortOrder::from_i32) - .unwrap_or(SortOrder::Desc); - search_request - .sort_by_field - .as_ref() - .map(|field_name| { - if field_name == "_score" { - SortBy::Score { order: sort_order } - } else { - SortBy::FastField { - field_name: field_name.clone(), - order: sort_order, - } + let get_sort_values = |partial_hit: &PartialHit| { + ( + partial_hit + .sort_value + .and_then(|sort_value| sort_value.sort_value), + partial_hit + .sort_value2 + .and_then(|sort_value| sort_value.sort_value), + ) + }; + match (sort_order1, sort_order2) { + (SortOrder::Asc, SortOrder::Asc) => { + top_k(partial_hits.into_iter(), num_hits, |partial_hit| { + // This reverse dance is a little bit complicated. + // Note that `Option>` is very different from `Reverse>`. + // + // Since the value is Option>, we have to use and_then (in + // get_sort_values) to flatten it to Option, or else we would get + // `Option>`. + // + // We do want the earlier: documents without any values should always get ranked + // after documents with a value, regardless of whether we use + // ascending or descending order. + let (score, score2) = get_sort_values(partial_hit); + let score = score.map(Reverse); + let score2 = score2.map(Reverse); + let addr = GlobalDocAddress::from_partial_hit(partial_hit); + (score, score2, Reverse(addr)) + }) + } + (SortOrder::Asc, SortOrder::Desc) => { + top_k(partial_hits.into_iter(), num_hits, |partial_hit| { + let (score, score2) = get_sort_values(partial_hit); + let score = score.map(Reverse); + let addr = GlobalDocAddress::from_partial_hit(partial_hit); + (score, score2, Reverse(addr)) + }) + } + (SortOrder::Desc, SortOrder::Desc) => { + top_k(partial_hits.into_iter(), num_hits, |partial_hit| { + let addr = GlobalDocAddress::from_partial_hit(partial_hit); + let (score, score2) = get_sort_values(partial_hit); + (score, score2, addr) + }) + } + (SortOrder::Desc, SortOrder::Asc) => { + top_k(partial_hits.into_iter(), num_hits, |partial_hit| { + let (score, score2) = get_sort_values(partial_hit); + let score2 = score2.map(Reverse); + let addr = GlobalDocAddress::from_partial_hit(partial_hit); + (score, score2, addr) + }) + } + } +} + +pub(crate) fn sort_by_from_request(search_request: &SearchRequest) -> SortByPair { + let to_sort_by_component = |field_name: &str, order| { + if field_name == "_score" { + SortByComponent::Score { order } + } else { + SortByComponent::FastField { + field_name: field_name.to_string(), + order, } - }) - .unwrap_or(SortBy::DocId) + } + }; + + let num_sort_fields = search_request.sort_fields.len(); + if num_sort_fields == 0 { + SortByComponent::DocId.into() + } else if num_sort_fields == 1 { + let sort_field = &search_request.sort_fields[0]; + let order = SortOrder::from_i32(sort_field.sort_order).unwrap_or(SortOrder::Desc); + to_sort_by_component(&sort_field.field_name, order).into() + } else if num_sort_fields == 2 { + let sort_field1 = &search_request.sort_fields[0]; + let order1 = SortOrder::from_i32(sort_field1.sort_order).unwrap_or(SortOrder::Desc); + let sort_field2 = &search_request.sort_fields[1]; + let order2 = SortOrder::from_i32(sort_field2.sort_order).unwrap_or(SortOrder::Desc); + SortByPair { + first: to_sort_by_component(&sort_field1.field_name, order1), + second: Some(to_sort_by_component(&sort_field2.field_name, order2)), + } + } else { + panic!("Sort by more than 2 fields is not supported yet.") + } } /// Builds the QuickwitCollector, in function of the information that was requested by the user. @@ -708,19 +894,50 @@ mod tests { fn test_partial_hit_ordered_by_sorting_field() { let lesser_score = PartialHitHeapItem { doc_id: 1u32, - sort_value_opt: Some(1u64), + sort_value_opt1: Some(1u64), + sort_value_opt2: None, }; let higher_score = PartialHitHeapItem { - sort_value_opt: Some(2u64), + sort_value_opt1: Some(2u64), + sort_value_opt2: None, doc_id: 1u32, }; assert_eq!(lesser_score.cmp(&higher_score), Ordering::Greater); } + #[test] + fn test_partial_hit_ordered_by_sorting_field_2() { + let get_el = |val1, val2, docid| PartialHitHeapItem { + doc_id: docid, + sort_value_opt1: val1, + sort_value_opt2: val2, + }; + let mut data = vec![ + get_el(Some(1u64), None, 1u32), + get_el(Some(2u64), Some(2u64), 1u32), + get_el(Some(2u64), Some(1u64), 1u32), + get_el(Some(2u64), None, 1u32), + get_el(None, Some(1u64), 1u32), + get_el(None, None, 1u32), + ]; + data.sort(); + assert_eq!( + data, + vec![ + get_el(Some(2u64), Some(2u64), 1u32), + get_el(Some(2u64), Some(1u64), 1u32), + get_el(Some(2u64), None, 1u32), + get_el(Some(1u64), None, 1u32), + get_el(None, Some(1u64), 1u32), + get_el(None, None, 1u32), + ] + ); + } #[test] fn test_merge_partial_hits_no_tie() { let make_doc = |sort_value: u64| PartialHit { - sort_value: Some(SortValue::U64(sort_value)), + sort_value: Some(SortValue::U64(sort_value).into()), + sort_value2: None, split_id: "split1".to_string(), segment_ord: 0u32, doc_id: 0u32, @@ -729,6 +946,7 @@ mod tests { top_k_partial_hits( vec![make_doc(1u64), make_doc(3u64), make_doc(2u64),].into_iter(), SortOrder::Asc, + SortOrder::Asc, 2 ), vec![make_doc(1), make_doc(2)] @@ -738,7 +956,8 @@ mod tests { #[test] fn test_merge_partial_hits_with_tie() { let make_hit_given_split_id = |split_id: u64| PartialHit { - sort_value: Some(SortValue::U64(0u64)), + sort_value: Some(SortValue::U64(0u64).into()), + sort_value2: None, split_id: format!("split_{split_id}"), segment_ord: 0u32, doc_id: 0u32, @@ -752,6 +971,7 @@ mod tests { ] .into_iter(), SortOrder::Desc, + SortOrder::Desc, 2 ), &[make_hit_given_split_id(3), make_hit_given_split_id(2)] @@ -765,6 +985,7 @@ mod tests { ] .into_iter(), SortOrder::Asc, + SortOrder::Asc, 2 ), &[make_hit_given_split_id(1), make_hit_given_split_id(2)] diff --git a/quickwit/quickwit-search/src/leaf.rs b/quickwit/quickwit-search/src/leaf.rs index 658e6b34a0d..049a3da4951 100644 --- a/quickwit/quickwit-search/src/leaf.rs +++ b/quickwit/quickwit-search/src/leaf.rs @@ -395,7 +395,7 @@ async fn leaf_search_single_split( /// or applying date range when the range covers the entire split. fn rewrite_request(search_request: &mut SearchRequest, split: &SplitIdAndFooterOffsets) { if search_request.max_hits == 0 { - search_request.sort_by_field = None; + search_request.sort_fields = vec![]; } rewrite_start_end_time_bounds( &mut search_request.start_timestamp, diff --git a/quickwit/quickwit-search/src/leaf_cache.rs b/quickwit/quickwit-search/src/leaf_cache.rs index 876c8b5dd33..6e0055f1346 100644 --- a/quickwit/quickwit-search/src/leaf_cache.rs +++ b/quickwit/quickwit-search/src/leaf_cache.rs @@ -224,7 +224,8 @@ mod tests { partial_hits: vec![PartialHit { doc_id: 1, segment_ord: 0, - sort_value: Some(SortValue::U64(0u64)), + sort_value: Some(SortValue::U64(0u64).into()), + sort_value2: None, split_id: "split_1".to_string(), }], }; @@ -309,7 +310,8 @@ mod tests { partial_hits: vec![PartialHit { doc_id: 1, segment_ord: 0, - sort_value: Some(SortValue::U64(0)), + sort_value: Some(SortValue::U64(0).into()), + sort_value2: None, split_id: "split_1".to_string(), }], }; diff --git a/quickwit/quickwit-search/src/root.rs b/quickwit/quickwit-search/src/root.rs index e0eb17b4b27..4db74c90e52 100644 --- a/quickwit/quickwit-search/src/root.rs +++ b/quickwit/quickwit-search/src/root.rs @@ -28,7 +28,7 @@ use quickwit_metastore::{Metastore, SplitMetadata}; use quickwit_proto::{ FetchDocsRequest, FetchDocsResponse, Hit, LeafHit, LeafListTermsRequest, LeafListTermsResponse, LeafSearchRequest, LeafSearchResponse, ListTermsRequest, ListTermsResponse, PartialHit, - SearchRequest, SearchResponse, SplitIdAndFooterOffsets, + SearchRequest, SearchResponse, SortField, SplitIdAndFooterOffsets, }; use quickwit_query::query_ast::{ BoolQuery, QueryAst, QueryAstVisitor, RangeQuery, TermQuery, TermSetQuery, @@ -145,6 +145,22 @@ fn validate_requested_snippet_fields( Ok(()) } +fn validate_sort_by_fields(sort_fields: &[SortField], schema: &Schema) -> crate::Result<()> { + if sort_fields.is_empty() { + return Ok(()); + } + if sort_fields.len() > 2 { + return Err(SearchError::InvalidArgument(format!( + "Sort by field must be up to 2 fields {:?}.", + sort_fields + ))); + } + for sort in sort_fields { + validate_sort_by_field(&sort.field_name, schema)?; + } + + Ok(()) +} fn validate_sort_by_field(field_name: &str, schema: &Schema) -> crate::Result<()> { if field_name == "_score" { return Ok(()); @@ -178,9 +194,7 @@ pub(crate) fn validate_request( validate_requested_snippet_fields(&schema, &search_request.snippet_fields)?; - if let Some(sort_by_field) = &search_request.sort_by_field { - validate_sort_by_field(sort_by_field, &schema)?; - } + validate_sort_by_fields(&search_request.sort_fields, &schema)?; if let Some(agg) = search_request.aggregation_request.as_ref() { let _aggs: QuickwitAggregations = serde_json::from_str(agg).map_err(|_err| { @@ -815,7 +829,8 @@ mod tests { doc_id: u32, ) -> quickwit_proto::PartialHit { quickwit_proto::PartialHit { - sort_value: Some(SortValue::U64(sort_value)), + sort_value: Some(SortValue::U64(sort_value).into()), + sort_value2: None, split_id: split_id.to_string(), segment_ord: 1, doc_id, @@ -1076,7 +1091,9 @@ mod tests { max_hits: 10, ..Default::default() }; - search_request.set_sort_order(SortOrder::Asc); + if let Some(sort_field) = search_request.sort_fields.get_mut(0) { + sort_field.set_sort_order(SortOrder::Asc); + } let mut metastore = MockMetastore::new(); metastore .expect_index_metadata() @@ -1096,13 +1113,15 @@ mod tests { num_hits: 2, partial_hits: vec![ quickwit_proto::PartialHit { - sort_value: Some(SortValue::U64(2u64)), + sort_value: Some(SortValue::U64(2u64).into()), + sort_value2: None, split_id: "split1".to_string(), segment_ord: 0, doc_id: 0, }, quickwit_proto::PartialHit { sort_value: None, + sort_value2: None, split_id: "split1".to_string(), segment_ord: 0, doc_id: 1, @@ -1128,19 +1147,22 @@ mod tests { num_hits: 3, partial_hits: vec![ quickwit_proto::PartialHit { - sort_value: Some(SortValue::I64(-1i64)), + sort_value: Some(SortValue::I64(-1i64).into()), + sort_value2: None, split_id: "split2".to_string(), segment_ord: 0, doc_id: 1, }, quickwit_proto::PartialHit { - sort_value: Some(SortValue::I64(1i64)), + sort_value: Some(SortValue::I64(1i64).into()), + sort_value2: None, split_id: "split2".to_string(), segment_ord: 0, doc_id: 0, }, quickwit_proto::PartialHit { sort_value: None, + sort_value2: None, split_id: "split2".to_string(), segment_ord: 0, doc_id: 2, @@ -1181,7 +1203,8 @@ mod tests { split_id: "split2".to_string(), segment_ord: 0, doc_id: 1, - sort_value: Some(SortValue::I64(-1i64)), + sort_value: Some(SortValue::I64(-1i64).into()), + sort_value2: None, } ); assert_eq!( @@ -1190,7 +1213,8 @@ mod tests { split_id: "split2".to_string(), segment_ord: 0, doc_id: 0, - sort_value: Some(SortValue::I64(1i64)), + sort_value: Some(SortValue::I64(1i64).into()), + sort_value2: None, } ); assert_eq!( @@ -1199,7 +1223,8 @@ mod tests { split_id: "split1".to_string(), segment_ord: 0, doc_id: 0, - sort_value: Some(SortValue::U64(2u64)), + sort_value: Some(SortValue::U64(2u64).into()), + sort_value2: None, } ); assert_eq!( @@ -1209,6 +1234,7 @@ mod tests { segment_ord: 0, doc_id: 1, sort_value: None, + sort_value2: None, } ); assert_eq!( @@ -1218,6 +1244,7 @@ mod tests { segment_ord: 0, doc_id: 2, sort_value: None, + sort_value2: None, } ); Ok(()) @@ -1251,13 +1278,15 @@ mod tests { num_hits: 2, partial_hits: vec![ quickwit_proto::PartialHit { - sort_value: Some(SortValue::U64(2u64)), + sort_value: Some(SortValue::U64(2u64).into()), + sort_value2: None, split_id: "split1".to_string(), segment_ord: 0, doc_id: 0, }, quickwit_proto::PartialHit { sort_value: None, + sort_value2: None, split_id: "split1".to_string(), segment_ord: 0, doc_id: 1, @@ -1283,19 +1312,22 @@ mod tests { num_hits: 3, partial_hits: vec![ quickwit_proto::PartialHit { - sort_value: Some(SortValue::I64(1i64)), + sort_value: Some(SortValue::I64(1i64).into()), + sort_value2: None, split_id: "split2".to_string(), segment_ord: 0, doc_id: 0, }, quickwit_proto::PartialHit { - sort_value: Some(SortValue::I64(-1i64)), + sort_value: Some(SortValue::I64(-1i64).into()), + sort_value2: None, split_id: "split2".to_string(), segment_ord: 0, doc_id: 1, }, quickwit_proto::PartialHit { sort_value: None, + sort_value2: None, split_id: "split2".to_string(), segment_ord: 0, doc_id: 2, @@ -1336,7 +1368,8 @@ mod tests { split_id: "split1".to_string(), segment_ord: 0, doc_id: 0, - sort_value: Some(SortValue::U64(2u64)), + sort_value: Some(SortValue::U64(2u64).into()), + sort_value2: None, } ); assert_eq!( @@ -1345,7 +1378,8 @@ mod tests { split_id: "split2".to_string(), segment_ord: 0, doc_id: 0, - sort_value: Some(SortValue::I64(1i64)), + sort_value: Some(SortValue::I64(1i64).into()), + sort_value2: None, } ); assert_eq!( @@ -1354,7 +1388,8 @@ mod tests { split_id: "split2".to_string(), segment_ord: 0, doc_id: 1, - sort_value: Some(SortValue::I64(-1i64)), + sort_value: Some(SortValue::I64(-1i64).into()), + sort_value2: None, } ); assert_eq!( @@ -1364,6 +1399,7 @@ mod tests { segment_ord: 0, doc_id: 2, sort_value: None, + sort_value2: None, } ); assert_eq!( @@ -1373,6 +1409,7 @@ mod tests { segment_ord: 0, doc_id: 1, sort_value: None, + sort_value2: None, } ); Ok(()) diff --git a/quickwit/quickwit-search/src/search_stream/leaf.rs b/quickwit/quickwit-search/src/search_stream/leaf.rs index 4a676e9ec32..e75bdc68343 100644 --- a/quickwit/quickwit-search/src/search_stream/leaf.rs +++ b/quickwit/quickwit-search/src/search_stream/leaf.rs @@ -171,7 +171,10 @@ async fn leaf_search_stream_single_split( search_request.end_timestamp, ); - let requires_scoring = search_request.sort_by_field.as_deref() == Some("_score"); + let requires_scoring = search_request + .sort_fields + .iter() + .any(|sort| sort.field_name == "_score"); // TODO no test fail if this line get removed warmup_info.field_norms |= requires_scoring; diff --git a/quickwit/quickwit-search/src/tests.rs b/quickwit/quickwit-search/src/tests.rs index b247e10f687..bc7a861b27c 100644 --- a/quickwit/quickwit-search/src/tests.rs +++ b/quickwit/quickwit-search/src/tests.rs @@ -25,8 +25,8 @@ use quickwit_doc_mapper::DefaultDocMapper; use quickwit_indexing::TestSandbox; use quickwit_opentelemetry::otlp::TraceId; use quickwit_proto::{ - qast_helper, query_ast_from_user_text, LeafListTermsResponse, SearchRequest, SortOrder, - SortValue, + qast_helper, query_ast_from_user_text, LeafListTermsResponse, SearchRequest, SortByValue, + SortField, SortOrder, SortValue, }; use serde_json::{json, Value as JsonValue}; use tantivy::schema::Value as TantivyValue; @@ -378,8 +378,10 @@ async fn test_single_node_filtering() -> anyhow::Result<()> { start_timestamp: Some(start_timestamp + 10), end_timestamp: Some(start_timestamp + 20), max_hits: 15, - sort_by_field: Some("ts".to_string()), - sort_order: Some(SortOrder::Desc as i32), + sort_fields: vec![SortField { + field_name: "ts".to_string(), + sort_order: SortOrder::Desc as i32, + }], ..Default::default() }; let single_node_response = single_node_search( @@ -399,8 +401,10 @@ async fn test_single_node_filtering() -> anyhow::Result<()> { query_ast: qast_helper("info", &["body"]), end_timestamp: Some(start_timestamp + 20), max_hits: 25, - sort_by_field: Some("ts".to_string()), - sort_order: Some(SortOrder::Desc as i32), + sort_fields: vec![SortField { + field_name: "ts".to_string(), + sort_order: SortOrder::Desc as i32, + }], ..Default::default() }; let single_node_response = single_node_search( @@ -419,8 +423,10 @@ async fn test_single_node_filtering() -> anyhow::Result<()> { index_id: index_id.to_string(), query_ast: qast_helper("tag:foo AND info", &["body"]), max_hits: 25, - sort_by_field: Some("ts".to_string()), - sort_order: Some(SortOrder::Desc as i32), + sort_fields: vec![SortField { + field_name: "ts".to_string(), + sort_order: SortOrder::Desc as i32, + }], ..Default::default() }; let single_node_response = single_node_search( @@ -502,8 +508,10 @@ async fn single_node_search_sort_by_field( index_id: index_id.to_string(), query_ast: qast_helper("city", &["description"]), max_hits: 15, - sort_by_field: Some(sort_by_field.to_string()), - sort_order: Some(SortOrder::Desc as i32), + sort_fields: vec![SortField { + field_name: sort_by_field.to_string(), + sort_order: SortOrder::Desc as i32, + }], ..Default::default() }; @@ -585,8 +593,10 @@ async fn test_sort_bm25() { index_id: index_id.to_string(), query_ast: query_ast_json, max_hits: 1_000, - sort_by_field: Some("_score".to_string()), - sort_order: Some(SortOrder::Desc as i32), + sort_fields: vec![SortField { + field_name: "_score".to_string(), + sort_order: SortOrder::Desc as i32, + }], ..Default::default() }; let metastore = test_sandbox.metastore(); @@ -599,7 +609,10 @@ async fn test_sort_bm25() { .into_iter() .map(|hit| { let partial_hit = hit.partial_hit.unwrap(); - let Some(SortValue::F64(score)) = partial_hit.sort_value else { + let Some(SortByValue { + sort_value: Some(SortValue::F64(score)), + }) = partial_hit.sort_value + else { panic!() }; (score as f32, partial_hit.doc_id) @@ -672,8 +685,10 @@ async fn test_sort_by_static_and_dynamic_field() { index_id: index_id.to_string(), query_ast: query_ast_json, max_hits: 1_000, - sort_by_field: Some(sort_field.to_string()), - sort_order: Some(order as i32), + sort_fields: vec![SortField { + field_name: sort_field.to_string(), + sort_order: order as i32, + }], ..Default::default() }; let metastore = test_sandbox.metastore(); @@ -728,6 +743,102 @@ async fn test_sort_by_static_and_dynamic_field() { test_sandbox.assert_quit().await; } +#[tokio::test] +async fn test_sort_by_2_field() { + let index_id = "sort_by_dynamic_field".to_string(); + // In this test, we will try sorting docs by several fields. + // - static_u64 + // - dynamic_u64 + let doc_mapping_yaml = r#" + mode: dynamic + field_mappings: + - name: static_u64 + type: u64 + fast: true + dynamic_mapping: + fast: true + stored: true + "#; + let test_sandbox = TestSandbox::create(&index_id, doc_mapping_yaml, "{}", &[]) + .await + .unwrap(); + let docs = vec![ + // 0 + json!({"static_u64": 3u64, "dynamic_u64": 3u64}), + // 1 + json!({"static_u64": 3u64, "dynamic_u64": 2u64}), + // 2 + json!({}), + // 3 + json!({"dynamic_u64": 2u64}), + // 4 + json!({"static_u64": 4u64, "dynamic_u64": (i64::MAX as u64) + 1}), + ]; + test_sandbox.add_documents(docs).await.unwrap(); + let search_hits = + |sort_field1: &str, order1: SortOrder, sort_field2: &str, order2: SortOrder| { + let query_ast_json = serde_json::to_string(&QueryAst::MatchAll).unwrap(); + let search_request = SearchRequest { + index_id: index_id.to_string(), + query_ast: query_ast_json, + max_hits: 1_000, + sort_fields: vec![ + SortField { + field_name: sort_field1.to_string(), + sort_order: order1 as i32, + }, + SortField { + field_name: sort_field2.to_string(), + sort_order: order2 as i32, + }, + ], + ..Default::default() + }; + let metastore = test_sandbox.metastore(); + let storage_resolver = test_sandbox.storage_resolver(); + async move { + let search_resp = single_node_search(search_request, &*metastore, storage_resolver) + .await + .unwrap(); + assert_eq!(search_resp.num_hits, 5); + search_resp + .hits + .into_iter() + .map(|hit| { + let partial_hit = hit.partial_hit.unwrap(); + partial_hit.doc_id + }) + .collect::>() + } + }; + { + let ordered_docs: Vec = search_hits( + "static_u64", + SortOrder::Desc, + "dynamic_u64", + SortOrder::Desc, + ) + .await; + assert_eq!(&ordered_docs[..], &[4, 0, 1, 3, 2]); + } + { + let ordered_docs: Vec = + search_hits("static_u64", SortOrder::Desc, "dynamic_u64", SortOrder::Asc).await; + assert_eq!(&ordered_docs[..], &[4, 1, 0, 3, 2]); + } + { + let ordered_docs: Vec = + search_hits("static_u64", SortOrder::Asc, "dynamic_u64", SortOrder::Desc).await; + assert_eq!(&ordered_docs[..], &[0, 1, 4, 3, 2]); + } + { + let ordered_docs: Vec = + search_hits("static_u64", SortOrder::Asc, "dynamic_u64", SortOrder::Asc).await; + assert_eq!(&ordered_docs[..], &[1, 0, 4, 3, 2]); + } + test_sandbox.assert_quit().await; +} + #[tokio::test] async fn test_single_node_invalid_sorting_with_query() { let index_id = "single-node-invalid-sorting"; @@ -754,8 +865,10 @@ async fn test_single_node_invalid_sorting_with_query() { index_id: index_id.to_string(), query_ast: qast_helper("city", &["description"]), max_hits: 15, - sort_by_field: Some("description".to_string()), - sort_order: Some(SortOrder::Desc as i32), + sort_fields: vec![SortField { + field_name: "description".to_string(), + sort_order: SortOrder::Desc as i32, + }], ..Default::default() }; let single_node_response = single_node_search( diff --git a/quickwit/quickwit-serve/src/elastic_search_api/rest_handler.rs b/quickwit/quickwit-serve/src/elastic_search_api/rest_handler.rs index 2566127ee41..fd73395e33c 100644 --- a/quickwit/quickwit-serve/src/elastic_search_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/elastic_search_api/rest_handler.rs @@ -45,7 +45,6 @@ use super::model::{ ElasticSearchError, MultiSearchHeader, MultiSearchQueryParams, MultiSearchResponse, MultiSearchSingleResponse, SearchBody, SearchQueryParams, }; -use crate::elastic_search_api::model::SortField; use crate::format::BodyFormat; use crate::json_api_response::{make_json_api_response, ApiError, JsonApiResponse}; use crate::{with_arg, BuildInfo}; @@ -147,31 +146,29 @@ fn build_request_for_es_api( let max_hits = search_params.size.or(search_body.size).unwrap_or(10); let start_offset = search_params.from.or(search_body.from).unwrap_or(0); - let sort_fields: Vec = search_params + let sort_fields: Vec = search_params .sort_fields()? .or_else(|| search_body.sort.clone()) - .unwrap_or_default(); - - if sort_fields.len() >= 2 { + .unwrap_or_default() + .iter() + .map(|sort_field| quickwit_proto::SortField { + field_name: sort_field.field.to_string(), + sort_order: sort_field.order as i32, + }) + .collect(); + if sort_fields.len() >= 3 { return Err(ElasticSearchError::from(SearchError::InvalidArgument( - format!("Only one search field is supported at the moment. Got {sort_fields:?}"), + format!("Only up to two sort fields supported at the moment. Got {sort_fields:?}"), ))); } - let (sort_by_field, sort_order) = if let Some(sort_field) = sort_fields.into_iter().next() { - (Some(sort_field.field), Some(sort_field.order as i32)) - } else { - (None, None) - }; - Ok(quickwit_proto::SearchRequest { index_id, query_ast: serde_json::to_string(&query_ast).expect("Failed to serialize QueryAst"), max_hits, start_offset, aggregation_request, - sort_by_field, - sort_order, + sort_fields, ..Default::default() }) } diff --git a/quickwit/quickwit-serve/src/search_api/rest_handler.rs b/quickwit/quickwit-serve/src/search_api/rest_handler.rs index edd248c0d4f..8b35f062eff 100644 --- a/quickwit/quickwit-serve/src/search_api/rest_handler.rs +++ b/quickwit/quickwit-serve/src/search_api/rest_handler.rs @@ -170,15 +170,17 @@ pub struct SearchRequestQueryString { pub sort_by_field: Option, } -fn get_proto_search_by(search_request: &SearchRequestQueryString) -> (Option, Option) { - if let Some(sort_by_field) = &search_request.sort_by_field { - ( - Some(sort_by_field.order as i32), - Some(sort_by_field.field_name.clone()), - ) - } else { - (None, None) - } +fn get_proto_sort_by(search_request: &SearchRequestQueryString) -> Vec { + search_request + .sort_by_field + .as_ref() + .map(|sort_by| { + vec![quickwit_proto::SortField { + field_name: sort_by.field_name.to_string(), + sort_order: sort_by.order as i32, + }] + }) + .unwrap_or_default() } async fn search_endpoint( @@ -186,7 +188,7 @@ async fn search_endpoint( search_request: SearchRequestQueryString, search_service: &dyn SearchService, ) -> Result { - let (sort_order, sort_by_field) = get_proto_search_by(&search_request); + let sort_fields = get_proto_sort_by(&search_request); // The query ast below may still contain user input query. The actual // parsing of the user query will happen in the root service, and might require // the user of the docmapper default fields (which we do not have at this point). @@ -203,8 +205,7 @@ async fn search_endpoint( aggregation_request: search_request .aggs .map(|agg| serde_json::to_string(&agg).expect("could not serialize JsonValue")), - sort_order, - sort_by_field, + sort_fields, }; let search_response = search_service.root_search(search_request).await?; let search_response_rest = SearchResponseRest::try_from(search_response)?; diff --git a/quickwit/quickwit-storage/src/storage_resolver.rs b/quickwit/quickwit-storage/src/storage_resolver.rs index 97d1f77c80b..3afda9d34ef 100644 --- a/quickwit/quickwit-storage/src/storage_resolver.rs +++ b/quickwit/quickwit-storage/src/storage_resolver.rs @@ -149,7 +149,6 @@ impl StorageResolver { /// Returns a [`StorageResolver`] for testing purposes. Unlike /// [`StorageResolver::unconfigured`], this resolver does not return a singleton. - #[cfg(any(test, feature = "testsuite"))] pub fn ram_for_test() -> Self { use quickwit_config::RamStorageConfig; diff --git a/quickwit/rest-api-tests/scenarii/sort_orders/0001-sort-elasticapi.yaml b/quickwit/rest-api-tests/scenarii/sort_orders/0001-sort-elasticapi.yaml index 0a003af96ad..7684c075164 100644 --- a/quickwit/rest-api-tests/scenarii/sort_orders/0001-sort-elasticapi.yaml +++ b/quickwit/rest-api-tests/scenarii/sort_orders/0001-sort-elasticapi.yaml @@ -10,10 +10,12 @@ json: expected: hits: total: - value: 5 + value: 7 relation: "eq" hits: - fields: { "count": 15, "id": 2 } + - fields: { "count": 10, "id": 0 } + - fields: { "count": 10, "id": 2 } - fields: { "count": 10, "id": 1 } - fields: {"count": -2.5, "id": 4} - fields: { "id": 5 } @@ -28,11 +30,55 @@ json: expected: hits: total: - value: 5 + value: 7 relation: "eq" hits: - fields: {"count": -2.5, "id": 4} - fields: {"count": 10, "id": 1 } + - fields: {"count": 10, "id": 2 } + - fields: {"count": 10, "id": 0 } - fields: {"count": 15, "id": 2 } - fields: {"id": 3} - fields: {"id": 5} +--- +endpoint: _elastic/sortorder/_search +json: + query: + match_all: {} + sort: + - id: {"order" : "asc"} + - count: {"order" : "asc"} +expected: + hits: + total: + value: 7 + relation: "eq" + hits: + - fields: {"count": 10, "id": 0 } + - fields: {"count": 10, "id": 1 } + - fields: {"count": 10, "id": 2 } + - fields: {"count": 15, "id": 2 } + - fields: {"id": 3} + - fields: {"count": -2.5, "id": 4} + - fields: {"id": 5} +--- +endpoint: _elastic/sortorder/_search +json: + query: + match_all: {} + sort: + - count: {"order" : "desc"} + - id: {"order" : "desc"} +expected: + hits: + total: + value: 7 + relation: "eq" + hits: + - fields: {"count": 15, "id": 2 } + - fields: {"count": 10, "id": 2 } + - fields: {"count": 10, "id": 1 } + - fields: {"count": 10, "id": 0 } + - fields: {"count": -2.5, "id": 4} + - fields: {"id": 5} + - fields: {"id": 3} diff --git a/quickwit/rest-api-tests/scenarii/sort_orders/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/sort_orders/_setup.quickwit.yaml index b39a872029d..9022e4d8082 100644 --- a/quickwit/rest-api-tests/scenarii/sort_orders/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/sort_orders/_setup.quickwit.yaml @@ -22,6 +22,7 @@ params: commit: force ndjson: - {"count": 10, "id": 1} + - {"count": 10, "id": 2} - {"count": 15, "id": 2} - {"id": 3} --- @@ -31,6 +32,7 @@ endpoint: sortorder/ingest params: commit: force ndjson: + - {"count": 10, "id": 0} - {"count": -2.5, "id": 4} - {"id": 5}