From cd7213e28cefea5b359e2d862dbefa02b277bb19 Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 26 Feb 2024 23:47:01 +0800 Subject: [PATCH] rename split_size to shard_size, update docs rename split_size to shard_size update docs remove variable propagation --- docs/reference/aggregation.md | 32 +++++++----- quickwit/quickwit-search/src/collector.rs | 51 ++++--------------- .../aggregations/0001-aggregations.yaml | 28 +++++++++- .../aggregations/_setup.quickwit.yaml | 2 +- 4 files changed, 56 insertions(+), 57 deletions(-) diff --git a/docs/reference/aggregation.md b/docs/reference/aggregation.md index f92e0838c86..332e36c85c5 100644 --- a/docs/reference/aggregation.md +++ b/docs/reference/aggregation.md @@ -500,23 +500,23 @@ Response #### Document count error In Quickwit, we have one segment per split. Therefore the results returned from a split, is equivalent to results returned from a segment. -To improve performance, results from one split are cut off at `split_size`. +To improve performance, results from one split are cut off at `shard_size`. When combining results of multiple splits, terms that don't make it in the top n of a result from a split increase the theoretical upper bound error by lowest term-count. -Even with a larger `split_size` value, doc_count values for a terms aggregation may be +Even with a larger `shard_size` value, doc_count values for a terms aggregation may be approximate. As a result, any sub-aggregations on the terms aggregation may also be approximate. `sum_other_doc_count` is the number of documents that didn’t make it into the the top size terms. If this is greater than 0, you can be sure that the terms agg had to throw away some buckets, either because they didn’t fit into `size` on the root node or they didn’t fit into -`split_size` on the leaf node. +`shard_size` on the leaf node. #### Per bucket document count error If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will include doc_count_error_upper_bound, which is an upper bound to the error on the doc_count returned by each split. It’s the sum of the size of the largest bucket on each split that didn’t fit -into `split_size`. +into `shard_size`. #### Parameters @@ -530,24 +530,28 @@ Currently term aggregation only works on fast fields of type `text`, `f64`, `i64 By default, the top 10 terms with the most documents are returned. Larger values for size are more expensive. -###### **split_size** +###### **shard_size** -The get more accurate results, we fetch more than size from each segment/split. Aliases to `shard_size`. +To obtain more accurate results, we fetch more than the `size` from each segment/split. -Increasing this value is will increase the accuracy, but also the CPU/memory usage. -See the [`document count error`](#document-count-error) section for more information how this parameter impacts accuracy. +Increasing this value will enhance accuracy but will also increase CPU/memory usage. +Refer to the [`document count error`](#document-count-error) section for more information on how `shard_size` impacts accuracy. -`split_size` is the number of terms that are sent from a leaf result to the root node. -Example: If you have 100 splits and `split_size` is 1000, the root node will receive 100_000 terms to merge. -With an average cost of 50 bytes per term this requires up to 5MB of memory. -The behaviour here deviates from elasticsearch, since we don't have global ordinals. That means we need to send serialized terms to the root node. +`shard_size` represents the number of terms that are returned from one split. +For example, if there are 100 splits and `shard_size` is set to 1000, the root node may receive up to 100_000 terms to merge. +Assuming an average cost of 50 bytes per term, this would require up to 5MB of memory. +The actual number of terms sent to the root depends on the number of splits handled by one node and how the intermediate results can be merged (e.g., the cardinality of the terms). -Defaults to size * 10. +Note on differences between Quickwit and Elasticsearch: +* Unlike Elasticsearch, Quickwit does not use global ordinals, so serialized terms need to be sent to the root node. +* The concept of shards in Elasticsearch differs from splits in Quickwit. In Elasticsearch, a shard contains up to 200M documents and is a collection of segments. In contrast, a Quickwit split comprises a single segment, typically with 5M documents. Therefore, `shard_size` in Elasticsearch applies to a group of segments, whereas in Quickwit, it applies to a single segment. + +Defaults to `size * 10`. ###### **show_term_doc_count_error** If you set the show_term_doc_count_error parameter to true, the terms aggregation will include doc_count_error_upper_bound, which is an upper bound to the error on the doc_count returned by each split. -It’s the sum of the size of the largest bucket on each split that didn’t fit into `split_size`. +It’s the sum of the size of the largest bucket on each split that didn’t fit into `shard_size`. Defaults to true when ordering by count desc. diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs index 3e82863af13..9a84e569c61 100644 --- a/quickwit/quickwit-search/src/collector.rs +++ b/quickwit/quickwit-search/src/collector.rs @@ -29,10 +29,7 @@ use quickwit_proto::search::{ SplitSearchError, }; use serde::Deserialize; -use tantivy::aggregation::agg_req::{ - get_fast_field_names, Aggregation, AggregationVariants, Aggregations, -}; -use tantivy::aggregation::bucket::TermsAggregation; +use tantivy::aggregation::agg_req::{get_fast_field_names, Aggregations}; use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults; use tantivy::aggregation::{AggregationLimits, AggregationSegmentCollector}; use tantivy::collector::{Collector, SegmentCollector}; @@ -451,22 +448,6 @@ pub enum QuickwitAggregations { TantivyAggregations(Aggregations), } -fn visit_term_aggregations(aggs: &mut Aggregations, cb: &mut F) -where F: FnMut(&mut TermsAggregation) { - visit_aggregations_mut(aggs, &mut |agg| { - if let AggregationVariants::Terms(terms_agg) = &mut agg.agg { - cb(terms_agg); - } - }); -} -fn visit_aggregations_mut(aggs: &mut Aggregations, cb: &mut F) -where F: FnMut(&mut Aggregation) { - for agg in aggs.values_mut() { - cb(agg); - visit_aggregations_mut(&mut agg.sub_aggregation, cb); - } -} - impl QuickwitAggregations { fn fast_field_names(&self) -> HashSet { match self { @@ -630,26 +611,16 @@ impl Collector for QuickwitCollector { Box::new(collector.for_segment(0, segment_reader)?), )) } - Some(QuickwitAggregations::TantivyAggregations(aggs)) => { - let mut aggs = aggs.clone(); - visit_term_aggregations(&mut aggs, &mut |terms_agg| { - // Forward split_size (also shard_size) parameter to segment_size, as - // this is the same in context of Quickwit - if let Some(split_size) = &terms_agg.split_size { - terms_agg.segment_size = Some(*split_size); - } - }); - Some( - AggregationSegmentCollectors::TantivyAggregationSegmentCollector( - AggregationSegmentCollector::from_agg_req_and_reader( - &aggs, - segment_reader, - segment_ord, - &self.aggregation_limits, - )?, - ), - ) - } + Some(QuickwitAggregations::TantivyAggregations(aggs)) => Some( + AggregationSegmentCollectors::TantivyAggregationSegmentCollector( + AggregationSegmentCollector::from_agg_req_and_reader( + aggs, + segment_reader, + segment_ord, + &self.aggregation_limits, + )?, + ), + ), None => None, }; let score_extractor = get_score_extractor(&self.sort_by, segment_reader)?; diff --git a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml index 6d6ad84116a..afdb8143ddf 100644 --- a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml +++ b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml @@ -155,10 +155,34 @@ expected: key: "Fritz" sum_other_doc_count: 8 doc_count_error_upper_bound: 2 - --- # Test term aggs with shard_size -# shard_size is an alius to split_size +# segment_size is an alias to shard_size +method: [GET] +engines: + - quickwit +endpoint: _elastic/aggregations/_search +json: + query: { match_all: {} } + aggs: + names: + terms: + field: "name" + size: 1 + segment_size: 1 +expected: + aggregations: + names: + buckets: + # There are 3 documents with name "Fritz" but we only get 2. One does not get passed to the + # root node, because it gets cut off due to the split_size parameter set to 1. + # We also get doc_count_error_upper_bound: 2, which signals that the result is approximate. + - doc_count: 2 + key: "Fritz" + sum_other_doc_count: 8 + doc_count_error_upper_bound: 2 +--- +# Test term aggs with shard_size method: [GET] engines: - quickwit diff --git a/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml index 13f9067627c..6d44e41b7dd 100644 --- a/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml @@ -54,7 +54,7 @@ num_retries: 10 params: commit: force ndjson: - - {"name": "Fred", "response": 100, "id": 1, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.10", "tags": ["nice"]} + - {"name": "Albert", "response": 100, "id": 1, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.10", "tags": ["nice"]} - {"name": "Fred", "response": 100, "id": 3, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.1", "tags": ["nice"]} - {"name": "Manfred", "response": 120, "id": 13, "date": "2015-01-11T12:10:30Z", "host": "192.168.0.11", "tags": ["nice"]} - {"name": "Horst", "id": 2, "date": "2015-01-01T11:11:30Z", "host": "192.168.0.10", "tags": ["nice", "cool"]}