diff --git a/docs/reference/aggregation.md b/docs/reference/aggregation.md index ad7c36817dc..f92e0838c86 100644 --- a/docs/reference/aggregation.md +++ b/docs/reference/aggregation.md @@ -532,15 +532,22 @@ By default, the top 10 terms with the most documents are returned. Larger values ###### **split_size** -The get more accurate results, we fetch more than size from each segment/split. +The get more accurate results, we fetch more than size from each segment/split. Aliases to `shard_size`. + Increasing this value is will increase the accuracy, but also the CPU/memory usage. +See the [`document count error`](#document-count-error) section for more information how this parameter impacts accuracy. + +`split_size` is the number of terms that are sent from a leaf result to the root node. +Example: If you have 100 splits and `split_size` is 1000, the root node will receive 100_000 terms to merge. +With an average cost of 50 bytes per term this requires up to 5MB of memory. +The behaviour here deviates from elasticsearch, since we don't have global ordinals. That means we need to send serialized terms to the root node. -Defaults to size * 1.5 + 10. +Defaults to size * 10. ###### **show_term_doc_count_error** If you set the show_term_doc_count_error parameter to true, the terms aggregation will include doc_count_error_upper_bound, which is an upper bound to the error on the doc_count returned by each split. -It’s the sum of the size of the largest bucket on each split that didn’t fit into split_size. +It’s the sum of the size of the largest bucket on each split that didn’t fit into `split_size`. Defaults to true when ordering by count desc. diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs index 9a84e569c61..3e82863af13 100644 --- a/quickwit/quickwit-search/src/collector.rs +++ b/quickwit/quickwit-search/src/collector.rs @@ -29,7 +29,10 @@ use quickwit_proto::search::{ SplitSearchError, }; use serde::Deserialize; -use tantivy::aggregation::agg_req::{get_fast_field_names, Aggregations}; +use tantivy::aggregation::agg_req::{ + get_fast_field_names, Aggregation, AggregationVariants, Aggregations, +}; +use tantivy::aggregation::bucket::TermsAggregation; use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults; use tantivy::aggregation::{AggregationLimits, AggregationSegmentCollector}; use tantivy::collector::{Collector, SegmentCollector}; @@ -448,6 +451,22 @@ pub enum QuickwitAggregations { TantivyAggregations(Aggregations), } +fn visit_term_aggregations(aggs: &mut Aggregations, cb: &mut F) +where F: FnMut(&mut TermsAggregation) { + visit_aggregations_mut(aggs, &mut |agg| { + if let AggregationVariants::Terms(terms_agg) = &mut agg.agg { + cb(terms_agg); + } + }); +} +fn visit_aggregations_mut(aggs: &mut Aggregations, cb: &mut F) +where F: FnMut(&mut Aggregation) { + for agg in aggs.values_mut() { + cb(agg); + visit_aggregations_mut(&mut agg.sub_aggregation, cb); + } +} + impl QuickwitAggregations { fn fast_field_names(&self) -> HashSet { match self { @@ -611,16 +630,26 @@ impl Collector for QuickwitCollector { Box::new(collector.for_segment(0, segment_reader)?), )) } - Some(QuickwitAggregations::TantivyAggregations(aggs)) => Some( - AggregationSegmentCollectors::TantivyAggregationSegmentCollector( - AggregationSegmentCollector::from_agg_req_and_reader( - aggs, - segment_reader, - segment_ord, - &self.aggregation_limits, - )?, - ), - ), + Some(QuickwitAggregations::TantivyAggregations(aggs)) => { + let mut aggs = aggs.clone(); + visit_term_aggregations(&mut aggs, &mut |terms_agg| { + // Forward split_size (also shard_size) parameter to segment_size, as + // this is the same in context of Quickwit + if let Some(split_size) = &terms_agg.split_size { + terms_agg.segment_size = Some(*split_size); + } + }); + Some( + AggregationSegmentCollectors::TantivyAggregationSegmentCollector( + AggregationSegmentCollector::from_agg_req_and_reader( + &aggs, + segment_reader, + segment_ord, + &self.aggregation_limits, + )?, + ), + ) + } None => None, }; let score_extractor = get_score_extractor(&self.sort_by, segment_reader)?; diff --git a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml index e3db007b2a8..6d6ad84116a 100644 --- a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml +++ b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml @@ -15,7 +15,7 @@ expected: aggregations: date_histo: buckets: - - { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" } + - { "doc_count": 5, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" } - { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z" } --- # Test date histogram aggregation and sub-aggregation @@ -39,7 +39,7 @@ expected: aggregations: date_histo: buckets: - - { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 80.0, "count": 3, "max": 120.0, "min": 20.0, "sum": 240.0 } } + - { "doc_count": 5, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 85.0, "count": 4, "max": 120.0, "min": 20.0, "sum": 340.0 } } - { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z", "response": { "avg": 80.0, "count": 2, "max": 130.0, "min": 30.0, "sum": 160.0 } } --- # Test date histogram aggregation + exists and sub-aggregation @@ -67,7 +67,7 @@ expected: aggregations: date_histo: buckets: - - { "doc_count": 3, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 80.0, "count": 3, "max": 120.0, "min": 20.0, "sum": 240.0 } } + - { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 85.0, "count": 4, "max": 120.0, "min": 20.0, "sum": 340.0 } } - { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z", "response": { "avg": 80.0, "count": 2, "max": 130.0, "min": 30.0, "sum": 160.0 } } --- # Test range aggregation @@ -89,9 +89,9 @@ expected: aggregations: my_range: buckets: - - { "doc_count": 4, "key": "fast", "to": 50.0 } + - { "doc_count": 5, "key": "fast", "to": 50.0 } - { "doc_count": 0, "from": 50.0, "key": "medium", "to": 80.0 } - - { "doc_count": 3, "from": 80.0, "key": "slow" } + - { "doc_count": 4, "from": 80.0, "key": "slow" } --- # Test term aggs method: [GET] @@ -111,7 +111,7 @@ expected: aggregations: hosts: buckets: - - doc_count: 3 + - doc_count: 4 key: 192.168.0.10 - doc_count: 2 key: 192.168.0.1 @@ -123,13 +123,91 @@ expected: sum_other_doc_count: 0 tags: buckets: - - doc_count: 4 + - doc_count: 5 key: nice - doc_count: 2 key: cool doc_count_error_upper_bound: 0 sum_other_doc_count: 0 --- +# Test term aggs with split_size +# We set split_size to 1, so one document with name "Fritz" will be missing from one split. +method: [GET] +engines: + - quickwit +endpoint: _elastic/aggregations/_search +json: + query: { match_all: {} } + aggs: + names: + terms: + field: "name" + size: 1 + split_size: 1 +expected: + aggregations: + names: + buckets: + # There are 3 documents with name "Fritz" but we only get 2. One does not get passed to the + # root node, because it gets cut off due to the split_size parameter set to 1. + # We also get doc_count_error_upper_bound: 2, which signals that the result is approximate. + - doc_count: 2 + key: "Fritz" + sum_other_doc_count: 8 + doc_count_error_upper_bound: 2 + +--- +# Test term aggs with shard_size +# shard_size is an alius to split_size +method: [GET] +engines: + - quickwit +endpoint: _elastic/aggregations/_search +json: + query: { match_all: {} } + aggs: + names: + terms: + field: "name" + size: 1 + shard_size: 1 +expected: + aggregations: + names: + buckets: + # There are 3 documents with name "Fritz" but we only get 2. One does not get passed to the + # root node, because it gets cut off due to the split_size parameter set to 1. + # We also get doc_count_error_upper_bound: 2, which signals that the result is approximate. + - doc_count: 2 + key: "Fritz" + sum_other_doc_count: 8 + doc_count_error_upper_bound: 2 +--- +# Test term aggs with split_size +# Here we increase split_size to 5, so we will get the 3 documents with name "Fritz" +method: [GET] +engines: + - quickwit +endpoint: _elastic/aggregations/_search +json: + query: { match_all: {} } + aggs: + names: + terms: + field: "name" + size: 1 + split_size: 5 +expected: + aggregations: + names: + buckets: + # We get all 3 documents with name "Fritz" + # We also get doc_count_error_upper_bound: 0, to the result is exact. + - doc_count: 3 + key: "Fritz" + sum_other_doc_count: 7 + doc_count_error_upper_bound: 0 +--- # Test date histogram + percentiles sub-aggregation method: [GET] engines: @@ -154,7 +232,7 @@ expected: aggregations: metrics: buckets: - - doc_count: 4 + - doc_count: 5 key: 1420070400000.0 key_as_string: '2015-01-01T00:00:00Z' response: @@ -185,11 +263,11 @@ expected: aggregations: metrics: buckets: - - doc_count: 4 + - doc_count: 5 key: 0.0 - doc_count: 0 key: 50.0 - - doc_count: 3 + - doc_count: 4 key: 100.0 --- diff --git a/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml index 37ce5fcd0a6..13f9067627c 100644 --- a/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml +++ b/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml @@ -54,7 +54,8 @@ num_retries: 10 params: commit: force ndjson: - - {"name": "Fred", "response": 100, "id": 1, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.1", "tags": ["nice"]} + - {"name": "Fred", "response": 100, "id": 1, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.10", "tags": ["nice"]} + - {"name": "Fred", "response": 100, "id": 3, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.1", "tags": ["nice"]} - {"name": "Manfred", "response": 120, "id": 13, "date": "2015-01-11T12:10:30Z", "host": "192.168.0.11", "tags": ["nice"]} - {"name": "Horst", "id": 2, "date": "2015-01-01T11:11:30Z", "host": "192.168.0.10", "tags": ["nice", "cool"]} - {"name": "Fritz", "response": 30, "id": 5, "host": "192.168.0.1", "tags": ["nice", "cool"]} @@ -65,8 +66,9 @@ endpoint: aggregations/ingest params: commit: force ndjson: - - {"name": "Werner", "response": 20, "id": 0, "date": "2015-01-02T00:00:00Z", "host": "192.168.0.10"} + - {"name": "Fritz", "response": 30, "id": 0} + - {"name": "Fritz", "response": 30, "id": 0} - {"name": "Holger", "response": 30, "id": 4, "date": "2015-02-06T00:00:00Z", "host": "192.168.0.10"} + - {"name": "Werner", "response": 20, "id": 5, "date": "2015-01-02T00:00:00Z", "host": "192.168.0.10"} - {"name": "Bernhard", "response": 130, "id": 14, "date": "2015-02-16T00:00:00Z", "host": "192.168.0.15"} - - {"name": "Fritz", "response": 30, "id": 5}