fix agg split_size parameter, add docs test

fix aggregation parameter split_size propagation improve split_size docs add split_size parameter test
quickwit-oss · Feb 26, 2024 · 1dad94d · 1dad94d
1 parent aa02e02
commit 1dad94d
Show file tree

Hide file tree

Showing 4 changed files with 143 additions and 27 deletions.
diff --git a/docs/reference/aggregation.md b/docs/reference/aggregation.md
@@ -532,15 +532,22 @@ By default, the top 10 terms with the most documents are returned. Larger values
 
 ###### **split_size**
 
-The get more accurate results, we fetch more than size from each segment/split.
+The get more accurate results, we fetch more than size from each segment/split. Aliases to `shard_size`.
+
 Increasing this value is will increase the accuracy, but also the CPU/memory usage.
+See the [`document count error`](#document-count-error) section for more information how this parameter impacts accuracy.
+
+`split_size` is the number of terms that are sent from a leaf result to the root node.
+Example: If you have 100 splits and `split_size` is 1000, the root node will receive 100_000 terms to merge.
+With an average cost of 50 bytes per term this requires up to 5MB of memory.
+The behaviour here deviates from elasticsearch, since we don't have global ordinals. That means we need to send serialized terms to the root node.
 
-Defaults to size * 1.5 + 10.
+Defaults to size * 10.
 
 ###### **show_term_doc_count_error**
 
 If you set the show_term_doc_count_error parameter to true, the terms aggregation will include doc_count_error_upper_bound, which is an upper bound to the error on the doc_count returned by each split.
-It’s the sum of the size of the largest bucket on each split that didn’t fit into split_size.
+It’s the sum of the size of the largest bucket on each split that didn’t fit into `split_size`.
 
 Defaults to true when ordering by count desc.
 

diff --git a/quickwit/quickwit-search/src/collector.rs b/quickwit/quickwit-search/src/collector.rs
@@ -29,7 +29,10 @@ use quickwit_proto::search::{
     SplitSearchError,
 };
 use serde::Deserialize;
-use tantivy::aggregation::agg_req::{get_fast_field_names, Aggregations};
+use tantivy::aggregation::agg_req::{
+    get_fast_field_names, Aggregation, AggregationVariants, Aggregations,
+};
+use tantivy::aggregation::bucket::TermsAggregation;
 use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults;
 use tantivy::aggregation::{AggregationLimits, AggregationSegmentCollector};
 use tantivy::collector::{Collector, SegmentCollector};
@@ -448,6 +451,22 @@ pub enum QuickwitAggregations {
     TantivyAggregations(Aggregations),
 }
 
+fn visit_term_aggregations<F>(aggs: &mut Aggregations, cb: &mut F)
+where F: FnMut(&mut TermsAggregation) {
+    visit_aggregations_mut(aggs, &mut |agg| {
+        if let AggregationVariants::Terms(terms_agg) = &mut agg.agg {
+            cb(terms_agg);
+        }
+    });
+}
+fn visit_aggregations_mut<F>(aggs: &mut Aggregations, cb: &mut F)
+where F: FnMut(&mut Aggregation) {
+    for agg in aggs.values_mut() {
+        cb(agg);
+        visit_aggregations_mut(&mut agg.sub_aggregation, cb);
+    }
+}
+
 impl QuickwitAggregations {
     fn fast_field_names(&self) -> HashSet<String> {
         match self {
@@ -611,16 +630,26 @@ impl Collector for QuickwitCollector {
                     Box::new(collector.for_segment(0, segment_reader)?),
                 ))
             }
-            Some(QuickwitAggregations::TantivyAggregations(aggs)) => Some(
-                AggregationSegmentCollectors::TantivyAggregationSegmentCollector(
-                    AggregationSegmentCollector::from_agg_req_and_reader(
-                        aggs,
-                        segment_reader,
-                        segment_ord,
-                        &self.aggregation_limits,
-                    )?,
-                ),
-            ),
+            Some(QuickwitAggregations::TantivyAggregations(aggs)) => {
+                let mut aggs = aggs.clone();
+                visit_term_aggregations(&mut aggs, &mut |terms_agg| {
+                    // Forward split_size (also shard_size) parameter to segment_size, as
+                    // this is the same in context of Quickwit
+                    if let Some(split_size) = &terms_agg.split_size {
+                        terms_agg.segment_size = Some(*split_size);
+                    }
+                });
+                Some(
+                    AggregationSegmentCollectors::TantivyAggregationSegmentCollector(
+                        AggregationSegmentCollector::from_agg_req_and_reader(
+                            &aggs,
+                            segment_reader,
+                            segment_ord,
+                            &self.aggregation_limits,
+                        )?,
+                    ),
+                )
+            }
             None => None,
         };
         let score_extractor = get_score_extractor(&self.sort_by, segment_reader)?;

diff --git a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml
@@ -15,7 +15,7 @@ expected:
   aggregations:
     date_histo:
       buckets:
-        -  { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" }
+        -  { "doc_count": 5, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" }
         -  { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z" }
 --- 
 # Test date histogram aggregation and sub-aggregation 
@@ -39,7 +39,7 @@ expected:
   aggregations:
     date_histo:
       buckets:
-        -  { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 80.0, "count": 3, "max": 120.0, "min": 20.0, "sum": 240.0 } }
+        -  { "doc_count": 5, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 85.0, "count": 4, "max": 120.0, "min": 20.0, "sum": 340.0 } }
         -  { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z", "response": { "avg": 80.0, "count": 2, "max": 130.0, "min": 30.0, "sum": 160.0 }  }
 --- 
 # Test date histogram aggregation + exists and sub-aggregation 
@@ -67,7 +67,7 @@ expected:
   aggregations:
     date_histo:
       buckets:
-        -  { "doc_count": 3, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 80.0, "count": 3, "max": 120.0, "min": 20.0, "sum": 240.0 } }
+        -  { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 85.0, "count": 4, "max": 120.0, "min": 20.0, "sum": 340.0 } }
         -  { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z", "response": { "avg": 80.0, "count": 2, "max": 130.0, "min": 30.0, "sum": 160.0 }  }
 --- 
 # Test range aggregation
@@ -89,9 +89,9 @@ expected:
   aggregations:
     my_range:
       buckets:
-        - { "doc_count": 4, "key": "fast", "to": 50.0 }
+        - { "doc_count": 5, "key": "fast", "to": 50.0 }
         - { "doc_count": 0, "from": 50.0, "key": "medium", "to": 80.0 }
-        - { "doc_count": 3, "from": 80.0, "key": "slow" }
+        - { "doc_count": 4, "from": 80.0, "key": "slow" }
 --- 
 # Test term aggs
 method: [GET]
@@ -111,7 +111,7 @@ expected:
   aggregations:
     hosts:
       buckets:
-      - doc_count: 3
+      - doc_count: 4
         key: 192.168.0.10
       - doc_count: 2
         key: 192.168.0.1
@@ -123,13 +123,91 @@ expected:
       sum_other_doc_count: 0
     tags:
       buckets:
-      - doc_count: 4
+      - doc_count: 5
         key: nice
       - doc_count: 2
         key: cool
       doc_count_error_upper_bound: 0
       sum_other_doc_count: 0
 --- 
+# Test term aggs with split_size
+# We set split_size to 1, so one document with name "Fritz" will be missing from one split.
+method: [GET]
+engines:
+  - quickwit
+endpoint: _elastic/aggregations/_search
+json:
+  query: { match_all: {} }
+  aggs:
+    names: 
+      terms: 
+        field: "name"
+        size: 1
+        split_size: 1
+expected:
+  aggregations:
+    names:
+      buckets:
+        # There are 3 documents with name "Fritz" but we only get 2. One does not get passed to the 
+        # root node, because it gets cut off due to the split_size parameter set to 1.
+        # We also get doc_count_error_upper_bound: 2, which signals that the result is approximate.
+      - doc_count: 2 
+        key: "Fritz"
+      sum_other_doc_count: 8
+      doc_count_error_upper_bound: 2
+
+--- 
+# Test term aggs with shard_size
+# shard_size is an alius to split_size
+method: [GET]
+engines:
+  - quickwit
+endpoint: _elastic/aggregations/_search
+json:
+  query: { match_all: {} }
+  aggs:
+    names: 
+      terms: 
+        field: "name"
+        size: 1
+        shard_size: 1
+expected:
+  aggregations:
+    names:
+      buckets:
+        # There are 3 documents with name "Fritz" but we only get 2. One does not get passed to the 
+        # root node, because it gets cut off due to the split_size parameter set to 1.
+        # We also get doc_count_error_upper_bound: 2, which signals that the result is approximate.
+      - doc_count: 2 
+        key: "Fritz"
+      sum_other_doc_count: 8
+      doc_count_error_upper_bound: 2
+---
+# Test term aggs with split_size
+# Here we increase split_size to 5, so we will get the 3 documents with name "Fritz"
+method: [GET]
+engines:
+  - quickwit
+endpoint: _elastic/aggregations/_search
+json:
+  query: { match_all: {} }
+  aggs:
+    names: 
+      terms: 
+        field: "name"
+        size: 1
+        split_size: 5
+expected:
+  aggregations:
+    names:
+      buckets:
+        # We get all 3 documents with name "Fritz"
+        # We also get doc_count_error_upper_bound: 0, to the result is exact.
+      - doc_count: 3 
+        key: "Fritz"
+      sum_other_doc_count: 7
+      doc_count_error_upper_bound: 0
+--- 
 # Test date histogram + percentiles sub-aggregation
 method: [GET]
 engines:
@@ -154,7 +232,7 @@ expected:
   aggregations:
     metrics:
       buckets:
-      - doc_count: 4
+      - doc_count: 5
         key: 1420070400000.0
         key_as_string: '2015-01-01T00:00:00Z'
         response:
@@ -185,11 +263,11 @@ expected:
   aggregations:
     metrics:
       buckets:
-      - doc_count: 4
+      - doc_count: 5
         key: 0.0
       - doc_count: 0
         key: 50.0
-      - doc_count: 3
+      - doc_count: 4
         key: 100.0
 
 --- 

diff --git a/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml
@@ -54,7 +54,8 @@ num_retries: 10
 params:
   commit: force
 ndjson:
-  - {"name": "Fred", "response": 100, "id": 1, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.1", "tags": ["nice"]}
+  - {"name": "Fred", "response": 100, "id": 1, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.10", "tags": ["nice"]}
+  - {"name": "Fred", "response": 100, "id": 3, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.1", "tags": ["nice"]}
   - {"name": "Manfred", "response": 120, "id": 13, "date": "2015-01-11T12:10:30Z", "host": "192.168.0.11", "tags": ["nice"]}
   - {"name": "Horst", "id": 2, "date": "2015-01-01T11:11:30Z", "host": "192.168.0.10", "tags": ["nice", "cool"]}
   - {"name": "Fritz", "response": 30, "id": 5, "host": "192.168.0.1", "tags": ["nice", "cool"]}
@@ -65,8 +66,9 @@ endpoint: aggregations/ingest
 params:
   commit: force
 ndjson:
-  - {"name": "Werner", "response": 20, "id": 0, "date": "2015-01-02T00:00:00Z", "host": "192.168.0.10"}
+  - {"name": "Fritz", "response": 30, "id": 0}
+  - {"name": "Fritz", "response": 30, "id": 0}
   - {"name": "Holger", "response": 30, "id": 4, "date": "2015-02-06T00:00:00Z", "host": "192.168.0.10"}
+  - {"name": "Werner", "response": 20, "id": 5, "date": "2015-01-02T00:00:00Z", "host": "192.168.0.10"}
   - {"name": "Bernhard", "response": 130, "id": 14, "date": "2015-02-16T00:00:00Z", "host": "192.168.0.15"}
-  - {"name": "Fritz", "response": 30, "id": 5}