fix agg split_size parameter, add docs and test (#4627)

* fix agg split_size parameter, add docs test fix aggregation parameter split_size propagation improve split_size docs add split_size parameter test * rename split_size to shard_size, update docs rename split_size to shard_size update docs remove variable propagation
quickwit-oss · Feb 27, 2024 · c81467d · c81467d
1 parent 46f0c16
commit c81467d
Show file tree

Hide file tree

Showing 3 changed files with 137 additions and 22 deletions.
diff --git a/docs/reference/aggregation.md b/docs/reference/aggregation.md
@@ -500,23 +500,23 @@ Response
 
 #### Document count error
 In Quickwit, we have one segment per split. Therefore the results returned from a split, is equivalent to results returned from a segment.
-To improve performance, results from one split are cut off at `split_size`.
+To improve performance, results from one split are cut off at `shard_size`.
 When combining results of multiple splits, terms that
 don't make it in the top n of a result from a split increase the theoretical upper bound error by lowest
 term-count.
 
-Even with a larger `split_size` value, doc_count values for a terms aggregation may be
+Even with a larger `shard_size` value, doc_count values for a terms aggregation may be
 approximate. As a result, any sub-aggregations on the terms aggregation may also be approximate.
 `sum_other_doc_count` is the number of documents that didn’t make it into the the top size
 terms. If this is greater than 0, you can be sure that the terms agg had to throw away some
 buckets, either because they didn’t fit into `size` on the root node or they didn’t fit into
-`split_size` on the leaf node.
+`shard_size` on the leaf node.
 
 #### Per bucket document count error
 If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will include
 doc_count_error_upper_bound, which is an upper bound to the error on the doc_count returned by
 each split. It’s the sum of the size of the largest bucket on each split that didn’t fit
-into `split_size`.
+into `shard_size`.
 
 #### Parameters
 
@@ -530,17 +530,28 @@ Currently term aggregation only works on fast fields of type `text`, `f64`, `i64
 
 By default, the top 10 terms with the most documents are returned. Larger values for size are more expensive.
 
-###### **split_size**
+###### **shard_size**
 
-The get more accurate results, we fetch more than size from each segment/split.
-Increasing this value is will increase the accuracy, but also the CPU/memory usage.
+To obtain more accurate results, we fetch more than the `size` from each segment/split.
 
-Defaults to size * 1.5 + 10.
+Increasing this value will enhance accuracy but will also increase CPU/memory usage. 
+Refer to the [`document count error`](#document-count-error) section for more information on how `shard_size` impacts accuracy.
+
+`shard_size` represents the number of terms that are returned from one split. 
+For example, if there are 100 splits and `shard_size` is set to 1000, the root node may receive up to 100_000 terms to merge. 
+Assuming an average cost of 50 bytes per term, this would require up to 5MB of memory. 
+The actual number of terms sent to the root depends on the number of splits handled by one node and how the intermediate results can be merged (e.g., the cardinality of the terms).
+
+Note on differences between Quickwit and Elasticsearch:
+* Unlike Elasticsearch, Quickwit does not use global ordinals, so serialized terms need to be sent to the root node.
+* The concept of shards in Elasticsearch differs from splits in Quickwit. In Elasticsearch, a shard contains up to 200M documents and is a collection of segments. In contrast, a Quickwit split comprises a single segment, typically with 5M documents. Therefore, `shard_size` in Elasticsearch applies to a group of segments, whereas in Quickwit, it applies to a single segment.
+
+Defaults to `size * 10`.
 
 ###### **show_term_doc_count_error**
 
 If you set the show_term_doc_count_error parameter to true, the terms aggregation will include doc_count_error_upper_bound, which is an upper bound to the error on the doc_count returned by each split.
-It’s the sum of the size of the largest bucket on each split that didn’t fit into split_size.
+It’s the sum of the size of the largest bucket on each split that didn’t fit into `shard_size`.
 
 Defaults to true when ordering by count desc.
 

diff --git a/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml b/quickwit/rest-api-tests/scenarii/aggregations/0001-aggregations.yaml
@@ -15,7 +15,7 @@ expected:
   aggregations:
     date_histo:
       buckets:
-        -  { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" }
+        -  { "doc_count": 5, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" }
         -  { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z" }
 --- 
 # Test date histogram aggregation and sub-aggregation 
@@ -39,7 +39,7 @@ expected:
   aggregations:
     date_histo:
       buckets:
-        -  { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 80.0, "count": 3, "max": 120.0, "min": 20.0, "sum": 240.0 } }
+        -  { "doc_count": 5, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 85.0, "count": 4, "max": 120.0, "min": 20.0, "sum": 340.0 } }
         -  { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z", "response": { "avg": 80.0, "count": 2, "max": 130.0, "min": 30.0, "sum": 160.0 }  }
 --- 
 # Test date histogram aggregation + exists and sub-aggregation 
@@ -67,7 +67,7 @@ expected:
   aggregations:
     date_histo:
       buckets:
-        -  { "doc_count": 3, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 80.0, "count": 3, "max": 120.0, "min": 20.0, "sum": 240.0 } }
+        -  { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 85.0, "count": 4, "max": 120.0, "min": 20.0, "sum": 340.0 } }
         -  { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z", "response": { "avg": 80.0, "count": 2, "max": 130.0, "min": 30.0, "sum": 160.0 }  }
 --- 
 # Test range aggregation
@@ -89,9 +89,9 @@ expected:
   aggregations:
     my_range:
       buckets:
-        - { "doc_count": 4, "key": "fast", "to": 50.0 }
+        - { "doc_count": 5, "key": "fast", "to": 50.0 }
         - { "doc_count": 0, "from": 50.0, "key": "medium", "to": 80.0 }
-        - { "doc_count": 3, "from": 80.0, "key": "slow" }
+        - { "doc_count": 4, "from": 80.0, "key": "slow" }
 --- 
 # Test term aggs
 method: [GET]
@@ -111,7 +111,7 @@ expected:
   aggregations:
     hosts:
       buckets:
-      - doc_count: 3
+      - doc_count: 4
         key: 192.168.0.10
       - doc_count: 2
         key: 192.168.0.1
@@ -123,13 +123,115 @@ expected:
       sum_other_doc_count: 0
     tags:
       buckets:
-      - doc_count: 4
+      - doc_count: 5
         key: nice
       - doc_count: 2
         key: cool
       doc_count_error_upper_bound: 0
       sum_other_doc_count: 0
 --- 
+# Test term aggs with split_size
+# We set split_size to 1, so one document with name "Fritz" will be missing from one split.
+method: [GET]
+engines:
+  - quickwit
+endpoint: _elastic/aggregations/_search
+json:
+  query: { match_all: {} }
+  aggs:
+    names: 
+      terms: 
+        field: "name"
+        size: 1
+        split_size: 1
+expected:
+  aggregations:
+    names:
+      buckets:
+        # There are 3 documents with name "Fritz" but we only get 2. One does not get passed to the 
+        # root node, because it gets cut off due to the split_size parameter set to 1.
+        # We also get doc_count_error_upper_bound: 2, which signals that the result is approximate.
+      - doc_count: 2 
+        key: "Fritz"
+      sum_other_doc_count: 8
+      doc_count_error_upper_bound: 2
+--- 
+# Test term aggs with shard_size
+# segment_size is an alias to shard_size
+method: [GET]
+engines:
+  - quickwit
+endpoint: _elastic/aggregations/_search
+json:
+  query: { match_all: {} }
+  aggs:
+    names: 
+      terms: 
+        field: "name"
+        size: 1
+        segment_size: 1
+expected:
+  aggregations:
+    names:
+      buckets:
+        # There are 3 documents with name "Fritz" but we only get 2. One does not get passed to the 
+        # root node, because it gets cut off due to the split_size parameter set to 1.
+        # We also get doc_count_error_upper_bound: 2, which signals that the result is approximate.
+      - doc_count: 2 
+        key: "Fritz"
+      sum_other_doc_count: 8
+      doc_count_error_upper_bound: 2
+--- 
+# Test term aggs with shard_size
+method: [GET]
+engines:
+  - quickwit
+endpoint: _elastic/aggregations/_search
+json:
+  query: { match_all: {} }
+  aggs:
+    names: 
+      terms: 
+        field: "name"
+        size: 1
+        shard_size: 1
+expected:
+  aggregations:
+    names:
+      buckets:
+        # There are 3 documents with name "Fritz" but we only get 2. One does not get passed to the 
+        # root node, because it gets cut off due to the split_size parameter set to 1.
+        # We also get doc_count_error_upper_bound: 2, which signals that the result is approximate.
+      - doc_count: 2 
+        key: "Fritz"
+      sum_other_doc_count: 8
+      doc_count_error_upper_bound: 2
+---
+# Test term aggs with split_size
+# Here we increase split_size to 5, so we will get the 3 documents with name "Fritz"
+method: [GET]
+engines:
+  - quickwit
+endpoint: _elastic/aggregations/_search
+json:
+  query: { match_all: {} }
+  aggs:
+    names: 
+      terms: 
+        field: "name"
+        size: 1
+        split_size: 5
+expected:
+  aggregations:
+    names:
+      buckets:
+        # We get all 3 documents with name "Fritz"
+        # We also get doc_count_error_upper_bound: 0, to the result is exact.
+      - doc_count: 3 
+        key: "Fritz"
+      sum_other_doc_count: 7
+      doc_count_error_upper_bound: 0
+--- 
 # Test date histogram + percentiles sub-aggregation
 method: [GET]
 engines:
@@ -154,7 +256,7 @@ expected:
   aggregations:
     metrics:
       buckets:
-      - doc_count: 4
+      - doc_count: 5
         key: 1420070400000.0
         key_as_string: '2015-01-01T00:00:00Z'
         response:
@@ -185,11 +287,11 @@ expected:
   aggregations:
     metrics:
       buckets:
-      - doc_count: 4
+      - doc_count: 5
         key: 0.0
       - doc_count: 0
         key: 50.0
-      - doc_count: 3
+      - doc_count: 4
         key: 100.0
 
 --- 

diff --git a/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml b/quickwit/rest-api-tests/scenarii/aggregations/_setup.quickwit.yaml
@@ -54,7 +54,8 @@ num_retries: 10
 params:
   commit: force
 ndjson:
-  - {"name": "Fred", "response": 100, "id": 1, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.1", "tags": ["nice"]}
+  - {"name": "Albert", "response": 100, "id": 1, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.10", "tags": ["nice"]}
+  - {"name": "Fred", "response": 100, "id": 3, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.1", "tags": ["nice"]}
   - {"name": "Manfred", "response": 120, "id": 13, "date": "2015-01-11T12:10:30Z", "host": "192.168.0.11", "tags": ["nice"]}
   - {"name": "Horst", "id": 2, "date": "2015-01-01T11:11:30Z", "host": "192.168.0.10", "tags": ["nice", "cool"]}
   - {"name": "Fritz", "response": 30, "id": 5, "host": "192.168.0.1", "tags": ["nice", "cool"]}
@@ -65,8 +66,9 @@ endpoint: aggregations/ingest
 params:
   commit: force
 ndjson:
-  - {"name": "Werner", "response": 20, "id": 0, "date": "2015-01-02T00:00:00Z", "host": "192.168.0.10"}
+  - {"name": "Fritz", "response": 30, "id": 0}
+  - {"name": "Fritz", "response": 30, "id": 0}
   - {"name": "Holger", "response": 30, "id": 4, "date": "2015-02-06T00:00:00Z", "host": "192.168.0.10"}
+  - {"name": "Werner", "response": 20, "id": 5, "date": "2015-01-02T00:00:00Z", "host": "192.168.0.10"}
   - {"name": "Bernhard", "response": 130, "id": 14, "date": "2015-02-16T00:00:00Z", "host": "192.168.0.15"}
-  - {"name": "Fritz", "response": 30, "id": 5}