Skip to content

Commit

Permalink
fix agg split_size parameter, add docs test
Browse files Browse the repository at this point in the history
fix aggregation parameter split_size propagation
improve split_size docs
add split_size parameter test
  • Loading branch information
PSeitz committed Feb 26, 2024
1 parent aa02e02 commit 1dad94d
Show file tree
Hide file tree
Showing 4 changed files with 143 additions and 27 deletions.
13 changes: 10 additions & 3 deletions docs/reference/aggregation.md
Original file line number Diff line number Diff line change
Expand Up @@ -532,15 +532,22 @@ By default, the top 10 terms with the most documents are returned. Larger values

###### **split_size**

The get more accurate results, we fetch more than size from each segment/split.
The get more accurate results, we fetch more than size from each segment/split. Aliases to `shard_size`.

Increasing this value is will increase the accuracy, but also the CPU/memory usage.
See the [`document count error`](#document-count-error) section for more information how this parameter impacts accuracy.

`split_size` is the number of terms that are sent from a leaf result to the root node.
Example: If you have 100 splits and `split_size` is 1000, the root node will receive 100_000 terms to merge.
With an average cost of 50 bytes per term this requires up to 5MB of memory.
The behaviour here deviates from elasticsearch, since we don't have global ordinals. That means we need to send serialized terms to the root node.

Defaults to size * 1.5 + 10.
Defaults to size * 10.

###### **show_term_doc_count_error**

If you set the show_term_doc_count_error parameter to true, the terms aggregation will include doc_count_error_upper_bound, which is an upper bound to the error on the doc_count returned by each split.
It’s the sum of the size of the largest bucket on each split that didn’t fit into split_size.
It’s the sum of the size of the largest bucket on each split that didn’t fit into `split_size`.

Defaults to true when ordering by count desc.

Expand Down
51 changes: 40 additions & 11 deletions quickwit/quickwit-search/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ use quickwit_proto::search::{
SplitSearchError,
};
use serde::Deserialize;
use tantivy::aggregation::agg_req::{get_fast_field_names, Aggregations};
use tantivy::aggregation::agg_req::{
get_fast_field_names, Aggregation, AggregationVariants, Aggregations,
};
use tantivy::aggregation::bucket::TermsAggregation;
use tantivy::aggregation::intermediate_agg_result::IntermediateAggregationResults;
use tantivy::aggregation::{AggregationLimits, AggregationSegmentCollector};
use tantivy::collector::{Collector, SegmentCollector};
Expand Down Expand Up @@ -448,6 +451,22 @@ pub enum QuickwitAggregations {
TantivyAggregations(Aggregations),
}

fn visit_term_aggregations<F>(aggs: &mut Aggregations, cb: &mut F)
where F: FnMut(&mut TermsAggregation) {
visit_aggregations_mut(aggs, &mut |agg| {
if let AggregationVariants::Terms(terms_agg) = &mut agg.agg {
cb(terms_agg);
}
});
}
fn visit_aggregations_mut<F>(aggs: &mut Aggregations, cb: &mut F)
where F: FnMut(&mut Aggregation) {
for agg in aggs.values_mut() {
cb(agg);
visit_aggregations_mut(&mut agg.sub_aggregation, cb);
}
}

impl QuickwitAggregations {
fn fast_field_names(&self) -> HashSet<String> {
match self {
Expand Down Expand Up @@ -611,16 +630,26 @@ impl Collector for QuickwitCollector {
Box::new(collector.for_segment(0, segment_reader)?),
))
}
Some(QuickwitAggregations::TantivyAggregations(aggs)) => Some(
AggregationSegmentCollectors::TantivyAggregationSegmentCollector(
AggregationSegmentCollector::from_agg_req_and_reader(
aggs,
segment_reader,
segment_ord,
&self.aggregation_limits,
)?,
),
),
Some(QuickwitAggregations::TantivyAggregations(aggs)) => {
let mut aggs = aggs.clone();
visit_term_aggregations(&mut aggs, &mut |terms_agg| {
// Forward split_size (also shard_size) parameter to segment_size, as
// this is the same in context of Quickwit
if let Some(split_size) = &terms_agg.split_size {
terms_agg.segment_size = Some(*split_size);
}
});
Some(
AggregationSegmentCollectors::TantivyAggregationSegmentCollector(
AggregationSegmentCollector::from_agg_req_and_reader(
&aggs,
segment_reader,
segment_ord,
&self.aggregation_limits,
)?,
),
)
}
None => None,
};
let score_extractor = get_score_extractor(&self.sort_by, segment_reader)?;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ expected:
aggregations:
date_histo:
buckets:
- { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" }
- { "doc_count": 5, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" }
- { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z" }
---
# Test date histogram aggregation and sub-aggregation
Expand All @@ -39,7 +39,7 @@ expected:
aggregations:
date_histo:
buckets:
- { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 80.0, "count": 3, "max": 120.0, "min": 20.0, "sum": 240.0 } }
- { "doc_count": 5, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 85.0, "count": 4, "max": 120.0, "min": 20.0, "sum": 340.0 } }
- { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z", "response": { "avg": 80.0, "count": 2, "max": 130.0, "min": 30.0, "sum": 160.0 } }
---
# Test date histogram aggregation + exists and sub-aggregation
Expand Down Expand Up @@ -67,7 +67,7 @@ expected:
aggregations:
date_histo:
buckets:
- { "doc_count": 3, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 80.0, "count": 3, "max": 120.0, "min": 20.0, "sum": 240.0 } }
- { "doc_count": 4, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z", "response": { "avg": 85.0, "count": 4, "max": 120.0, "min": 20.0, "sum": 340.0 } }
- { "doc_count": 2, "key": 1422662400000.0, "key_as_string": "2015-01-31T00:00:00Z", "response": { "avg": 80.0, "count": 2, "max": 130.0, "min": 30.0, "sum": 160.0 } }
---
# Test range aggregation
Expand All @@ -89,9 +89,9 @@ expected:
aggregations:
my_range:
buckets:
- { "doc_count": 4, "key": "fast", "to": 50.0 }
- { "doc_count": 5, "key": "fast", "to": 50.0 }
- { "doc_count": 0, "from": 50.0, "key": "medium", "to": 80.0 }
- { "doc_count": 3, "from": 80.0, "key": "slow" }
- { "doc_count": 4, "from": 80.0, "key": "slow" }
---
# Test term aggs
method: [GET]
Expand All @@ -111,7 +111,7 @@ expected:
aggregations:
hosts:
buckets:
- doc_count: 3
- doc_count: 4
key: 192.168.0.10
- doc_count: 2
key: 192.168.0.1
Expand All @@ -123,13 +123,91 @@ expected:
sum_other_doc_count: 0
tags:
buckets:
- doc_count: 4
- doc_count: 5
key: nice
- doc_count: 2
key: cool
doc_count_error_upper_bound: 0
sum_other_doc_count: 0
---
# Test term aggs with split_size
# We set split_size to 1, so one document with name "Fritz" will be missing from one split.
method: [GET]
engines:
- quickwit
endpoint: _elastic/aggregations/_search
json:
query: { match_all: {} }
aggs:
names:
terms:
field: "name"
size: 1
split_size: 1
expected:
aggregations:
names:
buckets:
# There are 3 documents with name "Fritz" but we only get 2. One does not get passed to the
# root node, because it gets cut off due to the split_size parameter set to 1.
# We also get doc_count_error_upper_bound: 2, which signals that the result is approximate.
- doc_count: 2
key: "Fritz"
sum_other_doc_count: 8
doc_count_error_upper_bound: 2

---
# Test term aggs with shard_size
# shard_size is an alius to split_size
method: [GET]
engines:
- quickwit
endpoint: _elastic/aggregations/_search
json:
query: { match_all: {} }
aggs:
names:
terms:
field: "name"
size: 1
shard_size: 1
expected:
aggregations:
names:
buckets:
# There are 3 documents with name "Fritz" but we only get 2. One does not get passed to the
# root node, because it gets cut off due to the split_size parameter set to 1.
# We also get doc_count_error_upper_bound: 2, which signals that the result is approximate.
- doc_count: 2
key: "Fritz"
sum_other_doc_count: 8
doc_count_error_upper_bound: 2
---
# Test term aggs with split_size
# Here we increase split_size to 5, so we will get the 3 documents with name "Fritz"
method: [GET]
engines:
- quickwit
endpoint: _elastic/aggregations/_search
json:
query: { match_all: {} }
aggs:
names:
terms:
field: "name"
size: 1
split_size: 5
expected:
aggregations:
names:
buckets:
# We get all 3 documents with name "Fritz"
# We also get doc_count_error_upper_bound: 0, to the result is exact.
- doc_count: 3
key: "Fritz"
sum_other_doc_count: 7
doc_count_error_upper_bound: 0
---
# Test date histogram + percentiles sub-aggregation
method: [GET]
engines:
Expand All @@ -154,7 +232,7 @@ expected:
aggregations:
metrics:
buckets:
- doc_count: 4
- doc_count: 5
key: 1420070400000.0
key_as_string: '2015-01-01T00:00:00Z'
response:
Expand Down Expand Up @@ -185,11 +263,11 @@ expected:
aggregations:
metrics:
buckets:
- doc_count: 4
- doc_count: 5
key: 0.0
- doc_count: 0
key: 50.0
- doc_count: 3
- doc_count: 4
key: 100.0

---
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ num_retries: 10
params:
commit: force
ndjson:
- {"name": "Fred", "response": 100, "id": 1, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.1", "tags": ["nice"]}
- {"name": "Fred", "response": 100, "id": 1, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.10", "tags": ["nice"]}
- {"name": "Fred", "response": 100, "id": 3, "date": "2015-01-01T12:10:30Z", "host": "192.168.0.1", "tags": ["nice"]}
- {"name": "Manfred", "response": 120, "id": 13, "date": "2015-01-11T12:10:30Z", "host": "192.168.0.11", "tags": ["nice"]}
- {"name": "Horst", "id": 2, "date": "2015-01-01T11:11:30Z", "host": "192.168.0.10", "tags": ["nice", "cool"]}
- {"name": "Fritz", "response": 30, "id": 5, "host": "192.168.0.1", "tags": ["nice", "cool"]}
Expand All @@ -65,8 +66,9 @@ endpoint: aggregations/ingest
params:
commit: force
ndjson:
- {"name": "Werner", "response": 20, "id": 0, "date": "2015-01-02T00:00:00Z", "host": "192.168.0.10"}
- {"name": "Fritz", "response": 30, "id": 0}
- {"name": "Fritz", "response": 30, "id": 0}
- {"name": "Holger", "response": 30, "id": 4, "date": "2015-02-06T00:00:00Z", "host": "192.168.0.10"}
- {"name": "Werner", "response": 20, "id": 5, "date": "2015-01-02T00:00:00Z", "host": "192.168.0.10"}
- {"name": "Bernhard", "response": 130, "id": 14, "date": "2015-02-16T00:00:00Z", "host": "192.168.0.15"}
- {"name": "Fritz", "response": 30, "id": 5}

0 comments on commit 1dad94d

Please sign in to comment.