From 6ec434380883ee0c20a7b164fa1135fc66b9f50a Mon Sep 17 00:00:00 2001 From: minghong Date: Wed, 14 Aug 2024 09:18:09 +0800 Subject: [PATCH] [feat](nereids) adjust stats derive by delta row (#39185) #39222 After analyzing, user may insert new rows. analyzed rows: the rows have been analyzed delta row: rows inserted after analyze job if analyzed rows are filtered out, then we try to estimate filter result by delta row with unknown column stats. ## Proposed changes Issue Number: close #xxx --- .../doris/nereids/stats/FilterEstimation.java | 17 ++++-- .../doris/nereids/stats/StatsCalculator.java | 31 +++++----- .../apache/doris/statistics/Statistics.java | 24 +++++++- .../doris/statistics/StatisticsBuilder.java | 10 +++- .../nereids_p0/delta_row/delta_row.groovy | 57 +++++++++++++++++++ 5 files changed, 117 insertions(+), 22 deletions(-) create mode 100644 regression-test/suites/nereids_p0/delta_row/delta_row.groovy diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index b4b4fa5e3f8ebd..6f6e768caae1f6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -87,12 +87,21 @@ public FilterEstimation(Set aggSlots) { /** * This method will update the stats according to the selectivity. */ - public Statistics estimate(Expression expression, Statistics statistics) { + public Statistics estimate(Expression expression, Statistics inputStats) { // For a comparison predicate, only when it's left side is a slot and right side is a literal, we would // consider is a valid predicate. - Statistics stats = expression.accept(this, new EstimationContext(statistics)); - stats.enforceValid(); - return stats; + Statistics outputStats = expression.accept(this, new EstimationContext(inputStats)); + if (outputStats.getRowCount() == 0 && inputStats.getDeltaRowCount() > 0) { + StatisticsBuilder deltaStats = new StatisticsBuilder(); + deltaStats.setDeltaRowCount(0); + deltaStats.setRowCount(inputStats.getDeltaRowCount()); + for (Expression expr : inputStats.columnStatistics().keySet()) { + deltaStats.putColumnStatistics(expr, ColumnStatistic.UNKNOWN); + } + outputStats = expression.accept(this, new EstimationContext(deltaStats.build())); + } + outputStats.enforceValid(); + return outputStats; } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index d711b99655ba52..2c954f9ee4382c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -646,10 +646,10 @@ private Statistics computeCatalogRelation(CatalogRelation catalogRelation) { idxId = olapScan.getSelectedIndexId(); } } - if (deltaRowCount > 0 && LOG.isDebugEnabled()) { - LOG.debug("{} is partially analyzed, clear min/max values in column stats", - catalogRelation.getTable().getName()); - } + // if (deltaRowCount > 0 && LOG.isDebugEnabled()) { + // LOG.debug("{} is partially analyzed, clear min/max values in column stats", + // catalogRelation.getTable().getName()); + // } for (SlotReference slotReference : slotSet) { String colName = slotReference.getColumn().isPresent() ? slotReference.getColumn().get().getName() @@ -676,14 +676,14 @@ private Statistics computeCatalogRelation(CatalogRelation catalogRelation) { hasUnknownCol = true; } if (ConnectContext.get() != null && ConnectContext.get().getSessionVariable().enableStats) { - if (deltaRowCount > 0) { - // clear min-max to avoid error estimation - // for example, after yesterday data loaded, user send query about yesterday immediately. - // since yesterday data are not analyzed, the max date is before yesterday, and hence optimizer - // estimates the filter result is zero - colStatsBuilder.setMinExpr(null).setMinValue(Double.NEGATIVE_INFINITY) - .setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY); - } + // if (deltaRowCount > 0) { + // // clear min-max to avoid error estimation + // // for example, after yesterday data loaded, user send query about yesterday immediately. + // // since yesterday data are not analyzed, the max date is before yesterday, and hence optimizer + // // estimates the filter result is zero + // colStatsBuilder.setMinExpr(null).setMinValue(Double.NEGATIVE_INFINITY) + // .setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY); + // } columnStatisticBuilderMap.put(slotReference, colStatsBuilder); } else { columnStatisticBuilderMap.put(slotReference, new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN)); @@ -693,17 +693,18 @@ private Statistics computeCatalogRelation(CatalogRelation catalogRelation) { if (hasUnknownCol && ConnectContext.get() != null && ConnectContext.get().getStatementContext() != null) { ConnectContext.get().getStatementContext().setHasUnknownColStats(true); } - return normalizeCatalogRelationColumnStatsRowCount(rowCount, columnStatisticBuilderMap); + return normalizeCatalogRelationColumnStatsRowCount(rowCount, columnStatisticBuilderMap, deltaRowCount); } private Statistics normalizeCatalogRelationColumnStatsRowCount(double rowCount, - Map columnStatisticBuilderMap) { + Map columnStatisticBuilderMap, + double deltaRowCount) { Map columnStatisticMap = new HashMap<>(); for (Expression slot : columnStatisticBuilderMap.keySet()) { columnStatisticMap.put(slot, columnStatisticBuilderMap.get(slot).setCount(rowCount).build()); } - return new Statistics(rowCount, columnStatisticMap); + return new Statistics(rowCount, columnStatisticMap, deltaRowCount); } private Statistics computeTopN(TopN topN) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java index da6bf93759308e..9cec4cc18d4ba6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java @@ -37,15 +37,23 @@ public class Statistics { // the byte size of one tuple private double tupleSize; + private double deltaRowCount = 0.0; + public Statistics(Statistics another) { this.rowCount = another.rowCount; this.expressionToColumnStats = new HashMap<>(another.expressionToColumnStats); this.tupleSize = another.tupleSize; + this.deltaRowCount = another.getDeltaRowCount(); } - public Statistics(double rowCount, Map expressionToColumnStats) { + public Statistics(double rowCount, Map expressionToColumnStats, double deltaRowCount) { this.rowCount = rowCount; this.expressionToColumnStats = expressionToColumnStats; + this.deltaRowCount = deltaRowCount; + } + + public Statistics(double rowCount, Map expressionToColumnStats) { + this(rowCount, expressionToColumnStats, 0); } public ColumnStatistic findColumnStatistics(Expression expression) { @@ -150,7 +158,11 @@ public String toString() { return "-Infinite"; } DecimalFormat format = new DecimalFormat("#,###.##"); - return format.format(rowCount); + String rows = format.format(rowCount); + if (deltaRowCount > 0) { + rows = rows + "(" + format.format(deltaRowCount) + ")"; + } + return rows; } public int getBENumber() { @@ -209,4 +221,12 @@ public Statistics normalizeByRatio(double originRowCount) { } return builder.build(); } + + public double getDeltaRowCount() { + return deltaRowCount; + } + + public void setDeltaRowCount(double deltaRowCount) { + this.deltaRowCount = deltaRowCount; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java index a0e75f7df38090..1e39957354607d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java @@ -28,6 +28,8 @@ public class StatisticsBuilder { private final Map expressionToColumnStats; + private double deltaRowCount = 0.0; + public StatisticsBuilder() { expressionToColumnStats = new HashMap<>(); } @@ -36,6 +38,7 @@ public StatisticsBuilder(Statistics statistics) { this.rowCount = statistics.getRowCount(); expressionToColumnStats = new HashMap<>(); expressionToColumnStats.putAll(statistics.columnStatistics()); + deltaRowCount = statistics.getDeltaRowCount(); } public StatisticsBuilder setRowCount(double rowCount) { @@ -54,7 +57,12 @@ public StatisticsBuilder putColumnStatistics(Expression expression, ColumnStatis return this; } + public StatisticsBuilder setDeltaRowCount(double deltaRowCount) { + this.deltaRowCount = deltaRowCount; + return this; + } + public Statistics build() { - return new Statistics(rowCount, expressionToColumnStats); + return new Statistics(rowCount, expressionToColumnStats, deltaRowCount); } } diff --git a/regression-test/suites/nereids_p0/delta_row/delta_row.groovy b/regression-test/suites/nereids_p0/delta_row/delta_row.groovy new file mode 100644 index 00000000000000..1550d3f6b15e16 --- /dev/null +++ b/regression-test/suites/nereids_p0/delta_row/delta_row.groovy @@ -0,0 +1,57 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("delta_row") { + String database = context.config.getDbNameByFile(context.file) + sql """ + drop database if exists ${database}; + create database ${database}; + use ${database}; + CREATE TABLE IF NOT EXISTS t ( + k int(11) null comment "", + v string replace null comment "", + ) engine=olap + DISTRIBUTED BY HASH(k) BUCKETS 5 properties("replication_num" = "1"); + + insert into t values (1, "a"),(2, "b"),(3, 'c'),(4,'d'); + analyze table t with sync; + """ + explain { + sql "physical plan select * from t where k > 6" + contains("stats=0 ") + contains("stats=4,") + // PhysicalResultSink[120] ( outputExprs=[k#0, v#1] ) + // +--PhysicalDistribute[117]@@1 ( distributionSpec=DistributionSpecGather, stats=0 ) + // +--PhysicalFilter[114]@1 ( predicates=(k#0 > 6), stats=0 ) + // +--PhysicalOlapScan[111]@0 ( qualified=internal.default_cluster:regression_test_nereids_p0_delta_row.t, stats=4, fr=Optional[1] ) + } + + sql "set global enable_auto_analyze=false;" + + sql "insert into t values (10, 'c');" + explain { + sql "physical plan select * from t where k > 6" + contains("stats=0.5 ") + contains("stats=5(1),") + notContains("stats=0 ") + notContains("stats=4,") + // PhysicalResultSink[120] ( outputExprs=[k#0, v#1] ) + // +--PhysicalDistribute[117]@@1 ( distributionSpec=DistributionSpecGather, stats=0.5 ) + // +--PhysicalFilter[114]@1 ( predicates=(k#0 > 6), stats=0.5 ) + // +--PhysicalOlapScan[111]@0 ( qualified=internal.default_cluster:regression_test_nereids_p0_delta_row.t, stats=5(1), fr=Optional[1] ) + } +} \ No newline at end of file