Skip to content

Commit

Permalink
[feat](nereids) adjust stats derive by delta row (#39185)
Browse files Browse the repository at this point in the history
#39222
After analyzing, user may insert new rows. 
analyzed rows: the rows have been analyzed
delta row: rows inserted after analyze job

if analyzed rows are filtered out, then we try to estimate filter result
by delta row with unknown column stats.
  

## Proposed changes

Issue Number: close #xxx

<!--Describe your changes.-->
  • Loading branch information
englefly authored Aug 14, 2024
1 parent a751eb6 commit c9813c8
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 22 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,21 @@ public FilterEstimation(Set<Slot> aggSlots) {
/**
* This method will update the stats according to the selectivity.
*/
public Statistics estimate(Expression expression, Statistics statistics) {
public Statistics estimate(Expression expression, Statistics inputStats) {
// For a comparison predicate, only when it's left side is a slot and right side is a literal, we would
// consider is a valid predicate.
Statistics stats = expression.accept(this, new EstimationContext(statistics));
stats.enforceValid();
return stats;
Statistics outputStats = expression.accept(this, new EstimationContext(inputStats));
if (outputStats.getRowCount() == 0 && inputStats.getDeltaRowCount() > 0) {
StatisticsBuilder deltaStats = new StatisticsBuilder();
deltaStats.setDeltaRowCount(0);
deltaStats.setRowCount(inputStats.getDeltaRowCount());
for (Expression expr : inputStats.columnStatistics().keySet()) {
deltaStats.putColumnStatistics(expr, ColumnStatistic.UNKNOWN);
}
outputStats = expression.accept(this, new EstimationContext(deltaStats.build()));
}
outputStats.enforceValid();
return outputStats;
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -646,10 +646,10 @@ private Statistics computeCatalogRelation(CatalogRelation catalogRelation) {
idxId = olapScan.getSelectedIndexId();
}
}
if (deltaRowCount > 0 && LOG.isDebugEnabled()) {
LOG.debug("{} is partially analyzed, clear min/max values in column stats",
catalogRelation.getTable().getName());
}
// if (deltaRowCount > 0 && LOG.isDebugEnabled()) {
// LOG.debug("{} is partially analyzed, clear min/max values in column stats",
// catalogRelation.getTable().getName());
// }
for (SlotReference slotReference : slotSet) {
String colName = slotReference.getColumn().isPresent()
? slotReference.getColumn().get().getName()
Expand All @@ -676,14 +676,14 @@ private Statistics computeCatalogRelation(CatalogRelation catalogRelation) {
hasUnknownCol = true;
}
if (ConnectContext.get() != null && ConnectContext.get().getSessionVariable().enableStats) {
if (deltaRowCount > 0) {
// clear min-max to avoid error estimation
// for example, after yesterday data loaded, user send query about yesterday immediately.
// since yesterday data are not analyzed, the max date is before yesterday, and hence optimizer
// estimates the filter result is zero
colStatsBuilder.setMinExpr(null).setMinValue(Double.NEGATIVE_INFINITY)
.setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY);
}
// if (deltaRowCount > 0) {
// // clear min-max to avoid error estimation
// // for example, after yesterday data loaded, user send query about yesterday immediately.
// // since yesterday data are not analyzed, the max date is before yesterday, and hence optimizer
// // estimates the filter result is zero
// colStatsBuilder.setMinExpr(null).setMinValue(Double.NEGATIVE_INFINITY)
// .setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY);
// }
columnStatisticBuilderMap.put(slotReference, colStatsBuilder);
} else {
columnStatisticBuilderMap.put(slotReference, new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN));
Expand All @@ -693,17 +693,18 @@ private Statistics computeCatalogRelation(CatalogRelation catalogRelation) {
if (hasUnknownCol && ConnectContext.get() != null && ConnectContext.get().getStatementContext() != null) {
ConnectContext.get().getStatementContext().setHasUnknownColStats(true);
}
return normalizeCatalogRelationColumnStatsRowCount(rowCount, columnStatisticBuilderMap);
return normalizeCatalogRelationColumnStatsRowCount(rowCount, columnStatisticBuilderMap, deltaRowCount);
}

private Statistics normalizeCatalogRelationColumnStatsRowCount(double rowCount,
Map<Expression, ColumnStatisticBuilder> columnStatisticBuilderMap) {
Map<Expression, ColumnStatisticBuilder> columnStatisticBuilderMap,
double deltaRowCount) {
Map<Expression, ColumnStatistic> columnStatisticMap = new HashMap<>();
for (Expression slot : columnStatisticBuilderMap.keySet()) {
columnStatisticMap.put(slot,
columnStatisticBuilderMap.get(slot).setCount(rowCount).build());
}
return new Statistics(rowCount, columnStatisticMap);
return new Statistics(rowCount, columnStatisticMap, deltaRowCount);
}

private Statistics computeTopN(TopN topN) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,23 @@ public class Statistics {
// the byte size of one tuple
private double tupleSize;

private double deltaRowCount = 0.0;

public Statistics(Statistics another) {
this.rowCount = another.rowCount;
this.expressionToColumnStats = new HashMap<>(another.expressionToColumnStats);
this.tupleSize = another.tupleSize;
this.deltaRowCount = another.getDeltaRowCount();
}

public Statistics(double rowCount, Map<Expression, ColumnStatistic> expressionToColumnStats) {
public Statistics(double rowCount, Map<Expression, ColumnStatistic> expressionToColumnStats, double deltaRowCount) {
this.rowCount = rowCount;
this.expressionToColumnStats = expressionToColumnStats;
this.deltaRowCount = deltaRowCount;
}

public Statistics(double rowCount, Map<Expression, ColumnStatistic> expressionToColumnStats) {
this(rowCount, expressionToColumnStats, 0);
}

public ColumnStatistic findColumnStatistics(Expression expression) {
Expand Down Expand Up @@ -150,7 +158,11 @@ public String toString() {
return "-Infinite";
}
DecimalFormat format = new DecimalFormat("#,###.##");
return format.format(rowCount);
String rows = format.format(rowCount);
if (deltaRowCount > 0) {
rows = rows + "(" + format.format(deltaRowCount) + ")";
}
return rows;
}

public int getBENumber() {
Expand Down Expand Up @@ -209,4 +221,12 @@ public Statistics normalizeByRatio(double originRowCount) {
}
return builder.build();
}

public double getDeltaRowCount() {
return deltaRowCount;
}

public void setDeltaRowCount(double deltaRowCount) {
this.deltaRowCount = deltaRowCount;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ public class StatisticsBuilder {

private final Map<Expression, ColumnStatistic> expressionToColumnStats;

private double deltaRowCount = 0.0;

public StatisticsBuilder() {
expressionToColumnStats = new HashMap<>();
}
Expand All @@ -36,6 +38,7 @@ public StatisticsBuilder(Statistics statistics) {
this.rowCount = statistics.getRowCount();
expressionToColumnStats = new HashMap<>();
expressionToColumnStats.putAll(statistics.columnStatistics());
deltaRowCount = statistics.getDeltaRowCount();
}

public StatisticsBuilder setRowCount(double rowCount) {
Expand All @@ -54,7 +57,12 @@ public StatisticsBuilder putColumnStatistics(Expression expression, ColumnStatis
return this;
}

public StatisticsBuilder setDeltaRowCount(double deltaRowCount) {
this.deltaRowCount = deltaRowCount;
return this;
}

public Statistics build() {
return new Statistics(rowCount, expressionToColumnStats);
return new Statistics(rowCount, expressionToColumnStats, deltaRowCount);
}
}
57 changes: 57 additions & 0 deletions regression-test/suites/nereids_p0/delta_row/delta_row.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("delta_row") {
String database = context.config.getDbNameByFile(context.file)
sql """
drop database if exists ${database};
create database ${database};
use ${database};
CREATE TABLE IF NOT EXISTS t (
k int(11) null comment "",
v string replace null comment "",
) engine=olap
DISTRIBUTED BY HASH(k) BUCKETS 5 properties("replication_num" = "1");
insert into t values (1, "a"),(2, "b"),(3, 'c'),(4,'d');
analyze table t with sync;
"""
explain {
sql "physical plan select * from t where k > 6"
contains("stats=0 ")
contains("stats=4,")
// PhysicalResultSink[120] ( outputExprs=[k#0, v#1] )
// +--PhysicalDistribute[117]@@1 ( distributionSpec=DistributionSpecGather, stats=0 )
// +--PhysicalFilter[114]@1 ( predicates=(k#0 > 6), stats=0 )
// +--PhysicalOlapScan[111]@0 ( qualified=internal.default_cluster:regression_test_nereids_p0_delta_row.t, stats=4, fr=Optional[1] )
}

sql "set global enable_auto_analyze=false;"

sql "insert into t values (10, 'c');"
explain {
sql "physical plan select * from t where k > 6"
contains("stats=0.5 ")
contains("stats=5(1),")
notContains("stats=0 ")
notContains("stats=4,")
// PhysicalResultSink[120] ( outputExprs=[k#0, v#1] )
// +--PhysicalDistribute[117]@@1 ( distributionSpec=DistributionSpecGather, stats=0.5 )
// +--PhysicalFilter[114]@1 ( predicates=(k#0 > 6), stats=0.5 )
// +--PhysicalOlapScan[111]@0 ( qualified=internal.default_cluster:regression_test_nereids_p0_delta_row.t, stats=5(1), fr=Optional[1] )
}
}

0 comments on commit c9813c8

Please sign in to comment.