Skip to content

Commit

Permalink
[opt](nereids) clean count usage in ColumnStatistic during stats deri…
Browse files Browse the repository at this point in the history
…ving (apache#40654)

## Proposed changes

Stats deriving refinement step 1: clean up count usage in
ColumnStatistic during stats deriving(mainly for stats-available), to
avoid serious stats deriving problem.
a. use Statistics rowCount instead of count in ColumnStatistic in stats
deriving, since these two infos may be inconsistent and lead to stats
deriving problem.
b. remove setCount interface to avoid using this count field during
deriving unexpectedly in the future.
c. refine notNullSelectivity computing and corresponding estimation.

Benchmark plan shape change:
- tpcds query74: no performance impact.

---------

Co-authored-by: zhongjian.xzj <[email protected]>
  • Loading branch information
xzj7019 and zhongjian.xzj authored Sep 19, 2024
1 parent 2385734 commit 80482c5
Show file tree
Hide file tree
Showing 26 changed files with 220 additions and 380 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -603,9 +603,8 @@ private Optional<ColumnStatistic> getHiveColumnStats(String colName) {
if (!parameters.containsKey(NUM_ROWS) || Long.parseLong(parameters.get(NUM_ROWS)) == 0) {
return Optional.empty();
}
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
long count = Long.parseLong(parameters.get(NUM_ROWS));
columnStatisticBuilder.setCount(count);
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(count);
// The tableStats length is at most 1.
for (ColumnStatisticsObj tableStat : tableStats) {
if (!tableStat.isSetStatsData()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ public ColumnStatistic visitLiteral(Literal literal, Statistics context) {
.setMaxValue(literalVal)
.setMinValue(literalVal)
.setNdv(1)
.setNumNulls(1)
.setNumNulls(literal.isNullLiteral() ? 1 : 0)
.setAvgSizeByte(1)
.setMinExpr(literal.toLegacyLiteral())
.setMaxExpr(literal.toLegacyLiteral())
Expand Down Expand Up @@ -274,13 +274,13 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic,
int exprResultTypeWidth = binaryArithmetic.getDataType().width();
double dataSize = exprResultTypeWidth * rowCount;
if (binaryArithmetic instanceof Add) {
return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin + rightMin)
.setMaxValue(leftMax + rightMax)
.setMinExpr(null).setMaxExpr(null).build();
}
if (binaryArithmetic instanceof Subtract) {
return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin - rightMax)
.setMaxValue(leftMax - rightMin).setMinExpr(null)
.setMaxExpr(null).build();
Expand All @@ -297,7 +297,7 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic,
Math.max(leftMin * rightMin, leftMin * rightMax),
leftMax * rightMin),
leftMax * rightMax);
return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(dataSize).setMinValue(min).setMaxValue(max)
.setMaxExpr(null).setMinExpr(null).build();
}
Expand All @@ -312,14 +312,14 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic,
Math.max(leftMin / noneZeroDivisor(rightMin), leftMin / noneZeroDivisor(rightMax)),
leftMax / noneZeroDivisor(rightMin)),
leftMax / noneZeroDivisor(rightMax));
return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(binaryArithmetic.getDataType().width()).setMinValue(min)
.setMaxValue(max).build();
}
if (binaryArithmetic instanceof Mod) {
double min = -Math.max(Math.abs(rightMin), Math.abs(rightMax));
double max = -min;
return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv)
return new ColumnStatisticBuilder().setNdv(ndv)
.setAvgSizeByte(exprResultTypeWidth)
.setDataSize(dataSize)
.setNumNulls(numNulls)
Expand Down Expand Up @@ -363,8 +363,7 @@ public ColumnStatistic visitMax(Max max, Statistics context) {
public ColumnStatistic visitCount(Count count, Statistics context) {
double width = count.getDataType().width();
// for scalar agg, ndv and row count will be normalized by 1 in StatsCalculator.computeAggregate()
return new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN).setCount(context.getRowCount())
.setAvgSizeByte(width).build();
return new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN).setAvgSizeByte(width).build();
}

// TODO: return a proper estimated stat after supports histogram
Expand All @@ -382,14 +381,14 @@ public ColumnStatistic visitAvg(Avg avg, Statistics context) {
@Override
public ColumnStatistic visitYear(Year year, Statistics context) {
ColumnStatistic childStat = year.child().accept(this, context);
double rowCount = context.getRowCount();
long minYear = 1970;
long maxYear = 2038;
return new ColumnStatisticBuilder()
.setCount(childStat.count)
.setNdv(maxYear - minYear + 1)
.setAvgSizeByte(4)
.setNumNulls(childStat.numNulls)
.setDataSize(4 * childStat.count)
.setDataSize(4 * rowCount)
.setMinValue(minYear)
.setMaxValue(maxYear).setMinExpr(null).build();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -323,6 +323,8 @@ private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic stats
selectivity = DEFAULT_INEQUALITY_COEFFICIENT;
} else {
double ndv = statsForLeft.ndv;
double numNulls = statsForLeft.numNulls;
double rowCount = context.statistics.getRowCount();
if (statsForRight.isUnKnown) {
if (ndv >= 1.0) {
selectivity = 1.0 / ndv;
Expand All @@ -338,7 +340,7 @@ private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic stats
} else {
selectivity = DEFAULT_INEQUALITY_COEFFICIENT;
}
selectivity = getNotNullSelectivity(statsForLeft, selectivity);
selectivity = getNotNullSelectivity(numNulls, rowCount, ndv, selectivity);
}
}
Statistics equalStats = context.statistics.withSel(selectivity);
Expand Down Expand Up @@ -451,7 +453,8 @@ A not in (1, 2, 3, 100):
compareExprStatsBuilder.setNumNulls(0);
Statistics estimated = new StatisticsBuilder(context.statistics).build();
ColumnStatistic stats = compareExprStatsBuilder.build();
selectivity = getNotNullSelectivity(stats, selectivity);
selectivity = getNotNullSelectivity(compareExprStats.numNulls, estimated.getRowCount(),
compareExprStats.ndv, selectivity);
estimated = estimated.withSel(selectivity);
estimated.addColumnStats(compareExpr, stats);
context.addKeyIfSlot(compareExpr);
Expand Down Expand Up @@ -546,7 +549,7 @@ public Statistics visitIsNull(IsNull isNull, EstimationContext context) {
outputRowCount = Math.max(outputRowCount, 1);
}
ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(childColStats);
colBuilder.setCount(outputRowCount).setNumNulls(outputRowCount)
colBuilder.setNumNulls(outputRowCount)
.setMaxValue(Double.POSITIVE_INFINITY)
.setMinValue(Double.NEGATIVE_INFINITY)
.setNdv(0);
Expand Down Expand Up @@ -597,7 +600,6 @@ private Statistics estimateBinaryComparisonFilter(Expression leftExpr, DataType
.setMaxValue(Double.POSITIVE_INFINITY)
.setMaxExpr(null)
.setNdv(0)
.setCount(0)
.setNumNulls(0);
} else {
leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats)
Expand All @@ -615,9 +617,8 @@ private Statistics estimateBinaryComparisonFilter(Expression leftExpr, DataType
} else {
sel = Math.max(sel, RANGE_SELECTIVITY_THRESHOLD);
}
sel = getNotNullSelectivity(leftStats, sel);
sel = getNotNullSelectivity(leftStats.numNulls, context.statistics.getRowCount(), leftStats.ndv, sel);
updatedStatistics = context.statistics.withSel(sel);
leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount());
}
updatedStatistics.addColumnStats(leftExpr, leftColumnStatisticBuilder.build());
context.addKeyIfSlot(leftExpr);
Expand Down Expand Up @@ -720,36 +721,27 @@ private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStati
@Override
public Statistics visitLike(Like like, EstimationContext context) {
StatisticsBuilder statsBuilder = new StatisticsBuilder(context.statistics);
statsBuilder.setRowCount(context.statistics.getRowCount() * DEFAULT_LIKE_COMPARISON_SELECTIVITY);
double rowCount = context.statistics.getRowCount() * DEFAULT_LIKE_COMPARISON_SELECTIVITY;
statsBuilder.setRowCount(rowCount);
if (like.left() instanceof Slot) {
ColumnStatistic origin = context.statistics.findColumnStatistics(like.left());
Preconditions.checkArgument(origin != null,
"col stats not found. slot=%s in %s",
like.left().toSql(), like.toSql());
ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(origin);
double selectivity = StatsMathUtil.divide(DEFAULT_LIKE_COMPARISON_SELECTIVITY, origin.ndv);
double notNullSel = getNotNullSelectivity(origin, selectivity);
colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY)
.setCount(notNullSel * context.statistics.getRowCount()).setNumNulls(0);
colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY).setNumNulls(0);
statsBuilder.putColumnStatistics(like.left(), colBuilder.build());
context.addKeyIfSlot(like.left());
}
return statsBuilder.build();
}

private double getNotNullSelectivity(ColumnStatistic stats, double origSel) {
double rowCount = stats.count;
double numNulls = stats.numNulls;

// comment following check since current rowCount and ndv may be inconsistant
// e.g, rowCount has been reduced by one filter but another filter column's
// ndv and numNull remains originally, which will unexpectedly go into the following
// normalization.

//if (numNulls > rowCount - ndv) {
// numNulls = rowCount - ndv > 0 ? rowCount - ndv : 0;
//}
double notNullSel = rowCount <= 1.0 ? 1.0 : 1 - Statistics.getValidSelectivity(numNulls / rowCount);
private double getNotNullSelectivity(double origNumNulls, double origRowCount, double origNdv, double origSel) {
if (origNumNulls > origRowCount - origNdv) {
origNumNulls = origRowCount - origNdv > 0 ? origRowCount - origNdv : 0;
}
double notNullSel = origRowCount <= 1.0 ? 1.0 : 1 - Statistics
.getValidSelectivity(origNumNulls / origRowCount);
double validSel = origSel * notNullSel;
return Statistics.getValidSelectivity(validSel);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -403,8 +403,7 @@ private Statistics computeOlapScan(OlapScan olapScan) {
for (Slot slot : ((Relation) olapScan).getOutput()) {
if (derivedStats.findColumnStatistics(slot) == null) {
derivedStats.addColumnStats(slot,
new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN)
.setCount(derivedRowCount).build());
new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN, derivedRowCount).build());
}
}
return derivedStats;
Expand All @@ -431,7 +430,7 @@ private Statistics computeOlapScan(OlapScan olapScan) {
// get row count from any visible slotReference's colStats
for (Slot slot : ((Plan) olapScan).getOutput()) {
builder.putColumnStatistics(slot,
new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN).setCount(tableRowCount).build());
new ColumnStatisticBuilder(ColumnStatistic.UNKNOWN, tableRowCount).build());
}
setHasUnknownColStatsInStatementContext();
return builder.setRowCount(tableRowCount).build();
Expand Down Expand Up @@ -463,8 +462,8 @@ private Statistics computeOlapScan(OlapScan olapScan) {
});
for (SlotReference slot : visibleOutputSlots) {
ColumnStatistic cache = getColumnStatsFromPartitionCache(olapScan, slot, selectedPartitionNames);
ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache);
colStatsBuilder.setCount(selectedPartitionsRowCount);
ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache,
selectedPartitionsRowCount);
colStatsBuilder.normalizeAvgSizeByte(slot);
builder.putColumnStatistics(slot, colStatsBuilder.build());
}
Expand All @@ -478,8 +477,7 @@ private Statistics computeOlapScan(OlapScan olapScan) {
// get table level stats
for (SlotReference slot : visibleOutputSlots) {
ColumnStatistic cache = getColumnStatsFromTableCache((CatalogRelation) olapScan, slot);
ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache);
colStatsBuilder.setCount(tableRowCount);
ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache, tableRowCount);
colStatsBuilder.normalizeAvgSizeByte(slot);
builder.putColumnStatistics(slot, colStatsBuilder.build());
}
Expand Down Expand Up @@ -1062,8 +1060,7 @@ private Statistics computeCatalogRelation(CatalogRelation catalogRelation) {
} else {
cache = getColumnStatsFromTableCache(catalogRelation, slot);
}
ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache);
colStatsBuilder.setCount(tableRowCount);
ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(cache, tableRowCount);
builder.putColumnStatistics(slot, colStatsBuilder.build());
}
checkIfUnknownStatsUsedAsKey(builder);
Expand Down Expand Up @@ -1187,7 +1184,6 @@ private Statistics computeRepeat(Repeat<? extends Plan> repeat) {
ColumnStatistic stats = kv.getValue();
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(stats);
columnStatisticBuilder
.setCount(stats.count < 0 ? stats.count : stats.count * groupingSetNum)
.setNumNulls(stats.numNulls < 0 ? stats.numNulls : stats.numNulls * groupingSetNum)
.setDataSize(stats.dataSize < 0 ? stats.dataSize : stats.dataSize * groupingSetNum);
return Pair.of(kv.getKey(), columnStatisticBuilder.build());
Expand Down Expand Up @@ -1322,12 +1318,11 @@ private Statistics computeGenerate(Generate generate) {
double count = stats.getRowCount() * generate.getGeneratorOutput().size() * statsFactor;
Map<Expression, ColumnStatistic> columnStatsMap = Maps.newHashMap();
for (Map.Entry<Expression, ColumnStatistic> entry : stats.columnStatistics().entrySet()) {
ColumnStatistic columnStatistic = new ColumnStatisticBuilder(entry.getValue()).setCount(count).build();
ColumnStatistic columnStatistic = new ColumnStatisticBuilder(entry.getValue()).build();
columnStatsMap.put(entry.getKey(), columnStatistic);
}
for (Slot output : generate.getGeneratorOutput()) {
ColumnStatistic columnStatistic = new ColumnStatisticBuilder()
.setCount(count)
.setMinValue(Double.NEGATIVE_INFINITY)
.setMaxValue(Double.POSITIVE_INFINITY)
.setNdv(count)
Expand All @@ -1349,8 +1344,7 @@ private Statistics computeWindow(Window windowOperator) {
"need WindowExpression, but we meet " + expr);
WindowExpression windExpr = (WindowExpression) expr.child(0);
ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder();
colStatsBuilder.setCount(childStats.getRowCount())
.setOriginal(null);
colStatsBuilder.setOriginal(null);

Double partitionCount = windExpr.getPartitionKeys().stream().map(key -> {
ColumnStatistic keyStats = childStats.findColumnStatistics(key);
Expand All @@ -1365,8 +1359,7 @@ private Statistics computeWindow(Window windowOperator) {

if (partitionCount == -1.0) {
// partition key stats are all unknown
colStatsBuilder.setCount(childStats.getRowCount())
.setNdv(1)
colStatsBuilder.setNdv(1)
.setMinValue(Double.NEGATIVE_INFINITY)
.setMaxValue(Double.POSITIVE_INFINITY);
} else {
Expand Down Expand Up @@ -1411,7 +1404,7 @@ private Statistics computeWindow(Window windowOperator) {
private ColumnStatistic unionColumn(ColumnStatistic leftStats, double leftRowCount, ColumnStatistic rightStats,
double rightRowCount, DataType dataType) {
if (leftStats.isUnKnown() || rightStats.isUnKnown()) {
return new ColumnStatisticBuilder(leftStats).setCount(leftRowCount + rightRowCount).build();
return new ColumnStatisticBuilder(leftStats).build();
}
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
columnStatisticBuilder.setMaxValue(Math.max(leftStats.maxValue, rightStats.maxValue));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ public Statistics computeStats(List<Slot> slots) {

Map<Expression, ColumnStatistic> columnToStatistics = Maps.newHashMap();
ColumnStatisticBuilder statBuilder = new ColumnStatisticBuilder()
.setCount(rowNum).setAvgSizeByte(8).setNumNulls(0).setDataSize(8);
.setAvgSizeByte(8).setNumNulls(0).setDataSize(8);
if (numberTvf.getUseConst()) { // a column of const value
long value = numberTvf.getConstValue();
statBuilder = statBuilder.setNdv(1).setMinValue(value).setMaxValue(value)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,7 @@ public ColumnStatistic toColumnStatistic() {
return ColumnStatistic.UNKNOWN;
}
try {
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder();
columnStatisticBuilder.setCount(count);
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(count);
columnStatisticBuilder.setNdv(ndv);
columnStatisticBuilder.setNumNulls(nullCount);
columnStatisticBuilder.setDataSize(dataSizeInBytes);
Expand Down
Loading

0 comments on commit 80482c5

Please sign in to comment.