From 8f5122be3ecb9a50c201cf779fcf5b2a4e68f242 Mon Sep 17 00:00:00 2001 From: kikyo Date: Sun, 8 Oct 2023 10:06:25 +0800 Subject: [PATCH] stats --- .../java/org/apache/doris/common/Config.java | 7 + .../apache/doris/common/FeMetaVersion.java | 4 +- fe/fe-core/src/main/cup/sql_parser.cup | 12 +- .../doris/analysis/AnalyzeProperties.java | 54 +- .../apache/doris/analysis/AnalyzeStmt.java | 13 +- .../apache/doris/analysis/AnalyzeTblStmt.java | 54 +- .../apache/doris/analysis/PartitionNames.java | 38 +- .../doris/analysis/ShowAnalyzeStmt.java | 84 +- .../doris/analysis/ShowTableStatsStmt.java | 39 +- .../java/org/apache/doris/catalog/Env.java | 14 +- .../catalog/InternalSchemaInitializer.java | 40 +- .../org/apache/doris/catalog/OlapTable.java | 53 ++ .../java/org/apache/doris/catalog/Table.java | 11 + .../org/apache/doris/catalog/TableIf.java | 13 + .../doris/catalog/external/ExternalTable.java | 21 + .../catalog/external/HMSExternalTable.java | 59 +- .../catalog/external/JdbcExternalTable.java | 18 +- .../doris/common/ThreadPoolManager.java | 9 + .../apache/doris/datasource/CatalogIf.java | 3 + .../apache/doris/datasource/CatalogMgr.java | 6 + .../doris/datasource/ExternalCatalog.java | 16 + .../doris/datasource/InternalCatalog.java | 5 + .../apache/doris/journal/JournalEntity.java | 17 + .../doris/nereids/cost/CostModelV1.java | 63 +- .../org/apache/doris/nereids/cost/CostV1.java | 30 +- .../processor/post/RuntimeFilterPruner.java | 21 +- .../nereids/stats/ExpressionEstimation.java | 54 +- .../doris/nereids/stats/FilterEstimation.java | 379 ++++++--- .../doris/nereids/stats/JoinEstimation.java | 63 +- .../doris/nereids/stats/StatsCalculator.java | 44 +- .../nereids/trees/expressions/Properties.java | 96 +++ .../expressions/functions/table/Hdfs.java | 8 +- .../expressions/functions/table/Local.java | 6 +- .../expressions/functions/table/Numbers.java | 10 +- .../trees/expressions/functions/table/S3.java | 8 +- .../functions/table/TableValuedFunction.java | 8 +- .../visitor/ExpressionVisitor.java | 5 + .../apache/doris/nereids/types/MapType.java | 2 + .../nereids/types/coercion/AnyDataType.java | 18 +- .../org/apache/doris/persist/EditLog.java | 19 +- .../doris/persist/TableStatsDeletionLog.java | 47 ++ .../org/apache/doris/qe/AuditLogHelper.java | 120 +++ .../java/org/apache/doris/qe/DdlExecutor.java | 3 - .../qe/InternalQueryExecutionException.java | 24 + .../org/apache/doris/qe/SessionVariable.java | 20 + .../org/apache/doris/qe/ShowExecutor.java | 37 +- .../org/apache/doris/qe/StmtExecutor.java | 42 +- .../apache/doris/statistics/AnalysisInfo.java | 116 ++- .../doris/statistics/AnalysisInfoBuilder.java | 39 +- .../doris/statistics/AnalysisManager.java | 792 ++++++++++-------- .../doris/statistics/AnalysisState.java | 2 + .../statistics/AnalysisTaskExecutor.java | 53 +- .../statistics/AnalysisTaskScheduler.java | 108 --- .../doris/statistics/AnalysisTaskWrapper.java | 12 + .../doris/statistics/BaseAnalysisTask.java | 61 +- .../apache/doris/statistics/ColStatsData.java | 82 ++ .../apache/doris/statistics/ColStatsMeta.java | 58 ++ .../doris/statistics/ColumnStatistic.java | 144 +--- .../statistics/ColumnStatisticBuilder.java | 47 +- .../ColumnStatisticsCacheLoader.java | 62 +- .../doris/statistics/HMSAnalysisTask.java | 254 +++--- .../org/apache/doris/statistics/HistData.java | 36 + .../apache/doris/statistics/Histogram.java | 18 +- .../statistics/HistogramCacheLoader.java | 1 - .../doris/statistics/JdbcAnalysisTask.java | 9 +- .../doris/statistics/OlapAnalysisJob.java | 56 ++ .../doris/statistics/OlapAnalysisTask.java | 116 ++- .../apache/doris/statistics/ResultRow.java | 59 ++ .../doris/statistics/StatisticConstants.java | 54 +- .../doris/statistics/StatisticRange.java | 96 ++- .../doris/statistics/StatisticalType.java | 2 + .../apache/doris/statistics/Statistics.java | 133 +-- .../statistics/StatisticsAutoAnalyzer.java | 215 ----- .../statistics/StatisticsAutoCollector.java | 196 +++++ .../doris/statistics/StatisticsBuilder.java | 2 +- .../doris/statistics/StatisticsCache.java | 158 ++-- .../doris/statistics/StatisticsCleaner.java | 21 +- .../doris/statistics/StatisticsCollector.java | 86 ++ .../statistics/StatisticsPeriodCollector.java | 50 ++ .../statistics/StatisticsRepository.java | 144 +--- .../doris/statistics/StatsDeriveResult.java | 7 - .../org/apache/doris/statistics/StatsId.java | 58 ++ .../doris/statistics/TableStatistic.java | 61 -- .../statistics/TableStatisticBuilder.java | 51 -- .../TableStatisticsCacheLoader.java | 60 -- .../doris/statistics/TableStatsMeta.java | 137 +++ .../doris/statistics/TaskStatusWrapper.java | 33 + .../doris/statistics/util/InternalQuery.java | 21 +- .../statistics/util/InternalQueryResult.java | 242 ------ .../doris/statistics/util/SimpleQueue.java | 65 ++ .../doris/statistics/util/StatisticsUtil.java | 163 +++- .../joinorder/hypergraph/OtherJoinTest.java | 31 +- .../nereids/stats/FilterEstimationTest.java | 85 +- .../doris/nereids/util/HyperGraphBuilder.java | 51 +- .../doris/statistics/AnalysisJobTest.java | 33 +- .../statistics/AnalysisTaskExecutorTest.java | 37 +- .../apache/doris/statistics/CacheTest.java | 192 +++-- .../doris/statistics/HistogramTaskTest.java | 14 +- .../statistics/StatsDeriveResultTest.java | 5 +- .../doris/statistics/StatsMockUtil.java | 48 ++ .../util/InternalQueryResultTest.java | 119 --- gensrc/thrift/FrontendService.thrift | 2 +- .../suites/statistics/analyze_stats.groovy | 92 +- 103 files changed, 3881 insertions(+), 2725 deletions(-) create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Properties.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/persist/TableStatsDeletionLog.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/qe/AuditLogHelper.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/qe/InternalQueryExecutionException.java delete mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskScheduler.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsMeta.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/HistData.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisJob.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/ResultRow.java delete mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsPeriodCollector.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java delete mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatistic.java delete mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticBuilder.java delete mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticsCacheLoader.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/TaskStatusWrapper.java delete mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQueryResult.java create mode 100644 fe/fe-core/src/main/java/org/apache/doris/statistics/util/SimpleQueue.java create mode 100644 fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java delete mode 100644 fe/fe-core/src/test/java/org/apache/doris/statistics/util/InternalQueryResultTest.java diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index bffa3e559778a27..cbb42db0e7cb46e 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -2165,4 +2165,11 @@ public class Config extends ConfigBase { "min buckets of auto bucket" }) public static int autobucket_min_buckets = 1; + + @ConfField + public static int full_auto_analyze_simultaneously_running_task_num = 1; + + @ConfField + public static final int period_analyze_simultaneously_running_task_num = 1; + } diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/FeMetaVersion.java b/fe/fe-common/src/main/java/org/apache/doris/common/FeMetaVersion.java index b1e42d343adf0e9..503263bf3ca4a36 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/FeMetaVersion.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/FeMetaVersion.java @@ -67,8 +67,10 @@ public final class FeMetaVersion { // For AnalysisInfo public static final int VERSION_123 = 123; + public static final int VERSION_124 = 124; + // note: when increment meta version, should assign the latest version to VERSION_CURRENT - public static final int VERSION_CURRENT = VERSION_123; + public static final int VERSION_CURRENT = VERSION_124; // all logs meta version should >= the minimum version, so that we could remove many if clause, for example // if (FE_METAVERSION < VERSION_94) ... diff --git a/fe/fe-core/src/main/cup/sql_parser.cup b/fe/fe-core/src/main/cup/sql_parser.cup index 531177fdd0b4d96..3c09cc092cc52b3 100644 --- a/fe/fe-core/src/main/cup/sql_parser.cup +++ b/fe/fe-core/src/main/cup/sql_parser.cup @@ -4126,13 +4126,17 @@ show_param ::= RESULT = new ShowCreateMaterializedViewStmt(mvName, tableName); :} /* show analyze job */ - | KW_ANALYZE opt_table_name:tbl opt_wild_where order_by_clause:orderByClause limit_clause:limitClause + | KW_ANALYZE opt_table_name:tbl opt_wild_where {: - RESULT = new ShowAnalyzeStmt(tbl, parser.where, orderByClause, limitClause); + RESULT = new ShowAnalyzeStmt(tbl, parser.where, false); :} - | KW_ANALYZE INTEGER_LITERAL:jobId opt_wild_where order_by_clause:orderByClause limit_clause:limitClause + | KW_ANALYZE INTEGER_LITERAL:jobId opt_wild_where {: - RESULT = new ShowAnalyzeStmt(jobId, parser.where, orderByClause, limitClause); + RESULT = new ShowAnalyzeStmt(jobId, parser.where); + :} + | KW_AUTO KW_ANALYZE opt_table_name:tbl opt_wild_where + {: + RESULT = new ShowAnalyzeStmt(tbl, parser.where, true); :} | KW_ANALYZE KW_TASK KW_STATUS INTEGER_LITERAL:jobId {: diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java index ccb122bc26986cf..d7e639da3a5bec3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeProperties.java @@ -22,16 +22,18 @@ import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import com.google.common.collect.ImmutableSet; +import com.google.gson.annotations.SerializedName; import org.apache.commons.lang3.StringUtils; +import org.apache.logging.log4j.core.util.CronExpression; +import java.util.HashMap; import java.util.Map; import java.util.Optional; import java.util.concurrent.TimeUnit; +// TODO: Remove map public class AnalyzeProperties { - private final Map properties; - public static final String PROPERTY_SYNC = "sync"; public static final String PROPERTY_INCREMENTAL = "incremental"; public static final String PROPERTY_AUTOMATIC = "automatic"; @@ -41,6 +43,23 @@ public class AnalyzeProperties { public static final String PROPERTY_ANALYSIS_TYPE = "analysis.type"; public static final String PROPERTY_PERIOD_SECONDS = "period.seconds"; + public static final String PROPERTY_FORCE_FULL = "force.full"; + + public static final AnalyzeProperties DEFAULT_PROP = new AnalyzeProperties(new HashMap() { + { + put(AnalyzeProperties.PROPERTY_SYNC, "false"); + put(AnalyzeProperties.PROPERTY_AUTOMATIC, "false"); + put(AnalyzeProperties.PROPERTY_ANALYSIS_TYPE, AnalysisType.FUNDAMENTALS.toString()); + } + }); + + public static final String PROPERTY_PERIOD_CRON = "period.cron"; + + private CronExpression cronExpression; + + @SerializedName("analyzeProperties") + private final Map properties; + private static final ImmutableSet PROPERTIES_SET = new ImmutableSet.Builder() .add(PROPERTY_SYNC) .add(PROPERTY_INCREMENTAL) @@ -50,6 +69,8 @@ public class AnalyzeProperties { .add(PROPERTY_NUM_BUCKETS) .add(PROPERTY_ANALYSIS_TYPE) .add(PROPERTY_PERIOD_SECONDS) + .add(PROPERTY_PERIOD_CRON) + .add(PROPERTY_FORCE_FULL) .build(); public AnalyzeProperties(Map properties) { @@ -72,6 +93,7 @@ public void check() throws AnalysisException { checkAnalysisMode(msgTemplate); checkAnalysisType(msgTemplate); checkScheduleType(msgTemplate); + checkPeriod(); } public boolean isSync() { @@ -115,6 +137,10 @@ public long getPeriodTimeInMs() { return TimeUnit.SECONDS.toMillis(minutes); } + public CronExpression getCron() { + return cronExpression; + } + private void checkPeriodSeconds() throws AnalysisException { if (properties.containsKey(PROPERTY_PERIOD_SECONDS)) { checkNumericProperty(PROPERTY_PERIOD_SECONDS, properties.get(PROPERTY_PERIOD_SECONDS), @@ -207,6 +233,22 @@ private void checkScheduleType(String msgTemplate) throws AnalysisException { } } + private void checkPeriod() throws AnalysisException { + if (properties.containsKey(PROPERTY_PERIOD_SECONDS) + && properties.containsKey(PROPERTY_PERIOD_CRON)) { + throw new AnalysisException(PROPERTY_PERIOD_SECONDS + " and " + PROPERTY_PERIOD_CRON + + " couldn't be set simultaneously"); + } + String cronExprStr = properties.get(PROPERTY_PERIOD_CRON); + if (cronExprStr != null) { + try { + cronExpression = new CronExpression(cronExprStr); + } catch (java.text.ParseException e) { + throw new AnalysisException("Invalid cron expression: " + cronExprStr); + } + } + } + private void checkNumericProperty(String key, String value, int lowerBound, int upperBound, boolean includeBoundary, String errorMsg) throws AnalysisException { if (!StringUtils.isNumeric(value)) { @@ -226,6 +268,14 @@ public boolean isSample() { || properties.containsKey(PROPERTY_SAMPLE_ROWS); } + public boolean forceFull() { + return properties.containsKey(PROPERTY_FORCE_FULL); + } + + public boolean isSampleRows() { + return properties.containsKey(PROPERTY_SAMPLE_ROWS); + } + public String toSQL() { StringBuilder sb = new StringBuilder(); sb.append("PROPERTIES("); diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java index 6f1f7c64d8f84cf..ae2c6a7ff4830f4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeStmt.java @@ -23,6 +23,8 @@ import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import org.apache.doris.statistics.AnalysisInfo.ScheduleType; +import org.apache.logging.log4j.core.util.CronExpression; + import java.util.Map; public class AnalyzeStmt extends StatementBase { @@ -55,7 +57,8 @@ public ScheduleType getScheduleType() { if (analyzeProperties.isAutomatic()) { return ScheduleType.AUTOMATIC; } - return analyzeProperties.getPeriodTimeInMs() > 0 ? ScheduleType.PERIOD : ScheduleType.ONCE; + return analyzeProperties.getPeriodTimeInMs() > 0 || analyzeProperties.getCron() != null + ? ScheduleType.PERIOD : ScheduleType.ONCE; } public boolean isSync() { @@ -86,4 +89,12 @@ public AnalyzeProperties getAnalyzeProperties() { public RedirectStatus getRedirectStatus() { return RedirectStatus.FORWARD_WITH_SYNC; } + + public CronExpression getCron() { + return analyzeProperties.getCron(); + } + + public boolean forceFull() { + return analyzeProperties.forceFull(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java index 920b60627a86539..cbc66f367f260f8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/AnalyzeTblStmt.java @@ -24,6 +24,7 @@ import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.View; +import org.apache.doris.catalog.external.ExternalTable; import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.Config; @@ -41,6 +42,7 @@ import com.google.common.collect.Sets; import org.apache.commons.lang3.StringUtils; +import java.util.Collections; import java.util.List; import java.util.Optional; import java.util.Set; @@ -84,7 +86,7 @@ public class AnalyzeTblStmt extends AnalyzeStmt { private final TableName tableName; private List columnNames; - private List partitionNames; + private PartitionNames partitionNames; private boolean isAllColumns; // after analyzed @@ -97,7 +99,7 @@ public AnalyzeTblStmt(TableName tableName, AnalyzeProperties properties) { super(properties); this.tableName = tableName; - this.partitionNames = partitionNames == null ? null : partitionNames.getPartitionNames(); + this.partitionNames = partitionNames; this.columnNames = columnNames; this.analyzeProperties = properties; this.isAllColumns = columnNames == null; @@ -166,11 +168,9 @@ public void check() throws AnalysisException { analyzeProperties.check(); // TODO support external table - if (analyzeProperties.isSample()) { - if (!(table instanceof OlapTable)) { - throw new AnalysisException("Sampling statistics " - + "collection of external tables is not supported"); - } + if (analyzeProperties.isSampleRows() && !(table instanceof OlapTable)) { + throw new AnalysisException("Sampling statistics " + + "collection of external tables is not supported with rows, use percent instead."); } if (analyzeProperties.isSync() && (analyzeProperties.isAutomatic() || analyzeProperties.getPeriodTimeInMs() != 0)) { @@ -181,6 +181,9 @@ public void check() throws AnalysisException { throw new AnalysisException("Automatic collection " + "and period statistics collection cannot be set at same time"); } + if (analyzeProperties.isSample() && analyzeProperties.forceFull()) { + throw new AnalysisException("Impossible to analyze with sample and full simultaneously"); + } } private void checkColumn() throws AnalysisException { @@ -196,7 +199,8 @@ private void checkColumn() throws AnalysisException { } } if (containsUnsupportedTytpe) { - if (!ConnectContext.get().getSessionVariable().enableAnalyzeComplexTypeColumn) { + if (ConnectContext.get() == null + || !ConnectContext.get().getSessionVariable().enableAnalyzeComplexTypeColumn) { columnNames = columnNames.stream() .filter(c -> !StatisticsUtil.isUnsupportedType(table.getColumn(c).getType())) .collect(Collectors.toList()); @@ -236,14 +240,33 @@ public Set getColumnNames() { } public Set getPartitionNames() { - Set partitions = partitionNames == null ? table.getPartitionNames() : Sets.newHashSet(partitionNames); - if (isSamplingPartition()) { - int partNum = ConnectContext.get().getSessionVariable().getExternalTableAnalyzePartNum(); - partitions = partitions.stream().limit(partNum).collect(Collectors.toSet()); + if (partitionNames == null || partitionNames.getPartitionNames() == null) { + if (table instanceof ExternalTable) { + // External table couldn't return all partitions when partitionNames is not set. + // Because Analyze Table command for external table could specify partition names. + return Collections.emptySet(); + } + return table.getPartitionNames(); } + Set partitions = Sets.newHashSet(); + partitions.addAll(partitionNames.getPartitionNames()); return partitions; } + public boolean isAllPartitions() { + if (partitionNames == null) { + return false; + } + return partitionNames.isAllPartitions(); + } + + public long getPartitionCount() { + if (partitionNames == null) { + return 0; + } + return partitionNames.getCount(); + } + public boolean isPartitionOnly() { return partitionNames != null; } @@ -260,8 +283,13 @@ public boolean isSamplingPartition() { } private void checkAnalyzePriv(String dbName, String tblName) throws AnalysisException { + ConnectContext ctx = ConnectContext.get(); + // means it a system analyze + if (ctx == null) { + return; + } if (!Env.getCurrentEnv().getAccessManager() - .checkTblPriv(ConnectContext.get(), dbName, tblName, PrivPredicate.SELECT)) { + .checkTblPriv(ctx, dbName, tblName, PrivPredicate.SELECT)) { ErrorReport.reportAnalysisException( ErrorCode.ERR_TABLEACCESS_DENIED_ERROR, "ANALYZE", diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java index 1140dfc67776410..ca26a2978e0e546 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/PartitionNames.java @@ -48,15 +48,37 @@ public class PartitionNames implements ParseNode, Writable { // true if these partitions are temp partitions @SerializedName(value = "isTemp") private final boolean isTemp; + private final boolean allPartitions; + private final long count; + // Default partition count to collect statistic for external table. + private static final long DEFAULT_PARTITION_COUNT = 100; public PartitionNames(boolean isTemp, List partitionNames) { this.partitionNames = partitionNames; this.isTemp = isTemp; + this.allPartitions = false; + this.count = 0; } public PartitionNames(PartitionNames other) { this.partitionNames = Lists.newArrayList(other.partitionNames); this.isTemp = other.isTemp; + this.allPartitions = other.allPartitions; + this.count = 0; + } + + public PartitionNames(boolean allPartitions) { + this.partitionNames = null; + this.isTemp = false; + this.allPartitions = allPartitions; + this.count = 0; + } + + public PartitionNames(long partitionCount) { + this.partitionNames = null; + this.isTemp = false; + this.allPartitions = false; + this.count = partitionCount; } public List getPartitionNames() { @@ -67,9 +89,23 @@ public boolean isTemp() { return isTemp; } + public boolean isAllPartitions() { + return allPartitions; + } + + public long getCount() { + return count; + } + @Override public void analyze(Analyzer analyzer) throws AnalysisException { - if (partitionNames.isEmpty()) { + if (allPartitions && count > 0) { + throw new AnalysisException("All partition and partition count couldn't be set at the same time."); + } + if (allPartitions || count > 0) { + return; + } + if (partitionNames == null || partitionNames.isEmpty()) { throw new AnalysisException("No partition specified in partition lists"); } // check if partition name is not empty string diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeStmt.java index 95035641a7e86d3..fb19cb2fd5bf950 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowAnalyzeStmt.java @@ -25,7 +25,6 @@ import org.apache.doris.common.ErrorCode; import org.apache.doris.common.ErrorReport; import org.apache.doris.common.UserException; -import org.apache.doris.common.util.OrderByPair; import org.apache.doris.mysql.privilege.PrivPredicate; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.ShowResultSetMetaData; @@ -35,10 +34,6 @@ import com.google.common.base.Strings; import com.google.common.collect.ImmutableList; -import java.util.ArrayList; -import java.util.List; -import java.util.stream.IntStream; - /** * ShowAnalyzeStmt is used to show statistics job info. * syntax: @@ -69,36 +64,30 @@ public class ShowAnalyzeStmt extends ShowStmt { .build(); private long jobId; - private TableName dbTableName; - private Expr whereClause; - private LimitElement limitElement; - private List orderByElements; + private final TableName dbTableName; + private final Expr whereClause; + + // extract from predicate private String stateValue; - private ArrayList orderByPairs; - public ShowAnalyzeStmt() { - } + private final boolean auto; + public ShowAnalyzeStmt(TableName dbTableName, - Expr whereClause, - List orderByElements, - LimitElement limitElement) { + Expr whereClause, boolean auto) { this.dbTableName = dbTableName; this.whereClause = whereClause; - this.orderByElements = orderByElements; - this.limitElement = limitElement; + this.auto = auto; + } public ShowAnalyzeStmt(long jobId, - Expr whereClause, - List orderByElements, - LimitElement limitElement) { + Expr whereClause) { Preconditions.checkArgument(jobId > 0, "JobId must greater than 0."); this.jobId = jobId; this.dbTableName = null; this.whereClause = whereClause; - this.orderByElements = orderByElements; - this.limitElement = limitElement; + this.auto = false; } public long getJobId() { @@ -111,12 +100,6 @@ public String getStateValue() { return stateValue; } - public ArrayList getOrderByPairs() { - Preconditions.checkArgument(isAnalyzed(), - "The orderByPairs must be obtained after the parsing is complete"); - return orderByPairs; - } - public Expr getWhereClause() { Preconditions.checkArgument(isAnalyzed(), "The whereClause must be obtained after the parsing is complete"); @@ -124,13 +107,6 @@ public Expr getWhereClause() { return whereClause; } - public long getLimit() { - if (limitElement != null && limitElement.hasLimit()) { - return limitElement.getLimit(); - } - return -1L; - } - @Override public void analyze(Analyzer analyzer) throws UserException { if (!Config.enable_stats) { @@ -149,21 +125,6 @@ public void analyze(Analyzer analyzer) throws UserException { if (whereClause != null) { analyzeSubPredicate(whereClause); } - - // analyze order by - if (orderByElements != null && !orderByElements.isEmpty()) { - orderByPairs = new ArrayList<>(); - for (OrderByElement orderByElement : orderByElements) { - if (orderByElement.getExpr() instanceof SlotRef) { - SlotRef slotRef = (SlotRef) orderByElement.getExpr(); - int index = analyzeColumn(slotRef.getColumnName()); - OrderByPair orderByPair = new OrderByPair(index, !orderByElement.getIsAsc()); - orderByPairs.add(orderByPair); - } else { - throw new AnalysisException("Should order by column"); - } - } - } } @Override @@ -279,25 +240,6 @@ public String toSql() { sb.append(whereClause.toSql()); } - // Order By clause - if (orderByElements != null) { - sb.append(" "); - sb.append("ORDER BY"); - sb.append(" "); - IntStream.range(0, orderByElements.size()).forEach(i -> { - sb.append(orderByElements.get(i).getExpr().toSql()); - sb.append((orderByElements.get(i).getIsAsc()) ? " ASC" : " DESC"); - sb.append((i + 1 != orderByElements.size()) ? ", " : ""); - }); - } - - if (getLimit() != -1L) { - sb.append(" "); - sb.append("LIMIT"); - sb.append(" "); - sb.append(getLimit()); - } - return sb.toString(); } @@ -309,4 +251,8 @@ public String toString() { public TableName getDbTableName() { return dbTableName; } + + public boolean isAuto() { + return auto; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java index da10d5c492b1fee..fe499fa1b0849f2 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java +++ b/fe/fe-core/src/main/java/org/apache/doris/analysis/ShowTableStatsStmt.java @@ -32,12 +32,13 @@ import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.ShowResultSet; import org.apache.doris.qe.ShowResultSetMetaData; -import org.apache.doris.statistics.TableStatistic; -import org.apache.doris.statistics.util.StatisticsUtil; +import org.apache.doris.statistics.TableStatsMeta; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; +import java.sql.Date; +import java.util.ArrayList; import java.util.List; public class ShowTableStatsStmt extends ShowStmt { @@ -45,9 +46,12 @@ public class ShowTableStatsStmt extends ShowStmt { // TODO add more columns private static final ImmutableList TITLE_NAMES = new ImmutableList.Builder() + .add("updated_rows") + .add("query_times") .add("row_count") - .add("update_time") - .add("last_analyze_time") + .add("updated_time") + .add("columns") + .add("trigger") .build(); private final TableName tableName; @@ -126,12 +130,33 @@ public long getPartitionId() { return table.getPartition(partitionName).getId(); } - public ShowResultSet constructResultSet(TableStatistic tableStatistic) { + public ShowResultSet constructResultSet(TableStatsMeta tableStatistic) { + if (tableStatistic == null) { + return new ShowResultSet(getMetaData(), new ArrayList<>()); + } List> result = Lists.newArrayList(); List row = Lists.newArrayList(); + row.add(String.valueOf(tableStatistic.updatedRows)); + row.add(String.valueOf(tableStatistic.queriedTimes.get())); row.add(String.valueOf(tableStatistic.rowCount)); - row.add(String.valueOf(tableStatistic.updateTime)); - row.add(StatisticsUtil.getReadableTime(tableStatistic.lastAnalyzeTimeInMs)); + row.add(new Date(tableStatistic.updatedTime).toString()); + row.add(tableStatistic.analyzeColumns().toString()); + row.add(tableStatistic.jobType.toString()); + result.add(row); + return new ShowResultSet(getMetaData(), result); + } + + public ShowResultSet constructResultSet(long rowCount) { + List> result = Lists.newArrayList(); + List row = Lists.newArrayList(); + row.add(""); + row.add(""); + row.add(String.valueOf(rowCount)); + row.add(""); + row.add(""); + row.add(""); + row.add(""); + row.add(""); result.add(row); return new ShowResultSet(getMetaData(), result); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java index ea4410f60c6f589..89b63455590ba34 100755 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @@ -211,7 +211,7 @@ import org.apache.doris.resource.workloadgroup.WorkloadGroupMgr; import org.apache.doris.service.FrontendOptions; import org.apache.doris.statistics.AnalysisManager; -import org.apache.doris.statistics.StatisticsAutoAnalyzer; +import org.apache.doris.statistics.StatisticsAutoCollector; import org.apache.doris.statistics.StatisticsCache; import org.apache.doris.statistics.StatisticsCleaner; import org.apache.doris.statistics.query.QueryStats; @@ -457,7 +457,7 @@ public class Env { */ private final LoadManagerAdapter loadManagerAdapter; - private StatisticsAutoAnalyzer statisticsAutoAnalyzer; + private StatisticsAutoCollector statisticsAutoCollector; private HiveTransactionMgr hiveTransactionMgr; @@ -663,7 +663,7 @@ private Env(boolean isCheckpointCatalog) { this.extMetaCacheMgr = new ExternalMetaCacheMgr(); this.analysisManager = new AnalysisManager(); this.statisticsCleaner = new StatisticsCleaner(); - this.statisticsAutoAnalyzer = new StatisticsAutoAnalyzer(); + this.statisticsAutoCollector = new StatisticsAutoCollector(); this.globalFunctionMgr = new GlobalFunctionMgr(); this.workloadGroupMgr = new WorkloadGroupMgr(); this.queryStats = new QueryStats(); @@ -907,8 +907,8 @@ public void initialize(String[] args) throws Exception { if (statisticsCleaner != null) { statisticsCleaner.start(); } - if (statisticsAutoAnalyzer != null) { - statisticsAutoAnalyzer.start(); + if (statisticsAutoCollector != null) { + statisticsAutoCollector.start(); } } @@ -5421,8 +5421,8 @@ public LoadManagerAdapter getLoadManagerAdapter() { return loadManagerAdapter; } - public StatisticsAutoAnalyzer getStatisticsAutoAnalyzer() { - return statisticsAutoAnalyzer; + public StatisticsAutoCollector getStatisticsAutoCollector() { + return statisticsAutoCollector; } public QueryStats getQueryStats() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java index fd42de00f7834df..b7fdec73f026798 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/InternalSchemaInitializer.java @@ -83,7 +83,6 @@ public void run() { return; } Database database = op.get(); - modifyTblReplicaCount(database, StatisticConstants.ANALYSIS_TBL_NAME); modifyTblReplicaCount(database, StatisticConstants.STATISTIC_TBL_NAME); modifyTblReplicaCount(database, StatisticConstants.HISTOGRAM_TBL_NAME); } @@ -126,7 +125,6 @@ public void modifyTblReplicaCount(Database database, String tblName) { } private void createTbl() throws UserException { - Env.getCurrentEnv().getInternalCatalog().createTable(buildAnalysisTblStmt()); Env.getCurrentEnv().getInternalCatalog().createTable(buildStatisticsTblStmt()); Env.getCurrentEnv().getInternalCatalog().createTable(buildHistogramTblStmt()); } @@ -145,41 +143,6 @@ public static void createDB() { } } - @VisibleForTesting - public CreateTableStmt buildAnalysisTblStmt() throws UserException { - TableName tableName = new TableName("", - FeConstants.INTERNAL_DB_NAME, StatisticConstants.ANALYSIS_TBL_NAME); - List columnDefs = new ArrayList<>(); - columnDefs.add(new ColumnDef("id", TypeDef.createVarchar(StatisticConstants.ID_LEN))); - columnDefs.add(new ColumnDef("catalog_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN))); - columnDefs.add(new ColumnDef("db_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN))); - columnDefs.add(new ColumnDef("tbl_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN))); - columnDefs.add(new ColumnDef("idx_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN))); - ColumnDef partId = new ColumnDef("part_id", TypeDef.createVarchar(StatisticConstants.MAX_NAME_LEN)); - partId.setAllowNull(true); - columnDefs.add(partId); - columnDefs.add(new ColumnDef("count", TypeDef.create(PrimitiveType.BIGINT))); - columnDefs.add(new ColumnDef("last_analyze_time_in_ms", TypeDef.create(PrimitiveType.BIGINT))); - columnDefs.add(new ColumnDef("update_time", TypeDef.create(PrimitiveType.DATETIME))); - String engineName = "olap"; - ArrayList uniqueKeys = Lists.newArrayList("id", "catalog_id", - "db_id", "tbl_id", "idx_id", "part_id"); - KeysDesc keysDesc = new KeysDesc(KeysType.UNIQUE_KEYS, uniqueKeys); - DistributionDesc distributionDesc = new HashDistributionDesc( - StatisticConstants.STATISTIC_TABLE_BUCKET_COUNT, uniqueKeys); - Map properties = new HashMap() { - { - put("replication_num", String.valueOf( - Math.max(1, Config.min_replication_num_per_tablet))); - } - }; - CreateTableStmt createTableStmt = new CreateTableStmt(true, false, - tableName, columnDefs, engineName, keysDesc, null, distributionDesc, - properties, null, "Doris internal statistics table, DO NOT MODIFY IT", null); - StatisticsUtil.analyze(createTableStmt); - return createTableStmt; - } - @VisibleForTesting public CreateTableStmt buildStatisticsTblStmt() throws UserException { TableName tableName = new TableName("", @@ -281,8 +244,7 @@ private boolean created() { } return false; } - return db.getTable(StatisticConstants.HISTOGRAM_TBL_NAME).isPresent() - && db.getTable(StatisticConstants.ANALYSIS_TBL_NAME).isPresent(); + return db.getTable(StatisticConstants.HISTOGRAM_TBL_NAME).isPresent(); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java index 9975ba0230ac926..f59df2554d36b2c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -55,6 +55,8 @@ import org.apache.doris.statistics.HistogramTask; import org.apache.doris.statistics.MVAnalysisTask; import org.apache.doris.statistics.OlapAnalysisTask; +import org.apache.doris.statistics.TableStatsMeta; +import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.system.Backend; import org.apache.doris.system.SystemInfoService; import org.apache.doris.thrift.TColumn; @@ -2249,4 +2251,55 @@ public void analyze(String dbName) { } } } + + @Override + public Map> findReAnalyzeNeededPartitions() { + TableIf table = this; + TableStatsMeta tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId()); + Set allPartitions = table.getPartitionNames().stream().map(table::getPartition) + .filter(Partition::hasData).map(Partition::getName).collect(Collectors.toSet()); + if (tableStats == null) { + return table.getBaseSchema().stream().collect(Collectors.toMap(Column::getName, v -> allPartitions)); + } + Map> colToPart = new HashMap<>(); + for (Column col : table.getBaseSchema()) { + long lastUpdateTime = tableStats.findColumnLastUpdateTime(col.getName()); + Set partitions = table.getPartitionNames().stream() + .map(table::getPartition) + .filter(Partition::hasData) + .filter(partition -> + partition.getVisibleVersionTime() >= lastUpdateTime).map(Partition::getName) + .collect(Collectors.toSet()); + colToPart.put(col.getName(), partitions); + } + return colToPart; + } + + public long getDataSize(boolean singleReplica) { + long dataSize = 0; + for (Partition partition : getAllPartitions()) { + dataSize += partition.getDataSize(singleReplica); + } + return dataSize; + } + + public boolean needReAnalyzeTable(TableStatsMeta tblStats) { + if (tblStats == null) { + return true; + } + long rowCount = getRowCount(); + // TODO: Do we need to analyze an empty table? + if (rowCount == 0) { + return false; + } + if (!tblStats.analyzeColumns().containsAll(getBaseSchema() + .stream() + .map(Column::getName) + .collect(Collectors.toSet()))) { + return true; + } + long updateRows = tblStats.updatedRows.get(); + int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows); + return tblHealth < Config.table_stats_health_threshold; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java index 0c50fc42b4b1355..ba7e55c7d86629e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Table.java @@ -30,6 +30,7 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.thrift.TTableDescriptor; import com.google.common.base.Preconditions; @@ -557,4 +558,14 @@ public Optional getColumnStatistic(String colName) { public void analyze(String dbName) { } + + @Override + public Map> findReAnalyzeNeededPartitions() { + return Collections.emptyMap(); + } + + @Override + public boolean needReAnalyzeTable(TableStatsMeta tblStats) { + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java index 78717f0eca769d6..108d227e5916697 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/TableIf.java @@ -23,6 +23,7 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.thrift.TTableDescriptor; import com.google.common.collect.Lists; @@ -33,6 +34,7 @@ import java.io.IOException; import java.util.Collections; import java.util.List; +import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.concurrent.TimeUnit; @@ -236,5 +238,16 @@ default boolean isManagedTable() { default long getLastUpdateTime() { return -1L; } + + Map> findReAnalyzeNeededPartitions(); + + default long getDataSize(boolean singleReplica) { + // TODO: Each tableIf should impl it by itself. + return 0; + } + + boolean needReAnalyzeTable(TableStatsMeta tblStats); + + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java index f4c76cda7a98a05..6a1630e297c93f4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/ExternalTable.java @@ -35,8 +35,10 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.ColumnStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.thrift.TTableDescriptor; +import com.google.common.collect.Sets; import com.google.gson.annotations.SerializedName; import lombok.Getter; import org.apache.commons.lang3.NotImplementedException; @@ -46,10 +48,14 @@ import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.ReentrantReadWriteLock; +import java.util.stream.Collectors; /** * External table represent tables that are not self-managed by Doris. @@ -373,4 +379,19 @@ public void gsonPostProcess() throws IOException { rwLock = new ReentrantReadWriteLock(true); objectCreated = false; } + + @Override + public boolean needReAnalyzeTable(TableStatsMeta tblStats) { + // TODO: Find a way to decide if this external table need to be reanalyzed. + // For now, simply return true for all external tables. + return true; + } + + @Override + public Map> findReAnalyzeNeededPartitions() { + HashSet partitions = Sets.newHashSet(); + // TODO: Find a way to collect external table partitions that need to be analyzed. + partitions.add("Dummy Partition"); + return getBaseSchema().stream().collect(Collectors.toMap(Column::getName, k -> partitions)); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java index 022428036661934..5852bf69aae4355 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/HMSExternalTable.java @@ -32,7 +32,7 @@ import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; import org.apache.doris.statistics.HMSAnalysisTask; -import org.apache.doris.statistics.TableStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.thrift.THiveTable; import org.apache.doris.thrift.TTableDescriptor; @@ -57,6 +57,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; import java.time.LocalDate; @@ -102,13 +103,16 @@ public class HMSExternalTable extends ExternalTable { SUPPORTED_HUDI_FILE_FORMATS.add("com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat"); } - private volatile org.apache.hadoop.hive.metastore.api.Table remoteTable = null; - private List partitionColumns; + protected volatile org.apache.hadoop.hive.metastore.api.Table remoteTable = null; + protected List partitionColumns; - private DLAType dlaType = DLAType.UNKNOWN; + protected DLAType dlaType = DLAType.UNKNOWN; + + // No as precise as row count in TableStats, but better than none. + private long estimatedRowCount = -1; public enum DLAType { - UNKNOWN, HIVE, HUDI, ICEBERG + UNKNOWN, HIVE, HUDI, ICEBERG, DELTALAKE } /** @@ -123,6 +127,10 @@ public HMSExternalTable(long id, String name, String dbName, HMSExternalCatalog super(id, name, catalog, dbName, TableType.HMS_EXTERNAL_TABLE); } + public HMSExternalTable(long id, String name, String dbName, HMSExternalCatalog catalog, TableType type) { + super(id, name, catalog, dbName, type); + } + public boolean isSupportedHmsTable() { makeSureInitialized(); return dlaType != DLAType.UNKNOWN; @@ -146,6 +154,7 @@ protected synchronized void makeSureInitialized() { } } objectCreated = true; + estimatedRowCount = getRowCountFromExternalSource(true); } } @@ -269,10 +278,19 @@ public long getUpdateTime() { @Override public long getRowCount() { makeSureInitialized(); + long rowCount = getRowCountFromExternalSource(false); + if (rowCount == -1) { + LOG.debug("Will estimate row count from file list."); + rowCount = StatisticsUtil.getRowCountFromFileList(this); + } + return rowCount; + } + + private long getRowCountFromExternalSource(boolean isInit) { long rowCount; switch (dlaType) { case HIVE: - rowCount = StatisticsUtil.getHiveRowCount(this); + rowCount = StatisticsUtil.getHiveRowCount(this, isInit); break; case ICEBERG: rowCount = StatisticsUtil.getIcebergRowCount(this); @@ -281,10 +299,6 @@ public long getRowCount() { LOG.warn("getRowCount for dlaType {} is not supported.", dlaType); rowCount = -1; } - if (rowCount == -1) { - LOG.debug("Will estimate row count from file list."); - rowCount = StatisticsUtil.getRowCountFromFileList(this); - } return rowCount; } @@ -422,13 +436,20 @@ public List getHudiSchema(List hmsSchema) { @Override public long estimatedRowCount() { try { - Optional tableStatistics = Env.getCurrentEnv().getStatisticsCache().getTableStatistics( - catalog.getId(), catalog.getDbOrAnalysisException(dbName).getId(), id); - if (tableStatistics.isPresent()) { - long rowCount = tableStatistics.get().rowCount; + TableStatsMeta tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(id); + if (tableStats != null) { + long rowCount = tableStats.rowCount; LOG.debug("Estimated row count for db {} table {} is {}.", dbName, name, rowCount); return rowCount; } + + if (estimatedRowCount != -1) { + return estimatedRowCount; + } + // Cache the estimated row count in this structure + // though the table never get analyzed, since the row estimation might be expensive caused by RPC. + estimatedRowCount = getRowCount(); + return estimatedRowCount; } catch (Exception e) { LOG.warn("Fail to get row count for table {}", name, e); } @@ -449,7 +470,7 @@ private List getIcebergSchema(List hmsSchema) { return tmpSchema; } - private void initPartitionColumns(List schema) { + protected void initPartitionColumns(List schema) { List partitionKeys = remoteTable.getPartitionKeys().stream().map(FieldSchema::getName) .collect(Collectors.toList()); partitionColumns = Lists.newArrayListWithCapacity(partitionKeys.size()); @@ -480,7 +501,7 @@ public Optional getColumnStatistic(String colName) { return getHiveColumnStats(colName); case ICEBERG: return StatisticsUtil.getIcebergColumnStats(colName, - Env.getCurrentEnv().getExtMetaCacheMgr().getIcebergMetadataCache().getIcebergTable(this)); + Env.getCurrentEnv().getExtMetaCacheMgr().getIcebergMetadataCache().getIcebergTable(this)); default: LOG.warn("get column stats for dlaType {} is not supported.", dlaType); } @@ -608,6 +629,12 @@ private void setStatData(Column col, ColumnStatisticsData data, ColumnStatisticB builder.setMaxValue(Double.MAX_VALUE); } } + + @Override + public void gsonPostProcess() throws IOException { + super.gsonPostProcess(); + estimatedRowCount = -1; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/JdbcExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/JdbcExternalTable.java index 051bfa5e585d1d0..a02c59080fc4ebd 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/external/JdbcExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/external/JdbcExternalTable.java @@ -24,14 +24,13 @@ import org.apache.doris.statistics.AnalysisInfo; import org.apache.doris.statistics.BaseAnalysisTask; import org.apache.doris.statistics.JdbcAnalysisTask; -import org.apache.doris.statistics.TableStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.thrift.TTableDescriptor; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.util.List; -import java.util.Optional; /** * Elasticsearch external table. @@ -112,16 +111,11 @@ public BaseAnalysisTask createAnalysisTask(AnalysisInfo info) { @Override public long getRowCount() { makeSureInitialized(); - try { - Optional tableStatistics = Env.getCurrentEnv().getStatisticsCache().getTableStatistics( - catalog.getId(), catalog.getDbOrAnalysisException(dbName).getId(), id); - if (tableStatistics.isPresent()) { - long rowCount = tableStatistics.get().rowCount; - LOG.debug("Estimated row count for db {} table {} is {}.", dbName, name, rowCount); - return rowCount; - } - } catch (Exception e) { - LOG.warn("Fail to get row count for table {}", name, e); + TableStatsMeta tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(id); + if (tableStats != null) { + long rowCount = tableStats.rowCount; + LOG.debug("Estimated row count for db {} table {} is {}.", dbName, name, rowCount); + return rowCount; } return 1; } diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java b/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java index be8731b6b254441..31d608b8a258a94 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/ThreadPoolManager.java @@ -134,6 +134,15 @@ public static ThreadPoolExecutor newDaemonFixedThreadPool(int numThread, int que poolName, needRegisterMetric); } + public static ThreadPoolExecutor newDaemonFixedThreadPool(int numThread, int queueSize, + String poolName, + boolean needRegisterMetric, + RejectedExecutionHandler handler) { + return newDaemonThreadPool(numThread, numThread, KEEP_ALIVE_TIME, TimeUnit.SECONDS, + new LinkedBlockingQueue<>(queueSize), handler, + poolName, needRegisterMetric); + } + public static ThreadPoolExecutor newDaemonFixedPriorityThreadPool(int numThread, int initQueueSize, Comparator comparator, Class tClass, String poolName, boolean needRegisterMetric) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java index 69e51b6326c5d68..2cf0fb7db2de559 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogIf.java @@ -172,4 +172,7 @@ default CatalogLog constructEditLog() { public Collection getAllDbs(); public ConcurrentHashMap getIdToDb(); + + public boolean enableAutoAnalyze(); + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogMgr.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogMgr.java index ca5c8faf8c5cb1d..253cfa2f699697e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/CatalogMgr.java @@ -62,8 +62,10 @@ import java.io.DataOutput; import java.io.IOException; import java.util.Arrays; +import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.TreeMap; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.Function; @@ -1113,5 +1115,9 @@ public void gsonPostProcess() throws IOException { public Map getIdToCatalog() { return idToCatalog; } + + public Set getCopyOfCatalog() { + return new HashSet<>(idToCatalog.values()); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java index 692e4c5f159db64..be4e6c498552e3e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/ExternalCatalog.java @@ -74,6 +74,8 @@ public abstract class ExternalCatalog implements CatalogIf>, Writable, GsonPostProcessable { private static final Logger LOG = LogManager.getLogger(ExternalCatalog.class); + public static final String ENABLE_AUTO_ANALYZE = "enable.auto.analyze"; + // Unique id of this catalog, will be assigned after catalog is loaded. @SerializedName(value = "id") protected long id; @@ -604,4 +606,18 @@ public Collection getAllDbs() { public ConcurrentHashMap getIdToDb() { return new ConcurrentHashMap<>(idToDb); } + + @Override + public boolean enableAutoAnalyze() { + // By default, external catalog disables auto analyze, uses could set catalog property to enable it: + // "enable.auto.analyze" = true + Map properties = catalogProperty.getProperties(); + boolean ret = false; + if (properties.containsKey(ENABLE_AUTO_ANALYZE) + && properties.get(ENABLE_AUTO_ANALYZE).equalsIgnoreCase("true")) { + ret = true; + } + return ret; + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java index 9960d71b7dd07ec..773f7d709e626e5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/InternalCatalog.java @@ -3153,4 +3153,9 @@ public ConcurrentHashMap getIdToDb() { public Collection getAllDbs() { return new HashSet<>(idToDb.values()); } + + @Override + public boolean enableAutoAnalyze() { + return true; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java b/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java index affeea2de1426bc..51d7803510d2f46 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java +++ b/fe/fe-core/src/main/java/org/apache/doris/journal/JournalEntity.java @@ -111,6 +111,7 @@ import org.apache.doris.persist.TableInfo; import org.apache.doris.persist.TablePropertyInfo; import org.apache.doris.persist.TableRenameColumnInfo; +import org.apache.doris.persist.TableStatsDeletionLog; import org.apache.doris.persist.TruncateTableInfo; import org.apache.doris.plugin.PluginInfo; import org.apache.doris.policy.DropPolicyLog; @@ -118,6 +119,7 @@ import org.apache.doris.policy.StoragePolicy; import org.apache.doris.resource.workloadgroup.WorkloadGroup; import org.apache.doris.statistics.AnalysisInfo; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.system.Backend; import org.apache.doris.system.Frontend; import org.apache.doris.transaction.TransactionState; @@ -844,6 +846,21 @@ public void readFields(DataInput in) throws IOException { isRead = true; break; } + case OperationType.OP_UPDATE_TABLE_STATS: { + data = TableStatsMeta.read(in); + isRead = true; + break; + } + case OperationType.OP_PERSIST_AUTO_JOB: { + data = AnalysisInfo.read(in); + isRead = true; + break; + } + case OperationType.OP_DELETE_TABLE_STATS: { + data = TableStatsDeletionLog.read(in); + isRead = true; + break; + } default: { IOException e = new IOException(); LOG.error("UNKNOWN Operation Type {}", opCode, e); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java index 1f7255b7990ace1..aa8f4d6cc7cfda6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostModelV1.java @@ -86,8 +86,7 @@ public static Cost addChildCost(Plan plan, Cost planCost, Cost childCost, int in CostV1 planCostV1 = (CostV1) planCost; return new CostV1(childCostV1.getCpuCost() + planCostV1.getCpuCost(), childCostV1.getMemoryCost() + planCostV1.getMemoryCost(), - childCostV1.getNetworkCost() + planCostV1.getNetworkCost(), - childCostV1.getPenalty() + planCostV1.getPenalty()); + childCostV1.getNetworkCost() + planCostV1.getNetworkCost()); } @Override @@ -118,7 +117,7 @@ public Cost visitPhysicalStorageLayerAggregate( CostV1 costValue = (CostV1) storageLayerAggregate.getRelation().accept(this, context); // multiply a factor less than 1, so we can select PhysicalStorageLayerAggregate as far as possible return new CostV1(costValue.getCpuCost() * 0.7, costValue.getMemoryCost(), - costValue.getNetworkCost(), costValue.getPenalty()); + costValue.getNetworkCost()); } @Override @@ -150,14 +149,14 @@ public Cost visitPhysicalQuickSort( // TODO: consider two-phase sort and enforcer. Statistics statistics = context.getStatisticsWithCheck(); Statistics childStatistics = context.getChildStatistics(0); + + double childRowCount = childStatistics.getRowCount(); + double rowCount = statistics.getRowCount(); if (physicalQuickSort.getSortPhase().isGather()) { // Now we do more like two-phase sort, so penalise one-phase sort - statistics = statistics.withRowCount(statistics.getRowCount() * 100); + rowCount *= 100; } - return CostV1.of( - childStatistics.getRowCount(), - statistics.getRowCount(), - childStatistics.getRowCount()); + return CostV1.of(childRowCount, rowCount, childRowCount); } @Override @@ -165,14 +164,14 @@ public Cost visitPhysicalTopN(PhysicalTopN topN, PlanContext con // TODO: consider two-phase sort and enforcer. Statistics statistics = context.getStatisticsWithCheck(); Statistics childStatistics = context.getChildStatistics(0); + + double childRowCount = childStatistics.getRowCount(); + double rowCount = statistics.getRowCount(); if (topN.getSortPhase().isGather()) { // Now we do more like two-phase sort, so penalise one-phase sort - statistics = statistics.withRowCount(statistics.getRowCount() * 100); + rowCount *= 100; } - return CostV1.of( - childStatistics.getRowCount(), - statistics.getRowCount(), - childStatistics.getRowCount()); + return CostV1.of(childRowCount, rowCount, childRowCount); } @Override @@ -186,9 +185,9 @@ public Cost visitPhysicalPartitionTopN(PhysicalPartitionTopN par Statistics statistics = context.getStatisticsWithCheck(); Statistics childStatistics = context.getChildStatistics(0); return CostV1.of( - childStatistics.getRowCount(), - statistics.getRowCount(), - childStatistics.getRowCount()); + childStatistics.getRowCount(), + statistics.getRowCount(), + childStatistics.getRowCount()); } @Override @@ -287,30 +286,38 @@ public Cost visitPhysicalHashJoin( pattern2: (L join1 Agg1) join2 agg2 in pattern2, join1 and join2 takes more time, but Agg1 and agg2 can be processed in parallel. */ - double penalty = HEAVY_OPERATOR_PUNISH_FACTOR - * Math.min(probeStats.getPenalty(), buildStats.getPenalty()); - if (buildStats.getWidth() >= 2) { - //penalty for right deep tree - penalty += rightRowCount; - } if (physicalHashJoin.getJoinType().isCrossJoin()) { return CostV1.of(leftRowCount + rightRowCount + outputRowCount, 0, - leftRowCount + rightRowCount, - penalty); + leftRowCount + rightRowCount + ); } if (context.isBroadcastJoin()) { - double broadcastJoinPenalty = broadCastJoinBalancePenalty(probeStats, buildStats); - return CostV1.of(leftRowCount * broadcastJoinPenalty + rightRowCount + outputRowCount, + // compared with shuffle join, bc join will be taken a penalty for both build and probe side; + // currently we use the following factor as the penalty factor: + // build side factor: totalInstanceNumber to the power of 2, standing for the additional effort for + // bigger cost for building hash table, taken on rightRowCount + // probe side factor: totalInstanceNumber to the power of 2, standing for the additional effort for + // bigger cost for ProbeWhenBuildSideOutput effort and ProbeWhenSearchHashTableTime + // on the output rows, taken on outputRowCount() + double probeSideFactor = 1.0; + double buildSideFactor = ConnectContext.get().getSessionVariable().getBroadcastRightTableScaleFactor(); + int parallelInstance = Math.max(1, ConnectContext.get().getSessionVariable().getParallelExecInstanceNum()); + int totalInstanceNumber = parallelInstance * beNumber; + if (buildSideFactor <= 1.0) { + // use totalInstanceNumber to the power of 2 as the default factor value + buildSideFactor = Math.pow(totalInstanceNumber, 0.5); + } + // TODO: since the outputs rows may expand a lot, penalty on it will cause bc never be chosen. + // will refine this in next generation cost model. + return CostV1.of(leftRowCount + rightRowCount * buildSideFactor + outputRowCount * probeSideFactor, rightRowCount, - 0, 0 ); } return CostV1.of(leftRowCount + rightRowCount + outputRowCount, rightRowCount, - 0, 0 ); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostV1.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostV1.java index b5c5b50bd2e7b3a..bf1cc425999f7c0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostV1.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/cost/CostV1.java @@ -19,23 +19,19 @@ class CostV1 implements Cost { private static final CostV1 INFINITE = new CostV1(Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY, - Double.POSITIVE_INFINITY, Double.POSITIVE_INFINITY); - private static final CostV1 ZERO = new CostV1(0, 0, 0, 0); + Double.POSITIVE_INFINITY); + private static final CostV1 ZERO = new CostV1(0, 0, 0); private final double cpuCost; private final double memoryCost; private final double networkCost; - //penalty for - // 1. right deep tree - // 2. right XXX join - private final double penalty; private final double cost; /** * Constructor of CostEstimate. */ - public CostV1(double cpuCost, double memoryCost, double networkCost, double penaltiy) { + public CostV1(double cpuCost, double memoryCost, double networkCost) { // TODO: fix stats cpuCost = Double.max(0, cpuCost); memoryCost = Double.max(0, memoryCost); @@ -43,11 +39,10 @@ public CostV1(double cpuCost, double memoryCost, double networkCost, double pena this.cpuCost = cpuCost; this.memoryCost = memoryCost; this.networkCost = networkCost; - this.penalty = penaltiy; CostWeight costWeight = CostWeight.get(); this.cost = costWeight.cpuWeight * cpuCost + costWeight.memoryWeight * memoryCost - + costWeight.networkWeight * networkCost + costWeight.penaltyWeight * penalty; + + costWeight.networkWeight * networkCost; } public CostV1(double cost) { @@ -55,7 +50,6 @@ public CostV1(double cost) { this.cpuCost = 0; this.networkCost = 0; this.memoryCost = 0; - this.penalty = 0; } public static CostV1 infinite() { @@ -78,28 +72,20 @@ public double getNetworkCost() { return networkCost; } - public double getPenalty() { - return penalty; - } - public double getValue() { return cost; } - public static CostV1 of(double cpuCost, double maxMemory, double networkCost, double rightDeepPenaltiy) { - return new CostV1(cpuCost, maxMemory, networkCost, rightDeepPenaltiy); - } - public static CostV1 of(double cpuCost, double maxMemory, double networkCost) { - return new CostV1(cpuCost, maxMemory, networkCost, 0); + return new CostV1(cpuCost, maxMemory, networkCost); } public static CostV1 ofCpu(double cpuCost) { - return new CostV1(cpuCost, 0, 0, 0); + return new CostV1(cpuCost, 0, 0); } public static CostV1 ofMemory(double memoryCost) { - return new CostV1(0, memoryCost, 0, 0); + return new CostV1(0, memoryCost, 0); } @Override @@ -107,7 +93,7 @@ public String toString() { StringBuilder sb = new StringBuilder(); sb.append("[").append((long) cpuCost).append("/") .append((long) memoryCost).append("/").append((long) networkCost) - .append("/").append((long) penalty).append("]"); + .append("/").append("]"); return sb.toString(); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/processor/post/RuntimeFilterPruner.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/processor/post/RuntimeFilterPruner.java index aa1f10aa47849e9..c9cc43d0c29d9fa 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/processor/post/RuntimeFilterPruner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/processor/post/RuntimeFilterPruner.java @@ -27,7 +27,6 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalAssertNumRows; import org.apache.doris.nereids.trees.plans.physical.PhysicalDistribute; import org.apache.doris.nereids.trees.plans.physical.PhysicalFilter; -import org.apache.doris.nereids.trees.plans.physical.PhysicalHashAggregate; import org.apache.doris.nereids.trees.plans.physical.PhysicalHashJoin; import org.apache.doris.nereids.trees.plans.physical.PhysicalLimit; import org.apache.doris.nereids.trees.plans.physical.PhysicalProject; @@ -54,17 +53,6 @@ */ public class RuntimeFilterPruner extends PlanPostProcessor { - // ******************************* - // Physical plans - // ******************************* - @Override - public PhysicalHashAggregate visitPhysicalHashAggregate( - PhysicalHashAggregate agg, CascadesContext context) { - agg.child().accept(this, context); - context.getRuntimeFilterContext().addEffectiveSrcNode(agg); - return agg; - } - @Override public PhysicalQuickSort visitPhysicalQuickSort(PhysicalQuickSort sort, CascadesContext context) { sort.child().accept(this, context); @@ -165,7 +153,9 @@ public PhysicalAssertNumRows visitPhysicalAssertNumRows(PhysicalAssertNumRows buildNdvInProbeRange * (1 + ColumnStatistic.STATS_ERROR); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java index b8a7975a087fadb..aa1903e7b37ebb5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/ExpressionEstimation.java @@ -151,7 +151,7 @@ public ColumnStatistic visitIf(If function, Statistics context) { return new ColumnStatisticBuilder() .setNdv(2) .setMinValue(0) - .setMaxValue(Double.MAX_VALUE) + .setMaxValue(Double.POSITIVE_INFINITY) .setAvgSizeByte(8) .setNumNulls(0) .build(); @@ -206,13 +206,15 @@ public ColumnStatistic visitLiteral(Literal literal, Statistics context) { return ColumnStatistic.UNKNOWN; } double literalVal = literal.getDouble(); - ColumnStatisticBuilder columnStatBuilder = new ColumnStatisticBuilder(); - columnStatBuilder.setMaxValue(literalVal); - columnStatBuilder.setMinValue(literalVal); - columnStatBuilder.setNdv(1); - columnStatBuilder.setNumNulls(1); - columnStatBuilder.setAvgSizeByte(1); - return columnStatBuilder.build(); + return new ColumnStatisticBuilder() + .setMaxValue(literalVal) + .setMinValue(literalVal) + .setNdv(1) + .setNumNulls(1) + .setAvgSizeByte(1) + .setMinExpr(literal.toLegacyLiteral()) + .setMaxExpr(literal.toLegacyLiteral()) + .build(); } @Override @@ -241,13 +243,13 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic, if (binaryArithmetic instanceof Add) { return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin + rightMin) - .setMaxValue(leftMax + rightMax).setSelectivity(1.0) + .setMaxValue(leftMax + rightMax) .setMinExpr(null).setMaxExpr(null).build(); } if (binaryArithmetic instanceof Subtract) { return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin - rightMax) - .setMaxValue(leftMax - rightMin).setSelectivity(1.0).setMinExpr(null) + .setMaxValue(leftMax - rightMin).setMinExpr(null) .setMaxExpr(null).build(); } // TODO: stat for multiply and divide produced by below algorithm may have huge deviation with reality. @@ -259,11 +261,11 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic, leftMax * rightMax); double max = Math.max( Math.max( - Math.max(leftMin * rightMin, leftMin * rightMax), - leftMax * rightMin), + Math.max(leftMin * rightMin, leftMin * rightMax), + leftMax * rightMin), leftMax * rightMax); return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) - .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(min).setMaxValue(max).setSelectivity(1.0) + .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(min).setMaxValue(max) .setMaxExpr(null).setMinExpr(null).build(); } if (binaryArithmetic instanceof Divide || binaryArithmetic instanceof IntegralDivide) { @@ -279,7 +281,7 @@ public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic, leftMax / noneZeroDivisor(rightMax)); return new ColumnStatisticBuilder().setCount(rowCount).setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte) .setNumNulls(numNulls).setDataSize(binaryArithmetic.getDataType().width()).setMinValue(min) - .setMaxValue(max).setSelectivity(1.0).build(); + .setMaxValue(max).build(); } if (binaryArithmetic instanceof Mod) { double min = -Math.max(Math.abs(rightMin), Math.abs(rightMax)); @@ -309,13 +311,12 @@ public ColumnStatistic visitMin(Min min, Statistics context) { } /* we keep columnStat.min and columnStat.max, but set ndv=1. - if there is group-by keys, we will update ndv when visiting group clause + if there is group-by keys, we will update count when visiting group clause */ double width = min.child().getDataType().width(); - return new ColumnStatisticBuilder().setCount(1).setNdv(1).setAvgSizeByte(width).setNumNulls(width) - .setDataSize(child.getDataType().width()).setMinValue(columnStat.minValue) - .setMaxValue(columnStat.maxValue).setSelectivity(1.0) - .setMinExpr(null).build(); + return new ColumnStatisticBuilder().setCount(1).setNdv(1).setAvgSizeByte(width) + .setMinValue(columnStat.minValue).setMinExpr(columnStat.minExpr) + .setMaxValue(columnStat.maxValue).setMaxExpr(columnStat.maxExpr).build(); } @Override @@ -327,19 +328,20 @@ public ColumnStatistic visitMax(Max max, Statistics context) { } /* we keep columnStat.min and columnStat.max, but set ndv=1. - if there is group-by keys, we will update ndv when visiting group clause + if there is group-by keys, we will update count when visiting group clause */ int width = max.child().getDataType().width(); - return new ColumnStatisticBuilder().setCount(1D).setNdv(1D).setAvgSizeByte(width).setNumNulls(0) - .setDataSize(width).setMinValue(columnStat.minValue).setMaxValue(columnStat.maxValue) - .setSelectivity(1.0).setMaxExpr(null).setMinExpr(null).build(); + return new ColumnStatisticBuilder().setCount(1D).setNdv(1D).setAvgSizeByte(width) + .setMinValue(columnStat.minValue).setMinExpr(columnStat.minExpr) + .setMaxValue(columnStat.maxValue).setMaxExpr(columnStat.maxExpr) + .build(); } @Override public ColumnStatistic visitCount(Count count, Statistics context) { double width = count.getDataType().width(); return new ColumnStatisticBuilder().setCount(1D).setAvgSizeByte(width).setNumNulls(0) - .setDataSize(width).setMinValue(0).setMaxValue(context.getRowCount()).setSelectivity(1.0) + .setDataSize(width).setMinValue(0).setMaxValue(context.getRowCount()) .setMaxExpr(null).setMinExpr(null).build(); } @@ -367,7 +369,7 @@ public ColumnStatistic visitYear(Year year, Statistics context) { .setNumNulls(childStat.numNulls) .setDataSize(4 * childStat.count) .setMinValue(minYear) - .setMaxValue(maxYear).setSelectivity(1.0).setMinExpr(null).build(); + .setMaxValue(maxYear).setMinExpr(null).build(); } @Override @@ -378,7 +380,7 @@ public ColumnStatistic visitWeekOfYear(WeekOfYear weekOfYear, Statistics context .setNdv(54) .setAvgSizeByte(width) .setNumNulls(childStat.numNulls) - .setDataSize(1).setMinValue(1).setMaxValue(53).setSelectivity(1.0).setMinExpr(null) + .setDataSize(1).setMinValue(1).setMaxValue(53).setMinExpr(null) .build(); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java index c5ddbd285b37f44..f06c9d1cc4f4ee0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/FilterEstimation.java @@ -17,10 +17,10 @@ package org.apache.doris.nereids.stats; +import org.apache.doris.analysis.LiteralExpr; import org.apache.doris.nereids.stats.FilterEstimation.EstimationContext; import org.apache.doris.nereids.trees.TreeNode; import org.apache.doris.nereids.trees.expressions.And; -import org.apache.doris.nereids.trees.expressions.Cast; import org.apache.doris.nereids.trees.expressions.ComparisonPredicate; import org.apache.doris.nereids.trees.expressions.CompoundPredicate; import org.apache.doris.nereids.trees.expressions.EqualTo; @@ -28,6 +28,7 @@ import org.apache.doris.nereids.trees.expressions.GreaterThan; import org.apache.doris.nereids.trees.expressions.GreaterThanEqual; import org.apache.doris.nereids.trees.expressions.InPredicate; +import org.apache.doris.nereids.trees.expressions.IsNull; import org.apache.doris.nereids.trees.expressions.LessThan; import org.apache.doris.nereids.trees.expressions.LessThanEqual; import org.apache.doris.nereids.trees.expressions.Like; @@ -37,6 +38,7 @@ import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.expressions.functions.Function; +import org.apache.doris.nereids.trees.expressions.literal.Literal; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.statistics.Bucket; import org.apache.doris.statistics.ColumnStatistic; @@ -47,10 +49,11 @@ import org.apache.doris.statistics.Statistics; import org.apache.doris.statistics.StatisticsBuilder; +import com.google.common.base.Preconditions; +import com.google.common.collect.Sets; + import java.util.ArrayList; import java.util.List; -import java.util.Map; -import java.util.Map.Entry; import java.util.Set; import java.util.function.Predicate; @@ -81,7 +84,9 @@ public FilterEstimation(Set aggSlots) { public Statistics estimate(Expression expression, Statistics statistics) { // For a comparison predicate, only when it's left side is a slot and right side is a literal, we would // consider is a valid predicate. - return expression.accept(this, new EstimationContext(statistics)); + Statistics stats = expression.accept(this, new EstimationContext(statistics)); + stats.enforceValid(); + return stats; } @Override @@ -94,7 +99,7 @@ public Statistics visitCompoundPredicate(CompoundPredicate predicate, Estimation Expression leftExpr = predicate.child(0); Expression rightExpr = predicate.child(1); Statistics leftStats = leftExpr.accept(this, context); - Statistics andStats = rightExpr.accept(new FilterEstimation(), + Statistics andStats = rightExpr.accept(this, new EstimationContext(leftStats)); if (predicate instanceof And) { return andStats; @@ -102,27 +107,29 @@ public Statistics visitCompoundPredicate(CompoundPredicate predicate, Estimation Statistics rightStats = rightExpr.accept(this, context); double rowCount = leftStats.getRowCount() + rightStats.getRowCount() - andStats.getRowCount(); Statistics orStats = context.statistics.withRowCount(rowCount); - for (Map.Entry entry : orStats.columnStatistics().entrySet()) { - ColumnStatistic leftColStats = leftStats.findColumnStatistics(entry.getKey()); - ColumnStatistic rightColStats = rightStats.findColumnStatistics(entry.getKey()); - ColumnStatisticBuilder estimatedColStatsBuilder = new ColumnStatisticBuilder(entry.getValue()); - if (leftColStats.minValue <= rightColStats.minValue) { - estimatedColStatsBuilder.setMinValue(leftColStats.minValue); - estimatedColStatsBuilder.setMinExpr(leftColStats.minExpr); - } else { - estimatedColStatsBuilder.setMinValue(rightColStats.minValue); - estimatedColStatsBuilder.setMinExpr(rightColStats.minExpr); - } - if (leftColStats.maxValue >= rightColStats.maxValue) { - estimatedColStatsBuilder.setMaxValue(leftColStats.maxValue); - estimatedColStatsBuilder.setMaxExpr(leftColStats.maxExpr); - } else { - estimatedColStatsBuilder.setMaxValue(rightColStats.maxValue); - estimatedColStatsBuilder.setMaxExpr(rightColStats.maxExpr); + Set leftInputSlots = leftExpr.getInputSlots(); + Set rightInputSlots = rightExpr.getInputSlots(); + for (Slot slot : context.keyColumns) { + if (leftInputSlots.contains(slot) && rightInputSlots.contains(slot)) { + ColumnStatistic leftColStats = leftStats.findColumnStatistics(slot); + ColumnStatistic rightColStats = rightStats.findColumnStatistics(slot); + StatisticRange leftRange = StatisticRange.from(leftColStats, slot.getDataType()); + StatisticRange rightRange = StatisticRange.from(rightColStats, slot.getDataType()); + StatisticRange union = leftRange.union(rightRange); + ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder( + context.statistics.findColumnStatistics(slot)); + colBuilder.setMinValue(union.getLow()).setMinExpr(union.getLowExpr()) + .setMaxValue(union.getHigh()).setMaxExpr(union.getHighExpr()) + .setNdv(union.getDistinctValues()); + orStats.addColumnStats(slot, colBuilder.build()); } } return orStats; } + // should not come here + Preconditions.checkArgument(false, + "unsupported compound operator: %s in %s", + predicate.getClass().getName(), predicate.toSql()); return context.statistics; } @@ -172,25 +179,27 @@ public Statistics visitComparisonPredicate(ComparisonPredicate cp, EstimationCon } private Statistics updateLessThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft, - double val, EstimationContext context, boolean contains) { + ColumnStatistic statsForRight, EstimationContext context, boolean contains) { if (statsForLeft.hasHistogram()) { - return estimateLessThanLiteralWithHistogram(leftExpr, statsForLeft, val, context, contains); + return estimateLessThanLiteralWithHistogram(leftExpr, statsForLeft, + statsForRight.maxValue, context, contains); } - //rightRange.distinctValues should not be used - StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, val, statsForLeft.ndv, - leftExpr.getDataType()); + StatisticRange rightRange = new StatisticRange(statsForLeft.minValue, statsForLeft.minExpr, + statsForRight.maxValue, statsForRight.maxExpr, + statsForLeft.ndv, leftExpr.getDataType()); return estimateBinaryComparisonFilter(leftExpr, statsForLeft, rightRange, context); } private Statistics updateGreaterThanLiteral(Expression leftExpr, ColumnStatistic statsForLeft, - double val, EstimationContext context, boolean contains) { + ColumnStatistic statsForRight, EstimationContext context, boolean contains) { if (statsForLeft.hasHistogram()) { - return estimateGreaterThanLiteralWithHistogram(leftExpr, statsForLeft, val, context, contains); + return estimateGreaterThanLiteralWithHistogram(leftExpr, statsForLeft, + statsForRight.minValue, context, contains); } - //rightRange.distinctValues should not be used - StatisticRange rightRange = new StatisticRange(val, statsForLeft.maxValue, + StatisticRange rightRange = new StatisticRange(statsForRight.minValue, statsForRight.minExpr, + statsForLeft.maxValue, statsForLeft.maxExpr, statsForLeft.ndv, leftExpr.getDataType()); return estimateBinaryComparisonFilter(leftExpr, statsForLeft, rightRange, context); } @@ -204,12 +213,12 @@ private Statistics calculateWhenLiteralRight(ComparisonPredicate cp, if (cp instanceof EqualTo || cp instanceof NullSafeEqual) { return estimateEqualTo(cp, statsForLeft, statsForRight, context); } else { - double val = statsForRight.maxValue; if (cp instanceof LessThan || cp instanceof LessThanEqual) { - return updateLessThanLiteral(cp.left(), statsForLeft, val, context, cp instanceof LessThanEqual); + return updateLessThanLiteral(cp.left(), statsForLeft, statsForRight, + context, cp instanceof LessThanEqual); } else if (cp instanceof GreaterThan || cp instanceof GreaterThanEqual) { - return updateGreaterThanLiteral(cp.left(), statsForLeft, val, context, + return updateGreaterThanLiteral(cp.left(), statsForLeft, statsForRight, context, cp instanceof GreaterThanEqual); } else { throw new RuntimeException(String.format("Unexpected expression : %s", cp.toSql())); @@ -234,19 +243,10 @@ private Statistics estimateEqualTo(ComparisonPredicate cp, ColumnStatistic stats Statistics equalStats = context.statistics.withSel(selectivity); Expression left = cp.left(); - if (left instanceof Cast) { - left = ((Cast) left).child(); - } - if (left instanceof SlotReference) { - Slot leftSlot = (SlotReference) left; - //update min/max of cp.left - ColumnStatistic columnStats = equalStats.findColumnStatistics(leftSlot); - ColumnStatisticBuilder colStatsBuilder = new ColumnStatisticBuilder(columnStats); - colStatsBuilder.setMaxValue(val); - colStatsBuilder.setMinValue(val); - colStatsBuilder.setNdv(1); - colStatsBuilder.setNumNulls(0); - equalStats.addColumnStats(leftSlot, colStatsBuilder.build()); + equalStats.addColumnStats(left, statsForRight); + context.addKeyIfSlot(left); + if (!(left instanceof SlotReference)) { + left.accept(new ColumnStatsAdjustVisitor(), equalStats); } return equalStats; } @@ -275,8 +275,12 @@ public Statistics visitInPredicate(InPredicate inPredicate, EstimationContext co return context.statistics.withSel(DEFAULT_IN_COEFFICIENT); } List options = inPredicate.getOptions(); - double maxOption = 0; - double minOption = Double.MAX_VALUE; + // init minOption and maxOption by compareExpr.max and compareExpr.min respectively, + // and then adjust min/max by options + double minOptionValue = compareExprStats.maxValue; + double maxOptionValue = compareExprStats.minValue; + LiteralExpr minOptionLiteral = compareExprStats.maxExpr; + LiteralExpr maxOptionLiteral = compareExprStats.minExpr; /* suppose A.(min, max) = (0, 10), A.ndv=10 A in ( 1, 2, 5, 100): validInOptCount = 3, that is (1, 2, 5) @@ -292,86 +296,199 @@ A not in (1, 2, 3, 100): A.(min, max) not changed A.selectivity = 7/10 */ - double validInOptCount = 0; + int validInOptCount = 0; double selectivity = 1.0; ColumnStatisticBuilder compareExprStatsBuilder = new ColumnStatisticBuilder(compareExprStats); - + int nonLiteralOptionCount = 0; for (Expression option : options) { ColumnStatistic optionStats = ExpressionEstimation.estimate(option, context.statistics); - double validOptionNdv = compareExprStats.ndvIntersection(optionStats); - if (validOptionNdv > 0.0) { - validInOptCount += validOptionNdv; - maxOption = Math.max(optionStats.maxValue, maxOption); - minOption = Math.min(optionStats.minValue, minOption); + if (option instanceof Literal) { + // remove the options which is out of compareExpr.range + if (compareExprStats.minValue <= optionStats.maxValue + && optionStats.maxValue <= compareExprStats.maxValue) { + validInOptCount++; + LiteralExpr optionLiteralExpr = ((Literal) option).toLegacyLiteral(); + if (maxOptionLiteral == null || optionLiteralExpr.compareTo(maxOptionLiteral) >= 0) { + maxOptionLiteral = optionLiteralExpr; + maxOptionValue = optionStats.maxValue; + } + + if (minOptionLiteral == null || optionLiteralExpr.compareTo(minOptionLiteral) <= 0) { + minOptionLiteral = optionLiteralExpr; + minOptionValue = optionStats.minValue; + } + } + } else { + nonLiteralOptionCount++; + } + } + if (nonLiteralOptionCount > 0) { + // A in (x+1, ...) + // "x+1" is not literal, and if const-fold can not handle it, it blocks estimation of min/max value. + // and hence, we do not adjust compareExpr.stats.range. + int newNdv = nonLiteralOptionCount + validInOptCount; + if (newNdv < compareExprStats.ndv) { + compareExprStatsBuilder.setNdv(newNdv); + selectivity = StatsMathUtil.divide(newNdv, compareExprStats.ndv); + } else { + selectivity = 1.0; + } + } else { + maxOptionValue = Math.min(maxOptionValue, compareExprStats.maxValue); + minOptionValue = Math.max(minOptionValue, compareExprStats.minValue); + compareExprStatsBuilder.setMaxValue(maxOptionValue); + compareExprStatsBuilder.setMaxExpr(maxOptionLiteral); + compareExprStatsBuilder.setMinValue(minOptionValue); + compareExprStatsBuilder.setMinExpr(minOptionLiteral); + if (validInOptCount < compareExprStats.ndv) { + compareExprStatsBuilder.setNdv(validInOptCount); + selectivity = StatsMathUtil.divide(validInOptCount, compareExprStats.ndv); + } else { + selectivity = 1.0; } } - maxOption = Math.min(maxOption, compareExprStats.maxValue); - minOption = Math.max(minOption, compareExprStats.minValue); - compareExprStatsBuilder.setMaxValue(maxOption); - compareExprStatsBuilder.setMinValue(minOption); - - selectivity = StatsMathUtil.minNonNaN(1.0, validInOptCount / compareExprStats.ndv); - compareExprStatsBuilder.setNdv(validInOptCount); Statistics estimated = new Statistics(context.statistics); estimated = estimated.withSel(selectivity); - if (compareExpr instanceof SlotReference) { - estimated.addColumnStats(compareExpr, - compareExprStatsBuilder.build()); - } + estimated.addColumnStats(compareExpr, + compareExprStatsBuilder.build()); + context.addKeyIfSlot(compareExpr); return estimated; } + // Right Now, we just assume the selectivity is 1 when stats is Unknown + private Statistics handleUnknownCase(EstimationContext context) { + return context.statistics; + } + @Override public Statistics visitNot(Not not, EstimationContext context) { - Statistics childStats = new FilterEstimation().estimate(not.child(), context.statistics); + if (context.statistics.isInputSlotsUnknown(not.getInputSlots())) { + return handleUnknownCase(context); + } + Expression child = not.child(); + Statistics childStats = child.accept(this, context); //if estimated rowCount is 0, adjust to 1 to make upper join reorder reasonable. double rowCount = Math.max(context.statistics.getRowCount() - childStats.getRowCount(), 1); StatisticsBuilder statisticsBuilder = new StatisticsBuilder(context.statistics).setRowCount(rowCount); - for (Entry entry : context.statistics.columnStatistics().entrySet()) { - Expression expr = entry.getKey(); - ColumnStatistic originColStats = entry.getValue(); - ColumnStatistic childColStats = childStats.findColumnStatistics(expr); - double originNonNullCount = Math.max(originColStats.count - originColStats.numNulls, 0); - double childNonNullCount = Math.max(childColStats.count - childColStats.numNulls, 0); - double supersetValuesPerDistinctValue = StatsMathUtil.divide(originNonNullCount, originColStats.ndv); - double subsetValuesPerDistinctValue = StatsMathUtil.divide(childNonNullCount, childColStats.ndv); - double ndv; - if (supersetValuesPerDistinctValue <= subsetValuesPerDistinctValue) { - ndv = Math.max(originColStats.ndv - childColStats.ndv, 0); - } else { - ndv = originColStats.ndv; + // update key col stats + for (Slot slot : not.child().getInputSlots()) { + ColumnStatistic originColStats = context.statistics.findColumnStatistics(slot); + ColumnStatistic childColStats = childStats.findColumnStatistics(slot); + if (context.isKeySlot(slot)) { + ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(childColStats); + // update column stats for + // 1. not (A=B) + // 2. not A in (...) + // 3. not A is null + // 4. not A like XXX + colBuilder.setNumNulls(0); + Preconditions.checkArgument( + child instanceof EqualTo + || child instanceof InPredicate + || child instanceof IsNull + || child instanceof Like, + "Not-predicate meet unexpected child: %s", child.toSql()); + if (child instanceof Like) { + rowCount = context.statistics.getRowCount() - childStats.getRowCount(); + colBuilder.setNdv(originColStats.ndv - childColStats.ndv); + } else if (child instanceof InPredicate) { + colBuilder.setNdv(originColStats.ndv - childColStats.ndv); + colBuilder.setMinValue(originColStats.minValue) + .setMinExpr(originColStats.minExpr) + .setMaxValue(originColStats.maxValue) + .setMaxExpr(originColStats.maxExpr); + } else if (child instanceof IsNull) { + colBuilder.setNdv(originColStats.ndv); + colBuilder.setMinValue(originColStats.minValue) + .setMinExpr(originColStats.minExpr) + .setMaxValue(originColStats.maxValue) + .setMaxExpr(originColStats.maxExpr); + } else if (child instanceof EqualTo) { + colBuilder.setNdv(originColStats.ndv - childColStats.ndv); + colBuilder.setMinValue(originColStats.minValue) + .setMinExpr(originColStats.minExpr) + .setMaxValue(originColStats.maxValue) + .setMaxExpr(originColStats.maxExpr); + } + statisticsBuilder.putColumnStatistics(slot, colBuilder.build()); } - double nullCount = Math.max(originColStats.numNulls - childColStats.numNulls, 0); - ColumnStatistic columnStatistic = new ColumnStatisticBuilder(originColStats) - .setNdv(ndv) - .setNumNulls(nullCount) - .build(); - statisticsBuilder.putColumnStatistics(expr, columnStatistic); } + return statisticsBuilder.build(); } + @Override + public Statistics visitIsNull(IsNull isNull, EstimationContext context) { + ColumnStatistic childStats = ExpressionEstimation.estimate(isNull.child(), context.statistics); + if (childStats.isUnKnown()) { + return new StatisticsBuilder(context.statistics).build(); + } + double outputRowCount = childStats.numNulls; + ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(childStats); + colBuilder.setCount(outputRowCount).setNumNulls(outputRowCount) + .setMaxValue(Double.POSITIVE_INFINITY) + .setMinValue(Double.NEGATIVE_INFINITY) + .setNdv(0); + StatisticsBuilder builder = new StatisticsBuilder(context.statistics); + builder.putColumnStatistics(isNull.child(), colBuilder.build()); + context.addKeyIfSlot(isNull.child()); + return builder.build(); + } + static class EstimationContext { private final Statistics statistics; + private final Set keyColumns = Sets.newHashSet(); + public EstimationContext(Statistics statistics) { this.statistics = statistics; } + + public void addKeyIfSlot(Expression expr) { + if (expr instanceof Slot) { + keyColumns.add((Slot) expr); + } + } + + public boolean isKeySlot(Expression expr) { + if (expr instanceof Slot) { + return keyColumns.contains((Slot) expr); + } + return false; + } } private Statistics estimateBinaryComparisonFilter(Expression leftExpr, ColumnStatistic leftStats, StatisticRange rightRange, EstimationContext context) { StatisticRange leftRange = - new StatisticRange(leftStats.minValue, leftStats.maxValue, leftStats.ndv, leftExpr.getDataType()); + new StatisticRange(leftStats.minValue, leftStats.minExpr, leftStats.maxValue, leftStats.maxExpr, + leftStats.ndv, leftExpr.getDataType()); StatisticRange intersectRange = leftRange.cover(rightRange); - ColumnStatisticBuilder leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats) - .setMinValue(intersectRange.getLow()) - .setMaxValue(intersectRange.getHigh()) - .setNdv(intersectRange.getDistinctValues()); - double sel = leftRange.overlapPercentWith(rightRange); - Statistics updatedStatistics = context.statistics.withSel(sel); - leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount()); + + ColumnStatisticBuilder leftColumnStatisticBuilder; + Statistics updatedStatistics; + if (intersectRange.isEmpty()) { + updatedStatistics = context.statistics.withRowCount(0); + leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats) + .setMinValue(Double.NEGATIVE_INFINITY) + .setMinExpr(null) + .setMaxValue(Double.POSITIVE_INFINITY) + .setMaxExpr(null) + .setNdv(0) + .setCount(0); + } else { + leftColumnStatisticBuilder = new ColumnStatisticBuilder(leftStats) + .setMinValue(intersectRange.getLow()) + .setMinExpr(intersectRange.getLowExpr()) + .setMaxValue(intersectRange.getHigh()) + .setMaxExpr(intersectRange.getHighExpr()) + .setNdv(intersectRange.getDistinctValues()); + double sel = leftRange.overlapPercentWith(rightRange); + updatedStatistics = context.statistics.withSel(sel); + leftColumnStatisticBuilder.setCount(updatedStatistics.getRowCount()); + } updatedStatistics.addColumnStats(leftExpr, leftColumnStatisticBuilder.build()); + context.addKeyIfSlot(leftExpr); leftExpr.accept(new ColumnStatsAdjustVisitor(), updatedStatistics); return updatedStatistics; } @@ -381,36 +498,17 @@ private Statistics estimateColumnEqualToColumn(Expression leftExpr, ColumnStatis StatisticRange leftRange = StatisticRange.from(leftStats, leftExpr.getDataType()); StatisticRange rightRange = StatisticRange.from(rightStats, rightExpr.getDataType()); StatisticRange leftIntersectRight = leftRange.intersect(rightRange); - StatisticRange rightIntersectLeft = rightRange.intersect(leftIntersectRight); - ColumnStatisticBuilder leftBuilder = new ColumnStatisticBuilder(leftStats); - leftBuilder.setNdv(leftIntersectRight.getDistinctValues()); - leftBuilder.setMinValue(leftIntersectRight.getLow()); - leftBuilder.setMaxValue(leftIntersectRight.getHigh()); - ColumnStatisticBuilder rightBuilder = new ColumnStatisticBuilder(rightStats); - rightBuilder.setNdv(rightIntersectLeft.getDistinctValues()); - rightBuilder.setMinValue(rightIntersectLeft.getLow()); - rightBuilder.setMaxValue(rightIntersectLeft.getDistinctValues()); - double sel; - double reduceRatio = 0.25; - double bothSideReducedRatio = 0.9; - if (!leftStats.rangeChanged() && !rightStats.rangeChanged() - && leftStats.ndv < leftStats.getOriginalNdv() * bothSideReducedRatio - && rightStats.ndv < rightStats.getOriginalNdv() * bothSideReducedRatio) { - double sel1; - if (leftStats.ndv > rightStats.ndv) { - sel1 = 1 / StatsMathUtil.nonZeroDivisor(leftStats.ndv); - } else { - sel1 = 1 / StatsMathUtil.nonZeroDivisor(rightStats.ndv); - } - double sel2 = Math.min(rightStats.ndv / rightStats.getOriginalNdv(), - leftStats.ndv / leftStats.getOriginalNdv()); - sel = sel1 * Math.pow(sel2, reduceRatio); - } else { - sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv)); - } + StatisticRange intersect = rightRange.intersect(leftIntersectRight); + ColumnStatisticBuilder intersectBuilder = new ColumnStatisticBuilder(leftStats); + intersectBuilder.setNdv(intersect.getDistinctValues()); + intersectBuilder.setMinValue(intersect.getLow()); + intersectBuilder.setMaxValue(intersect.getHigh()); + double sel = 1 / StatsMathUtil.nonZeroDivisor(Math.max(leftStats.ndv, rightStats.ndv)); Statistics updatedStatistics = context.statistics.withSel(sel); - updatedStatistics.addColumnStats(leftExpr, leftBuilder.build()); - updatedStatistics.addColumnStats(rightExpr, rightBuilder.build()); + updatedStatistics.addColumnStats(leftExpr, intersectBuilder.build()); + updatedStatistics.addColumnStats(rightExpr, intersectBuilder.build()); + context.addKeyIfSlot(leftExpr); + context.addKeyIfSlot(rightExpr); return updatedStatistics; } @@ -426,6 +524,8 @@ private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStati context.statistics.getRowCount() - rightStats.numNulls)); statistics.addColumnStats(leftExpr, new ColumnStatisticBuilder(leftStats).setNumNulls(0.0).build()); statistics.addColumnStats(rightExpr, new ColumnStatisticBuilder(rightStats).setNumNulls(0.0).build()); + context.addKeyIfSlot(leftExpr); + context.addKeyIfSlot(rightExpr); return statistics; } double leftOverlapPercent = leftRange.overlapPercentWith(rightRange); @@ -433,8 +533,8 @@ private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStati if (leftOverlapPercent == 0) { return context.statistics.withRowCount(0.0); } - StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue, - rightStats.minValue, Double.NaN, leftExpr.getDataType()); + StatisticRange leftAlwaysLessThanRightRange = new StatisticRange(leftStats.minValue, leftStats.minExpr, + rightStats.minValue, rightStats.minExpr, Double.NaN, leftExpr.getDataType()); double leftAlwaysLessThanRightPercent = 0; if (leftRange.getLow() < rightRange.getLow()) { leftAlwaysLessThanRightPercent = leftRange.overlapPercentWith(leftAlwaysLessThanRightRange); @@ -448,8 +548,10 @@ private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStati double rightOverlappingRangeFraction = rightRange.overlapPercentWith(leftRange); double rightAlwaysGreaterRangeFraction = 0; if (leftRange.getHigh() < rightRange.getHigh()) { - rightAlwaysGreaterRangeFraction = rightRange.overlapPercentWith(new StatisticRange(leftRange.getHigh(), - rightRange.getHigh(), Double.NaN, rightExpr.getDataType())); + rightAlwaysGreaterRangeFraction = rightRange.overlapPercentWith(new StatisticRange( + leftRange.getHigh(), leftRange.getHighExpr(), + rightRange.getHigh(), rightRange.getHighExpr(), + Double.NaN, rightExpr.getDataType())); } ColumnStatistic rightColumnStatistic = new ColumnStatisticBuilder(rightStats) .setMinValue(Math.max(leftRange.getLow(), rightRange.getLow())) @@ -460,6 +562,8 @@ private Statistics estimateColumnLessThanColumn(Expression leftExpr, ColumnStati double sel = leftAlwaysLessThanRightPercent + leftOverlapPercent * rightOverlappingRangeFraction * DEFAULT_INEQUALITY_COEFFICIENT + leftOverlapPercent * rightAlwaysGreaterRangeFraction; + context.addKeyIfSlot(leftExpr); + context.addKeyIfSlot(rightExpr); return context.statistics.withSel(sel) .addColumnStats(leftExpr, leftColumnStatistic) .addColumnStats(rightExpr, rightColumnStatistic); @@ -493,6 +597,7 @@ private Statistics estimateLessThanLiteralWithHistogram(Expression leftExpr, Col .setMaxValue(numVal) .setHistogram(new HistogramBuilder(leftHist).setBuckets(updatedBucketList).build()) .build(); + context.addKeyIfSlot(leftExpr); return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic); } } @@ -520,7 +625,7 @@ private Statistics estimateGreaterThanLiteralWithHistogram(Expression leftExpr, double overlapCountInBucket = overlapPercentInBucket * bucket.count; double sel = StatsMathUtil.minNonNaN(1, (leftHist.size() - bucket.preSum - (bucket.count - overlapCountInBucket)) - / context.statistics.getRowCount()); + / context.statistics.getRowCount()); List updatedBucketList = new ArrayList<>(); updatedBucketList.add(new Bucket(numVal, bucket.upper, overlapPercentInBucket * bucket.count, 0, overlapPercentInBucket * bucket.ndv)); @@ -529,6 +634,7 @@ private Statistics estimateGreaterThanLiteralWithHistogram(Expression leftExpr, .setMaxValue(numVal) .setHistogram(new HistogramBuilder(leftHist).setBuckets(updatedBucketList).build()) .build(); + context.addKeyIfSlot(leftExpr); return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic); } } @@ -556,11 +662,24 @@ private Statistics estimateEqualToWithHistogram(Expression leftExpr, ColumnStati .setMaxValue(numVal) .setMinValue(numVal) .build(); + context.addKeyIfSlot(leftExpr); return context.statistics.withSel(sel).addColumnStats(leftExpr, columnStatistic); } @Override public Statistics visitLike(Like like, EstimationContext context) { - return context.statistics.withSel(DEFAULT_LIKE_COMPARISON_SELECTIVITY); + StatisticsBuilder statsBuilder = new StatisticsBuilder(context.statistics); + statsBuilder.setRowCount(context.statistics.getRowCount() * DEFAULT_LIKE_COMPARISON_SELECTIVITY); + if (like.left() instanceof Slot) { + ColumnStatistic origin = context.statistics.findColumnStatistics(like.left()); + Preconditions.checkArgument(origin != null, + "col stats not found. slot=%s in %s", + like.left().toSql(), like.toSql()); + ColumnStatisticBuilder colBuilder = new ColumnStatisticBuilder(origin); + colBuilder.setNdv(origin.ndv * DEFAULT_LIKE_COMPARISON_SELECTIVITY).setNumNulls(0); + statsBuilder.putColumnStatistics(like.left(), colBuilder.build()); + context.addKeyIfSlot(like.left()); + } + return statsBuilder.build(); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java index a40e409fae40c0f..ef4575e3308d7aa 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/JoinEstimation.java @@ -72,16 +72,7 @@ private static boolean hashJoinConditionContainsUnknownColumnStats(Statistics le return false; } - private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rightStats, Join join) { - if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { - double rowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount()); - rowCount = Math.max(1, rowCount); - return new StatisticsBuilder() - .setRowCount(rowCount) - .putColumnStatistics(leftStats.columnStatistics()) - .putColumnStatistics(rightStats.columnStatistics()) - .build(); - } + private static Statistics estimateHashJoin(Statistics leftStats, Statistics rightStats, Join join) { /* * When we estimate filter A=B, * if any side of equation, A or B, is almost unique, the confidence level of estimation is high. @@ -129,7 +120,7 @@ private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rig .putColumnStatistics(rightStats.columnStatistics()) .build(); - double outputRowCount = 1; + double outputRowCount; if (!trustableConditions.isEmpty()) { List joinConditionSels = trustableConditions.stream() .map(expression -> estimateJoinConditionSel(crossJoinStats, expression)) @@ -138,20 +129,47 @@ private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rig double sel = 1.0; double denominator = 1.0; - for (int i = 0; i < joinConditionSels.size(); i++) { - sel *= Math.pow(joinConditionSels.get(i), 1 / denominator); + for (Double joinConditionSel : joinConditionSels) { + sel *= Math.pow(joinConditionSel, 1 / denominator); denominator *= 2; } outputRowCount = Math.max(1, crossJoinStats.getRowCount() * sel); outputRowCount = outputRowCount * Math.pow(0.9, unTrustableCondition.size()); - innerJoinStats = crossJoinStats.updateRowCountOnly(outputRowCount); } else { outputRowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount()); Optional ratio = unTrustEqualRatio.stream().min(Double::compareTo); if (ratio.isPresent()) { outputRowCount = Math.max(1, outputRowCount * ratio.get()); } - innerJoinStats = crossJoinStats.updateRowCountOnly(outputRowCount); + } + innerJoinStats = crossJoinStats.withRowCountAndEnforceValid(outputRowCount); + return innerJoinStats; + } + + private static Statistics estimateNestLoopJoin(Statistics leftStats, Statistics rightStats, Join join) { + return new StatisticsBuilder() + .setRowCount(Math.max(1, leftStats.getRowCount() * rightStats.getRowCount())) + .putColumnStatistics(leftStats.columnStatistics()) + .putColumnStatistics(rightStats.columnStatistics()) + .build(); + } + + private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rightStats, Join join) { + if (hashJoinConditionContainsUnknownColumnStats(leftStats, rightStats, join)) { + double rowCount = Math.max(leftStats.getRowCount(), rightStats.getRowCount()); + rowCount = Math.max(1, rowCount); + return new StatisticsBuilder() + .setRowCount(rowCount) + .putColumnStatistics(leftStats.columnStatistics()) + .putColumnStatistics(rightStats.columnStatistics()) + .build(); + } + + Statistics innerJoinStats; + if (join.getHashJoinConjuncts().isEmpty()) { + innerJoinStats = estimateNestLoopJoin(leftStats, rightStats, join); + } else { + innerJoinStats = estimateHashJoin(leftStats, rightStats, join); } if (!join.getOtherJoinConjuncts().isEmpty()) { @@ -162,9 +180,6 @@ private static Statistics estimateInnerJoin(Statistics leftStats, Statistics rig innerJoinStats = new StatisticsBuilder(innerJoinStats).setRowCount(1).build(); } } - - innerJoinStats.setWidth(leftStats.getWidth() + rightStats.getWidth()); - innerJoinStats.setPenalty(0); return innerJoinStats; } @@ -242,10 +257,9 @@ private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics ri double baseRowCount = join.getJoinType().isLeftSemiOrAntiJoin() ? leftStats.getRowCount() : rightStats.getRowCount(); rowCount = Math.min(innerJoinStats.getRowCount(), baseRowCount); - return innerJoinStats.withRowCount(rowCount); + return innerJoinStats.withRowCountAndEnforceValid(rowCount); } else { StatisticsBuilder builder; - double originalRowCount = leftStats.getRowCount(); if (join.getJoinType().isLeftSemiOrAntiJoin()) { builder = new StatisticsBuilder(leftStats); builder.setRowCount(rowCount); @@ -253,10 +267,9 @@ private static Statistics estimateSemiOrAnti(Statistics leftStats, Statistics ri //right semi or anti builder = new StatisticsBuilder(rightStats); builder.setRowCount(rowCount); - originalRowCount = rightStats.getRowCount(); } Statistics outputStats = builder.build(); - outputStats.fix(rowCount, originalRowCount); + outputStats.enforceValid(); return outputStats; } } @@ -276,15 +289,15 @@ public static Statistics estimate(Statistics leftStats, Statistics rightStats, J Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); double rowCount = Math.max(leftStats.getRowCount(), innerJoinStats.getRowCount()); rowCount = Math.max(leftStats.getRowCount(), rowCount); - return innerJoinStats.withRowCount(rowCount); + return innerJoinStats.withRowCountAndEnforceValid(rowCount); } else if (joinType == JoinType.RIGHT_OUTER_JOIN) { Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); double rowCount = Math.max(rightStats.getRowCount(), innerJoinStats.getRowCount()); rowCount = Math.max(rowCount, rightStats.getRowCount()); - return innerJoinStats.withRowCount(rowCount); + return innerJoinStats.withRowCountAndEnforceValid(rowCount); } else if (joinType == JoinType.FULL_OUTER_JOIN) { Statistics innerJoinStats = estimateInnerJoin(leftStats, rightStats, join); - return innerJoinStats.withRowCount(leftStats.getRowCount() + return innerJoinStats.withRowCountAndEnforceValid(leftStats.getRowCount() + rightStats.getRowCount() + innerJoinStats.getRowCount()); } else if (joinType == JoinType.CROSS_JOIN) { return new StatisticsBuilder() diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java index 24ec929e820b00a..f64c6bf8022f08a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/stats/StatsCalculator.java @@ -17,10 +17,7 @@ package org.apache.doris.nereids.stats; -import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.OlapTable; -import org.apache.doris.catalog.SchemaTable; import org.apache.doris.catalog.TableIf; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; @@ -608,7 +605,7 @@ private Statistics computeCatalogRelation(CatalogRelation catalogRelation) { double rowCount = catalogRelation.getTable().estimatedRowCount(); for (SlotReference slotReference : slotSet) { String colName = slotReference.getName(); - boolean shouldIgnoreThisCol = shouldIgnoreCol(table, slotReference.getColumn().get()); + boolean shouldIgnoreThisCol = StatisticConstants.shouldIgnoreCol(table, slotReference.getColumn().get()); if (colName == null) { throw new RuntimeException(String.format("Invalid slot: %s", slotReference.getExprId())); @@ -646,16 +643,20 @@ private Statistics computeTopN(TopN topN) { } private Statistics computePartitionTopN(PartitionTopN partitionTopN) { - Statistics stats = groupExpression.childStatistics(0); - double rowCount = stats.getRowCount(); + Statistics childStats = groupExpression.childStatistics(0); + double rowCount = childStats.getRowCount(); List partitionKeys = partitionTopN.getPartitionKeys(); if (!partitionTopN.hasGlobalLimit() && !partitionKeys.isEmpty()) { // If there is no global limit. So result for the cardinality estimation is: // NDV(partition key) * partitionLimit - Map childSlotToColumnStats = stats.columnStatistics(); List partitionByKeyStats = partitionKeys.stream() - .filter(childSlotToColumnStats::containsKey) - .map(childSlotToColumnStats::get) + .map(partitionKey -> { + ColumnStatistic partitionKeyStats = childStats.findColumnStatistics(partitionKey); + if (partitionKeyStats == null) { + partitionKeyStats = new ExpressionEstimation().visit(partitionKey, childStats); + } + return partitionKeyStats; + }) .filter(s -> !s.isUnKnown) .collect(Collectors.toList()); if (partitionByKeyStats.isEmpty()) { @@ -663,7 +664,7 @@ private Statistics computePartitionTopN(PartitionTopN partitionTopN) { rowCount = rowCount * DEFAULT_COLUMN_NDV_RATIO; } else { rowCount = Math.min(rowCount, partitionByKeyStats.stream().map(s -> s.ndv) - .max(Double::compare).get()); + .max(Double::compare).get() * partitionTopN.getPartitionLimit()); } } else { rowCount = Math.min(rowCount, partitionTopN.getPartitionLimit()); @@ -671,7 +672,7 @@ private Statistics computePartitionTopN(PartitionTopN partitionTopN) { // TODO: for the filter push down window situation, we will prune the row count twice // because we keep the pushed down filter. And it will be calculated twice, one of them in 'PartitionTopN' // and the other is in 'Filter'. It's hard to dismiss. - return stats.updateRowCountOnly(rowCount); + return childStats.withRowCountAndEnforceValid(rowCount); } private Statistics computeLimit(Limit limit) { @@ -740,9 +741,7 @@ private Statistics computeAggregate(Aggregate aggregate) { builder.setDataSize(rowCount * outputExpression.getDataType().width()); slotToColumnStats.put(outputExpression.toSlot(), columnStat); } - return new Statistics(rowCount, slotToColumnStats, childStats.getWidth(), - childStats.getPenalty() + childStats.getRowCount()); - // TODO: Update ColumnStats properly, add new mapping from output slot to ColumnStats + return new Statistics(rowCount, slotToColumnStats); } private Statistics computeRepeat(Repeat repeat) { @@ -760,8 +759,7 @@ private Statistics computeRepeat(Repeat repeat) { .setDataSize(stats.dataSize < 0 ? stats.dataSize : stats.dataSize * groupingSetNum); return Pair.of(kv.getKey(), columnStatisticBuilder.build()); }).collect(Collectors.toMap(Pair::key, Pair::value)); - return new Statistics(rowCount < 0 ? rowCount : rowCount * groupingSetNum, columnStatisticMap, - childStats.getWidth(), childStats.getPenalty()); + return new Statistics(rowCount < 0 ? rowCount : rowCount * groupingSetNum, columnStatisticMap); } private Statistics computeProject(Project project) { @@ -771,7 +769,7 @@ private Statistics computeProject(Project project) { ColumnStatistic columnStatistic = ExpressionEstimation.estimate(projection, childStats); return new SimpleEntry<>(projection.toSlot(), columnStatistic); }).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue, (item1, item2) -> item1)); - return new Statistics(childStats.getRowCount(), columnsStats, childStats.getWidth(), childStats.getPenalty()); + return new Statistics(childStats.getRowCount(), columnsStats); } private Statistics computeOneRowRelation(List projects) { @@ -1068,16 +1066,4 @@ public Statistics visitPhysicalCTEAnchor( return groupExpression.childStatistics(1); } - private boolean shouldIgnoreCol(TableIf tableIf, Column c) { - if (tableIf instanceof SchemaTable) { - return true; - } - if (tableIf instanceof OlapTable) { - OlapTable olapTable = (OlapTable) tableIf; - if (StatisticConstants.STATISTICS_DB_BLACK_LIST.contains(olapTable.getQualifiedDbName())) { - return true; - } - } - return !c.isVisible(); - } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Properties.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Properties.java new file mode 100644 index 000000000000000..db0c78c1f78f579 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/Properties.java @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions; + +import org.apache.doris.nereids.exceptions.UnboundException; +import org.apache.doris.nereids.trees.expressions.shape.LeafExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.DataType; +import org.apache.doris.nereids.types.MapType; + +import com.google.common.collect.ImmutableList; + +import java.util.Map; +import java.util.Objects; +import java.util.stream.Collectors; + +/** + * Properties + */ +public class Properties extends Expression implements LeafExpression { + + private final Map keyValues; + + public Properties(Map properties) { + super(ImmutableList.of()); + this.keyValues = Objects.requireNonNull(properties, "properties can not be null"); + } + + public Map getMap() { + return keyValues; + } + + @Override + public boolean nullable() { + return false; + } + + @Override + public DataType getDataType() throws UnboundException { + return MapType.SYSTEM_DEFAULT; + } + + @Override + public String toSql() { + return getMap() + .entrySet() + .stream() + .map(kv -> "'" + kv.getKey() + "' = '" + kv.getValue() + "'") + .collect(Collectors.joining(", ")); + } + + @Override + public String toString() { + return "Properties(" + toSql() + ")"; + } + + @Override + public boolean equals(Object o) { + if (this == o) { + return true; + } + if (o == null || getClass() != o.getClass()) { + return false; + } + if (!super.equals(o)) { + return false; + } + Properties that = (Properties) o; + return Objects.equals(keyValues, that.keyValues); + } + + @Override + public int hashCode() { + return Objects.hash(super.hashCode(), keyValues); + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitProperties(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Hdfs.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Hdfs.java index d82878c4a48f2a5..c66684cd22c657e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Hdfs.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Hdfs.java @@ -19,7 +19,7 @@ import org.apache.doris.catalog.FunctionSignature; import org.apache.doris.nereids.exceptions.AnalysisException; -import org.apache.doris.nereids.trees.expressions.TVFProperties; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.nereids.types.coercion.AnyDataType; import org.apache.doris.tablefunction.HdfsTableValuedFunction; @@ -30,13 +30,13 @@ /** hdfs */ public class Hdfs extends TableValuedFunction { - public Hdfs(TVFProperties properties) { + public Hdfs(Properties properties) { super("hdfs", properties); } @Override public FunctionSignature customSignature() { - return FunctionSignature.of(AnyDataType.INSTANCE, (List) getArgumentsTypes()); + return FunctionSignature.of(AnyDataType.INSTANCE_WITHOUT_INDEX, (List) getArgumentsTypes()); } @Override @@ -46,7 +46,7 @@ protected TableValuedFunctionIf toCatalogFunction() { return new HdfsTableValuedFunction(arguments); } catch (Throwable t) { throw new AnalysisException("Can not build HdfsTableValuedFunction by " - + this + ": " + t.getMessage(), t); + + this + ": " + t.getMessage(), t); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Local.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Local.java index cd4169c1c9592a9..d45a4c939433f54 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Local.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Local.java @@ -19,7 +19,7 @@ import org.apache.doris.catalog.FunctionSignature; import org.apache.doris.nereids.exceptions.AnalysisException; -import org.apache.doris.nereids.trees.expressions.TVFProperties; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.nereids.types.coercion.AnyDataType; import org.apache.doris.tablefunction.LocalTableValuedFunction; @@ -31,13 +31,13 @@ * local */ public class Local extends TableValuedFunction { - public Local(TVFProperties properties) { + public Local(Properties properties) { super("local", properties); } @Override public FunctionSignature customSignature() { - return FunctionSignature.of(AnyDataType.INSTANCE, getArgumentsTypes()); + return FunctionSignature.of(AnyDataType.INSTANCE_WITHOUT_INDEX, getArgumentsTypes()); } @Override diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java index 0043ab9c1f7835f..c5febcf9749f447 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/Numbers.java @@ -24,8 +24,8 @@ import org.apache.doris.nereids.exceptions.AnalysisException; import org.apache.doris.nereids.properties.PhysicalProperties; import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.Slot; -import org.apache.doris.nereids.trees.expressions.TVFProperties; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.nereids.types.BigIntType; import org.apache.doris.statistics.ColumnStatistic; @@ -42,7 +42,7 @@ /** Numbers */ public class Numbers extends TableValuedFunction { - public Numbers(TVFProperties properties) { + public Numbers(Properties properties) { super("numbers", properties); } @@ -72,7 +72,7 @@ public Statistics computeStats(List slots) { Map columnToStatistics = Maps.newHashMap(); ColumnStatistic columnStat = new ColumnStatisticBuilder() .setCount(rowNum).setNdv(rowNum).setAvgSizeByte(8).setNumNulls(0).setDataSize(8).setMinValue(0) - .setMaxValue(rowNum - 1).setSelectivity(1.0 / rowNum) + .setMaxValue(rowNum - 1) .setMinExpr(new IntLiteral(0, Type.BIGINT)) .setMaxExpr(new IntLiteral(rowNum - 1, Type.BIGINT)) .build(); @@ -101,7 +101,7 @@ public PhysicalProperties getPhysicalProperties() { @Override public Numbers withChildren(List children) { Preconditions.checkArgument(children().size() == 1 - && children().get(0) instanceof TVFProperties); - return new Numbers((TVFProperties) children.get(0)); + && children().get(0) instanceof Properties); + return new Numbers((Properties) children.get(0)); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/S3.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/S3.java index d871219b7a8e309..29d8ad082186142 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/S3.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/S3.java @@ -19,7 +19,7 @@ import org.apache.doris.catalog.FunctionSignature; import org.apache.doris.nereids.exceptions.AnalysisException; -import org.apache.doris.nereids.trees.expressions.TVFProperties; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; import org.apache.doris.nereids.types.coercion.AnyDataType; import org.apache.doris.tablefunction.S3TableValuedFunction; @@ -29,13 +29,13 @@ /** s3 */ public class S3 extends TableValuedFunction { - public S3(TVFProperties properties) { + public S3(Properties properties) { super("s3", properties); } @Override public FunctionSignature customSignature() { - return FunctionSignature.of(AnyDataType.INSTANCE, getArgumentsTypes()); + return FunctionSignature.of(AnyDataType.INSTANCE_WITHOUT_INDEX, getArgumentsTypes()); } @Override @@ -45,7 +45,7 @@ protected TableValuedFunctionIf toCatalogFunction() { return new S3TableValuedFunction(arguments); } catch (Throwable t) { throw new AnalysisException("Can not build S3TableValuedFunction by " - + this + ": " + t.getMessage(), t); + + this + ": " + t.getMessage(), t); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/TableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/TableValuedFunction.java index 6443eab9728539b..5acc73eb75a9f83 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/TableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/table/TableValuedFunction.java @@ -23,8 +23,8 @@ import org.apache.doris.nereids.exceptions.UnboundException; import org.apache.doris.nereids.properties.PhysicalProperties; import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.Slot; -import org.apache.doris.nereids.trees.expressions.TVFProperties; import org.apache.doris.nereids.trees.expressions.functions.BoundFunction; import org.apache.doris.nereids.trees.expressions.functions.CustomSignature; import org.apache.doris.nereids.trees.expressions.shape.UnaryExpression; @@ -57,7 +57,7 @@ public abstract class TableValuedFunction extends BoundFunction implements Unary } }); - public TableValuedFunction(String functionName, TVFProperties tvfProperties) { + public TableValuedFunction(String functionName, Properties tvfProperties) { super(functionName, tvfProperties); } @@ -78,8 +78,8 @@ public Statistics computeStats(List slots) { return new Statistics(0, columnToStatistics); } - public TVFProperties getTVFProperties() { - return (TVFProperties) child(0); + public Properties getTVFProperties() { + return (Properties) child(0); } public final String getTableName() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java index 4517f444e0ba209..d95f35a6f69ca14 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ExpressionVisitor.java @@ -63,6 +63,7 @@ import org.apache.doris.nereids.trees.expressions.NullSafeEqual; import org.apache.doris.nereids.trees.expressions.Or; import org.apache.doris.nereids.trees.expressions.OrderExpression; +import org.apache.doris.nereids.trees.expressions.Properties; import org.apache.doris.nereids.trees.expressions.ScalarSubquery; import org.apache.doris.nereids.trees.expressions.Slot; import org.apache.doris.nereids.trees.expressions.SlotReference; @@ -117,6 +118,10 @@ public abstract class ExpressionVisitor public abstract R visit(Expression expr, C context); + public R visitProperties(Properties properties, C context) { + return visit(properties, context); + } + @Override public R visitAggregateFunction(AggregateFunction aggregateFunction, C context) { return visitBoundFunction(aggregateFunction, context); diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/MapType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/MapType.java index 40e0dae0fdb47a2..c576dcb9933a422 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/MapType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/MapType.java @@ -29,6 +29,8 @@ public class MapType extends DataType { public static final MapType INSTANCE = new MapType(); + public static final MapType SYSTEM_DEFAULT = new MapType(); + public static final int WIDTH = 24; private MapType() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/AnyDataType.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/AnyDataType.java index be07a7ee7b825ff..e1097df65f1404f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/AnyDataType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/types/coercion/AnyDataType.java @@ -26,7 +26,18 @@ */ public class AnyDataType implements AbstractDataType { - public static final AnyDataType INSTANCE = new AnyDataType(); + public static final AnyDataType INSTANCE_WITHOUT_INDEX = new AnyDataType(-1); + + public static final AnyDataType INSTANCE = new AnyDataType(-1); + + private final int index; + + public AnyDataType(int index) { + if (index < 0) { + index = -1; + } + this.index = index; + } @Override public DataType defaultConcreteType() { @@ -47,4 +58,9 @@ public Type toCatalogDataType() { public String simpleString() { return "any"; } + + public int getIndex() { + return index; + } + } diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java b/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java index 89299a7661bbd89..3554dd37c210bc4 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/EditLog.java @@ -82,6 +82,7 @@ import org.apache.doris.policy.StoragePolicy; import org.apache.doris.resource.workloadgroup.WorkloadGroup; import org.apache.doris.statistics.AnalysisInfo; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.system.Backend; import org.apache.doris.system.Frontend; import org.apache.doris.transaction.TransactionState; @@ -1060,16 +1061,16 @@ public static void loadJournal(Env env, Long logId, JournalEntity journal) { env.getBinlogManager().addBarrierLog(log, logId); break; } - // For backward compatible with 2.0.3 case OperationType.OP_UPDATE_TABLE_STATS: { + env.getAnalysisManager().replayUpdateTableStatsStatus((TableStatsMeta) journal.getData()); break; } - // For backward compatible with 2.0.3 case OperationType.OP_PERSIST_AUTO_JOB: { + env.getAnalysisManager().replayPersistSysJob((AnalysisInfo) journal.getData()); break; } - // For backward compatible with 2.0.3 case OperationType.OP_DELETE_TABLE_STATS: { + env.getAnalysisManager().replayTableStatsDeletion((TableStatsDeletionLog) journal.getData()); break; } default: { @@ -1875,4 +1876,16 @@ public long logBarrier(BarrierLog log) { LOG.info("logId {}, barrier {}", logId, log); return logId; } + + public void logCreateTableStats(TableStatsMeta tableStats) { + logEdit(OperationType.OP_UPDATE_TABLE_STATS, tableStats); + } + + public void logAutoJob(AnalysisInfo analysisInfo) { + logEdit(OperationType.OP_PERSIST_AUTO_JOB, analysisInfo); + } + + public void logDeleteTableStats(TableStatsDeletionLog log) { + logEdit(OperationType.OP_DELETE_TABLE_STATS, log); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/persist/TableStatsDeletionLog.java b/fe/fe-core/src/main/java/org/apache/doris/persist/TableStatsDeletionLog.java new file mode 100644 index 000000000000000..4016ff0139e79bd --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/persist/TableStatsDeletionLog.java @@ -0,0 +1,47 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.persist; + +import org.apache.doris.common.io.Text; +import org.apache.doris.common.io.Writable; +import org.apache.doris.persist.gson.GsonUtils; + +import com.google.gson.annotations.SerializedName; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; + +public class TableStatsDeletionLog implements Writable { + + @SerializedName("id") + public final long id; + + public TableStatsDeletionLog(long id) { + this.id = id; + } + + @Override + public void write(DataOutput out) throws IOException { + Text.writeString(out, GsonUtils.GSON.toJson(this)); + } + + public static TableStatsDeletionLog read(DataInput dataInput) throws IOException { + return GsonUtils.GSON.fromJson(Text.readString(dataInput), TableStatsDeletionLog.class); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/AuditLogHelper.java b/fe/fe-core/src/main/java/org/apache/doris/qe/AuditLogHelper.java new file mode 100644 index 000000000000000..40f870eee11c0c7 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/AuditLogHelper.java @@ -0,0 +1,120 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.qe; + +import org.apache.doris.analysis.InsertStmt; +import org.apache.doris.analysis.Queriable; +import org.apache.doris.analysis.StatementBase; +import org.apache.doris.catalog.Env; +import org.apache.doris.cluster.ClusterNamespace; +import org.apache.doris.common.Config; +import org.apache.doris.common.util.DebugUtil; +import org.apache.doris.metric.MetricRepo; +import org.apache.doris.plugin.AuditEvent.EventType; +import org.apache.doris.qe.QueryState.MysqlStateType; +import org.apache.doris.service.FrontendOptions; + +import io.opentelemetry.api.trace.Span; +import io.opentelemetry.api.trace.SpanContext; +import io.opentelemetry.context.Context; +import org.apache.commons.codec.digest.DigestUtils; + +public class AuditLogHelper { + + public static void logAuditLog(ConnectContext ctx, String origStmt, StatementBase parsedStmt, + org.apache.doris.proto.Data.PQueryStatistics statistics, boolean printFuzzyVariables) { + origStmt = origStmt.replace("\n", " "); + // slow query + long endTime = System.currentTimeMillis(); + long elapseMs = endTime - ctx.getStartTime(); + SpanContext spanContext = Span.fromContext(Context.current()).getSpanContext(); + + ctx.getAuditEventBuilder().setEventType(EventType.AFTER_QUERY) + .setDb(ClusterNamespace.getNameFromFullName(ctx.getDatabase())) + .setState(ctx.getState().toString()) + .setErrorCode(ctx.getState().getErrorCode() == null ? 0 : ctx.getState().getErrorCode().getCode()) + .setErrorMessage((ctx.getState().getErrorMessage() == null ? "" : + ctx.getState().getErrorMessage().replace("\n", " ").replace("\t", " "))) + .setQueryTime(elapseMs) + .setScanBytes(statistics == null ? 0 : statistics.getScanBytes()) + .setScanRows(statistics == null ? 0 : statistics.getScanRows()) + .setCpuTimeMs(statistics == null ? 0 : statistics.getCpuMs()) + .setPeakMemoryBytes(statistics == null ? 0 : statistics.getMaxPeakMemoryBytes()) + .setReturnRows(ctx.getReturnRows()) + .setStmtId(ctx.getStmtId()) + .setQueryId(ctx.queryId() == null ? "NaN" : DebugUtil.printId(ctx.queryId())) + .setTraceId(spanContext.isValid() ? spanContext.getTraceId() : "") + .setWorkloadGroup(ctx.getWorkloadGroupName()) + .setFuzzyVariables(!printFuzzyVariables ? "" : ctx.getSessionVariable().printFuzzyVariables()); + + if (ctx.getState().isQuery()) { + MetricRepo.COUNTER_QUERY_ALL.increase(1L); + MetricRepo.USER_COUNTER_QUERY_ALL.getOrAdd(ctx.getQualifiedUser()).increase(1L); + if (ctx.getState().getStateType() == MysqlStateType.ERR + && ctx.getState().getErrType() != QueryState.ErrType.ANALYSIS_ERR) { + // err query + MetricRepo.COUNTER_QUERY_ERR.increase(1L); + MetricRepo.USER_COUNTER_QUERY_ERR.getOrAdd(ctx.getQualifiedUser()).increase(1L); + } else if (ctx.getState().getStateType() == MysqlStateType.OK + || ctx.getState().getStateType() == MysqlStateType.EOF) { + // ok query + MetricRepo.HISTO_QUERY_LATENCY.update(elapseMs); + MetricRepo.USER_HISTO_QUERY_LATENCY.getOrAdd(ctx.getQualifiedUser()).update(elapseMs); + + if (elapseMs > Config.qe_slow_log_ms) { + String sqlDigest = DigestUtils.md5Hex(((Queriable) parsedStmt).toDigest()); + ctx.getAuditEventBuilder().setSqlDigest(sqlDigest); + } + } + ctx.getAuditEventBuilder().setIsQuery(true); + if (ctx.getQueryDetail() != null) { + ctx.getQueryDetail().setEventTime(endTime); + ctx.getQueryDetail().setEndTime(endTime); + ctx.getQueryDetail().setLatency(elapseMs); + ctx.getQueryDetail().setState(QueryDetail.QueryMemState.FINISHED); + QueryDetailQueue.addOrUpdateQueryDetail(ctx.getQueryDetail()); + ctx.setQueryDetail(null); + } + } else { + ctx.getAuditEventBuilder().setIsQuery(false); + } + ctx.getAuditEventBuilder().setIsNereids(ctx.getState().isNereids); + + ctx.getAuditEventBuilder().setFeIp(FrontendOptions.getLocalHostAddress()); + + // We put origin query stmt at the end of audit log, for parsing the log more convenient. + if (!ctx.getState().isQuery() && (parsedStmt != null && parsedStmt.needAuditEncryption())) { + ctx.getAuditEventBuilder().setStmt(parsedStmt.toSql()); + } else { + if (parsedStmt instanceof InsertStmt && !((InsertStmt) parsedStmt).needLoadManager() + && ((InsertStmt) parsedStmt).isValuesOrConstantSelect()) { + // INSERT INTO VALUES may be very long, so we only log at most 1K bytes. + int length = Math.min(1024, origStmt.length()); + ctx.getAuditEventBuilder().setStmt(origStmt.substring(0, length)); + } else { + ctx.getAuditEventBuilder().setStmt(origStmt); + } + } + if (!Env.getCurrentEnv().isMaster()) { + if (ctx.executor.isForwardToMaster()) { + ctx.getAuditEventBuilder().setState(ctx.executor.getProxyStatus()); + } + } + Env.getCurrentAuditEventProcessor().handleAuditEvent(ctx.getAuditEventBuilder().build()); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/DdlExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/DdlExecutor.java index 15b7bcc883ab67d..2b468556fe017ce 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/DdlExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/DdlExecutor.java @@ -38,7 +38,6 @@ import org.apache.doris.analysis.AlterRoutineLoadStmt; import org.apache.doris.analysis.AlterSqlBlockRuleStmt; import org.apache.doris.analysis.AlterSystemStmt; -import org.apache.doris.analysis.AlterTableStatsStmt; import org.apache.doris.analysis.AlterTableStmt; import org.apache.doris.analysis.AlterUserStmt; import org.apache.doris.analysis.AlterViewStmt; @@ -160,8 +159,6 @@ public static void execute(Env env, DdlStmt ddlStmt) throws Exception { env.createMaterializedView((CreateMaterializedViewStmt) ddlStmt); } else if (ddlStmt instanceof AlterTableStmt) { env.alterTable((AlterTableStmt) ddlStmt); - } else if (ddlStmt instanceof AlterTableStatsStmt) { - StatisticsRepository.alterTableStatistics((AlterTableStatsStmt) ddlStmt); } else if (ddlStmt instanceof AlterColumnStatsStmt) { StatisticsRepository.alterColumnStatistics((AlterColumnStatsStmt) ddlStmt); } else if (ddlStmt instanceof AlterViewStmt) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/InternalQueryExecutionException.java b/fe/fe-core/src/main/java/org/apache/doris/qe/InternalQueryExecutionException.java new file mode 100644 index 000000000000000..c368533c53df720 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/InternalQueryExecutionException.java @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.qe; + +public class InternalQueryExecutionException extends RuntimeException { + public InternalQueryExecutionException(String message, Throwable cause) { + super(message, cause); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index d42474965488ce0..3451811445370ac 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -403,6 +403,8 @@ public class SessionVariable implements Serializable, Writable { public static final String TEST_QUERY_CACHE_HIT = "test_query_cache_hit"; + public static final String ENABLE_FULL_AUTO_ANALYZE = "enable_full_auto_analyze"; + public static final List DEBUG_VARIABLES = ImmutableList.of( SKIP_DELETE_PREDICATE, SKIP_DELETE_BITMAP, @@ -1142,6 +1144,24 @@ public void setMaxJoinNumberOfReorder(int maxJoinNumberOfReorder) { options = {"none", "sql_cache", "partition_cache"}) public String testQueryCacheHit = "none"; + @VariableMgr.VarAttr(name = ENABLE_FULL_AUTO_ANALYZE, + description = {"该参数控制是否开启自动收集", "Set false to disable auto analyze"}, + flag = VariableMgr.GLOBAL) + public boolean enableFullAutoAnalyze = true; + + @VariableMgr.VarAttr(name = FULL_AUTO_ANALYZE_START_TIME, needForward = true, checker = "checkAnalyzeTimeFormat", + description = {"该参数定义自动ANALYZE例程的开始时间", + "This parameter defines the start time for the automatic ANALYZE routine."}, + flag = VariableMgr.GLOBAL) + public String fullAutoAnalyzeStartTime = "00:00:00"; + + @VariableMgr.VarAttr(name = FULL_AUTO_ANALYZE_END_TIME, needForward = true, checker = "checkAnalyzeTimeFormat", + description = {"该参数定义自动ANALYZE例程的结束时间", + "This parameter defines the end time for the automatic ANALYZE routine."}, + flag = VariableMgr.GLOBAL) + public String fullAutoAnalyzeEndTime = "02:00:00"; + + // If this fe is in fuzzy mode, then will use initFuzzyModeVariables to generate some variables, // not the default value set in the code. public void initFuzzyModeVariables() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java index 0ebcb1c7dcf2ed0..fe447c2746f5cc1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/ShowExecutor.java @@ -138,7 +138,6 @@ import org.apache.doris.catalog.TabletInvertedIndex; import org.apache.doris.catalog.TabletMeta; import org.apache.doris.catalog.View; -import org.apache.doris.catalog.external.ExternalTable; import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.clone.DynamicPartitionScheduler; import org.apache.doris.cluster.ClusterNamespace; @@ -196,7 +195,7 @@ import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.Histogram; import org.apache.doris.statistics.StatisticsRepository; -import org.apache.doris.statistics.TableStatistic; +import org.apache.doris.statistics.TableStatsMeta; import org.apache.doris.statistics.query.QueryStatsUtil; import org.apache.doris.system.Backend; import org.apache.doris.system.Diagnoser; @@ -240,7 +239,6 @@ import java.util.List; import java.util.Map; import java.util.Objects; -import java.util.Optional; import java.util.Set; import java.util.concurrent.TimeUnit; import java.util.function.Predicate; @@ -2392,29 +2390,16 @@ private void handleShowDataSkew() throws AnalysisException { private void handleShowTableStats() { ShowTableStatsStmt showTableStatsStmt = (ShowTableStatsStmt) stmt; TableIf tableIf = showTableStatsStmt.getTable(); - long partitionId = showTableStatsStmt.getPartitionId(); - boolean showCache = showTableStatsStmt.isCached(); - try { - if (tableIf instanceof ExternalTable && showCache) { - Optional tableStatistics = Env.getCurrentEnv().getStatisticsCache().getTableStatistics( - tableIf.getDatabase().getCatalog().getId(), - tableIf.getDatabase().getId(), - tableIf.getId()); - if (tableStatistics.isPresent()) { - resultSet = showTableStatsStmt.constructResultSet(tableStatistics.get()); - } else { - resultSet = showTableStatsStmt.constructResultSet(TableStatistic.UNKNOWN); - } - } else if (partitionId > 0) { - TableStatistic partStats = StatisticsRepository.fetchTableLevelOfPartStats(partitionId); - resultSet = showTableStatsStmt.constructResultSet(partStats); - } else { - TableStatistic tableStats = StatisticsRepository.fetchTableLevelStats(tableIf.getId()); - resultSet = showTableStatsStmt.constructResultSet(tableStats); - } - } catch (DdlException e) { - LOG.warn("Table statistics do not exist: {}", tableIf.getName()); - resultSet = showTableStatsStmt.constructResultSet(TableStatistic.UNKNOWN); + TableStatsMeta tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(tableIf.getId()); + /* + HMSExternalTable table will fetch row count from HMS + or estimate with file size and schema if it's not analyzed. + tableStats == null means it's not analyzed, in this case show the estimated row count. + */ + if (tableStats == null && tableIf instanceof HMSExternalTable) { + resultSet = showTableStatsStmt.constructResultSet(tableIf.estimatedRowCount()); + } else { + resultSet = showTableStatsStmt.constructResultSet(tableStats); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java index 6f99fd765a9ba8f..2d67e29415cf020 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/StmtExecutor.java @@ -142,8 +142,8 @@ import org.apache.doris.rewrite.mvrewrite.MVSelectFailedException; import org.apache.doris.rpc.RpcException; import org.apache.doris.service.FrontendOptions; +import org.apache.doris.statistics.ResultRow; import org.apache.doris.statistics.util.InternalQueryBuffer; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.task.LoadEtlTask; import org.apache.doris.thrift.TFileFormatType; import org.apache.doris.thrift.TFileType; @@ -2480,7 +2480,8 @@ public List executeInternalQuery() { planner = new NereidsPlanner(statementContext); planner.plan(parsedStmt, context.getSessionVariable().toThrift()); } catch (Exception e) { - LOG.warn("fall back to legacy planner, because: {}", e.getMessage(), e); + LOG.warn("Arrow Flight SQL fall back to legacy planner, because: {}", + e.getMessage(), e); parsedStmt = null; planner = null; context.getState().setNereids(false); @@ -2492,10 +2493,9 @@ public List executeInternalQuery() { analyze(context.getSessionVariable().toThrift()); } } catch (Exception e) { - throw new RuntimeException("Failed to execute internal SQL. " - + Util.getRootCauseMessage(e) + " " + originStmt.toString(), e); + LOG.warn("Failed to run internal SQL: {}", originStmt, e); + throw new RuntimeException("Failed to execute internal SQL. " + Util.getRootCauseMessage(e), e); } - planner.getFragments(); RowBatch batch; coord = new Coordinator(context, analyzer, planner, context.getStatsErrorEstimator()); profile.addExecutionProfile(coord.getExecutionProfile()); @@ -2503,8 +2503,7 @@ public List executeInternalQuery() { QeProcessorImpl.INSTANCE.registerQuery(context.queryId(), new QeProcessorImpl.QueryInfo(context, originStmt.originStmt, coord)); } catch (UserException e) { - throw new RuntimeException("Failed to execute internal SQL. " - + " " + Util.getRootCauseMessage(e) + originStmt.toString(), e); + throw new RuntimeException("Failed to execute internal SQL. " + Util.getRootCauseMessage(e), e); } Span queryScheduleSpan = context.getTracer() @@ -2513,8 +2512,7 @@ public List executeInternalQuery() { coord.exec(); } catch (Exception e) { queryScheduleSpan.recordException(e); - throw new RuntimeException("Failed to execute internal SQL. " - + Util.getRootCauseMessage(e) + " " + originStmt.toString(), e); + throw new InternalQueryExecutionException(e.getMessage() + Util.getRootCauseMessage(e), e); } finally { queryScheduleSpan.end(); } @@ -2531,21 +2529,19 @@ public List executeInternalQuery() { } } catch (Exception e) { fetchResultSpan.recordException(e); - throw new RuntimeException("Failed to execute internal SQL. " + Util.getRootCauseMessage(e) + " " - + originStmt.toString(), e); + throw new RuntimeException("Failed to fetch internal SQL result. " + Util.getRootCauseMessage(e), e); } finally { fetchResultSpan.end(); } } finally { + AuditLogHelper.logAuditLog(context, originStmt.toString(), parsedStmt, getQueryStatisticsForAuditLog(), + true); QeProcessorImpl.INSTANCE.unregisterQuery(context.queryId()); } } private List convertResultBatchToResultRows(TResultBatch batch) { List columns = parsedStmt.getColLabels(); - List types = parsedStmt.getResultExprs().stream() - .map(e -> e.getType().getPrimitiveType()) - .collect(Collectors.toList()); List resultRows = new ArrayList<>(); List rows = batch.getRows(); for (ByteBuffer buffer : rows) { @@ -2556,8 +2552,7 @@ private List convertResultBatchToResultRows(TResultBatch batch) { String value = queryBuffer.readStringWithLength(); values.add(value); } - - ResultRow resultRow = new ResultRow(columns, types, values); + ResultRow resultRow = new ResultRow(values); resultRows.add(resultRow); } return resultRows; @@ -2579,6 +2574,21 @@ public void setProfileType(ProfileType profileType) { public void setProxyResultSet(ShowResultSet proxyResultSet) { this.proxyResultSet = proxyResultSet; } + + public ConnectContext getContext() { + return context; + } + + public OriginStatement getOriginStmt() { + return originStmt; + } + + public String getOriginStmtInString() { + if (originStmt != null && originStmt.originStmt != null) { + return originStmt.originStmt; + } + return ""; + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java index ee39582aac4df0e..e3d5c8a91b8f7d9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfo.java @@ -22,19 +22,21 @@ import org.apache.doris.common.io.Text; import org.apache.doris.common.io.Writable; import org.apache.doris.persist.gson.GsonUtils; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import com.google.gson.Gson; import com.google.gson.annotations.SerializedName; import com.google.gson.reflect.TypeToken; +import org.apache.commons.lang3.StringUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.core.util.CronExpression; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.lang.reflect.Type; +import java.text.ParseException; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -46,6 +48,7 @@ public class AnalysisInfo implements Writable { private static final Logger LOG = LogManager.getLogger(AnalysisInfo.class); + // TODO: useless, remove it later public enum AnalysisMode { INCREMENTAL, FULL @@ -66,10 +69,12 @@ public enum JobType { // submit by user directly MANUAL, // submit by system automatically - SYSTEM + SYSTEM; } public enum ScheduleType { + // Job created by AutoCollector is also `ONCE` type, this is because it runs once only and should be removed + // when its information is expired ONCE, PERIOD, AUTOMATIC @@ -95,6 +100,7 @@ public enum ScheduleType { @SerializedName("tblName") public final String tblName; + // TODO: Map here is wired, List is enough @SerializedName("colToPartitions") public final Map> colToPartitions; @@ -151,24 +157,39 @@ public enum ScheduleType { // True means this task is a table level task for external table. // This kind of task is mainly to collect the number of rows of a table. @SerializedName("externalTableLevelTask") - public boolean externalTableLevelTask; + public final boolean externalTableLevelTask; @SerializedName("partitionOnly") - public boolean partitionOnly; + public final boolean partitionOnly; @SerializedName("samplingPartition") - public boolean samplingPartition; + public final boolean samplingPartition; + + @SerializedName("isAllPartition") + public final boolean isAllPartition; + + @SerializedName("partitionCount") + public final long partitionCount; // For serialize @SerializedName("cronExpr") public String cronExprStr; + @SerializedName("progress") + public String progress; + + public CronExpression cronExpression; + + @SerializedName("forceFull") + public final boolean forceFull; + public AnalysisInfo(long jobId, long taskId, List taskIds, String catalogName, String dbName, String tblName, Map> colToPartitions, Set partitionNames, String colName, Long indexId, JobType jobType, AnalysisMode analysisMode, AnalysisMethod analysisMethod, AnalysisType analysisType, int samplePercent, int sampleRows, int maxBucketNum, long periodTimeInMs, String message, long lastExecTimeInMs, long timeCostInMs, AnalysisState state, ScheduleType scheduleType, - boolean isExternalTableLevelTask, boolean partitionOnly, boolean samplingPartition) { + boolean isExternalTableLevelTask, boolean partitionOnly, boolean samplingPartition, + boolean isAllPartition, long partitionCount, CronExpression cronExpression, boolean forceFull) { this.jobId = jobId; this.taskId = taskId; this.taskIds = taskIds; @@ -195,6 +216,13 @@ public AnalysisInfo(long jobId, long taskId, List taskIds, String catalogN this.externalTableLevelTask = isExternalTableLevelTask; this.partitionOnly = partitionOnly; this.samplingPartition = samplingPartition; + this.isAllPartition = isAllPartition; + this.partitionCount = partitionCount; + this.cronExpression = cronExpression; + if (cronExpression != null) { + this.cronExprStr = cronExpression.getCronExpression(); + } + this.forceFull = forceFull; } @Override @@ -205,11 +233,11 @@ public String toString() { sj.add("DBName: " + dbName); sj.add("TableName: " + tblName); sj.add("ColumnName: " + colName); - sj.add("TaskType: " + analysisType.toString()); - sj.add("TaskMode: " + analysisMode.toString()); - sj.add("TaskMethod: " + analysisMethod.toString()); + sj.add("TaskType: " + analysisType); + sj.add("TaskMode: " + analysisMode); + sj.add("TaskMethod: " + analysisMethod); sj.add("Message: " + message); - sj.add("CurrentState: " + state.toString()); + sj.add("CurrentState: " + state); if (samplePercent > 0) { sj.add("SamplePercent: " + samplePercent); } @@ -231,6 +259,10 @@ public String toString() { if (periodTimeInMs > 0) { sj.add("periodTimeInMs: " + StatisticsUtil.getReadableTime(periodTimeInMs)); } + if (StringUtils.isNotEmpty(cronExprStr)) { + sj.add("cronExpr: " + cronExprStr); + } + sj.add("forceFull: " + forceFull); return sj.toString(); } @@ -246,60 +278,6 @@ public void addTaskId(long taskId) { taskIds.add(taskId); } - // TODO: use thrift - public static AnalysisInfo fromResultRow(ResultRow resultRow) { - try { - AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder(); - long jobId = Long.parseLong(resultRow.getColumnValue("job_id")); - analysisInfoBuilder.setJobId(jobId); - long taskId = Long.parseLong(resultRow.getColumnValue("task_id")); - analysisInfoBuilder.setTaskId(taskId); - String catalogName = resultRow.getColumnValue("catalog_name"); - analysisInfoBuilder.setCatalogName(catalogName); - String dbName = resultRow.getColumnValue("db_name"); - analysisInfoBuilder.setDbName(dbName); - String tblName = resultRow.getColumnValue("tbl_name"); - analysisInfoBuilder.setTblName(tblName); - String colName = resultRow.getColumnValue("col_name"); - analysisInfoBuilder.setColName(colName); - long indexId = Long.parseLong(resultRow.getColumnValue("index_id")); - analysisInfoBuilder.setIndexId(indexId); - String partitionNames = resultRow.getColumnValue("col_partitions"); - Map> colToPartitions = getColToPartition(partitionNames); - analysisInfoBuilder.setColToPartitions(colToPartitions); - String jobType = resultRow.getColumnValue("job_type"); - analysisInfoBuilder.setJobType(JobType.valueOf(jobType)); - String analysisType = resultRow.getColumnValue("analysis_type"); - analysisInfoBuilder.setAnalysisType(AnalysisType.valueOf(analysisType)); - String analysisMode = resultRow.getColumnValue("analysis_mode"); - analysisInfoBuilder.setAnalysisMode(AnalysisMode.valueOf(analysisMode)); - String analysisMethod = resultRow.getColumnValue("analysis_method"); - analysisInfoBuilder.setAnalysisMethod(AnalysisMethod.valueOf(analysisMethod)); - String scheduleType = resultRow.getColumnValue("schedule_type"); - analysisInfoBuilder.setScheduleType(ScheduleType.valueOf(scheduleType)); - String state = resultRow.getColumnValue("state"); - analysisInfoBuilder.setState(AnalysisState.valueOf(state)); - String samplePercent = resultRow.getColumnValue("sample_percent"); - analysisInfoBuilder.setSamplePercent(StatisticsUtil.convertStrToInt(samplePercent)); - String sampleRows = resultRow.getColumnValue("sample_rows"); - analysisInfoBuilder.setSampleRows(StatisticsUtil.convertStrToInt(sampleRows)); - String maxBucketNum = resultRow.getColumnValue("max_bucket_num"); - analysisInfoBuilder.setMaxBucketNum(StatisticsUtil.convertStrToInt(maxBucketNum)); - String periodTimeInMs = resultRow.getColumnValue("period_time_in_ms"); - analysisInfoBuilder.setPeriodTimeInMs(StatisticsUtil.convertStrToInt(periodTimeInMs)); - String lastExecTimeInMs = resultRow.getColumnValue("last_exec_time_in_ms"); - analysisInfoBuilder.setLastExecTimeInMs(StatisticsUtil.convertStrToLong(lastExecTimeInMs)); - String timeCostInMs = resultRow.getColumnValue("time_cost_in_ms"); - analysisInfoBuilder.setTimeCostInMs(StatisticsUtil.convertStrToLong(timeCostInMs)); - String message = resultRow.getColumnValue("message"); - analysisInfoBuilder.setMessage(message); - return analysisInfoBuilder.build(); - } catch (Exception e) { - LOG.warn("Failed to deserialize analysis task info.", e); - return null; - } - } - public String getColToPartitionStr() { if (colToPartitions == null || colToPartitions.isEmpty()) { return ""; @@ -362,7 +340,15 @@ public static AnalysisInfo read(DataInput dataInput) throws IOException { return analysisInfoBuilder.build(); } else { String json = Text.readString(dataInput); - return GsonUtils.GSON.fromJson(json, AnalysisInfo.class); + AnalysisInfo analysisInfo = GsonUtils.GSON.fromJson(json, AnalysisInfo.class); + if (analysisInfo.cronExprStr != null) { + try { + analysisInfo.cronExpression = new CronExpression(analysisInfo.cronExprStr); + } catch (ParseException e) { + LOG.warn("Cron expression of job is invalid, there is a bug", e); + } + } + return analysisInfo; } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java index 2fd0e25d727cc13..7e97b4d951f5cac 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisInfoBuilder.java @@ -23,6 +23,8 @@ import org.apache.doris.statistics.AnalysisInfo.JobType; import org.apache.doris.statistics.AnalysisInfo.ScheduleType; +import org.apache.logging.log4j.core.util.CronExpression; + import java.util.List; import java.util.Map; import java.util.Set; @@ -54,6 +56,12 @@ public class AnalysisInfoBuilder { private boolean externalTableLevelTask; private boolean partitionOnly; private boolean samplingPartition; + private boolean isAllPartition; + private long partitionCount; + + private CronExpression cronExpression; + + private boolean forceFull; public AnalysisInfoBuilder() { } @@ -85,6 +93,10 @@ public AnalysisInfoBuilder(AnalysisInfo info) { externalTableLevelTask = info.externalTableLevelTask; partitionOnly = info.partitionOnly; samplingPartition = info.samplingPartition; + isAllPartition = info.isAllPartition; + partitionCount = info.partitionCount; + cronExpression = info.cronExpression; + forceFull = info.forceFull; } public AnalysisInfoBuilder setJobId(long jobId) { @@ -217,11 +229,30 @@ public AnalysisInfoBuilder setSamplingPartition(boolean samplingPartition) { return this; } + public AnalysisInfoBuilder setAllPartition(boolean isAllPartition) { + this.isAllPartition = isAllPartition; + return this; + } + + public AnalysisInfoBuilder setPartitionCount(long partitionCount) { + this.partitionCount = partitionCount; + return this; + } + + public void setCronExpression(CronExpression cronExpression) { + this.cronExpression = cronExpression; + } + + public void setForceFull(boolean forceFull) { + this.forceFull = forceFull; + } + public AnalysisInfo build() { return new AnalysisInfo(jobId, taskId, taskIds, catalogName, dbName, tblName, colToPartitions, partitionNames, colName, indexId, jobType, analysisMode, analysisMethod, analysisType, samplePercent, sampleRows, maxBucketNum, periodTimeInMs, message, lastExecTimeInMs, timeCostInMs, state, scheduleType, - externalTableLevelTask, partitionOnly, samplingPartition); + externalTableLevelTask, partitionOnly, samplingPartition, isAllPartition, partitionCount, + cronExpression, forceFull); } public AnalysisInfoBuilder copy() { @@ -248,6 +279,10 @@ public AnalysisInfoBuilder copy() { .setTimeCostInMs(timeCostInMs) .setState(state) .setScheduleType(scheduleType) - .setExternalTableLevelTask(externalTableLevelTask); + .setExternalTableLevelTask(externalTableLevelTask) + .setSamplingPartition(samplingPartition) + .setPartitionOnly(partitionOnly) + .setAllPartition(isAllPartition) + .setPartitionCount(partitionCount); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java index 5ac0c0cd5248c15..b488bd385ce9513 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisManager.java @@ -18,6 +18,7 @@ package org.apache.doris.statistics; import org.apache.doris.analysis.AnalyzeDBStmt; +import org.apache.doris.analysis.AnalyzeProperties; import org.apache.doris.analysis.AnalyzeStmt; import org.apache.doris.analysis.AnalyzeTblStmt; import org.apache.doris.analysis.DropAnalyzeJobStmt; @@ -28,12 +29,9 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.MaterializedIndexMeta; -import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.Partition; import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.TableIf; -import org.apache.doris.catalog.TableIf.TableType; import org.apache.doris.catalog.View; import org.apache.doris.catalog.external.ExternalTable; import org.apache.doris.catalog.external.HMSExternalTable; @@ -41,13 +39,16 @@ import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; import org.apache.doris.common.FeConstants; +import org.apache.doris.common.FeMetaVersion; import org.apache.doris.common.ThreadPoolManager.BlockedPolicy; +import org.apache.doris.common.io.Text; import org.apache.doris.common.io.Writable; import org.apache.doris.common.util.Daemon; import org.apache.doris.common.util.Util; -import org.apache.doris.datasource.CatalogIf; import org.apache.doris.mysql.privilege.PrivPredicate; import org.apache.doris.persist.AnalyzeDeletionLog; +import org.apache.doris.persist.TableStatsDeletionLog; +import org.apache.doris.persist.gson.GsonUtils; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.ShowResultSet; import org.apache.doris.qe.ShowResultSetMetaData; @@ -56,25 +57,31 @@ import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import org.apache.doris.statistics.AnalysisInfo.JobType; import org.apache.doris.statistics.AnalysisInfo.ScheduleType; +import org.apache.doris.statistics.util.SimpleQueue; import org.apache.doris.statistics.util.StatisticsUtil; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; -import com.google.common.collect.Maps; +import com.google.common.reflect.TypeToken; import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.core.util.CronExpression; import org.jetbrains.annotations.Nullable; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.lang.reflect.Type; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; import java.util.Comparator; +import java.util.Date; import java.util.HashMap; import java.util.HashSet; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -93,26 +100,141 @@ public class AnalysisManager extends Daemon implements Writable { - public AnalysisTaskScheduler taskScheduler; - private static final Logger LOG = LogManager.getLogger(AnalysisManager.class); - private ConcurrentMap> analysisJobIdToTaskMap = new ConcurrentHashMap<>(); + // Tracking running manually submitted async tasks, keep in mem only + protected final ConcurrentMap> analysisJobIdToTaskMap = new ConcurrentHashMap<>(); private StatisticsCache statisticsCache; private AnalysisTaskExecutor taskExecutor; + // Store task information in metadata. private final Map analysisTaskInfoMap = Collections.synchronizedMap(new TreeMap<>()); + + // Store job information in metadata private final Map analysisJobInfoMap = Collections.synchronizedMap(new TreeMap<>()); + // Tracking system submitted job, keep in mem only + protected final Map systemJobInfoMap = new ConcurrentHashMap<>(); + + // Tracking and control sync analyze tasks, keep in mem only private final ConcurrentMap ctxToSyncTask = new ConcurrentHashMap<>(); + private final Map idToTblStats = new ConcurrentHashMap<>(); + + protected SimpleQueue autoJobs = createSimpleQueue(null, this); + + private final Function userJobStatusUpdater = w -> { + AnalysisInfo info = w.info; + AnalysisState taskState = w.taskState; + String message = w.message; + long time = w.time; + if (analysisJobIdToTaskMap.get(info.jobId) == null) { + return null; + } + info.state = taskState; + info.message = message; + // Update the task cost time when task finished or failed. And only log the final state. + if (taskState.equals(AnalysisState.FINISHED) || taskState.equals(AnalysisState.FAILED)) { + info.timeCostInMs = time - info.lastExecTimeInMs; + info.lastExecTimeInMs = time; + logCreateAnalysisTask(info); + } + info.lastExecTimeInMs = time; + AnalysisInfo job = analysisJobInfoMap.get(info.jobId); + // Job may get deleted during execution. + if (job == null) { + return null; + } + // Synchronize the job state change in job level. + synchronized (job) { + job.lastExecTimeInMs = time; + // Set the job state to RUNNING when its first task becomes RUNNING. + if (info.state.equals(AnalysisState.RUNNING) && job.state.equals(AnalysisState.PENDING)) { + job.state = AnalysisState.RUNNING; + replayCreateAnalysisJob(job); + } + boolean allFinished = true; + boolean hasFailure = false; + for (BaseAnalysisTask task : analysisJobIdToTaskMap.get(info.jobId).values()) { + AnalysisInfo taskInfo = task.info; + if (taskInfo.state.equals(AnalysisState.RUNNING) || taskInfo.state.equals(AnalysisState.PENDING)) { + allFinished = false; + break; + } + if (taskInfo.state.equals(AnalysisState.FAILED)) { + hasFailure = true; + } + } + if (allFinished) { + if (hasFailure) { + job.state = AnalysisState.FAILED; + } else { + job.state = AnalysisState.FINISHED; + try { + updateTableStats(job); + } catch (Throwable e) { + LOG.warn("Failed to update Table statistics in job: {}", info.toString(), e); + } + } + logCreateAnalysisJob(job); + analysisJobIdToTaskMap.remove(job.jobId); + } + } + return null; + }; + + private final String progressDisplayTemplate = "%d Finished | %d Failed | %d In Progress | %d Total"; + + protected final Function systemJobStatusUpdater = w -> { + AnalysisInfo info = w.info; + info.state = w.taskState; + info.message = w.message; + AnalysisInfo job = systemJobInfoMap.get(info.jobId); + if (job == null) { + return null; + } + int failedCount = 0; + StringJoiner reason = new StringJoiner(", "); + Map taskMap = analysisJobIdToTaskMap.get(info.jobId); + for (BaseAnalysisTask task : taskMap.values()) { + if (task.info.state.equals(AnalysisState.RUNNING) || task.info.state.equals(AnalysisState.PENDING)) { + return null; + } + if (task.info.state.equals(AnalysisState.FAILED)) { + failedCount++; + reason.add(task.info.message); + } + } + try { + updateTableStats(job); + } catch (Throwable e) { + LOG.warn("Failed to update Table statistics in job: {}", info.toString(), e); + } finally { + job.lastExecTimeInMs = System.currentTimeMillis(); + job.message = reason.toString(); + job.progress = String.format(progressDisplayTemplate, + taskMap.size() - failedCount, failedCount, 0, taskMap.size()); + if (failedCount > 0) { + job.message = reason.toString(); + job.state = AnalysisState.FAILED; + } else { + job.state = AnalysisState.FINISHED; + } + autoJobs.offer(job); + systemJobInfoMap.remove(info.jobId); + } + return null; + }; + + private final Function[] updaters = + new Function[] {userJobStatusUpdater, systemJobStatusUpdater}; + public AnalysisManager() { super(TimeUnit.SECONDS.toMillis(StatisticConstants.ANALYZE_MANAGER_INTERVAL_IN_SECS)); if (!Env.isCheckpointThread()) { - this.taskScheduler = new AnalysisTaskScheduler(); - this.taskExecutor = new AnalysisTaskExecutor(taskScheduler); + this.taskExecutor = new AnalysisTaskExecutor(Config.statistics_simultaneously_running_task_num); this.statisticsCache = new StatisticsCache(); taskExecutor.start(); } @@ -124,7 +246,7 @@ protected void runOneCycle() { } private void clear() { - clearMeta(analysisJobInfoMap, (a) -> + clearExpiredAnalysisInfo(analysisJobInfoMap, (a) -> a.scheduleType.equals(ScheduleType.ONCE) && System.currentTimeMillis() - a.lastExecTimeInMs > TimeUnit.DAYS.toMillis(StatisticConstants.ANALYSIS_JOB_INFO_EXPIRATION_TIME_IN_DAYS), @@ -132,7 +254,7 @@ private void clear() { Env.getCurrentEnv().getEditLog().logDeleteAnalysisJob(new AnalyzeDeletionLog(id)); return null; }); - clearMeta(analysisTaskInfoMap, (a) -> System.currentTimeMillis() - a.lastExecTimeInMs + clearExpiredAnalysisInfo(analysisTaskInfoMap, (a) -> System.currentTimeMillis() - a.lastExecTimeInMs > TimeUnit.DAYS.toMillis(StatisticConstants.ANALYSIS_JOB_INFO_EXPIRATION_TIME_IN_DAYS), (id) -> { Env.getCurrentEnv().getEditLog().logDeleteAnalysisTask(new AnalyzeDeletionLog(id)); @@ -140,7 +262,7 @@ private void clear() { }); } - private void clearMeta(Map infoMap, Predicate isExpired, + private void clearExpiredAnalysisInfo(Map infoMap, Predicate isExpired, Function writeLog) { synchronized (infoMap) { List expired = new ArrayList<>(); @@ -161,6 +283,9 @@ public StatisticsCache getStatisticsCache() { } public void createAnalyze(AnalyzeStmt analyzeStmt, boolean proxy) throws DdlException { + if (!StatisticsUtil.statsTblAvailable() && !FeConstants.runningUnitTest) { + throw new DdlException("Stats table not available, please make sure your cluster status is normal"); + } if (analyzeStmt instanceof AnalyzeDBStmt) { createAnalysisJobs((AnalyzeDBStmt) analyzeStmt, proxy); } else if (analyzeStmt instanceof AnalyzeTblStmt) { @@ -170,38 +295,53 @@ public void createAnalyze(AnalyzeStmt analyzeStmt, boolean proxy) throws DdlExce public void createAnalysisJobs(AnalyzeDBStmt analyzeDBStmt, boolean proxy) throws DdlException { DatabaseIf db = analyzeDBStmt.getDb(); + // Using auto analyzer if user specifies. + if (analyzeDBStmt.getAnalyzeProperties().getProperties().containsKey("use.auto.analyzer")) { + Env.getCurrentEnv().getStatisticsAutoCollector().analyzeDb(db); + return; + } + List analysisInfos = buildAnalysisInfosForDB(db, analyzeDBStmt.getAnalyzeProperties()); + if (!analyzeDBStmt.isSync()) { + sendJobId(analysisInfos, proxy); + } + } + + public List buildAnalysisInfosForDB(DatabaseIf db, AnalyzeProperties analyzeProperties) { + db.readLock(); List tbls = db.getTables(); List analysisInfos = new ArrayList<>(); - db.readLock(); try { List analyzeStmts = new ArrayList<>(); for (TableIf table : tbls) { if (table instanceof View) { continue; } - TableName tableName = new TableName(analyzeDBStmt.getCtlIf().getName(), db.getFullName(), + TableName tableName = new TableName(db.getCatalog().getName(), db.getFullName(), table.getName()); // columnNames null means to add all visitable columns. // Will get all the visible columns in analyzeTblStmt.check() - AnalyzeTblStmt analyzeTblStmt = new AnalyzeTblStmt(analyzeDBStmt.getAnalyzeProperties(), tableName, + AnalyzeTblStmt analyzeTblStmt = new AnalyzeTblStmt(analyzeProperties, tableName, null, db.getId(), table); try { analyzeTblStmt.check(); } catch (AnalysisException analysisException) { - throw new DdlException(analysisException.getMessage(), analysisException); + LOG.warn("Failed to build analyze job: {}", + analysisException.getMessage(), analysisException); } analyzeStmts.add(analyzeTblStmt); } for (AnalyzeTblStmt analyzeTblStmt : analyzeStmts) { - analysisInfos.add(buildAndAssignJob(analyzeTblStmt)); - } - if (!analyzeDBStmt.isSync()) { - sendJobId(analysisInfos, proxy); + try { + analysisInfos.add(buildAndAssignJob(analyzeTblStmt)); + } catch (DdlException e) { + LOG.warn("Failed to build analyze job: {}", + e.getMessage(), e); + } } } finally { db.readUnlock(); } - + return analysisInfos; } // Each analyze stmt corresponding to an analysis job. @@ -214,11 +354,8 @@ public void createAnalysisJob(AnalyzeTblStmt stmt, boolean proxy) throws DdlExce } @Nullable - private AnalysisInfo buildAndAssignJob(AnalyzeTblStmt stmt) throws DdlException { - if (!StatisticsUtil.statsTblAvailable() && !FeConstants.runningUnitTest) { - throw new DdlException("Stats table not available, please make sure your cluster status is normal"); - } - + @VisibleForTesting + protected AnalysisInfo buildAndAssignJob(AnalyzeTblStmt stmt) throws DdlException { AnalysisInfo jobInfo = buildAnalysisJobInfo(stmt); if (jobInfo.colToPartitions.isEmpty()) { // No statistics need to be collected or updated @@ -228,66 +365,44 @@ private AnalysisInfo buildAndAssignJob(AnalyzeTblStmt stmt) throws DdlException boolean isSync = stmt.isSync(); Map analysisTaskInfos = new HashMap<>(); createTaskForEachColumns(jobInfo, analysisTaskInfos, isSync); - createTaskForMVIdx(jobInfo, analysisTaskInfos, isSync); - if (stmt.isAllColumns()) { - createTaskForExternalTable(jobInfo, analysisTaskInfos, isSync); - } - if (!isSync) { - persistAnalysisJob(jobInfo); - analysisJobIdToTaskMap.put(jobInfo.jobId, analysisTaskInfos); - } - if (!isSync) { - try { - updateTableStats(jobInfo); - } catch (Throwable e) { - throw new DdlException("Failed to update Table statistics"); - } + if (!jobInfo.partitionOnly && stmt.isAllColumns() + && StatisticsUtil.isExternalTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName)) { + createTableLevelTaskForExternalTable(jobInfo, analysisTaskInfos, isSync); } - if (isSync) { syncExecute(analysisTaskInfos.values()); + updateTableStats(jobInfo); return null; } - - analysisTaskInfos.values().forEach(taskScheduler::schedule); - return jobInfo; - } - - // Analysis job created by the system - public void createAnalysisJob(AnalysisInfo info) throws DdlException { - AnalysisInfo jobInfo = buildAnalysisJobInfo(info); - if (jobInfo.colToPartitions.isEmpty()) { - // No statistics need to be collected or updated - return; + recordAnalysisJob(jobInfo); + analysisJobIdToTaskMap.put(jobInfo.jobId, analysisTaskInfos); + // TODO: maybe we should update table stats only when all task succeeded. + updateTableStats(jobInfo); + if (!jobInfo.scheduleType.equals(ScheduleType.PERIOD)) { + analysisTaskInfos.values().forEach(taskExecutor::submitTask); } - - Map analysisTaskInfos = new HashMap<>(); - createTaskForEachColumns(jobInfo, analysisTaskInfos, false); - createTaskForMVIdx(jobInfo, analysisTaskInfos, false); - if (!jobInfo.jobType.equals(JobType.SYSTEM)) { - persistAnalysisJob(jobInfo); - analysisJobIdToTaskMap.put(jobInfo.jobId, analysisTaskInfos); - } - - analysisTaskInfos.values().forEach(taskScheduler::schedule); + return jobInfo; } private void sendJobId(List analysisInfos, boolean proxy) { List columns = new ArrayList<>(); + columns.add(new Column("Job_Id", ScalarType.createVarchar(19))); columns.add(new Column("Catalog_Name", ScalarType.createVarchar(1024))); columns.add(new Column("DB_Name", ScalarType.createVarchar(1024))); columns.add(new Column("Table_Name", ScalarType.createVarchar(1024))); columns.add(new Column("Columns", ScalarType.createVarchar(1024))); - columns.add(new Column("Job_Id", ScalarType.createVarchar(19))); ShowResultSetMetaData commonResultSetMetaData = new ShowResultSetMetaData(columns); List> resultRows = new ArrayList<>(); for (AnalysisInfo analysisInfo : analysisInfos) { + if (analysisInfo == null) { + continue; + } List row = new ArrayList<>(); + row.add(String.valueOf(analysisInfo.jobId)); row.add(analysisInfo.catalogName); row.add(analysisInfo.dbName); row.add(analysisInfo.tblName); row.add(analysisInfo.colName); - row.add(String.valueOf(analysisInfo.jobId)); resultRows.add(row); } ShowResultSet commonResultSet = new ShowResultSet(commonResultSetMetaData, resultRows); @@ -317,13 +432,13 @@ private void sendJobId(List analysisInfos, boolean proxy) { * TODO Supports incremental collection of statistics from materialized views */ private Map> validateAndGetPartitions(TableIf table, Set columnNames, - Set partitionNames, AnalysisType analysisType, AnalysisMode analysisMode) throws DdlException { + Set partitionNames, AnalysisType analysisType) throws DdlException { long tableId = table.getId(); Map> columnToPartitions = columnNames.stream() .collect(Collectors.toMap( columnName -> columnName, - columnName -> new HashSet<>(partitionNames) + columnName -> new HashSet<>(partitionNames == null ? Collections.emptySet() : partitionNames) )); if (analysisType == AnalysisType.HISTOGRAM) { @@ -340,7 +455,7 @@ private Map> validateAndGetPartitions(TableIf table, Set> existColAndPartsForStats = StatisticsRepository + Map> existColAndPartsForStats = StatisticsRepository .fetchColAndPartsForStats(tableId); if (existColAndPartsForStats.isEmpty()) { @@ -348,42 +463,30 @@ private Map> validateAndGetPartitions(TableIf table, Set existPartIdsForStats = new HashSet<>(); + Set existPartIdsForStats = new HashSet<>(); existColAndPartsForStats.values().forEach(existPartIdsForStats::addAll); - Map idToPartition = StatisticsUtil.getPartitionIdToName(table); + Set idToPartition = StatisticsUtil.getPartitionIds(table); // Get an invalid set of partitions (those partitions were deleted) - Set invalidPartIds = existPartIdsForStats.stream() - .filter(id -> !idToPartition.containsKey(id)).collect(Collectors.toSet()); + Set invalidPartIds = existPartIdsForStats.stream() + .filter(id -> !idToPartition.contains(id)).collect(Collectors.toSet()); if (!invalidPartIds.isEmpty()) { // Delete invalid partition statistics to avoid affecting table statistics StatisticsRepository.dropStatistics(invalidPartIds); } - if (analysisMode == AnalysisMode.INCREMENTAL && analysisType == AnalysisType.FUNDAMENTALS) { - existColAndPartsForStats.values().forEach(partIds -> partIds.removeAll(invalidPartIds)); - // In incremental collection mode, just collect the uncollected partition statistics - existColAndPartsForStats.forEach((columnName, partitionIds) -> { - Set existPartitions = partitionIds.stream() - .map(idToPartition::get) - .collect(Collectors.toSet()); - columnToPartitions.computeIfPresent(columnName, (colName, partNames) -> { - partNames.removeAll(existPartitions); - return partNames; - }); - }); - if (invalidPartIds.isEmpty()) { - // There is no invalid statistics, so there is no need to update table statistics, - // remove columns that do not require re-collection of statistics - columnToPartitions.entrySet().removeIf(entry -> entry.getValue().isEmpty()); - } + if (analysisType == AnalysisType.FUNDAMENTALS) { + return table.findReAnalyzeNeededPartitions(); } return columnToPartitions; } - private AnalysisInfo buildAnalysisJobInfo(AnalyzeTblStmt stmt) throws DdlException { - AnalysisInfoBuilder taskInfoBuilder = new AnalysisInfoBuilder(); + // Make sure colName of job has all the column as this AnalyzeStmt specified, no matter whether it will be analyzed + // or not. + @VisibleForTesting + public AnalysisInfo buildAnalysisJobInfo(AnalyzeTblStmt stmt) throws DdlException { + AnalysisInfoBuilder infoBuilder = new AnalysisInfoBuilder(); long jobId = Env.getCurrentEnv().getNextId(); String catalogName = stmt.getCatalogName(); String db = stmt.getDBName(); @@ -395,140 +498,75 @@ private AnalysisInfo buildAnalysisJobInfo(AnalyzeTblStmt stmt) throws DdlExcepti Set partitionNames = stmt.getPartitionNames(); boolean partitionOnly = stmt.isPartitionOnly(); boolean isSamplingPartition = stmt.isSamplingPartition(); + boolean isAllPartition = stmt.isAllPartitions(); + long partitionCount = stmt.getPartitionCount(); int samplePercent = stmt.getSamplePercent(); int sampleRows = stmt.getSampleRows(); AnalysisType analysisType = stmt.getAnalysisType(); AnalysisMode analysisMode = stmt.getAnalysisMode(); AnalysisMethod analysisMethod = stmt.getAnalysisMethod(); ScheduleType scheduleType = stmt.getScheduleType(); + CronExpression cronExpression = stmt.getCron(); - taskInfoBuilder.setJobId(jobId); - taskInfoBuilder.setCatalogName(catalogName); - taskInfoBuilder.setDbName(db); - taskInfoBuilder.setTblName(tblName); + infoBuilder.setJobId(jobId); + infoBuilder.setCatalogName(catalogName); + infoBuilder.setDbName(db); + infoBuilder.setTblName(tblName); + // TODO: Refactor later, DON'T MODIFY IT RIGHT NOW StringJoiner stringJoiner = new StringJoiner(",", "[", "]"); for (String colName : columnNames) { stringJoiner.add(colName); } - taskInfoBuilder.setColName(stringJoiner.toString()); - taskInfoBuilder.setPartitionNames(partitionNames); - taskInfoBuilder.setPartitionOnly(partitionOnly); - taskInfoBuilder.setSamplingPartition(isSamplingPartition); - taskInfoBuilder.setJobType(JobType.MANUAL); - taskInfoBuilder.setState(AnalysisState.PENDING); - taskInfoBuilder.setLastExecTimeInMs(System.currentTimeMillis()); - taskInfoBuilder.setAnalysisType(analysisType); - taskInfoBuilder.setAnalysisMode(analysisMode); - taskInfoBuilder.setAnalysisMethod(analysisMethod); - taskInfoBuilder.setScheduleType(scheduleType); - taskInfoBuilder.setLastExecTimeInMs(0); - + infoBuilder.setColName(stringJoiner.toString()); + infoBuilder.setPartitionNames(partitionNames); + infoBuilder.setPartitionOnly(partitionOnly); + infoBuilder.setSamplingPartition(isSamplingPartition); + infoBuilder.setAllPartition(isAllPartition); + infoBuilder.setPartitionCount(partitionCount); + infoBuilder.setJobType(JobType.MANUAL); + infoBuilder.setState(AnalysisState.PENDING); + infoBuilder.setLastExecTimeInMs(System.currentTimeMillis()); + infoBuilder.setAnalysisType(analysisType); + infoBuilder.setAnalysisMode(analysisMode); + infoBuilder.setAnalysisMethod(analysisMethod); + infoBuilder.setScheduleType(scheduleType); + infoBuilder.setLastExecTimeInMs(0); + infoBuilder.setCronExpression(cronExpression); + infoBuilder.setForceFull(stmt.forceFull()); if (analysisMethod == AnalysisMethod.SAMPLE) { - taskInfoBuilder.setSamplePercent(samplePercent); - taskInfoBuilder.setSampleRows(sampleRows); + infoBuilder.setSamplePercent(samplePercent); + infoBuilder.setSampleRows(sampleRows); } if (analysisType == AnalysisType.HISTOGRAM) { int numBuckets = stmt.getNumBuckets(); int maxBucketNum = numBuckets > 0 ? numBuckets : StatisticConstants.HISTOGRAM_MAX_BUCKET_NUM; - taskInfoBuilder.setMaxBucketNum(maxBucketNum); + infoBuilder.setMaxBucketNum(maxBucketNum); } - if (scheduleType == ScheduleType.PERIOD) { - long periodTimeInMs = stmt.getPeriodTimeInMs(); - taskInfoBuilder.setPeriodTimeInMs(periodTimeInMs); - } + long periodTimeInMs = stmt.getPeriodTimeInMs(); + infoBuilder.setPeriodTimeInMs(periodTimeInMs); Map> colToPartitions = validateAndGetPartitions(table, columnNames, - partitionNames, analysisType, analysisMode); - taskInfoBuilder.setColToPartitions(colToPartitions); - taskInfoBuilder.setTaskIds(Lists.newArrayList()); - - return taskInfoBuilder.build(); - } - - private AnalysisInfo buildAnalysisJobInfo(AnalysisInfo jobInfo) { - AnalysisInfoBuilder taskInfoBuilder = new AnalysisInfoBuilder(); - taskInfoBuilder.setJobId(jobInfo.jobId); - taskInfoBuilder.setCatalogName(jobInfo.catalogName); - taskInfoBuilder.setDbName(jobInfo.dbName); - taskInfoBuilder.setTblName(jobInfo.tblName); - taskInfoBuilder.setJobType(JobType.SYSTEM); - taskInfoBuilder.setState(AnalysisState.PENDING); - taskInfoBuilder.setLastExecTimeInMs(System.currentTimeMillis()); - taskInfoBuilder.setAnalysisType(jobInfo.analysisType); - taskInfoBuilder.setAnalysisMode(jobInfo.analysisMode); - taskInfoBuilder.setAnalysisMethod(jobInfo.analysisMethod); - taskInfoBuilder.setScheduleType(jobInfo.scheduleType); - taskInfoBuilder.setSamplePercent(jobInfo.samplePercent); - taskInfoBuilder.setSampleRows(jobInfo.sampleRows); - taskInfoBuilder.setMaxBucketNum(jobInfo.maxBucketNum); - taskInfoBuilder.setPeriodTimeInMs(jobInfo.periodTimeInMs); - taskInfoBuilder.setLastExecTimeInMs(jobInfo.lastExecTimeInMs); - try { - TableIf table = StatisticsUtil - .findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - Map> colToPartitions = validateAndGetPartitions(table, jobInfo.colToPartitions.keySet(), - jobInfo.partitionNames, jobInfo.analysisType, jobInfo.analysisMode); - taskInfoBuilder.setColToPartitions(colToPartitions); - } catch (Throwable e) { - throw new RuntimeException(e); - } - return taskInfoBuilder.build(); + partitionNames, analysisType); + infoBuilder.setColToPartitions(colToPartitions); + infoBuilder.setTaskIds(Lists.newArrayList()); + + return infoBuilder.build(); } - private void persistAnalysisJob(AnalysisInfo jobInfo) throws DdlException { + @VisibleForTesting + public void recordAnalysisJob(AnalysisInfo jobInfo) throws DdlException { if (jobInfo.scheduleType == ScheduleType.PERIOD && jobInfo.lastExecTimeInMs > 0) { return; } AnalysisInfoBuilder jobInfoBuilder = new AnalysisInfoBuilder(jobInfo); AnalysisInfo analysisInfo = jobInfoBuilder.setTaskId(-1).build(); - logCreateAnalysisJob(analysisInfo); + replayCreateAnalysisJob(analysisInfo); } - private void createTaskForMVIdx(AnalysisInfo jobInfo, Map analysisTasks, - boolean isSync) throws DdlException { - TableIf table; - try { - table = StatisticsUtil.findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - } catch (Throwable e) { - LOG.warn(e.getMessage()); - return; - } - - TableType type = table.getType(); - if (jobInfo.analysisType != AnalysisType.INDEX || !type.equals(TableType.OLAP)) { - // not need to collect statistics for materialized view - return; - } - - OlapTable olapTable = (OlapTable) table; - - try { - olapTable.readLock(); - for (MaterializedIndexMeta meta : olapTable.getIndexIdToMeta().values()) { - if (meta.getDefineStmt() == null) { - continue; - } - long indexId = meta.getIndexId(); - long taskId = Env.getCurrentEnv().getNextId(); - AnalysisInfoBuilder indexTaskInfoBuilder = new AnalysisInfoBuilder(jobInfo); - AnalysisInfo analysisInfo = indexTaskInfoBuilder.setIndexId(indexId) - .setTaskId(taskId).setLastExecTimeInMs(System.currentTimeMillis()).build(); - jobInfo.addTaskId(taskId); - if (isSync) { - return; - } - analysisTasks.put(taskId, createTask(analysisInfo)); - logCreateAnalysisTask(analysisInfo); - } - } finally { - olapTable.readUnlock(); - } - } - - private void createTaskForEachColumns(AnalysisInfo jobInfo, Map analysisTasks, + public void createTaskForEachColumns(AnalysisInfo jobInfo, Map analysisTasks, boolean isSync) throws DdlException { Map> columnToPartitions = jobInfo.colToPartitions; for (Entry> entry : columnToPartitions.entrySet()) { @@ -549,7 +587,7 @@ private void createTaskForEachColumns(AnalysisInfo jobInfo, Map analysisTasks, boolean isSync) throws DdlException { - TableIf table; - try { - table = StatisticsUtil.findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - } catch (Throwable e) { - LOG.warn(e.getMessage()); - return; - } - if (jobInfo.analysisType == AnalysisType.HISTOGRAM || !(table instanceof ExternalTable)) { + + if (jobInfo.analysisType == AnalysisType.HISTOGRAM) { return; } AnalysisInfoBuilder colTaskInfoBuilder = new AnalysisInfoBuilder(jobInfo); @@ -593,120 +626,49 @@ private void createTaskForExternalTable(AnalysisInfo jobInfo, return; } try { - logCreateAnalysisTask(analysisInfo); + replayCreateAnalysisTask(analysisInfo); } catch (Exception e) { throw new DdlException("Failed to create analysis task", e); } } public void updateTaskStatus(AnalysisInfo info, AnalysisState taskState, String message, long time) { - if (analysisJobIdToTaskMap.get(info.jobId) == null) { - return; - } - info.state = taskState; - info.message = message; - // Update the task cost time when task finished or failed. And only log the final state. - if (taskState.equals(AnalysisState.FINISHED) || taskState.equals(AnalysisState.FAILED)) { - info.timeCostInMs = time - info.lastExecTimeInMs; - info.lastExecTimeInMs = time; - logCreateAnalysisTask(info); - } - info.lastExecTimeInMs = time; - AnalysisInfo job = analysisJobInfoMap.get(info.jobId); - // Synchronize the job state change in job level. - synchronized (job) { - job.lastExecTimeInMs = time; - // Set the job state to RUNNING when its first task becomes RUNNING. - if (info.state.equals(AnalysisState.RUNNING) && job.state.equals(AnalysisState.PENDING)) { - job.state = AnalysisState.RUNNING; - replayCreateAnalysisJob(job); - } - boolean allFinished = true; - boolean hasFailure = false; - for (BaseAnalysisTask task : analysisJobIdToTaskMap.get(info.jobId).values()) { - AnalysisInfo taskInfo = task.info; - if (taskInfo.state.equals(AnalysisState.RUNNING) || taskInfo.state.equals(AnalysisState.PENDING)) { - allFinished = false; - break; - } - if (taskInfo.state.equals(AnalysisState.FAILED)) { - hasFailure = true; - } - } - if (allFinished) { - if (hasFailure) { - job.state = AnalysisState.FAILED; - logCreateAnalysisJob(job); - } else { - job.state = AnalysisState.FINISHED; - if (job.jobType.equals(JobType.SYSTEM)) { - try { - updateTableStats(job); - } catch (Throwable e) { - LOG.warn("Failed to update Table statistics in job: {}", info.toString(), e); - } - } - logCreateAnalysisJob(job); - } - analysisJobIdToTaskMap.remove(job.jobId); - } - } + TaskStatusWrapper taskStatusWrapper = new TaskStatusWrapper(info, taskState, message, time); + updaters[info.jobType.ordinal()].apply(taskStatusWrapper); } - private void updateTableStats(AnalysisInfo jobInfo) throws Throwable { - Map params = buildTableStatsParams(jobInfo); + @VisibleForTesting + public void updateTableStats(AnalysisInfo jobInfo) { TableIf tbl = StatisticsUtil.findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - - // update olap table stats - if (tbl.getType() == TableType.OLAP) { - OlapTable table = (OlapTable) tbl; - updateOlapTableStats(table, params); + // External Table update table stats after table level task finished. + if (tbl instanceof ExternalTable) { + return; + } + TableStatsMeta tableStats = findTableStatsStatus(tbl.getId()); + if (tableStats == null) { + updateTableStatsStatus(new TableStatsMeta(tbl.getId(), tbl.estimatedRowCount(), jobInfo)); + } else { + tableStats.updateByJob(jobInfo); + logCreateTableStats(tableStats); } - // External Table doesn't collect table stats here. - // We create task for external table to collect table/partition level statistics. - } - - @SuppressWarnings("rawtypes") - private Map buildTableStatsParams(AnalysisInfo jobInfo) throws Throwable { - CatalogIf catalog = StatisticsUtil.findCatalog(jobInfo.catalogName); - DatabaseIf db = StatisticsUtil.findDatabase(jobInfo.catalogName, jobInfo.dbName); - TableIf tbl = StatisticsUtil.findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - String indexId = String.valueOf(jobInfo.indexId); - String id = StatisticsUtil.constructId(tbl.getId(), indexId); - Map commonParams = new HashMap<>(); - commonParams.put("id", id); - commonParams.put("catalogId", String.valueOf(catalog.getId())); - commonParams.put("dbId", String.valueOf(db.getId())); - commonParams.put("tblId", String.valueOf(tbl.getId())); - commonParams.put("indexId", indexId); - commonParams.put("lastAnalyzeTimeInMs", String.valueOf(System.currentTimeMillis())); - return commonParams; } - private void updateOlapTableStats(OlapTable table, Map params) throws Throwable { - for (Partition partition : table.getPartitions()) { - HashMap partParams = Maps.newHashMap(params); - long rowCount = partition.getBaseIndex().getRowCount(); - partParams.put("id", StatisticsUtil - .constructId(params.get("id"), partition.getId())); - partParams.put("partId", String.valueOf(partition.getId())); - partParams.put("rowCount", String.valueOf(rowCount)); - StatisticsRepository.persistTableStats(partParams); + public List showAnalysisJob(ShowAnalyzeStmt stmt) { + if (stmt.isAuto()) { + // It's ok to sync on this field, it would only be assigned when instance init or do checkpoint + synchronized (autoJobs) { + return findShowAnalyzeResult(autoJobs, stmt); + } } - - HashMap tblParams = Maps.newHashMap(params); - long rowCount = table.getRowCount(); - tblParams.put("partId", "NULL"); - tblParams.put("rowCount", String.valueOf(rowCount)); - StatisticsRepository.persistTableStats(tblParams); + return findShowAnalyzeResult(analysisJobInfoMap.values(), stmt); } - public List showAnalysisJob(ShowAnalyzeStmt stmt) { + protected List findShowAnalyzeResult(Collection analysisInfos, ShowAnalyzeStmt stmt) { String state = stmt.getStateValue(); TableName tblName = stmt.getDbTableName(); - return analysisJobInfoMap.values().stream() + return analysisInfos.stream() .filter(a -> stmt.getJobId() == 0 || a.jobId == stmt.getJobId()) .filter(a -> state == null || a.state.equals(AnalysisState.valueOf(state))) .filter(a -> tblName == null || a.catalogName.equals(tblName.getCtl()) @@ -737,10 +699,11 @@ public String getJobProgress(long jobId) { break; } } - return String.format("%d Finished/%d Failed/%d In Progress/%d Total", finished, failed, inProgress, total); + return String.format(progressDisplayTemplate, finished, failed, inProgress, total); } - private void syncExecute(Collection tasks) { + @VisibleForTesting + public void syncExecute(Collection tasks) { SyncTaskCollection syncTaskCollection = new SyncTaskCollection(tasks); ConnectContext ctx = ConnectContext.get(); try { @@ -754,7 +717,8 @@ private void syncExecute(Collection tasks) { private ThreadPoolExecutor createThreadPoolForSyncAnalyze() { String poolName = "SYNC ANALYZE THREAD POOL"; - return new ThreadPoolExecutor(0, 64, + return new ThreadPoolExecutor(0, + ConnectContext.get().getSessionVariable().parallelSyncAnalyzeTaskNum, 0, TimeUnit.SECONDS, new SynchronousQueue(), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("SYNC ANALYZE" + "-%d") @@ -767,17 +731,23 @@ public void dropStats(DropStatsStmt dropStatsStmt) throws DdlException { Env.getCurrentEnv().getStatisticsCleaner().clear(); return; } + Set cols = dropStatsStmt.getColumnNames(); long tblId = dropStatsStmt.getTblId(); - StatisticsRepository.dropStatistics(tblId, cols); - for (String col : cols) { - Env.getCurrentEnv().getStatisticsCache().invalidate(tblId, -1L, col); + TableStatsMeta tableStats = findTableStatsStatus(dropStatsStmt.getTblId()); + if (tableStats == null) { + return; } - if (dropStatsStmt.dropTableRowCount()) { - StatisticsRepository.dropExternalTableStatistics(tblId); - // Table cache key doesn't care about catalog id and db id, because the table id is globally unique. - Env.getCurrentEnv().getStatisticsCache().invalidateTableStats(-1, -1, tblId); + if (cols == null) { + tableStats.reset(); + } else { + dropStatsStmt.getColumnNames().forEach(tableStats::removeColumn); + for (String col : cols) { + Env.getCurrentEnv().getStatisticsCache().invalidate(tblId, -1L, col); + } } + logCreateTableStats(tableStats); + StatisticsRepository.dropStatistics(tblId, cols); } public void handleKillAnalyzeStmt(KillAnalysisJobStmt killAnalysisJobStmt) throws DdlException { @@ -872,15 +842,15 @@ public void execute(ThreadPoolExecutor executor) { executor.submit(() -> { try { if (cancelled) { + errorMessages.add("Query timeout or user cancelled." + + "Could set analyze_timeout to a bigger value."); return; } try { task.execute(); - updateSyncTaskStatus(task, AnalysisState.FINISHED); } catch (Throwable t) { colNames.add(task.info.colName); errorMessages.add(Util.getRootCauseMessage(t)); - updateSyncTaskStatus(task, AnalysisState.FAILED); LOG.warn("Failed to analyze, info: {}", task, t); } } finally { @@ -898,32 +868,22 @@ public void execute(ThreadPoolExecutor executor) { + "] Reasons: " + String.join(",", errorMessages)); } } - - private void updateSyncTaskStatus(BaseAnalysisTask task, AnalysisState state) { - Env.getCurrentEnv().getAnalysisManager() - .updateTaskStatus(task.info, state, "", System.currentTimeMillis()); - } - } - - public List findAutomaticAnalysisJobs() { - synchronized (analysisJobInfoMap) { - return analysisJobInfoMap.values().stream() - .filter(a -> - a.scheduleType.equals(ScheduleType.AUTOMATIC) - && (!(a.state.equals(AnalysisState.RUNNING) - || a.state.equals(AnalysisState.PENDING))) - && System.currentTimeMillis() - a.lastExecTimeInMs - > TimeUnit.MINUTES.toMillis(Config.auto_check_statistics_in_minutes)) - .collect(Collectors.toList()); - } } public List findPeriodicJobs() { synchronized (analysisJobInfoMap) { + Predicate p = a -> { + if (a.state.equals(AnalysisState.RUNNING)) { + return false; + } + if (a.cronExpression == null) { + return a.scheduleType.equals(ScheduleType.PERIOD) + && System.currentTimeMillis() - a.lastExecTimeInMs > a.periodTimeInMs; + } + return a.cronExpression.getTimeAfter(new Date(a.lastExecTimeInMs)).before(new Date()); + }; return analysisJobInfoMap.values().stream() - .filter(a -> a.scheduleType.equals(ScheduleType.PERIOD) - && (a.state.equals(AnalysisState.FINISHED)) - && System.currentTimeMillis() - a.lastExecTimeInMs > a.periodTimeInMs) + .filter(p) .collect(Collectors.toList()); } } @@ -937,7 +897,8 @@ public List findTasks(long jobId) { public List findTasksByTaskIds(long jobId) { AnalysisInfo jobInfo = analysisJobInfoMap.get(jobId); if (jobInfo != null && jobInfo.taskIds != null) { - return jobInfo.taskIds.stream().map(id -> analysisTaskInfoMap.get(id)).collect(Collectors.toList()); + return jobInfo.taskIds.stream().map(analysisTaskInfoMap::get).filter(i -> i != null) + .collect(Collectors.toList()); } return null; } @@ -963,34 +924,187 @@ public void dropAnalyzeJob(DropAnalyzeJobStmt analyzeJobStmt) throws DdlExceptio public static AnalysisManager readFields(DataInput in) throws IOException { AnalysisManager analysisManager = new AnalysisManager(); - doRead(in, analysisManager.analysisJobInfoMap, true); - doRead(in, analysisManager.analysisTaskInfoMap, false); + readAnalysisInfo(in, analysisManager.analysisJobInfoMap, true); + readAnalysisInfo(in, analysisManager.analysisTaskInfoMap, false); + readIdToTblStats(in, analysisManager.idToTblStats); + readAutoJobs(in, analysisManager); return analysisManager; } - private static void doRead(DataInput in, Map map, boolean job) throws IOException { + private static void readAnalysisInfo(DataInput in, Map map, boolean job) throws IOException { int size = in.readInt(); for (int i = 0; i < size; i++) { AnalysisInfo analysisInfo = AnalysisInfo.read(in); + // Unfinished manual once job/tasks doesn't need to keep in memory anymore. + if (needAbandon(analysisInfo)) { + continue; + } map.put(job ? analysisInfo.jobId : analysisInfo.taskId, analysisInfo); } } + // Need to abandon the unfinished manual once jobs/tasks while loading image and replay journal. + // Journal only store finished tasks and jobs. + public static boolean needAbandon(AnalysisInfo analysisInfo) { + if (analysisInfo == null) { + return true; + } + if ((AnalysisState.PENDING.equals(analysisInfo.state) || AnalysisState.RUNNING.equals(analysisInfo.state)) + && ScheduleType.ONCE.equals(analysisInfo.scheduleType) + && JobType.MANUAL.equals(analysisInfo.jobType)) { + return true; + } + return false; + } + + private static void readIdToTblStats(DataInput in, Map map) throws IOException { + int size = in.readInt(); + for (int i = 0; i < size; i++) { + TableStatsMeta tableStats = TableStatsMeta.read(in); + map.put(tableStats.tblId, tableStats); + } + } + + private static void readAutoJobs(DataInput in, AnalysisManager analysisManager) throws IOException { + if (Env.getCurrentEnvJournalVersion() > FeMetaVersion.VERSION_123) { + Type type = new TypeToken>() {}.getType(); + Collection autoJobs = GsonUtils.GSON.fromJson(Text.readString(in), type); + analysisManager.autoJobs = analysisManager.createSimpleQueue(autoJobs, analysisManager); + } + } + @Override public void write(DataOutput out) throws IOException { - doWrite(out, analysisJobInfoMap); - doWrite(out, analysisTaskInfoMap); + writeJobInfo(out, analysisJobInfoMap); + writeJobInfo(out, analysisTaskInfoMap); + writeTableStats(out); + writeAutoJobsStatus(out); } - private void doWrite(DataOutput out, Map infoMap) throws IOException { + private void writeJobInfo(DataOutput out, Map infoMap) throws IOException { out.writeInt(infoMap.size()); for (Entry entry : infoMap.entrySet()) { entry.getValue().write(out); } } + private void writeTableStats(DataOutput out) throws IOException { + out.writeInt(idToTblStats.size()); + for (Entry entry : idToTblStats.entrySet()) { + entry.getValue().write(out); + } + } + + private void writeAutoJobsStatus(DataOutput output) throws IOException { + Type type = new TypeToken>() {}.getType(); + String autoJobs = GsonUtils.GSON.toJson(this.autoJobs, type); + Text.writeString(output, autoJobs); + } + // For unit test use only. public void addToJobIdTasksMap(long jobId, Map tasks) { analysisJobIdToTaskMap.put(jobId, tasks); } + + public TableStatsMeta findTableStatsStatus(long tblId) { + return idToTblStats.get(tblId); + } + + // Invoke this when load transaction finished. + public void updateUpdatedRows(long tblId, long rows) { + TableStatsMeta statsStatus = idToTblStats.get(tblId); + if (statsStatus != null) { + statsStatus.updatedRows.addAndGet(rows); + logCreateTableStats(statsStatus); + } + } + + public void updateTableStatsStatus(TableStatsMeta tableStats) { + replayUpdateTableStatsStatus(tableStats); + logCreateTableStats(tableStats); + } + + public void replayUpdateTableStatsStatus(TableStatsMeta tableStats) { + idToTblStats.put(tableStats.tblId, tableStats); + } + + public void logCreateTableStats(TableStatsMeta tableStats) { + Env.getCurrentEnv().getEditLog().logCreateTableStats(tableStats); + } + + public void registerSysJob(AnalysisInfo jobInfo, Map taskInfos) { + jobInfo.state = AnalysisState.RUNNING; + systemJobInfoMap.put(jobInfo.jobId, jobInfo); + analysisJobIdToTaskMap.put(jobInfo.jobId, taskInfos); + } + + @VisibleForTesting + protected Set findReAnalyzeNeededPartitions(TableIf table) { + TableStatsMeta tableStats = findTableStatsStatus(table.getId()); + if (tableStats == null) { + return table.getPartitionNames().stream().map(table::getPartition) + .filter(Partition::hasData).map(Partition::getName).collect(Collectors.toSet()); + } + return table.getPartitionNames().stream() + .map(table::getPartition) + .filter(Partition::hasData) + .filter(partition -> + partition.getVisibleVersionTime() >= tableStats.updatedTime).map(Partition::getName) + .collect(Collectors.toSet()); + } + + protected void logAutoJob(AnalysisInfo autoJob) { + Env.getCurrentEnv().getEditLog().logAutoJob(autoJob); + } + + public void replayPersistSysJob(AnalysisInfo analysisInfo) { + autoJobs.offer(analysisInfo); + } + + protected SimpleQueue createSimpleQueue(Collection collection, + AnalysisManager analysisManager) { + return new SimpleQueue<>(Config.auto_analyze_job_record_count, + a -> { + // FE is not ready when replaying log and operations triggered by replaying + // shouldn't be logged again. + if (Env.getCurrentEnv().isReady() && Env.getCurrentEnv().isMaster() && !Env.isCheckpointThread()) { + analysisManager.logAutoJob(a); + } + return null; + }, + a -> { + // DO NOTHING + return null; + }, null); + } + + // Remove col stats status from TableStats if failed load some col stats after analyze corresponding column so that + // we could make sure it would be analyzed again soon if user or system submit job for that column again. + public void removeColStatsStatus(long tblId, String colName) { + TableStatsMeta tableStats = findTableStatsStatus(tblId); + if (tableStats != null) { + tableStats.removeColumn(colName); + } + } + + public void removeTableStats(long tblId) { + if (!idToTblStats.containsKey(tblId)) { + return; + } + TableStatsDeletionLog log = new TableStatsDeletionLog(tblId); + Env.getCurrentEnv().getEditLog().logDeleteTableStats(log); + replayTableStatsDeletion(log); + } + + public void replayTableStatsDeletion(TableStatsDeletionLog log) { + idToTblStats.remove(log.id); + } + + public ColStatsMeta findColStatsMeta(long tblId, String colName) { + TableStatsMeta tableStats = findTableStatsStatus(tblId); + if (tableStats == null) { + return null; + } + return tableStats.findColumnStatsMeta(colName); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisState.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisState.java index bab8a462e8a5bd7..3abc4c224faad2a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisState.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisState.java @@ -18,7 +18,9 @@ package org.apache.doris.statistics; public enum AnalysisState { + // When analyze job/task created, but never run PENDING, + // When analyze job/task is in running queue RUNNING, FINISHED, FAILED; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java index b5ec7aeb8768070..4b133ce0ebfc686 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskExecutor.java @@ -17,6 +17,7 @@ package org.apache.doris.statistics; +import org.apache.doris.catalog.Env; import org.apache.doris.common.Config; import org.apache.doris.common.ThreadPoolManager; import org.apache.doris.common.ThreadPoolManager.BlockedPolicy; @@ -35,26 +36,30 @@ public class AnalysisTaskExecutor extends Thread { private static final Logger LOG = LogManager.getLogger(AnalysisTaskExecutor.class); - private final ThreadPoolExecutor executors = ThreadPoolManager.newDaemonThreadPool( - Config.statistics_simultaneously_running_task_num, - Config.statistics_simultaneously_running_task_num, 0, - TimeUnit.DAYS, new LinkedBlockingQueue<>(), - new BlockedPolicy("Analysis Job Executor", Integer.MAX_VALUE), - "Analysis Job Executor", true); - - private final AnalysisTaskScheduler taskScheduler; + private final ThreadPoolExecutor executors; private final BlockingQueue taskQueue = new PriorityBlockingQueue(20, Comparator.comparingLong(AnalysisTaskWrapper::getStartTime)); - public AnalysisTaskExecutor(AnalysisTaskScheduler jobExecutor) { - this.taskScheduler = jobExecutor; + public AnalysisTaskExecutor(int simultaneouslyRunningTaskNum) { + if (!Env.isCheckpointThread()) { + executors = ThreadPoolManager.newDaemonThreadPool( + simultaneouslyRunningTaskNum, + simultaneouslyRunningTaskNum, 0, + TimeUnit.DAYS, new LinkedBlockingQueue<>(), + new BlockedPolicy("Analysis Job Executor", Integer.MAX_VALUE), + "Analysis Job Executor", true); + } else { + executors = null; + } } @Override public void run() { - fetchAndExecute(); + if (Env.isCheckpointThread()) { + return; + } cancelExpiredTask(); } @@ -82,22 +87,7 @@ private void doCancelExpiredJob() { } } - public void fetchAndExecute() { - Thread t = new Thread(() -> { - for (;;) { - try { - doFetchAndExecute(); - } catch (Throwable throwable) { - LOG.warn(throwable); - } - } - }, "Analysis Task Submitter"); - t.setDaemon(true); - t.start(); - } - - private void doFetchAndExecute() { - BaseAnalysisTask task = taskScheduler.getPendingTasks(); + public void submitTask(BaseAnalysisTask task) { AnalysisTaskWrapper taskWrapper = new AnalysisTaskWrapper(this, task); executors.submit(taskWrapper); } @@ -105,4 +95,13 @@ private void doFetchAndExecute() { public void putJob(AnalysisTaskWrapper wrapper) throws Exception { taskQueue.put(wrapper); } + + public boolean idle() { + return executors.getQueue().isEmpty(); + } + + public void clear() { + executors.getQueue().clear(); + taskQueue.clear(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskScheduler.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskScheduler.java deleted file mode 100644 index 5c9de2b58b22b9e..000000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskScheduler.java +++ /dev/null @@ -1,108 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -import org.apache.doris.catalog.Env; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.util.Comparator; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.PriorityQueue; -import java.util.Queue; -import java.util.Set; - -public class AnalysisTaskScheduler { - - private static final Logger LOG = LogManager.getLogger(AnalysisTaskScheduler.class); - - private final PriorityQueue systemJobQueue = - new PriorityQueue<>(Comparator.comparingLong(BaseAnalysisTask::getLastExecTime)); - - private final Queue manualJobQueue = new LinkedList<>(); - - private final Set systemJobSet = new HashSet<>(); - - private final Set manualJobSet = new HashSet<>(); - - public synchronized void schedule(BaseAnalysisTask analysisTask) { - try { - - switch (analysisTask.info.jobType) { - case MANUAL: - addToManualJobQueue(analysisTask); - break; - case SYSTEM: - addToSystemQueue(analysisTask); - break; - default: - throw new IllegalArgumentException("Unknown job type: " + analysisTask.info.jobType); - } - } catch (Throwable t) { - Env.getCurrentEnv().getAnalysisManager().updateTaskStatus( - analysisTask.info, AnalysisState.FAILED, t.getMessage(), System.currentTimeMillis()); - } - } - - // Make sure invoker of this method is synchronized on object. - - private void addToSystemQueue(BaseAnalysisTask analysisJobInfo) { - if (systemJobSet.contains(analysisJobInfo)) { - return; - } - systemJobSet.add(analysisJobInfo); - systemJobQueue.add(analysisJobInfo); - notify(); - } - - // Make sure invoker of this method is synchronized on object. - private void addToManualJobQueue(BaseAnalysisTask analysisJobInfo) { - if (manualJobSet.contains(analysisJobInfo)) { - return; - } - manualJobSet.add(analysisJobInfo); - manualJobQueue.add(analysisJobInfo); - notify(); - } - - public synchronized BaseAnalysisTask getPendingTasks() { - while (true) { - if (!manualJobQueue.isEmpty()) { - return pollAndRemove(manualJobQueue, manualJobSet); - } - if (!systemJobQueue.isEmpty()) { - return pollAndRemove(systemJobQueue, systemJobSet); - } - try { - wait(); - } catch (Exception e) { - LOG.warn("Thread get interrupted when waiting for pending jobs", e); - return null; - } - } - } - - // Poll from queue, remove from set. Make sure invoker of this method is synchronized on object. - private BaseAnalysisTask pollAndRemove(Queue q, Set s) { - BaseAnalysisTask t = q.poll(); - s.remove(t); - return t; - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java index 7f55469f5335860..9aa3d85992b32c0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/AnalysisTaskWrapper.java @@ -18,11 +18,15 @@ package org.apache.doris.statistics; import org.apache.doris.catalog.Env; +import org.apache.doris.common.util.TimeUtils; import org.apache.doris.common.util.Util; +import org.apache.doris.statistics.AnalysisInfo.ScheduleType; +import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.time.LocalTime; import java.util.concurrent.FutureTask; public class AnalysisTaskWrapper extends FutureTask { @@ -52,6 +56,14 @@ public void run() { if (task.killed) { return; } + if (task.info.scheduleType.equals(ScheduleType.AUTOMATIC) && !StatisticsUtil.inAnalyzeTime( + LocalTime.now(TimeUtils.getTimeZone().toZoneId()))) { + // TODO: Do we need a separate AnalysisState here? + Env.getCurrentEnv().getAnalysisManager() + .updateTaskStatus(task.info, AnalysisState.FAILED, "Auto task" + + "doesn't get executed within specified time range", System.currentTimeMillis()); + return; + } executor.putJob(this); super.run(); Object result = get(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 719df8769a43b99..fd99c97e83ba40f 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -21,7 +21,11 @@ import org.apache.doris.catalog.DatabaseIf; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.TableIf; +import org.apache.doris.common.Config; import org.apache.doris.datasource.CatalogIf; +import org.apache.doris.qe.AuditLogHelper; +import org.apache.doris.qe.QueryState; +import org.apache.doris.qe.QueryState.MysqlStateType; import org.apache.doris.qe.StmtExecutor; import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; import org.apache.doris.statistics.AnalysisInfo.AnalysisType; @@ -64,7 +68,7 @@ public abstract class BaseAnalysisTask { protected static final String INSERT_COL_STATISTICS = "INSERT INTO " + "${internalDB}.${columnStatTbl}" + " SELECT id, catalog_id, db_id, tbl_id, idx_id, col_id, part_id, row_count, " - + " ndv, null_count, min, max, data_size, update_time\n" + + " ndv, null_count, CAST(min AS string), CAST(max AS string), data_size, update_time\n" + " FROM \n" + " (SELECT CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + " ${catalogId} AS catalog_id, " @@ -111,7 +115,7 @@ public BaseAnalysisTask(AnalysisInfo info) { init(info); } - private void init(AnalysisInfo info) { + protected void init(AnalysisInfo info) { catalog = Env.getCurrentEnv().getCatalogMgr().getCatalog(info.catalogName); if (catalog == null) { Env.getCurrentEnv().getAnalysisManager().updateTaskStatus(info, AnalysisState.FAILED, @@ -166,6 +170,9 @@ protected void executeWithRetry() { doExecute(); break; } catch (Throwable t) { + if (killed) { + throw new RuntimeException(t); + } LOG.warn("Failed to execute analysis task, retried times: {}", retriedTimes++, t); if (retriedTimes > StatisticConstants.ANALYZE_TASK_RETRY_TIMES) { throw new RuntimeException(t); @@ -181,12 +188,16 @@ protected void afterExecution() { if (killed) { return; } - Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName()); + long tblId = tbl.getId(); + String colName = col.getName(); + if (!Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tblId, -1, colName)) { + Env.getCurrentEnv().getAnalysisManager().removeColStatsStatus(tblId, colName); + } } protected void setTaskStateToRunning() { Env.getCurrentEnv().getAnalysisManager() - .updateTaskStatus(info, AnalysisState.RUNNING, "", System.currentTimeMillis()); + .updateTaskStatus(info, AnalysisState.RUNNING, "", System.currentTimeMillis()); } public void cancel() { @@ -199,10 +210,6 @@ public void cancel() { String.format("Job has been cancelled: %s", info.message), System.currentTimeMillis()); } - public long getLastExecTime() { - return info.lastExecTimeInMs; - } - public long getJobId() { return info.jobId; } @@ -216,21 +223,49 @@ protected String getDataSizeFunction(Column column) { } protected String getSampleExpression() { - if (info.analysisMethod == AnalysisMethod.FULL) { + if (info.forceFull) { return ""; } - // TODO Add sampling methods for external tables + int sampleRows = info.sampleRows; + if (info.analysisMethod == AnalysisMethod.FULL) { + if (Config.enable_auto_sample + && tbl.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes) { + sampleRows = Config.huge_table_default_sample_rows; + } else { + return ""; + } + } if (info.samplePercent > 0) { return String.format("TABLESAMPLE(%d PERCENT)", info.samplePercent); } else { - return String.format("TABLESAMPLE(%d ROWS)", info.sampleRows); + return String.format("TABLESAMPLE(%d ROWS)", sampleRows); } } @Override public String toString() { return String.format("Job id [%d], Task id [%d], catalog [%s], db [%s], table [%s], column [%s]", - info.jobId, info.taskId, catalog.getName(), db.getFullName(), tbl.getName(), - col == null ? "TableRowCount" : col.getName()); + info.jobId, info.taskId, catalog.getName(), db.getFullName(), tbl.getName(), + col == null ? "TableRowCount" : col.getName()); + } + + protected void executeWithExceptionOnFail(StmtExecutor stmtExecutor) throws Exception { + if (killed) { + return; + } + LOG.debug("execute internal sql: {}", stmtExecutor.getOriginStmt()); + try { + stmtExecutor.execute(); + QueryState queryState = stmtExecutor.getContext().getState(); + if (queryState.getStateType().equals(MysqlStateType.ERR)) { + throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", + info.catalogName, info.dbName, info.colName, stmtExecutor.getOriginStmt().toString(), + queryState.getErrorMessage())); + } + } finally { + AuditLogHelper.logAuditLog(stmtExecutor.getContext(), stmtExecutor.getOriginStmt().toString(), + stmtExecutor.getParsedStmt(), stmtExecutor.getQueryStatisticsForAuditLog(), + true); + } } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java new file mode 100644 index 000000000000000..a14e32d726ba20a --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.statistics.util.StatisticsUtil; + +import java.util.StringJoiner; + +/** + * Used to convert data from ResultRow. + * 0: id + * 1: catalog_id + * 2: db_id + * 3: tbl_id + * 4: idx_id + * 5: col_id + * 6: part_id + * 7: count + * 8: ndv + * 9: null_count + * 10: min + * 11: max + * 12: data_size_in_bytes + * 13: update_time + */ +public class ColStatsData { + public final StatsId statsId; + public final long count; + public final long ndv; + + public final long nullCount; + + public final String minLit; + public final String maxLit; + + public final long dataSizeInBytes; + + public final String updateTime; + + public ColStatsData(ResultRow row) { + this.statsId = new StatsId(row); + this.count = Long.parseLong(row.get(7)); + this.ndv = Long.parseLong(row.getWithDefault(8, "0")); + this.nullCount = Long.parseLong(row.getWithDefault(9, "0")); + this.minLit = row.get(10); + this.maxLit = row.get(11); + this.dataSizeInBytes = Long.parseLong(row.getWithDefault(12, "0")); + this.updateTime = row.get(13); + } + + public String toSQL(boolean roundByParentheses) { + StringJoiner sj = null; + if (roundByParentheses) { + sj = new StringJoiner(",", "(" + statsId.toSQL() + ",", ")"); + } else { + sj = new StringJoiner(",", statsId.toSQL(), ""); + } + sj.add(String.valueOf(count)); + sj.add(String.valueOf(ndv)); + sj.add(String.valueOf(nullCount)); + sj.add(StatisticsUtil.quote(minLit)); + sj.add(StatisticsUtil.quote(maxLit)); + sj.add(String.valueOf(dataSizeInBytes)); + sj.add(StatisticsUtil.quote(updateTime)); + return sj.toString(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsMeta.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsMeta.java new file mode 100644 index 000000000000000..445641b25056106 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsMeta.java @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; +import org.apache.doris.statistics.AnalysisInfo.AnalysisType; +import org.apache.doris.statistics.AnalysisInfo.JobType; + +import com.google.gson.annotations.SerializedName; + +import java.util.concurrent.atomic.AtomicLong; + +public class ColStatsMeta { + + @SerializedName("updateTime") + public long updatedTime; + + @SerializedName("method") + public AnalysisMethod analysisMethod; + + @SerializedName("type") + public AnalysisType analysisType; + + @SerializedName("queriedTimes") + public final AtomicLong queriedTimes = new AtomicLong(); + + // TODO: For column that manually analyzed, we should use same analyze method as user specified. + @SerializedName("trigger") + public JobType jobType; + + public ColStatsMeta(long updatedTime, AnalysisMethod analysisMethod, + AnalysisType analysisType, JobType jobType, long queriedTimes) { + this.updatedTime = updatedTime; + this.analysisMethod = analysisMethod; + this.analysisType = analysisType; + this.jobType = jobType; + this.queriedTimes.addAndGet(queriedTimes); + } + + public void clear() { + updatedTime = 0; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java index 7986cb07a57b0c8..c6b019f669b65d3 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java @@ -19,26 +19,26 @@ import org.apache.doris.analysis.LiteralExpr; import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.PartitionInfo; import org.apache.doris.catalog.Type; import org.apache.doris.common.AnalysisException; -import org.apache.doris.common.DdlException; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; +import com.google.common.collect.Sets; import com.google.gson.annotations.SerializedName; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.json.JSONObject; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class ColumnStatistic { + public static final double STATS_ERROR = 0.1D; + public static final StatsType NDV = StatsType.NDV; public static final StatsType AVG_SIZE = StatsType.AVG_SIZE; public static final StatsType MAX_SIZE = StatsType.MAX_SIZE; @@ -50,30 +50,17 @@ public class ColumnStatistic { public static ColumnStatistic UNKNOWN = new ColumnStatisticBuilder().setAvgSizeByte(1).setNdv(1) .setNumNulls(1).setCount(1).setMaxValue(Double.POSITIVE_INFINITY).setMinValue(Double.NEGATIVE_INFINITY) - .setSelectivity(1.0).setIsUnknown(true) + .setIsUnknown(true).setUpdatedTime("") .build(); public static ColumnStatistic ZERO = new ColumnStatisticBuilder().setAvgSizeByte(0).setNdv(0) .setNumNulls(0).setCount(0).setMaxValue(Double.NaN).setMinValue(Double.NaN) - .setSelectivity(0) .build(); - public static final Set UNSUPPORTED_TYPE = new HashSet<>(); - - static { - UNSUPPORTED_TYPE.add(Type.HLL); - UNSUPPORTED_TYPE.add(Type.BITMAP); - UNSUPPORTED_TYPE.add(Type.ARRAY); - UNSUPPORTED_TYPE.add(Type.STRUCT); - UNSUPPORTED_TYPE.add(Type.MAP); - UNSUPPORTED_TYPE.add(Type.QUANTILE_STATE); - UNSUPPORTED_TYPE.add(Type.AGG_STATE); - UNSUPPORTED_TYPE.add(Type.JSONB); - UNSUPPORTED_TYPE.add(Type.VARIANT); - UNSUPPORTED_TYPE.add(Type.TIME); - UNSUPPORTED_TYPE.add(Type.TIMEV2); - UNSUPPORTED_TYPE.add(Type.LAMBDA_FUNCTION); - } + public static final Set UNSUPPORTED_TYPE = Sets.newHashSet( + Type.HLL, Type.BITMAP, Type.ARRAY, Type.STRUCT, Type.MAP, Type.QUANTILE_STATE, Type.AGG_STATE, Type.JSONB, + Type.VARIANT, Type.TIME, Type.TIMEV2, Type.LAMBDA_FUNCTION + ); @SerializedName("count") public final double count; @@ -90,19 +77,6 @@ public class ColumnStatistic { @SerializedName("maxValue") public final double maxValue; public final boolean isUnKnown; - /* - selectivity of Column T1.A: - if T1.A = T2.B is the inner join condition, for a given `b` in B, b in - intersection of range(A) and range(B), selectivity means the probability that - the equation can be satisfied. - We take tpch as example. - l_orderkey = o_orderkey and o_orderstatus='o' - there are 3 distinct o_orderstatus in orders table. filter o_orderstatus='o' reduces orders table by 1/3 - because o_orderkey is primary key, thus the o_orderkey.selectivity = 1/3, - and after join(l_orderkey = o_orderkey), lineitem is reduced by 1/3. - But after filter, other columns' selectivity is still 1.0 - */ - public final double selectivity; /* originalNdv is the ndv in stats of ScanNode. ndv may be changed after filter or join, @@ -111,7 +85,6 @@ and after join(l_orderkey = o_orderkey), lineitem is reduced by 1/3. */ public final ColumnStatistic original; - // For display only. public final LiteralExpr minExpr; public final LiteralExpr maxExpr; @@ -119,14 +92,17 @@ and after join(l_orderkey = o_orderkey), lineitem is reduced by 1/3. // assign value when do stats estimation. public final Histogram histogram; - public final Map partitionIdToColStats = new HashMap<>(); + @SerializedName("partitionIdToColStats") + public final Map partitionIdToColStats = new HashMap<>(); public final String updatedTime; + public final PartitionInfo partitionInfo; + public ColumnStatistic(double count, double ndv, ColumnStatistic original, double avgSizeByte, double numNulls, double dataSize, double minValue, double maxValue, - double selectivity, LiteralExpr minExpr, LiteralExpr maxExpr, boolean isUnKnown, Histogram histogram, - String updatedTime) { + LiteralExpr minExpr, LiteralExpr maxExpr, boolean isUnKnown, Histogram histogram, + String updatedTime, PartitionInfo partitionInfo) { this.count = count; this.ndv = ndv; this.original = original; @@ -135,31 +111,30 @@ public ColumnStatistic(double count, double ndv, ColumnStatistic original, doubl this.dataSize = dataSize; this.minValue = minValue; this.maxValue = maxValue; - this.selectivity = selectivity; this.minExpr = minExpr; this.maxExpr = maxExpr; this.isUnKnown = isUnKnown; this.histogram = histogram; this.updatedTime = updatedTime; + this.partitionInfo = partitionInfo; } public static ColumnStatistic fromResultRow(List resultRows) { - Map partitionIdToColStats = new HashMap<>(); + Map partitionIdToColStats = new HashMap<>(); ColumnStatistic columnStatistic = null; try { for (ResultRow resultRow : resultRows) { - String partId = resultRow.getColumnValue("part_id"); + String partId = resultRow.get(6); if (partId == null) { columnStatistic = fromResultRow(resultRow); } else { - partitionIdToColStats.put(Long.parseLong(partId), fromResultRow(resultRow)); + partitionIdToColStats.put(partId, fromResultRow(resultRow)); } } } catch (Throwable t) { LOG.debug("Failed to deserialize column stats", t); return ColumnStatistic.UNKNOWN; } - // Means last analyze failed or interrupted for some reason. if (columnStatistic == null) { return ColumnStatistic.UNKNOWN; } @@ -168,47 +143,44 @@ public static ColumnStatistic fromResultRow(List resultRows) { } // TODO: use thrift - public static ColumnStatistic fromResultRow(ResultRow resultRow) { + public static ColumnStatistic fromResultRow(ResultRow row) { try { ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); - double count = Double.parseDouble(resultRow.getColumnValueWithDefault("count", "0")); + double count = Double.parseDouble(row.get(7)); columnStatisticBuilder.setCount(count); - double ndv = Double.parseDouble(resultRow.getColumnValueWithDefault("ndv", "0")); - if (0.99 * count < ndv && ndv < 1.01 * count) { - ndv = count; - } + double ndv = Double.parseDouble(row.getWithDefault(8, "0")); columnStatisticBuilder.setNdv(ndv); - String nullCount = resultRow.getColumnValueWithDefault("null_count", "0"); + String nullCount = row.getWithDefault(9, "0"); columnStatisticBuilder.setNumNulls(Double.parseDouble(nullCount)); columnStatisticBuilder.setDataSize(Double - .parseDouble(resultRow.getColumnValueWithDefault("data_size_in_bytes", "0"))); + .parseDouble(row.getWithDefault(12, "0"))); columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getCount() == 0 ? 0 : columnStatisticBuilder.getDataSize() / columnStatisticBuilder.getCount()); - long catalogId = Long.parseLong(resultRow.getColumnValue("catalog_id")); - long idxId = Long.parseLong(resultRow.getColumnValue("idx_id")); - long dbID = Long.parseLong(resultRow.getColumnValue("db_id")); - long tblId = Long.parseLong(resultRow.getColumnValue("tbl_id")); - String colName = resultRow.getColumnValue("col_id"); + long catalogId = Long.parseLong(row.get(1)); + long idxId = Long.parseLong(row.get(4)); + long dbID = Long.parseLong(row.get(2)); + long tblId = Long.parseLong(row.get(3)); + String colName = row.get(5); Column col = StatisticsUtil.findColumn(catalogId, dbID, tblId, idxId, colName); if (col == null) { - LOG.warn("Failed to deserialize column statistics, ctlId: {} dbId: {}" - + "tblId: {} column: {} not exists", + LOG.debug("Failed to deserialize column statistics, ctlId: {} dbId: {}" + + "tblId: {} column: {} not exists", catalogId, dbID, tblId, colName); return ColumnStatistic.UNKNOWN; } - String min = resultRow.getColumnValue("min"); - String max = resultRow.getColumnValue("max"); + String min = row.get(10); + String max = row.get(11); if (min != null && !min.equalsIgnoreCase("NULL")) { try { columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min)); columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min)); } catch (AnalysisException e) { LOG.warn("Failed to deserialize column {} min value {}.", col, min, e); - columnStatisticBuilder.setMinValue(Double.MIN_VALUE); + columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY); } } else { - columnStatisticBuilder.setMinValue(Double.MIN_VALUE); + columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY); } if (max != null && !max.equalsIgnoreCase("NULL")) { try { @@ -216,16 +188,12 @@ public static ColumnStatistic fromResultRow(ResultRow resultRow) { columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max)); } catch (AnalysisException e) { LOG.warn("Failed to deserialize column {} max value {}.", col, max, e); - columnStatisticBuilder.setMaxValue(Double.MAX_VALUE); + columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY); } } else { - columnStatisticBuilder.setMaxValue(Double.MAX_VALUE); + columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY); } - columnStatisticBuilder.setSelectivity(1.0); - Histogram histogram = Env.getCurrentEnv().getStatisticsCache().getHistogram(tblId, idxId, colName) - .orElse(null); - columnStatisticBuilder.setHistogram(histogram); - columnStatisticBuilder.setUpdatedTime(resultRow.getColumnValue("update_time")); + columnStatisticBuilder.setUpdatedTime(row.get(13)); return columnStatisticBuilder.build(); } catch (Exception e) { LOG.warn("Failed to deserialize column statistics.", e); @@ -237,25 +205,12 @@ public static boolean isAlmostUnique(double ndv, double rowCount) { return rowCount * 0.9 < ndv && ndv < rowCount * 1.1; } - public ColumnStatistic copy() { - return new ColumnStatisticBuilder().setCount(count).setNdv(ndv).setAvgSizeByte(avgSizeByte) - .setNumNulls(numNulls).setDataSize(dataSize).setMinValue(minValue) - .setMaxValue(maxValue).setMinExpr(minExpr).setMaxExpr(maxExpr) - .setSelectivity(selectivity).setIsUnknown(isUnKnown).build(); - } - public ColumnStatistic updateByLimit(long limit, double rowCount) { double ratio = 0; if (rowCount != 0) { ratio = limit / rowCount; } double newNdv = Math.ceil(Math.min(ndv, limit)); - double newSelectivity = selectivity; - if (newNdv != 0) { - newSelectivity = newSelectivity * newNdv / ndv; - } else { - newSelectivity = 0; - } return new ColumnStatisticBuilder() .setCount(Math.ceil(limit)) .setNdv(newNdv) @@ -266,7 +221,6 @@ public ColumnStatistic updateByLimit(long limit, double rowCount) { .setMaxValue(maxValue) .setMinExpr(minExpr) .setMaxExpr(maxExpr) - .setSelectivity(newSelectivity) .setIsUnknown(isUnKnown) .build(); } @@ -282,14 +236,11 @@ public ColumnStatistic updateBySelectivity(double selectivity, double rowCount) ColumnStatisticBuilder builder = new ColumnStatisticBuilder(this); Double rowsAfterFilter = rowCount * selectivity; if (isAlmostUnique(ndv, rowCount)) { - builder.setSelectivity(this.selectivity * selectivity); builder.setNdv(ndv * selectivity); } else { if (ndv > rowsAfterFilter) { - builder.setSelectivity(this.selectivity * rowsAfterFilter / ndv); builder.setNdv(rowsAfterFilter); } else { - builder.setSelectivity(this.selectivity); builder.setNdv(this.ndv); } } @@ -332,8 +283,8 @@ public boolean enclosed(ColumnStatistic other) { @Override public String toString() { - return isUnKnown ? "unknown" : String.format("ndv=%.4f, min=%f(%s), max=%f(%s), count=%.4f", - ndv, minValue, minExpr, maxValue, maxExpr, count); + return isUnKnown ? "unknown" : String.format("ndv=%.4f, min=%f(%s), max=%f(%s), count=%.4f, avgSizeByte=%f", + ndv, minValue, minExpr, maxValue, maxExpr, count, avgSizeByte); } public JSONObject toJson() { @@ -355,17 +306,16 @@ public JSONObject toJson() { statistic.put("MaxValueType", "Normal"); statistic.put("MaxValue", maxValue); } - statistic.put("Selectivity", selectivity); statistic.put("Count", count); statistic.put("AvgSizeByte", avgSizeByte); statistic.put("NumNulls", numNulls); statistic.put("DataSize", dataSize); - statistic.put("Selectivity", selectivity); statistic.put("MinExpr", minExpr); statistic.put("MaxExpr", maxExpr); statistic.put("IsUnKnown", isUnKnown); statistic.put("Histogram", Histogram.serializeToJson(histogram)); statistic.put("Original", original); + statistic.put("LastUpdatedTime", updatedTime); return statistic; } @@ -410,12 +360,11 @@ public static ColumnStatistic fromJson(String statJson) { stat.getDouble("DataSize"), minValue, maxValue, - stat.getDouble("Selectivity"), null, null, stat.getBoolean("IsUnKnown"), Histogram.deserializeFromJson(stat.getString("Histogram")), - stat.getString("lastUpdatedTine") + stat.getString("LastUpdatedTime"), null ); } @@ -443,12 +392,7 @@ public boolean isUnKnown() { return isUnKnown; } - public void loadPartitionStats(long tableId, long idxId, String colName) throws DdlException { - List resultRows = StatisticsRepository.loadPartStats(tableId, idxId, colName); - for (ResultRow resultRow : resultRows) { - String partId = resultRow.getColumnValue("part_id"); - ColumnStatistic columnStatistic = ColumnStatistic.fromResultRow(resultRow); - partitionIdToColStats.put(Long.parseLong(partId), columnStatistic); - } + public void putPartStats(String partId, ColumnStatistic columnStatistic) { + this.partitionIdToColStats.put(partId, columnStatistic); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java index 6ca2cc55b7922fd..fa4cf7ebc99cb42 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticBuilder.java @@ -18,6 +18,10 @@ package org.apache.doris.statistics; import org.apache.doris.analysis.LiteralExpr; +import org.apache.doris.catalog.PartitionInfo; + +import java.util.HashMap; +import java.util.Map; public class ColumnStatisticBuilder { private double count; @@ -27,7 +31,6 @@ public class ColumnStatisticBuilder { private double dataSize; private double minValue; private double maxValue; - private double selectivity = 1.0; private LiteralExpr minExpr; private LiteralExpr maxExpr; @@ -37,11 +40,24 @@ public class ColumnStatisticBuilder { private ColumnStatistic original; + private Map partitionIdToColStats = new HashMap<>(); + private String updatedTime; + private PartitionInfo partitionInfo; + public ColumnStatisticBuilder() { } + public PartitionInfo getPartitionInfo() { + return partitionInfo; + } + + public ColumnStatisticBuilder setPartitionInfo(PartitionInfo partitionInfo) { + this.partitionInfo = partitionInfo; + return this; + } + public ColumnStatisticBuilder(ColumnStatistic columnStatistic) { this.count = columnStatistic.count; this.ndv = columnStatistic.ndv; @@ -50,13 +66,14 @@ public ColumnStatisticBuilder(ColumnStatistic columnStatistic) { this.dataSize = columnStatistic.dataSize; this.minValue = columnStatistic.minValue; this.maxValue = columnStatistic.maxValue; - this.selectivity = columnStatistic.selectivity; this.minExpr = columnStatistic.minExpr; this.maxExpr = columnStatistic.maxExpr; this.isUnknown = columnStatistic.isUnKnown; this.histogram = columnStatistic.histogram; this.original = columnStatistic.original; + this.partitionIdToColStats.putAll(columnStatistic.partitionIdToColStats); this.updatedTime = columnStatistic.updatedTime; + this.partitionInfo = columnStatistic.partitionInfo; } public ColumnStatisticBuilder setCount(double count) { @@ -99,11 +116,6 @@ public ColumnStatisticBuilder setMaxValue(double maxValue) { return this; } - public ColumnStatisticBuilder setSelectivity(double selectivity) { - this.selectivity = selectivity; - return this; - } - public ColumnStatisticBuilder setMinExpr(LiteralExpr minExpr) { this.minExpr = minExpr; return this; @@ -147,10 +159,6 @@ public double getMaxValue() { return maxValue; } - public double getSelectivity() { - return selectivity; - } - public LiteralExpr getMinExpr() { return minExpr; } @@ -176,18 +184,23 @@ public String getUpdatedTime() { return updatedTime; } - public void setUpdatedTime(String updatedTime) { + public ColumnStatisticBuilder setUpdatedTime(String updatedTime) { this.updatedTime = updatedTime; + return this; } public ColumnStatistic build() { dataSize = Math.max((count - numNulls + 1) * avgSizeByte, 0); - if (original == null) { + if (original == null && !isUnknown) { original = new ColumnStatistic(count, ndv, null, avgSizeByte, numNulls, - dataSize, minValue, maxValue, selectivity, minExpr, maxExpr, false, - histogram, updatedTime); + dataSize, minValue, maxValue, minExpr, maxExpr, + isUnknown, histogram, updatedTime, partitionInfo); + original.partitionIdToColStats.putAll(partitionIdToColStats); } - return new ColumnStatistic(count, ndv, original, avgSizeByte, numNulls, - dataSize, minValue, maxValue, selectivity, minExpr, maxExpr, isUnknown, histogram, updatedTime); + ColumnStatistic colStats = new ColumnStatistic(count, ndv, original, avgSizeByte, numNulls, + dataSize, minValue, maxValue, minExpr, maxExpr, + isUnknown, histogram, updatedTime, partitionInfo); + colStats.partitionIdToColStats.putAll(partitionIdToColStats); + return colStats; } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java index d94a90b75f0c55a..281a0e8250206ff 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatisticsCacheLoader.java @@ -19,7 +19,8 @@ import org.apache.doris.catalog.Env; import org.apache.doris.catalog.TableIf; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; +import org.apache.doris.common.ThreadPoolManager; +import org.apache.doris.qe.InternalQueryExecutionException; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.logging.log4j.LogManager; @@ -27,16 +28,23 @@ import java.util.List; import java.util.Optional; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.ThreadPoolExecutor.DiscardOldestPolicy; public class ColumnStatisticsCacheLoader extends StatisticsCacheLoader> { private static final Logger LOG = LogManager.getLogger(ColumnStatisticsCacheLoader.class); + private static final ThreadPoolExecutor singleThreadPool = ThreadPoolManager.newDaemonFixedThreadPool( + StatisticConstants.RETRY_LOAD_THREAD_POOL_SIZE, + StatisticConstants.RETRY_LOAD_QUEUE_SIZE, "STATS_RELOAD", + true, + new DiscardOldestPolicy()); + @Override protected Optional doLoad(StatisticsCacheKey key) { // Load from statistics table. - Optional columnStatistic = loadFromStatsTable(key.tableId, - key.idxId, key.colName); + Optional columnStatistic = loadFromStatsTable(key); if (columnStatistic.isPresent()) { return columnStatistic; } @@ -52,8 +60,14 @@ protected Optional doLoad(StatisticsCacheKey key) { return columnStatistic; } - private Optional loadFromStatsTable(long tableId, long idxId, String colName) { - List columnResults = StatisticsRepository.loadColStats(tableId, idxId, colName); + private Optional loadFromStatsTable(StatisticsCacheKey key) { + List columnResults = null; + try { + columnResults = StatisticsRepository.loadColStats(key.tableId, key.idxId, key.colName); + } catch (InternalQueryExecutionException e) { + retryLoad(key); + return Optional.empty(); + } ColumnStatistic columnStatistics; try { columnStatistics = StatisticsUtil.deserializeToColumnStatistics(columnResults); @@ -67,4 +81,42 @@ private Optional loadFromStatsTable(long tableId, long idxId, S return Optional.of(columnStatistics); } } + + private void retryLoad(StatisticsCacheKey key) { + singleThreadPool.submit(new RetryTask(key, 1)); + } + + private static class RetryTask implements Runnable { + StatisticsCacheKey key; + int retryTimes; + + public RetryTask(StatisticsCacheKey key, int retryTimes) { + this.key = key; + this.retryTimes = retryTimes; + } + + @Override + public void run() { + List columnResults = null; + try { + columnResults = StatisticsRepository.loadColStats(key.tableId, key.idxId, key.colName); + } catch (InternalQueryExecutionException e) { + if (this.retryTimes < StatisticConstants.LOAD_RETRY_TIMES) { + retryTimes++; + singleThreadPool.submit(this); + } + return; + } + ColumnStatistic columnStatistics; + try { + columnStatistics = StatisticsUtil.deserializeToColumnStatistics(columnResults); + } catch (Exception e) { + LOG.warn("Exception to deserialize column statistics", e); + return; + } + if (columnStatistics != null) { + Env.getCurrentEnv().getStatisticsCache().putCache(key, columnStatistics); + } + } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index d569cd79bd4aa0f..a446cc2610a0385 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -17,29 +17,28 @@ package org.apache.doris.statistics; +import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.common.FeConstants; -import org.apache.doris.common.util.TimeUtils; import org.apache.doris.qe.AutoCloseConnectContext; import org.apache.doris.qe.QueryState; import org.apache.doris.qe.StmtExecutor; -import org.apache.doris.statistics.util.InternalQueryResult; import org.apache.doris.statistics.util.StatisticsUtil; -import org.apache.commons.lang3.StringUtils; +import com.google.common.collect.Lists; import org.apache.commons.text.StringSubstitutor; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import java.time.Instant; -import java.time.LocalDateTime; -import java.time.ZoneId; import java.util.ArrayList; +import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.StringJoiner; +import java.util.stream.Collectors; public class HMSAnalysisTask extends BaseAnalysisTask { private static final Logger LOG = LogManager.getLogger(HMSAnalysisTask.class); @@ -49,7 +48,7 @@ public class HMSAnalysisTask extends BaseAnalysisTask { public static final String NUM_FILES = "numFiles"; public static final String TIMESTAMP = "transient_lastDdlTime"; - private static final String ANALYZE_SQL_TABLE_TEMPLATE = "INSERT INTO " + private static final String ANALYZE_TABLE_TEMPLATE = "INSERT INTO " + "${internalDB}.${columnStatTbl}" + " SELECT " + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " @@ -59,19 +58,17 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "${idxId} AS idx_id, " + "'${colId}' AS col_id, " + "NULL AS part_id, " - + "COUNT(1) AS row_count, " + + "${countExpr} AS row_count, " + "NDV(`${colName}`) AS ndv, " - + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " + + "${nullCountExpr} AS null_count, " + "MIN(`${colName}`) AS min, " + "MAX(`${colName}`) AS max, " + "${dataSizeFunction} AS data_size, " + "NOW() " - + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; + + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; - private static final String ANALYZE_SQL_PARTITION_TEMPLATE = "INSERT INTO " - + "${internalDB}.${columnStatTbl}" - + " SELECT " - + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, " + private static final String ANALYZE_PARTITION_TEMPLATE = " SELECT " + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}', '-', ${partId}) AS id, " + "${catalogId} AS catalog_id, " + "${dbId} AS db_id, " + "${tblId} AS tbl_id, " @@ -84,22 +81,22 @@ public class HMSAnalysisTask extends BaseAnalysisTask { + "MIN(`${colName}`) AS min, " + "MAX(`${colName}`) AS max, " + "${dataSizeFunction} AS data_size, " - + "NOW() " - + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; + + "NOW() FROM `${catalogName}`.`${dbName}`.`${tblName}` where "; - private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT COUNT(1) as rowCount " - + "FROM `${catalogName}`.`${dbName}`.`${tblName}`"; + private static final String ANALYZE_TABLE_COUNT_TEMPLATE = "SELECT ${countExpr} as rowCount " + + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${sampleExpr}"; + + // cache stats for each partition, it would be inserted into column_statistics in a batch. + private final List> buf = new ArrayList<>(); private final boolean isTableLevelTask; - private final boolean isSamplingPartition; private final boolean isPartitionOnly; - private final Set partitionNames; + private Set partitionNames; private HMSExternalTable table; public HMSAnalysisTask(AnalysisInfo info) { super(info); isTableLevelTask = info.externalTableLevelTask; - isSamplingPartition = info.samplingPartition; isPartitionOnly = info.partitionOnly; partitionNames = info.partitionNames; table = (HMSExternalTable) tbl; @@ -114,42 +111,17 @@ public void doExecute() throws Exception { } /** - * Get table row count and insert the result to __internal_schema.table_statistics + * Get table row count */ private void getTableStats() throws Exception { - // Get table level information. An example sql for table stats: - // INSERT INTO __internal_schema.table_statistics VALUES - // ('13055', 13002, 13038, 13055, -1, 'NULL', 5, 1686111064658, NOW()) - Map parameters = table.getRemoteTable().getParameters(); - if (isPartitionOnly) { - for (String partId : partitionNames) { - StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_SQL_PARTITION_TEMPLATE); - sb.append(" where "); - String[] splits = partId.split("/"); - for (int i = 0; i < splits.length; i++) { - String value = splits[i].split("=")[1]; - splits[i] = splits[i].replace(value, "\'" + value + "\'"); - } - sb.append(StringUtils.join(splits, " and ")); - Map params = buildTableStatsParams(partId); - setParameterData(parameters, params); - List columnResult = - StatisticsUtil.execStatisticQuery(new StringSubstitutor(params) - .replace(sb.toString())); - String rowCount = columnResult.get(0).getColumnValue("rowCount"); - params.put("rowCount", rowCount); - StatisticsRepository.persistTableStats(params); - } - } else { - Map params = buildTableStatsParams(null); - List columnResult = - StatisticsUtil.execStatisticQuery(new StringSubstitutor(params) - .replace(ANALYZE_TABLE_COUNT_TEMPLATE)); - String rowCount = columnResult.get(0).getColumnValue("rowCount"); - params.put("rowCount", rowCount); - StatisticsRepository.persistTableStats(params); - } + Map params = buildTableStatsParams(null); + List columnResult = + StatisticsUtil.execStatisticQuery(new StringSubstitutor(params) + .replace(ANALYZE_TABLE_COUNT_TEMPLATE)); + String rowCount = columnResult.get(0).get(0); + Env.getCurrentEnv().getAnalysisManager() + .updateTableStatsStatus( + new TableStatsMeta(table.getId(), Long.parseLong(rowCount), info)); } /** @@ -173,67 +145,102 @@ private void getTableColumnStats() throws Exception { // 0 AS data_size, // NOW() FROM `hive`.`tpch100`.`region` if (isPartitionOnly) { - for (String partId : partitionNames) { - StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_SQL_TABLE_TEMPLATE); - sb.append(" where "); - String[] splits = partId.split("/"); - for (int i = 0; i < splits.length; i++) { - String value = splits[i].split("=")[1]; - splits[i] = splits[i].replace(value, "\'" + value + "\'"); - } - sb.append(StringUtils.join(splits, " and ")); - Map params = buildTableStatsParams(partId); - params.put("internalDB", FeConstants.INTERNAL_DB_NAME); - params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); - params.put("colName", col.getName()); - params.put("colId", info.colName); - params.put("dataSizeFunction", getDataSizeFunction(col)); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - String sql = stringSubstitutor.replace(sb.toString()); - executeInsertSql(sql); + getPartitionNames(); + List partitionAnalysisSQLs = new ArrayList<>(); + for (String partId : this.partitionNames) { + partitionAnalysisSQLs.add(generateSqlForPartition(partId)); } + execSQLs(partitionAnalysisSQLs); } else { StringBuilder sb = new StringBuilder(); - sb.append(ANALYZE_SQL_TABLE_TEMPLATE); - if (isSamplingPartition) { - sb.append(" where 1=1 "); - String[] splitExample = partitionNames.stream().findFirst().get().split("/"); - int parts = splitExample.length; - List partNames = new ArrayList<>(); - for (String split : splitExample) { - partNames.add(split.split("=")[0]); - } - List> valueLists = new ArrayList<>(); - for (int i = 0; i < parts; i++) { - valueLists.add(new ArrayList<>()); - } - for (String partId : partitionNames) { - String[] partIds = partId.split("/"); - for (int i = 0; i < partIds.length; i++) { - valueLists.get(i).add("\'" + partIds[i].split("=")[1] + "\'"); - } - } - for (int i = 0; i < parts; i++) { - sb.append(" and "); - sb.append(partNames.get(i)); - sb.append(" in ("); - sb.append(StringUtils.join(valueLists.get(i), ",")); - sb.append(") "); - } - } + sb.append(ANALYZE_TABLE_TEMPLATE); Map params = buildTableStatsParams("NULL"); params.put("internalDB", FeConstants.INTERNAL_DB_NAME); params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); params.put("colName", col.getName()); params.put("colId", info.colName); params.put("dataSizeFunction", getDataSizeFunction(col)); + params.put("nullCountExpr", getNullCountExpression()); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String sql = stringSubstitutor.replace(sb.toString()); executeInsertSql(sql); } } + private void getPartitionNames() { + if (partitionNames == null) { + if (info.isAllPartition) { + partitionNames = table.getPartitionNames(); + } else if (info.partitionCount > 0) { + partitionNames = table.getPartitionNames().stream() + .limit(info.partitionCount).collect(Collectors.toSet()); + } + if (partitionNames == null || partitionNames.isEmpty()) { + throw new RuntimeException("Not a partition table or no partition specified."); + } + } + } + + private String generateSqlForPartition(String partId) { + StringBuilder sb = new StringBuilder(); + sb.append(ANALYZE_PARTITION_TEMPLATE); + String[] splits = partId.split("/"); + for (int i = 0; i < splits.length; i++) { + String[] kv = splits[i].split("="); + sb.append(kv[0]); + sb.append("='"); + sb.append(kv[1]); + sb.append("'"); + if (i < splits.length - 1) { + sb.append(" and "); + } + } + Map params = buildTableStatsParams(partId); + params.put("internalDB", FeConstants.INTERNAL_DB_NAME); + params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); + params.put("colName", col.getName()); + params.put("colId", info.colName); + params.put("dataSizeFunction", getDataSizeFunction(col)); + return new StringSubstitutor(params).replace(sb.toString()); + } + + public void execSQLs(List partitionAnalysisSQLs) throws Exception { + long startTime = System.currentTimeMillis(); + LOG.debug("analyze task {} start at {}", info.toString(), new Date()); + try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) { + List> sqlGroups = Lists.partition(partitionAnalysisSQLs, StatisticConstants.UNION_ALL_LIMIT); + for (List group : sqlGroups) { + if (killed) { + return; + } + StringJoiner partitionCollectSQL = new StringJoiner("UNION ALL"); + group.forEach(partitionCollectSQL::add); + stmtExecutor = new StmtExecutor(r.connectContext, partitionCollectSQL.toString()); + buf.add(stmtExecutor.executeInternalQuery() + .stream().map(ColStatsData::new).collect(Collectors.toList())); + QueryState queryState = r.connectContext.getState(); + if (queryState.getStateType().equals(QueryState.MysqlStateType.ERR)) { + throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", + info.catalogName, info.dbName, info.colName, partitionCollectSQL, + queryState.getErrorMessage())); + } + } + for (List colStatsDataList : buf) { + StringBuilder batchInsertSQL = + new StringBuilder("INSERT INTO " + StatisticConstants.FULL_QUALIFIED_STATS_TBL_NAME + + " VALUES "); + StringJoiner sj = new StringJoiner(","); + colStatsDataList.forEach(c -> sj.add(c.toSQL(true))); + batchInsertSQL.append(sj); + stmtExecutor = new StmtExecutor(r.connectContext, batchInsertSQL.toString()); + executeWithExceptionOnFail(stmtExecutor); + } + } finally { + LOG.debug("analyze task {} end. cost {}ms", info, System.currentTimeMillis() - startTime); + } + + } + private void executeInsertSql(String sql) throws Exception { long startTime = System.currentTimeMillis(); try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) { @@ -270,6 +277,8 @@ private Map buildTableStatsParams(String partId) { commonParams.put("catalogName", catalog.getName()); commonParams.put("dbName", db.getFullName()); commonParams.put("tblName", tbl.getName()); + commonParams.put("sampleExpr", getSampleExpression()); + commonParams.put("countExpr", getCountExpression()); if (col != null) { commonParams.put("type", col.getType().toString()); } @@ -277,28 +286,39 @@ private Map buildTableStatsParams(String partId) { return commonParams; } - private void setParameterData(Map parameters, Map params) { - String numRows = ""; - String timestamp = ""; - if (parameters.containsKey(NUM_ROWS)) { - numRows = parameters.get(NUM_ROWS); + protected String getCountExpression() { + if (info.samplePercent > 0) { + return String.format("ROUND(COUNT(1) * 100 / %d)", info.samplePercent); + } else { + return "COUNT(1)"; } - if (parameters.containsKey(TIMESTAMP)) { - timestamp = parameters.get(TIMESTAMP); + } + + protected String getNullCountExpression() { + if (info.samplePercent > 0) { + return String.format("ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 100 / %d)", + info.samplePercent); + } else { + return "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END)"; + } + } + + protected String getDataSizeFunction(Column column) { + String originFunction = super.getDataSizeFunction(column); + if (info.samplePercent > 0 && !isPartitionOnly) { + return String.format("ROUND((%s) * 100 / %d)", originFunction, info.samplePercent); + } else { + return originFunction; } - params.put("numRows", numRows); - params.put("rowCount", numRows); - params.put("update_time", TimeUtils.DATETIME_FORMAT.format( - LocalDateTime.ofInstant(Instant.ofEpochMilli(Long.parseLong(timestamp) * 1000), - ZoneId.systemDefault()))); } @Override protected void afterExecution() { - if (isTableLevelTask) { - Env.getCurrentEnv().getStatisticsCache().refreshTableStatsSync(catalog.getId(), db.getId(), tbl.getId()); - } else { - Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName()); + // Table level task doesn't need to sync any value to sync stats, it stores the value in metadata. + // Partition only task doesn't need to refresh cached. + if (isTableLevelTask || isPartitionOnly) { + return; } + Env.getCurrentEnv().getStatisticsCache().syncLoadColStats(tbl.getId(), -1, col.getName()); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HistData.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HistData.java new file mode 100644 index 000000000000000..85f2fe45bd5761d --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HistData.java @@ -0,0 +1,36 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +public class HistData { + + public final StatsId statsId; + + public final double sampleRate; + + public final String buckets; + + public final String updateTime; + + public HistData(ResultRow row) { + this.statsId = new StatsId(row); + this.sampleRate = Double.parseDouble(row.get(7)); + this.buckets = row.get(8); + this.updateTime = row.get(9); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/Histogram.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/Histogram.java index 05e2c199ed9083a..2068c368c40f52e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Histogram.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Histogram.java @@ -20,7 +20,6 @@ import org.apache.doris.catalog.Column; import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.catalog.Type; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import com.google.common.base.Strings; @@ -62,13 +61,12 @@ public Histogram(Type dataType, double sampleRate, int numBuckets, List public static Histogram fromResultRow(ResultRow resultRow) { try { HistogramBuilder histogramBuilder = new HistogramBuilder(); - - long catalogId = Long.parseLong(resultRow.getColumnValue("catalog_id")); - long idxId = Long.parseLong(resultRow.getColumnValue("idx_id")); - long dbId = Long.parseLong(resultRow.getColumnValue("db_id")); - long tblId = Long.parseLong(resultRow.getColumnValue("tbl_id")); - - String colName = resultRow.getColumnValue("col_id"); + HistData histData = new HistData(resultRow); + long catalogId = histData.statsId.catalogId; + long idxId = histData.statsId.idxId; + long dbId = histData.statsId.dbId; + long tblId = histData.statsId.tblId; + String colName = histData.statsId.colId; Column col = StatisticsUtil.findColumn(catalogId, dbId, tblId, idxId, colName); if (col == null) { LOG.warn("Failed to deserialize histogram statistics, ctlId: {} dbId: {}" @@ -79,10 +77,10 @@ public static Histogram fromResultRow(ResultRow resultRow) { Type dataType = col.getType(); histogramBuilder.setDataType(dataType); - double sampleRate = Double.parseDouble(resultRow.getColumnValue("sample_rate")); + double sampleRate = histData.sampleRate; histogramBuilder.setSampleRate(sampleRate); - String json = resultRow.getColumnValue("buckets"); + String json = histData.buckets; JsonObject jsonObj = JsonParser.parseString(json).getAsJsonObject(); int bucketNum = jsonObj.get("num_buckets").getAsInt(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramCacheLoader.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramCacheLoader.java index 0e0752409231d6d..d9928f2a6392617 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramCacheLoader.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HistogramCacheLoader.java @@ -18,7 +18,6 @@ package org.apache.doris.statistics; import org.apache.doris.common.FeConstants; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.commons.collections.CollectionUtils; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java index 25b9db0b2c7bdef..58be1510b44b10c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/JdbcAnalysisTask.java @@ -23,7 +23,6 @@ import org.apache.doris.qe.AutoCloseConnectContext; import org.apache.doris.qe.QueryState; import org.apache.doris.qe.StmtExecutor; -import org.apache.doris.statistics.util.InternalQueryResult; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.commons.text.StringSubstitutor; @@ -81,11 +80,11 @@ public void doExecute() throws Exception { */ private void getTableStats() throws Exception { Map params = buildTableStatsParams(null); - List columnResult = + List columnResult = StatisticsUtil.execStatisticQuery(new StringSubstitutor(params).replace(ANALYZE_TABLE_COUNT_TEMPLATE)); - String rowCount = columnResult.get(0).getColumnValue("rowCount"); - params.put("rowCount", rowCount); - StatisticsRepository.persistTableStats(params); + String rowCount = columnResult.get(0).get(0); + Env.getCurrentEnv().getAnalysisManager() + .updateTableStatsStatus(new TableStatsMeta(table.getId(), Long.parseLong(rowCount), info)); } /** diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisJob.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisJob.java new file mode 100644 index 000000000000000..877a4f5bd093646 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisJob.java @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import java.util.List; + +public class OlapAnalysisJob { + + + + private List columns; + + private static String collectPartionStatsSQLTemplate = + " SELECT " + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}', '-', ${partId}) AS id, " + + "${catalogId} AS catalog_id, " + + "${dbId} AS db_id, " + + "${tblId} AS tbl_id, " + + "${idxId} AS idx_id, " + + "'${colId}' AS col_id, " + + "${partId} AS part_id, " + + "COUNT(1) AS row_count, " + + "NDV(`${colName}`) AS ndv, " + + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " + + "MIN(`${colName}`) AS min, " + + "MAX(`${colName}`) AS max, " + + "${dataSizeFunction} AS data_size, " + + "NOW() "; + + + protected void beforeExecution() { + } + + public void execute() { + } + + protected void afterExecution() { + + } + +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 257708de54f78bd..180ac9d9839c5e0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -23,37 +23,66 @@ import org.apache.doris.qe.QueryState; import org.apache.doris.qe.QueryState.MysqlStateType; import org.apache.doris.qe.StmtExecutor; +import org.apache.doris.statistics.AnalysisInfo.JobType; import org.apache.doris.statistics.util.StatisticsUtil; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; import org.apache.commons.text.StringSubstitutor; import java.util.ArrayList; +import java.util.Date; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.StringJoiner; +import java.util.stream.Collectors; /** * Each task analyze one column. */ public class OlapAnalysisTask extends BaseAnalysisTask { - private static final String ANALYZE_PARTITION_SQL_TEMPLATE = INSERT_PART_STATISTICS - + "FROM `${dbName}`.`${tblName}` " - + "PARTITION ${partName} ${sampleExpr}"; - // TODO Currently, NDV is computed for the full table; in fact, // NDV should only be computed for the relevant partition. private static final String ANALYZE_COLUMN_SQL_TEMPLATE = INSERT_COL_STATISTICS + " (SELECT NDV(`${colName}`) AS ndv " + " FROM `${dbName}`.`${tblName}` ${sampleExpr}) t2\n"; + private static final String collectPartitionStatsSQLTemplate = + " SELECT " + + "CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}', '-', ${partId}) AS id, " + + "${catalogId} AS catalog_id, " + + "${dbId} AS db_id, " + + "${tblId} AS tbl_id, " + + "${idxId} AS idx_id, " + + "'${colId}' AS col_id, " + + "${partId} AS part_id, " + + "COUNT(1) AS row_count, " + + "NDV(`${colName}`) AS ndv, " + + "SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) AS null_count, " + + "MIN(`${colName}`) AS min, " + + "MAX(`${colName}`) AS max, " + + "${dataSizeFunction} AS data_size, " + + "NOW() FROM `${dbName}`.`${tblName}` PARTITION ${partitionName} ${sampleExpr}"; + + // cache stats for each partition, it would be inserted into column_statistics in a batch. + private final List> buf = new ArrayList<>(); + + @VisibleForTesting + public OlapAnalysisTask() { + } + public OlapAnalysisTask(AnalysisInfo info) { super(info); } public void doExecute() throws Exception { + Set partitionNames = info.colToPartitions.get(info.colName); + if (partitionNames.isEmpty()) { + return; + } Map params = new HashMap<>(); params.put("internalDB", FeConstants.INTERNAL_DB_NAME); params.put("columnStatTbl", StatisticConstants.STATISTIC_TBL_NAME); @@ -70,56 +99,65 @@ public void doExecute() throws Exception { List partitionAnalysisSQLs = new ArrayList<>(); try { tbl.readLock(); - Set partNames = info.colToPartitions.get(info.colName); - for (String partName : partNames) { - Partition part = tbl.getPartition(partName); + + for (String partitionName : partitionNames) { + Partition part = tbl.getPartition(partitionName); if (part == null) { continue; } - params.put("partId", String.valueOf(tbl.getPartition(partName).getId())); + params.put("partId", String.valueOf(tbl.getPartition(partitionName).getId())); // Avoid error when get the default partition - params.put("partName", "`" + partName + "`"); + params.put("partitionName", "`" + partitionName + "`"); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - partitionAnalysisSQLs.add(stringSubstitutor.replace(ANALYZE_PARTITION_SQL_TEMPLATE)); + partitionAnalysisSQLs.add(stringSubstitutor.replace(collectPartitionStatsSQLTemplate)); } } finally { tbl.readUnlock(); } - execSQLs(partitionAnalysisSQLs); - params.remove("partId"); - params.put("type", col.getType().toString()); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - String sql = stringSubstitutor.replace(ANALYZE_COLUMN_SQL_TEMPLATE); - execSQL(sql); - } - - @VisibleForTesting - public void execSQLs(List partitionAnalysisSQLs) throws Exception { - for (String sql : partitionAnalysisSQLs) { - execSQL(sql); - } + execSQLs(partitionAnalysisSQLs, params); } @VisibleForTesting - public void execSQL(String sql) throws Exception { - if (killed) { - return; - } + public void execSQLs(List partitionAnalysisSQLs, Map params) throws Exception { long startTime = System.currentTimeMillis(); - LOG.info("ANALYZE SQL : " + sql + " start at " + startTime); - try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext()) { - r.connectContext.getSessionVariable().disableNereidsPlannerOnce(); - stmtExecutor = new StmtExecutor(r.connectContext, sql); - r.connectContext.setExecutor(stmtExecutor); - stmtExecutor.execute(); - QueryState queryState = r.connectContext.getState(); - if (queryState.getStateType().equals(MysqlStateType.ERR)) { - throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", - info.catalogName, info.dbName, info.colName, sql, queryState.getErrorMessage())); + LOG.debug("analyze task {} start at {}", info.toString(), new Date()); + try (AutoCloseConnectContext r = StatisticsUtil.buildConnectContext(info.jobType.equals(JobType.SYSTEM))) { + List> sqlGroups = Lists.partition(partitionAnalysisSQLs, StatisticConstants.UNION_ALL_LIMIT); + for (List group : sqlGroups) { + if (killed) { + return; + } + StringJoiner partitionCollectSQL = new StringJoiner("UNION ALL"); + group.forEach(partitionCollectSQL::add); + stmtExecutor = new StmtExecutor(r.connectContext, partitionCollectSQL.toString()); + buf.add(stmtExecutor.executeInternalQuery() + .stream().map(ColStatsData::new).collect(Collectors.toList())); + QueryState queryState = r.connectContext.getState(); + if (queryState.getStateType().equals(MysqlStateType.ERR)) { + throw new RuntimeException(String.format("Failed to analyze %s.%s.%s, error: %s sql: %s", + info.catalogName, info.dbName, info.colName, partitionCollectSQL, + queryState.getErrorMessage())); + } + } + for (List colStatsDataList : buf) { + StringBuilder batchInsertSQL = + new StringBuilder("INSERT INTO " + StatisticConstants.FULL_QUALIFIED_STATS_TBL_NAME + + " VALUES "); + StringJoiner sj = new StringJoiner(","); + colStatsDataList.forEach(c -> sj.add(c.toSQL(true))); + batchInsertSQL.append(sj.toString()); + stmtExecutor = new StmtExecutor(r.connectContext, batchInsertSQL.toString()); + executeWithExceptionOnFail(stmtExecutor); } + params.put("type", col.getType().toString()); + StringSubstitutor stringSubstitutor = new StringSubstitutor(params); + String sql = stringSubstitutor.replace(ANALYZE_COLUMN_SQL_TEMPLATE); + stmtExecutor = new StmtExecutor(r.connectContext, sql); + executeWithExceptionOnFail(stmtExecutor); } finally { - LOG.info("Analyze SQL: " + sql + " cost time: " + (System.currentTimeMillis() - startTime) + "ms"); + LOG.debug("analyze task {} end. cost {}ms", info, + System.currentTimeMillis() - startTime); } - } + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ResultRow.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ResultRow.java new file mode 100644 index 000000000000000..9945175a228a932 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ResultRow.java @@ -0,0 +1,59 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import com.google.gson.annotations.SerializedName; + +import java.util.Collections; +import java.util.List; +import java.util.StringJoiner; + +public class ResultRow { + @SerializedName("values") + private final List values; + + public ResultRow(List values) { + this.values = values; + } + + public List getValues() { + return values != null ? values : Collections.emptyList(); + } + + @Override + public String toString() { + StringJoiner sj = new StringJoiner(",", "ResultRow:{", "}"); + for (String val : values) { + sj.add(val); + } + return sj.toString(); + } + + public String get(int idx) { + return values.get(idx); + } + + /** + * If analyze an empty table, some stats would be null, return a default value + * to avoid npe would deserialize it. + */ + public String getWithDefault(int idx, String defaultVal) { + String val = values.get(idx); + return val == null ? defaultVal : val; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java index a2194834030b5d0..e6b8297d0c0b013 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticConstants.java @@ -17,6 +17,9 @@ package org.apache.doris.statistics; +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.TableIf; import org.apache.doris.cluster.ClusterNamespace; import org.apache.doris.common.FeConstants; import org.apache.doris.system.SystemInfoService; @@ -26,22 +29,16 @@ import java.util.concurrent.TimeUnit; public class StatisticConstants { - public static final String ANALYSIS_TBL_NAME = "table_statistics"; public static final String STATISTIC_TBL_NAME = "column_statistics"; - public static final String HISTOGRAM_TBL_NAME = "histogram_statistics"; public static final int MAX_NAME_LEN = 64; public static final int ID_LEN = 4096; - public static final int STATISTICS_CACHE_VALID_DURATION_IN_HOURS = 24 * 2; - public static final int STATISTICS_CACHE_REFRESH_INTERVAL = 24 * 2; - public static final int ROW_COUNT_CACHE_VALID_DURATION_IN_HOURS = 12; - /** * Bucket count fot column_statistics and analysis_job table. */ @@ -63,26 +60,51 @@ public class StatisticConstants { public static final int HISTOGRAM_MAX_BUCKET_NUM = 128; - /** - * The health of the table indicates the health of the table statistics, rang in [0, 100]. - * Below this threshold will automatically re-collect statistics. TODO make it in fe.conf - */ - public static final int TABLE_STATS_HEALTH_THRESHOLD = 80; - public static final int ANALYZE_MANAGER_INTERVAL_IN_SECS = 60; - public static List STATISTICS_DB_BLACK_LIST = new ArrayList<>(); + public static List SYSTEM_DBS = new ArrayList<>(); + + public static int ANALYZE_TASK_RETRY_TIMES = 5; public static final String DB_NAME = SystemInfoService.DEFAULT_CLUSTER + ":" + FeConstants.INTERNAL_DB_NAME; + public static final String FULL_QUALIFIED_STATS_TBL_NAME = FeConstants.INTERNAL_DB_NAME + "." + STATISTIC_TBL_NAME; + public static final int STATISTIC_INTERNAL_TABLE_REPLICA_NUM = 3; - public static int ANALYZE_TASK_RETRY_TIMES = 3; + public static final int RETRY_LOAD_QUEUE_SIZE = 1000; + + public static final int RETRY_LOAD_THREAD_POOL_SIZE = 1; + + public static final int LOAD_RETRY_TIMES = 3; + + // union more relation than 512 may cause StackOverFlowException in the future. + public static final int UNION_ALL_LIMIT = 512; + + public static final String FULL_AUTO_ANALYZE_START_TIME = "00:00:00"; + public static final String FULL_AUTO_ANALYZE_END_TIME = "23:59:59"; static { - STATISTICS_DB_BLACK_LIST.add(SystemInfoService.DEFAULT_CLUSTER + SYSTEM_DBS.add(SystemInfoService.DEFAULT_CLUSTER + ClusterNamespace.CLUSTER_DELIMITER + FeConstants.INTERNAL_DB_NAME); - STATISTICS_DB_BLACK_LIST.add(SystemInfoService.DEFAULT_CLUSTER + SYSTEM_DBS.add(SystemInfoService.DEFAULT_CLUSTER + ClusterNamespace.CLUSTER_DELIMITER + "information_schema"); } + + public static boolean isSystemTable(TableIf tableIf) { + if (tableIf instanceof OlapTable) { + OlapTable olapTable = (OlapTable) tableIf; + if (StatisticConstants.SYSTEM_DBS.contains(olapTable.getQualifiedDbName())) { + return true; + } + } + return false; + } + + public static boolean shouldIgnoreCol(TableIf tableIf, Column c) { + if (isSystemTable(tableIf)) { + return true; + } + return !c.isVisible(); + } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java index d3bdbdabd0b0e5d..74b77c2ee7c91ff 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticRange.java @@ -17,6 +17,8 @@ package org.apache.doris.statistics; +import org.apache.doris.analysis.LiteralExpr; +import org.apache.doris.common.Pair; import org.apache.doris.nereids.types.DataType; import java.util.Objects; @@ -29,20 +31,47 @@ public class StatisticRange { * {@code NaN} represents empty range ({@code high} must be {@code NaN} too) */ private final double low; + + private final LiteralExpr lowExpr; /** * {@code NaN} represents empty range ({@code low} must be {@code NaN} too) */ private final double high; + private final LiteralExpr highExpr; + private final double distinctValues; private final DataType dataType; - public StatisticRange(double low, double high, double distinctValues, DataType dataType) { + private final boolean isEmpty; + + public StatisticRange(double low, LiteralExpr lowExpr, double high, LiteralExpr highExpr, + double distinctValues, DataType dataType) { + this(low, lowExpr, high, highExpr, distinctValues, dataType, false); + } + + private StatisticRange(double low, LiteralExpr lowExpr, double high, LiteralExpr highExpr, + double distinctValues, DataType dataType, boolean isEmpty) { this.low = low; + this.lowExpr = lowExpr; this.high = high; + this.highExpr = highExpr; this.distinctValues = distinctValues; this.dataType = dataType; + this.isEmpty = isEmpty; + } + + public LiteralExpr getLowExpr() { + return lowExpr; + } + + public LiteralExpr getHighExpr() { + return highExpr; + } + + public DataType getDataType() { + return dataType; } public double overlapPercentWith(StatisticRange other) { @@ -79,19 +108,29 @@ public double overlapPercentWith(StatisticRange other) { } public static StatisticRange empty(DataType dataType) { - return new StatisticRange(Double.NaN, Double.NaN, 0, dataType); + return new StatisticRange(Double.NEGATIVE_INFINITY, null, Double.POSITIVE_INFINITY, + null, 0, dataType, true); } public boolean isEmpty() { - return Double.isNaN(low) && Double.isNaN(high); + return isEmpty; } public boolean isBothInfinite() { return Double.isInfinite(low) && Double.isInfinite(high); } - public static StatisticRange from(ColumnStatistic column, DataType dataType) { - return new StatisticRange(column.minValue, column.maxValue, column.ndv, dataType); + public boolean isInfinite() { + return Double.isInfinite(low) || Double.isInfinite(high); + } + + public boolean isFinite() { + return Double.isFinite(low) && Double.isFinite(high); + } + + public static StatisticRange from(ColumnStatistic colStats, DataType dataType) { + return new StatisticRange(colStats.minValue, colStats.minExpr, colStats.maxValue, colStats.maxExpr, + colStats.ndv, dataType); } public double getLow() { @@ -107,22 +146,49 @@ public double length() { } public StatisticRange intersect(StatisticRange other) { - double newLow = Math.max(low, other.low); - double newHigh = Math.min(high, other.high); + Pair biggerLow = maxPair(low, lowExpr, other.low, other.lowExpr); + double newLow = biggerLow.first; + LiteralExpr newLowExpr = biggerLow.second; + + Pair smallerHigh = minPair(high, highExpr, other.high, other.highExpr); + double newHigh = smallerHigh.first; + LiteralExpr newHighExpr = smallerHigh.second; if (newLow <= newHigh) { - return new StatisticRange(newLow, newHigh, overlappingDistinctValues(other), dataType); + return new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, + overlappingDistinctValues(other), dataType); } return empty(dataType); } + public Pair minPair(double r1, LiteralExpr e1, double r2, LiteralExpr e2) { + if (r1 < r2) { + return Pair.of(r1, e1); + } + return Pair.of(r2, e2); + } + + public Pair maxPair(double r1, LiteralExpr e1, double r2, LiteralExpr e2) { + if (r1 > r2) { + return Pair.of(r1, e1); + } + return Pair.of(r2, e2); + } + public StatisticRange cover(StatisticRange other) { - double newLow = Math.max(low, other.low); - double newHigh = Math.min(high, other.high); + // double newLow = Math.max(low, other.low); + // double newHigh = Math.min(high, other.high); + Pair biggerLow = maxPair(low, lowExpr, other.low, other.lowExpr); + double newLow = biggerLow.first; + LiteralExpr newLowExpr = biggerLow.second; + Pair smallerHigh = minPair(high, highExpr, other.high, other.highExpr); + double newHigh = smallerHigh.first; + LiteralExpr newHighExpr = smallerHigh.second; + if (newLow <= newHigh) { double overlapPercentOfLeft = overlapPercentWith(other); double overlapDistinctValuesLeft = overlapPercentOfLeft * distinctValues; double coveredDistinctValues = minExcludeNaN(distinctValues, overlapDistinctValuesLeft); - return new StatisticRange(newLow, newHigh, coveredDistinctValues, dataType); + return new StatisticRange(newLow, newLowExpr, newHigh, newHighExpr, coveredDistinctValues, dataType); } return empty(dataType); } @@ -135,7 +201,10 @@ public StatisticRange union(StatisticRange other) { double maxOverlapNDV = Math.max(overlapNDVThis, overlapNDVOther); double newNDV = maxOverlapNDV + ((1 - overlapPercentThis) * distinctValues) + ((1 - overlapPercentOther) * other.distinctValues); - return new StatisticRange(Math.min(low, other.low), Math.max(high, other.high), newNDV, dataType); + Pair smallerMin = minPair(low, lowExpr, other.low, other.lowExpr); + Pair biggerHigh = maxPair(high, highExpr, other.high, other.highExpr); + return new StatisticRange(smallerMin.first, smallerMin.second, + biggerHigh.first, biggerHigh.second, newNDV, dataType); } private double overlappingDistinctValues(StatisticRange other) { @@ -170,7 +239,4 @@ public double getDistinctValues() { return distinctValues; } - public static StatisticRange fromColumnStatistics(ColumnStatistic columnStatistic, DataType dataType) { - return new StatisticRange(columnStatistic.minValue, columnStatistic.maxValue, columnStatistic.ndv, dataType); - } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticalType.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticalType.java index 67dd9bb05432bb6..7fe9b03cbcfa70a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticalType.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticalType.java @@ -22,6 +22,7 @@ public enum StatisticalType { AGG_NODE, ANALYTIC_EVAL_NODE, ASSERT_NUM_ROWS_NODE, + CTE_SCAN_NODE, BROKER_SCAN_NODE, NESTED_LOOP_JOIN_NODE, EMPTY_SET_NODE, @@ -54,4 +55,5 @@ public enum StatisticalType { METADATA_SCAN_NODE, JDBC_SCAN_NODE, TEST_EXTERNAL_TABLE, + GROUP_COMMIT_SCAN_NODE } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java index 5c628aaba302ca3..77c221f5931b381 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/Statistics.java @@ -17,17 +17,18 @@ package org.apache.doris.statistics; -import org.apache.doris.nereids.stats.ExpressionEstimation; import org.apache.doris.nereids.stats.StatsMathUtil; import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.Slot; import java.text.DecimalFormat; import java.util.HashMap; import java.util.Map; import java.util.Map.Entry; +import java.util.Set; public class Statistics { - private static int K_BYTES = 1024; + private static final int K_BYTES = 1024; private final double rowCount; @@ -36,33 +37,10 @@ public class Statistics { // the byte size of one tuple private double tupleSize; - @Deprecated - private double width; - - @Deprecated - private double penalty; - - /** - * after filter, compute the new ndv of a column - * @param ndv original ndv of column - * @param newRowCount the row count of table after filter - * @param oldRowCount the row count of table before filter - * @return the new ndv after filter - */ - public static double computeNdv(double ndv, double newRowCount, double oldRowCount) { - if (newRowCount > oldRowCount) { - return ndv; - } - double selectOneTuple = newRowCount / StatsMathUtil.nonZeroDivisor(oldRowCount); - double allTuplesOfSameDistinctValueNotSelected = Math.pow((1 - selectOneTuple), oldRowCount / ndv); - return Math.min(ndv * (1 - allTuplesOfSameDistinctValueNotSelected), newRowCount); - } - public Statistics(Statistics another) { this.rowCount = another.rowCount; this.expressionToColumnStats = new HashMap<>(another.expressionToColumnStats); - this.width = another.width; - this.penalty = another.penalty; + this.tupleSize = another.tupleSize; } public Statistics(double rowCount, Map expressionToColumnStats) { @@ -70,14 +48,6 @@ public Statistics(double rowCount, Map expressionTo this.expressionToColumnStats = expressionToColumnStats; } - public Statistics(double rowCount, Map expressionToColumnStats, double width, - double penalty) { - this.rowCount = rowCount; - this.expressionToColumnStats = expressionToColumnStats; - this.width = width; - this.penalty = penalty; - } - public ColumnStatistic findColumnStatistics(Expression expression) { return expressionToColumnStats.get(expression); } @@ -90,53 +60,46 @@ public double getRowCount() { return rowCount; } - /* - * Return a stats with new rowCount and fix each column stats. - */ public Statistics withRowCount(double rowCount) { - if (Double.isNaN(rowCount)) { - return this; - } - Statistics statistics = new Statistics(rowCount, new HashMap<>(expressionToColumnStats), width, penalty); - statistics.fix(rowCount, StatsMathUtil.nonZeroDivisor(this.rowCount)); - return statistics; + return new Statistics(rowCount, new HashMap<>(expressionToColumnStats)); } /** * Update by count. */ - public Statistics updateRowCountOnly(double rowCount) { + public Statistics withRowCountAndEnforceValid(double rowCount) { Statistics statistics = new Statistics(rowCount, expressionToColumnStats); - for (Entry entry : expressionToColumnStats.entrySet()) { - ColumnStatistic columnStatistic = entry.getValue(); - ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(columnStatistic); - columnStatisticBuilder.setNdv(Math.min(columnStatistic.ndv, rowCount)); - double nullFactor = (rowCount - columnStatistic.numNulls) / rowCount; - columnStatisticBuilder.setNumNulls(nullFactor * rowCount); - columnStatisticBuilder.setCount(rowCount); - statistics.addColumnStats(entry.getKey(), columnStatisticBuilder.build()); - } + statistics.enforceValid(); return statistics; } - /** - * Fix by sel. - */ - public void fix(double newRowCount, double originRowCount) { - double sel = newRowCount / originRowCount; + public void enforceValid() { for (Entry entry : expressionToColumnStats.entrySet()) { ColumnStatistic columnStatistic = entry.getValue(); - ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(columnStatistic); - columnStatisticBuilder.setNdv(computeNdv(columnStatistic.ndv, newRowCount, originRowCount)); - columnStatisticBuilder.setNumNulls(Math.min(columnStatistic.numNulls * sel, newRowCount)); - columnStatisticBuilder.setCount(newRowCount); - expressionToColumnStats.put(entry.getKey(), columnStatisticBuilder.build()); + if (!checkColumnStatsValid(columnStatistic)) { + double ndv = Math.min(columnStatistic.ndv, rowCount); + ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(columnStatistic); + columnStatisticBuilder.setNdv(ndv); + columnStatisticBuilder.setNumNulls(Math.min(columnStatistic.numNulls, rowCount - ndv)); + columnStatisticBuilder.setCount(rowCount); + columnStatistic = columnStatisticBuilder.build(); + expressionToColumnStats.put(entry.getKey(), columnStatistic); + } } } + public boolean checkColumnStatsValid(ColumnStatistic columnStatistic) { + return columnStatistic.ndv <= rowCount + && columnStatistic.numNulls <= rowCount - columnStatistic.ndv; + } + public Statistics withSel(double sel) { sel = StatsMathUtil.minNonNaN(sel, 1); - return withRowCount(rowCount * sel); + if (Double.isNaN(rowCount)) { + return this; + } + double newCount = rowCount * sel; + return new Statistics(newCount, new HashMap<>(expressionToColumnStats)); } public Statistics addColumnStats(Expression expression, ColumnStatistic columnStatistic) { @@ -144,9 +107,10 @@ public Statistics addColumnStats(Expression expression, ColumnStatistic columnSt return this; } - public Statistics merge(Statistics statistics) { - expressionToColumnStats.putAll(statistics.expressionToColumnStats); - return this; + public boolean isInputSlotsUnknown(Set inputs) { + return inputs.stream() + .allMatch(s -> expressionToColumnStats.containsKey(s) + && expressionToColumnStats.get(s).isUnKnown); } private double computeTupleSize() { @@ -183,53 +147,20 @@ public String toString() { return format.format(rowCount); } - public void setWidth(double width) { - this.width = width; - } - - public void setPenalty(double penalty) { - this.penalty = penalty; - } - - public double getWidth() { - return width; - } - - public double getPenalty() { - return penalty; - } - public int getBENumber() { return 1; } public static Statistics zero(Statistics statistics) { Statistics zero = new Statistics(0, new HashMap<>()); - for (Map.Entry entry : statistics.expressionToColumnStats.entrySet()) { + for (Entry entry : statistics.expressionToColumnStats.entrySet()) { zero.addColumnStats(entry.getKey(), ColumnStatistic.ZERO); } return zero; } - public boolean almostUniqueExpression(Expression expr) { - ExpressionEstimation estimator = new ExpressionEstimation(); - double ndvErrorThreshold = 0.9; - ColumnStatistic colStats = expr.accept(estimator, this); - if (colStats.ndv > colStats.count * ndvErrorThreshold) { - return true; - } - return false; - } - - public boolean isStatsUnknown(Expression expr) { - ExpressionEstimation estimator = new ExpressionEstimation(); - ColumnStatistic colStats = expr.accept(estimator, this); - return colStats.isUnKnown; - } - /** * merge this and other colStats.ndv, choose min - * @param other */ public void updateNdv(Statistics other) { for (Expression expr : expressionToColumnStats.keySet()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java deleted file mode 100644 index aae783ca8b278b7..000000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoAnalyzer.java +++ /dev/null @@ -1,215 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -import org.apache.doris.analysis.DdlStmt; -import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.Partition; -import org.apache.doris.catalog.TableIf; -import org.apache.doris.common.Config; -import org.apache.doris.common.DdlException; -import org.apache.doris.common.util.MasterDaemon; -import org.apache.doris.statistics.AnalysisInfo.JobType; -import org.apache.doris.statistics.util.StatisticsUtil; - -import com.google.common.collect.Maps; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.util.Collection; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; - -public class StatisticsAutoAnalyzer extends MasterDaemon { - - private static final Logger LOG = LogManager.getLogger(StatisticsAutoAnalyzer.class); - - public StatisticsAutoAnalyzer() { - super("Automatic Analyzer", TimeUnit.MINUTES.toMillis(Config.auto_check_statistics_in_minutes)); - } - - @Override - protected void runAfterCatalogReady() { - if (!Env.getCurrentEnv().isMaster()) { - return; - } - if (!StatisticsUtil.statsTblAvailable()) { - return; - } - if (Config.enable_auto_collect_statistics) { - analyzePeriodically(); - analyzeAutomatically(); - } - } - - public void autoAnalyzeStats(DdlStmt ddlStmt) { - // TODO Monitor some DDL statements, and then trigger automatic analysis tasks - } - - private void analyzePeriodically() { - try { - AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); - List jobInfos = analysisManager.findPeriodicJobs(); - for (AnalysisInfo jobInfo : jobInfos) { - jobInfo = new AnalysisInfoBuilder(jobInfo).setJobType(JobType.SYSTEM).build(); - analysisManager.createAnalysisJob(jobInfo); - } - } catch (DdlException e) { - LOG.warn("Failed to periodically analyze the statistics." + e); - } - } - - private void analyzeAutomatically() { - AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); - List jobInfos = analysisManager.findAutomaticAnalysisJobs(); - for (AnalysisInfo jobInfo : jobInfos) { - AnalysisInfo checkedJobInfo = null; - try { - checkedJobInfo = checkAutomaticJobInfo(jobInfo); - if (checkedJobInfo != null) { - analysisManager.createAnalysisJob(checkedJobInfo); - } - } catch (Throwable t) { - LOG.warn("Failed to create analyze job: {}", checkedJobInfo); - } - - } - } - - /** - * Check if automatic analysis of statistics is required. - *

- * Step1: check the health of the table, if the health is good, - * there is no need to re-analyze, or check partition - *

- * Step2: check the partition update time, if the partition is not updated - * after the statistics is analyzed, there is no need to re-analyze - *

- * Step3: if the partition is updated after the statistics is analyzed, - * check the health of the partition, if the health is good, there is no need to re-analyze - * - Step3.1: check the analyzed partition statistics - * - Step3.2: Check for new partitions for which statistics were not analyzed - *

- * TODO new columns is not currently supported to analyze automatically - * - * @param jobInfo analysis job info - * @return new job info after check - * @throws Throwable failed to check - */ - private AnalysisInfo checkAutomaticJobInfo(AnalysisInfo jobInfo) throws Throwable { - long lastExecTimeInMs = jobInfo.lastExecTimeInMs; - TableIf table = StatisticsUtil - .findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); - TableStatistic tblStats = StatisticsRepository.fetchTableLevelStats(table.getId()); - - if (tblStats == TableStatistic.UNKNOWN) { - LOG.warn("Failed to automatically analyze statistics, " - + "no corresponding table statistics for job: {}", jobInfo.toString()); - throw new DdlException("No corresponding table statistics for automatic job."); - } - - if (!needReanalyzeTable(table, tblStats)) { - return null; - } - - Set needRunPartitions = new HashSet<>(); - Set statsPartitions = jobInfo.colToPartitions.values() - .stream() - .flatMap(Collection::stream) - .collect(Collectors.toSet()); - - checkAnalyzedPartitions(table, statsPartitions, needRunPartitions, lastExecTimeInMs); - checkNewPartitions(table, needRunPartitions, lastExecTimeInMs); - - if (needRunPartitions.isEmpty()) { - return null; - } - - return getAnalysisJobInfo(jobInfo, table, needRunPartitions); - } - - private boolean needReanalyzeTable(TableIf table, TableStatistic tblStats) { - long rowCount = table.getRowCount(); - long updateRows = Math.abs(rowCount - tblStats.rowCount); - int tblHealth = StatisticsUtil.getTableHealth(rowCount, updateRows); - return tblHealth < StatisticConstants.TABLE_STATS_HEALTH_THRESHOLD; - } - - private void checkAnalyzedPartitions(TableIf table, Set statsPartitions, - Set needRunPartitions, long lastExecTimeInMs) throws DdlException { - for (String statsPartition : statsPartitions) { - Partition partition = table.getPartition(statsPartition); - if (partition == null) { - // Partition that has been deleted also need to - // be reanalyzed (delete partition statistics later) - needRunPartitions.add(statsPartition); - continue; - } - TableStatistic partitionStats = StatisticsRepository - .fetchTableLevelOfPartStats(partition.getId()); - if (partitionStats == TableStatistic.UNKNOWN) { - continue; - } - if (needReanalyzePartition(lastExecTimeInMs, partition, partitionStats)) { - needRunPartitions.add(partition.getName()); - } - } - } - - private boolean needReanalyzePartition(long lastExecTimeInMs, Partition partition, TableStatistic partStats) { - long partUpdateTime = partition.getVisibleVersionTime(); - if (partUpdateTime < lastExecTimeInMs) { - return false; - } - long pRowCount = partition.getBaseIndex().getRowCount(); - long pUpdateRows = Math.abs(pRowCount - partStats.rowCount); - int partHealth = StatisticsUtil.getTableHealth(pRowCount, pUpdateRows); - return partHealth < StatisticConstants.TABLE_STATS_HEALTH_THRESHOLD; - } - - private void checkNewPartitions(TableIf table, Set needRunPartitions, long lastExecTimeInMs) { - Set partitionNames = table.getPartitionNames(); - partitionNames.removeAll(needRunPartitions); - needRunPartitions.addAll( - partitionNames.stream() - .map(table::getPartition) - .filter(partition -> partition.getVisibleVersionTime() >= lastExecTimeInMs) - .map(Partition::getName) - .collect(Collectors.toSet()) - ); - } - - private AnalysisInfo getAnalysisJobInfo(AnalysisInfo jobInfo, TableIf table, - Set needRunPartitions) { - Map> newColToPartitions = Maps.newHashMap(); - Map> colToPartitions = jobInfo.colToPartitions; - colToPartitions.keySet().forEach(colName -> { - Column column = table.getColumn(colName); - if (column != null) { - newColToPartitions.put(colName, needRunPartitions); - } - }); - return new AnalysisInfoBuilder(jobInfo) - .setColToPartitions(newColToPartitions).build(); - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java new file mode 100644 index 000000000000000..fe535b0fb4ab0ed --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsAutoCollector.java @@ -0,0 +1,196 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.analysis.TableName; +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.DatabaseIf; +import org.apache.doris.catalog.Env; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.TableIf; +import org.apache.doris.catalog.external.ExternalTable; +import org.apache.doris.common.Config; +import org.apache.doris.common.util.TimeUtils; +import org.apache.doris.datasource.CatalogIf; +import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod; +import org.apache.doris.statistics.AnalysisInfo.JobType; +import org.apache.doris.statistics.AnalysisInfo.ScheduleType; +import org.apache.doris.statistics.util.StatisticsUtil; + +import com.google.common.collect.Maps; +import org.apache.hudi.common.util.VisibleForTesting; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.time.LocalTime; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; + +public class StatisticsAutoCollector extends StatisticsCollector { + + private static final Logger LOG = LogManager.getLogger(StatisticsAutoCollector.class); + + public StatisticsAutoCollector() { + super("Automatic Analyzer", + TimeUnit.MINUTES.toMillis(Config.auto_check_statistics_in_minutes), + new AnalysisTaskExecutor(Config.full_auto_analyze_simultaneously_running_task_num)); + } + + @Override + protected void collect() { + if (!StatisticsUtil.inAnalyzeTime(LocalTime.now(TimeUtils.getTimeZone().toZoneId()))) { + analysisTaskExecutor.clear(); + return; + } + if (StatisticsUtil.enableAutoAnalyze()) { + analyzeAll(); + } + } + + @SuppressWarnings({"rawtypes", "unchecked"}) + private void analyzeAll() { + Set catalogs = Env.getCurrentEnv().getCatalogMgr().getCopyOfCatalog(); + for (CatalogIf ctl : catalogs) { + if (!ctl.enableAutoAnalyze()) { + continue; + } + Collection dbs = ctl.getAllDbs(); + for (DatabaseIf databaseIf : dbs) { + if (StatisticConstants.SYSTEM_DBS.contains(databaseIf.getFullName())) { + continue; + } + analyzeDb(databaseIf); + } + } + } + + public void analyzeDb(DatabaseIf databaseIf) { + List analysisInfos = constructAnalysisInfo(databaseIf); + for (AnalysisInfo analysisInfo : analysisInfos) { + analysisInfo = getReAnalyzeRequiredPart(analysisInfo); + if (analysisInfo == null) { + continue; + } + try { + createSystemAnalysisJob(analysisInfo); + } catch (Exception e) { + LOG.warn("Failed to create analysis job", e); + } + } + } + + protected List constructAnalysisInfo(DatabaseIf db) { + List analysisInfos = new ArrayList<>(); + for (TableIf table : db.getTables()) { + if (skip(table)) { + continue; + } + createAnalyzeJobForTbl(db, analysisInfos, table); + } + return analysisInfos; + } + + // return true if skip auto analyze this time. + protected boolean skip(TableIf table) { + if (!(table instanceof OlapTable || table instanceof ExternalTable)) { + return true; + } + if (table.getDataSize(true) < Config.huge_table_lower_bound_size_in_bytes) { + return false; + } + TableStatsMeta tableStats = Env.getCurrentEnv().getAnalysisManager().findTableStatsStatus(table.getId()); + return System.currentTimeMillis() - tableStats.updatedTime < Config.huge_table_auto_analyze_interval_in_millis; + } + + protected void createAnalyzeJobForTbl(DatabaseIf db, + List analysisInfos, TableIf table) { + AnalysisMethod analysisMethod = table.getDataSize(true) > Config.huge_table_lower_bound_size_in_bytes + ? AnalysisMethod.SAMPLE : AnalysisMethod.FULL; + TableName tableName = new TableName(db.getCatalog().getName(), db.getFullName(), + table.getName()); + AnalysisInfo jobInfo = new AnalysisInfoBuilder() + .setJobId(Env.getCurrentEnv().getNextId()) + .setCatalogName(db.getCatalog().getName()) + .setDbName(db.getFullName()) + .setTblName(tableName.getTbl()) + .setColName( + table.getBaseSchema().stream().filter(c -> !StatisticsUtil.isUnsupportedType(c.getType())) + .map( + Column::getName).collect(Collectors.joining(",")) + ) + .setAnalysisType(AnalysisInfo.AnalysisType.FUNDAMENTALS) + .setAnalysisMode(AnalysisInfo.AnalysisMode.INCREMENTAL) + .setAnalysisMethod(analysisMethod) + .setSampleRows(Config.huge_table_default_sample_rows) + .setScheduleType(ScheduleType.AUTOMATIC) + .setState(AnalysisState.PENDING) + .setTaskIds(new ArrayList<>()) + .setLastExecTimeInMs(System.currentTimeMillis()) + .setJobType(JobType.SYSTEM).build(); + analysisInfos.add(jobInfo); + } + + @VisibleForTesting + protected AnalysisInfo getReAnalyzeRequiredPart(AnalysisInfo jobInfo) { + TableIf table = StatisticsUtil + .findTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName); + AnalysisManager analysisManager = Env.getServingEnv().getAnalysisManager(); + TableStatsMeta tblStats = analysisManager.findTableStatsStatus(table.getId()); + + if (!table.needReAnalyzeTable(tblStats)) { + return null; + } + + Map> needRunPartitions = table.findReAnalyzeNeededPartitions(); + + if (needRunPartitions.isEmpty()) { + return null; + } + + return new AnalysisInfoBuilder(jobInfo).setColToPartitions(needRunPartitions).build(); + } + + @VisibleForTesting + protected AnalysisInfo getAnalysisJobInfo(AnalysisInfo jobInfo, TableIf table, + Set needRunPartitions) { + Map> newColToPartitions = Maps.newHashMap(); + Map> colToPartitions = jobInfo.colToPartitions; + if (colToPartitions == null) { + for (Column c : table.getColumns()) { + if (StatisticsUtil.isUnsupportedType(c.getType())) { + continue; + } + newColToPartitions.put(c.getName(), needRunPartitions); + } + } else { + colToPartitions.keySet().forEach(colName -> { + Column column = table.getColumn(colName); + if (column != null) { + newColToPartitions.put(colName, needRunPartitions); + } + }); + } + return new AnalysisInfoBuilder(jobInfo) + .setColToPartitions(newColToPartitions).build(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java index 3622006542d93de..a0e75f7df380907 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsBuilder.java @@ -26,7 +26,7 @@ public class StatisticsBuilder { private double rowCount; - private Map expressionToColumnStats; + private final Map expressionToColumnStats; public StatisticsBuilder() { expressionToColumnStats = new HashMap<>(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java index cb9e4ca322882fb..c9b049a8cfc0830 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCache.java @@ -24,7 +24,6 @@ import org.apache.doris.ha.FrontendNodeType; import org.apache.doris.persist.gson.GsonUtils; import org.apache.doris.qe.ConnectContext; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.system.Frontend; import org.apache.doris.thrift.FrontendService; @@ -34,16 +33,21 @@ import com.github.benmanes.caffeine.cache.AsyncLoadingCache; import com.github.benmanes.caffeine.cache.Caffeine; import org.apache.commons.collections.CollectionUtils; +import org.apache.hudi.common.util.VisibleForTesting; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.time.Duration; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ExecutionException; import java.util.concurrent.ThreadPoolExecutor; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; public class StatisticsCache { @@ -59,7 +63,6 @@ public class StatisticsCache { private final ColumnStatisticsCacheLoader columnStatisticsCacheLoader = new ColumnStatisticsCacheLoader(); private final HistogramCacheLoader histogramCacheLoader = new HistogramCacheLoader(); - private final TableStatisticsCacheLoader tableStatisticsCacheLoader = new TableStatisticsCacheLoader(); private final AsyncLoadingCache> columnStatisticsCache = Caffeine.newBuilder() @@ -75,20 +78,12 @@ public class StatisticsCache { .executor(threadPool) .buildAsync(histogramCacheLoader); - private final AsyncLoadingCache> tableStatisticsCache = - Caffeine.newBuilder() - .maximumSize(Config.stats_cache_size) - .refreshAfterWrite(Duration.ofHours(StatisticConstants.STATISTICS_CACHE_REFRESH_INTERVAL)) - .executor(threadPool) - .buildAsync(tableStatisticsCacheLoader); - { threadPool.submit(() -> { while (true) { try { columnStatisticsCacheLoader.removeExpiredInProgressing(); histogramCacheLoader.removeExpiredInProgressing(); - tableStatisticsCacheLoader.removeExpiredInProgressing(); } catch (Throwable t) { // IGNORE } @@ -141,23 +136,6 @@ public Optional getHistogram(long tblId, long idxId, String colName) return Optional.empty(); } - public Optional getTableStatistics(long catalogId, long dbId, long tableId) { - ConnectContext ctx = ConnectContext.get(); - if (ctx != null && ctx.getSessionVariable().internalSession) { - return Optional.empty(); - } - StatisticsCacheKey k = new StatisticsCacheKey(catalogId, dbId, tableId); - try { - CompletableFuture> f = tableStatisticsCache.get(k); - if (f.isDone()) { - return f.get(); - } - } catch (Exception e) { - LOG.warn("Unexpected exception while returning Histogram", e); - } - return Optional.empty(); - } - public void invalidate(long tblId, long idxId, String colName) { columnStatisticsCache.synchronous().invalidate(new StatisticsCacheKey(tblId, idxId, colName)); } @@ -174,14 +152,6 @@ public void refreshColStatsSync(long catalogId, long dbId, long tblId, long idxI columnStatisticsCache.synchronous().refresh(new StatisticsCacheKey(catalogId, dbId, tblId, idxId, colName)); } - public void invalidateTableStats(long catalogId, long dbId, long tblId) { - tableStatisticsCache.synchronous().invalidate(new StatisticsCacheKey(catalogId, dbId, tblId)); - } - - public void refreshTableStatsSync(long catalogId, long dbId, long tblId) { - tableStatisticsCache.synchronous().refresh(new StatisticsCacheKey(catalogId, dbId, tblId)); - } - public void refreshHistogramSync(long tblId, long idxId, String colName) { histogramCache.synchronous().refresh(new StatisticsCacheKey(tblId, idxId, colName)); } @@ -219,83 +189,113 @@ private void doPreHeat() { if (CollectionUtils.isEmpty(recentStatsUpdatedCols)) { return; } + Map keyToColStats = new HashMap<>(); for (ResultRow r : recentStatsUpdatedCols) { try { - long tblId = Long.parseLong(r.getColumnValue("tbl_id")); - long idxId = Long.parseLong(r.getColumnValue("idx_id")); - String colId = r.getColumnValue("col_id"); + StatsId statsId = new StatsId(r); + long tblId = statsId.tblId; + long idxId = statsId.idxId; + String colId = statsId.colId; final StatisticsCacheKey k = new StatisticsCacheKey(tblId, idxId, colId); final ColumnStatistic c = ColumnStatistic.fromResultRow(r); - c.loadPartitionStats(tblId, idxId, colId); + keyToColStats.put(k, c); putCache(k, c); } catch (Throwable t) { LOG.warn("Error when preheating stats cache", t); } } + try { + loadPartStats(keyToColStats); + } catch (Exception e) { + LOG.warn("Fucka", e); + } } - public void syncLoadColStats(long tableId, long idxId, String colName) { + /** + * Return false if the log of corresponding stats load is failed. + */ + public boolean syncLoadColStats(long tableId, long idxId, String colName) { List columnResults = StatisticsRepository.loadColStats(tableId, idxId, colName); final StatisticsCacheKey k = new StatisticsCacheKey(tableId, idxId, colName); final ColumnStatistic c = ColumnStatistic.fromResultRow(columnResults); if (c == ColumnStatistic.UNKNOWN) { - return; + return false; } putCache(k, c); + if (ColumnStatistic.UNKNOWN == c) { + return false; + } TUpdateFollowerStatsCacheRequest updateFollowerStatsCacheRequest = new TUpdateFollowerStatsCacheRequest(); updateFollowerStatsCacheRequest.key = GsonUtils.GSON.toJson(k); - updateFollowerStatsCacheRequest.colStats = GsonUtils.GSON.toJson(c); + updateFollowerStatsCacheRequest.statsRows = columnResults.stream().map(GsonUtils.GSON::toJson).collect( + Collectors.toList()); for (Frontend frontend : Env.getCurrentEnv().getFrontends(FrontendNodeType.FOLLOWER)) { - if (frontend.getHost().equals(Env.getCurrentEnv().getSelfNode().getHost())) { - // Doesn't need to send request to current node. + if (StatisticsUtil.isMaster(frontend)) { continue; } - TNetworkAddress address = new TNetworkAddress(frontend.getHost(), - frontend.getRpcPort()); - FrontendService.Client client = null; - try { - client = ClientPool.frontendPool.borrowObject(address); - client.updateStatsCache(updateFollowerStatsCacheRequest); - } catch (Throwable t) { - LOG.warn("Failed to sync stats to follower: {}", address, t); - } finally { - if (client != null) { - ClientPool.frontendPool.returnObject(address, client); - } - } + sendStats(frontend, updateFollowerStatsCacheRequest); } - + return true; } - public void putCache(StatisticsCacheKey k, ColumnStatistic c) { - CompletableFuture> f = new CompletableFuture>() { - - @Override - public Optional get() throws InterruptedException, ExecutionException { - return Optional.of(c); + @VisibleForTesting + public void sendStats(Frontend frontend, TUpdateFollowerStatsCacheRequest updateFollowerStatsCacheRequest) { + TNetworkAddress address = new TNetworkAddress(frontend.getHost(), + frontend.getRpcPort()); + FrontendService.Client client = null; + try { + client = ClientPool.frontendPool.borrowObject(address); + client.updateStatsCache(updateFollowerStatsCacheRequest); + } catch (Throwable t) { + LOG.warn("Failed to sync stats to follower: {}", address, t); + } finally { + if (client != null) { + ClientPool.frontendPool.returnObject(address, client); } + } + } - @Override - public boolean isDone() { - return true; - } + public void putCache(StatisticsCacheKey k, ColumnStatistic c) { + CompletableFuture> f = new CompletableFuture>(); + f.obtrudeValue(Optional.of(c)); + columnStatisticsCache.put(k, f); + } - @Override - public boolean complete(Optional value) { - return true; + private void loadPartStats(Map keyToColStats) { + final int batchSize = Config.expr_children_limit; + Set keySet = new HashSet<>(); + for (StatisticsCacheKey statisticsCacheKey : keyToColStats.keySet()) { + if (keySet.size() < batchSize - 1) { + keySet.add(statisticsCacheKey); + } else { + List partStats = StatisticsRepository.loadPartStats(keySet); + addPartStatsToColStats(keyToColStats, partStats); + keySet = new HashSet<>(); } + } + if (!keySet.isEmpty()) { + List partStats = StatisticsRepository.loadPartStats(keySet); + addPartStatsToColStats(keyToColStats, partStats); + } + } - @Override - public Optional join() { - return Optional.of(c); + private void addPartStatsToColStats(Map keyToColStats, + List partsStats) { + for (ResultRow r : partsStats) { + try { + StatsId statsId = new StatsId(r); + long tblId = statsId.tblId; + long idxId = statsId.idxId; + String partId = statsId.partId; + String colId = statsId.colId; + ColumnStatistic partStats = ColumnStatistic.fromResultRow(r); + keyToColStats.get(new StatisticsCacheKey(tblId, idxId, colId)).putPartStats(partId, partStats); + } catch (Throwable t) { + LOG.warn("Failed to deserialized part stats", t); } - }; - if (c.isUnKnown) { - return; } - columnStatisticsCache.put(k, f); } } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java index 9aaee6bf1d72bae..6521a8b4a5999b6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCleaner.java @@ -27,7 +27,6 @@ import org.apache.doris.common.util.MasterDaemon; import org.apache.doris.datasource.CatalogIf; import org.apache.doris.datasource.InternalCatalog; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.system.SystemInfoService; @@ -195,31 +194,32 @@ private long findExpiredStats(OlapTable statsTbl, ExpiredStats expiredStats, lon pos += StatisticConstants.FETCH_LIMIT; for (ResultRow r : rows) { try { - String id = r.getColumnValue("id"); - long catalogId = Long.parseLong(r.getColumnValue("catalog_id")); + StatsId statsId = new StatsId(r); + String id = statsId.id; + long catalogId = statsId.catalogId; if (!idToCatalog.containsKey(catalogId)) { expiredStats.expiredCatalog.add(catalogId); continue; } - long dbId = Long.parseLong(r.getColumnValue("db_id")); + long dbId = statsId.dbId; if (!idToDb.containsKey(dbId)) { expiredStats.expiredDatabase.add(dbId); continue; } - long tblId = Long.parseLong(r.getColumnValue("tbl_id")); + long tblId = statsId.tblId; if (!idToTbl.containsKey(tblId)) { expiredStats.expiredTable.add(tblId); continue; } - long idxId = Long.parseLong(r.getColumnValue("idx_id")); + long idxId = statsId.idxId; if (idxId != -1 && !idToMVIdx.containsKey(idxId)) { expiredStats.expiredIdxId.add(idxId); continue; } TableIf t = idToTbl.get(tblId); - String colId = r.getColumnValue("col_id"); + String colId = statsId.colId; if (t.getColumn(colId) == null) { expiredStats.ids.add(id); continue; @@ -228,12 +228,11 @@ private long findExpiredStats(OlapTable statsTbl, ExpiredStats expiredStats, lon continue; } OlapTable olapTable = (OlapTable) t; - String partIdStr = r.getColumnValue("part_id"); - if (partIdStr == null) { + String partId = statsId.partId; + if (partId == null) { continue; } - long partId = Long.parseLong(partIdStr); - if (!olapTable.getPartitionIds().contains(partId)) { + if (!olapTable.getPartitionIds().contains(Long.parseLong(partId))) { expiredStats.ids.add(id); } } catch (Exception e) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java new file mode 100644 index 000000000000000..2d5c48168357fc2 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsCollector.java @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.catalog.Env; +import org.apache.doris.common.DdlException; +import org.apache.doris.common.util.MasterDaemon; +import org.apache.doris.statistics.util.StatisticsUtil; + +import org.apache.hudi.common.util.VisibleForTesting; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.HashMap; +import java.util.Map; + +public abstract class StatisticsCollector extends MasterDaemon { + + private static final Logger LOG = LogManager.getLogger(StatisticsCollector.class); + + protected final AnalysisTaskExecutor analysisTaskExecutor; + + + public StatisticsCollector(String name, long intervalMs, AnalysisTaskExecutor analysisTaskExecutor) { + super(name, intervalMs); + this.analysisTaskExecutor = analysisTaskExecutor; + analysisTaskExecutor.start(); + } + + @Override + protected void runAfterCatalogReady() { + if (!Env.getCurrentEnv().isMaster()) { + return; + } + if (!StatisticsUtil.statsTblAvailable()) { + LOG.info("Stats table not available, skip"); + return; + } + if (Env.isCheckpointThread()) { + return; + } + + if (!analysisTaskExecutor.idle()) { + LOG.info("Analyze tasks those submitted in last time is not finished, skip"); + return; + } + collect(); + } + + protected abstract void collect(); + + // Analysis job created by the system + @VisibleForTesting + protected void createSystemAnalysisJob(AnalysisInfo jobInfo) + throws DdlException { + if (jobInfo.colToPartitions.isEmpty()) { + // No statistics need to be collected or updated + return; + } + + Map analysisTaskInfos = new HashMap<>(); + AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); + analysisManager.createTaskForEachColumns(jobInfo, analysisTaskInfos, false); + if (StatisticsUtil.isExternalTable(jobInfo.catalogName, jobInfo.dbName, jobInfo.tblName)) { + analysisManager.createTableLevelTaskForExternalTable(jobInfo, analysisTaskInfos, false); + } + Env.getCurrentEnv().getAnalysisManager().registerSysJob(jobInfo, analysisTaskInfos); + analysisTaskInfos.values().forEach(analysisTaskExecutor::submitTask); + } + +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsPeriodCollector.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsPeriodCollector.java new file mode 100644 index 000000000000000..f34ad0f1221de7f --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsPeriodCollector.java @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.catalog.Env; +import org.apache.doris.common.Config; + +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.List; +import java.util.concurrent.TimeUnit; + +public class StatisticsPeriodCollector extends StatisticsCollector { + private static final Logger LOG = LogManager.getLogger(StatisticsPeriodCollector.class); + + public StatisticsPeriodCollector() { + super("Automatic Analyzer", + TimeUnit.MINUTES.toMillis(Config.auto_check_statistics_in_minutes) / 2, + new AnalysisTaskExecutor(Config.period_analyze_simultaneously_running_task_num)); + } + + @Override + protected void collect() { + try { + AnalysisManager analysisManager = Env.getCurrentEnv().getAnalysisManager(); + List jobInfos = analysisManager.findPeriodicJobs(); + for (AnalysisInfo jobInfo : jobInfos) { + createSystemAnalysisJob(jobInfo); + } + } catch (Exception e) { + LOG.warn("Failed to periodically analyze the statistics." + e); + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java index d20bb358c1807b4..cd3cc67f3c91c7d 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java @@ -18,7 +18,6 @@ package org.apache.doris.statistics; import org.apache.doris.analysis.AlterColumnStatsStmt; -import org.apache.doris.analysis.AlterTableStatsStmt; import org.apache.doris.analysis.TableName; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Env; @@ -28,16 +27,15 @@ import org.apache.doris.common.DdlException; import org.apache.doris.common.FeConstants; import org.apache.doris.statistics.util.DBObjects; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.system.SystemInfoService; -import com.google.common.collect.ImmutableMap; import com.google.common.collect.Maps; import org.apache.commons.text.StringSubstitutor; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; @@ -95,38 +93,18 @@ public class StatisticsRepository { + " ORDER BY update_time " + "LIMIT ${limit} OFFSET ${offset}"; - private static final String FETCH_STATS_PART_ID = "SELECT col_id, part_id FROM " + private static final String FETCH_STATS_PART_ID = "SELECT * FROM " + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.STATISTIC_TBL_NAME + " WHERE tbl_id = ${tblId}" + " AND part_id IS NOT NULL"; - private static final String PERSIST_TABLE_STATS_TEMPLATE = "INSERT INTO " - + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.ANALYSIS_TBL_NAME - + " VALUES('${id}', ${catalogId}, ${dbId}, ${tblId}, ${indexId}, ${partId}, ${rowCount}," - + " ${lastAnalyzeTimeInMs}, NOW())"; - - private static final String FETCH_TABLE_LEVEL_STATS_TEMPLATE = "SELECT * FROM " - + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.ANALYSIS_TBL_NAME - + " WHERE tbl_id = ${tblId}" - + " AND part_id IS NULL"; - - private static final String FETCH_TABLE_LEVEL_PART_STATS_TEMPLATE = "SELECT * FROM " - + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.ANALYSIS_TBL_NAME - + " WHERE part_id = ${partId}"; - - - private static final String FETCH_PART_TABLE_STATS_TEMPLATE = "SELECT * FROM " - + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.ANALYSIS_TBL_NAME - + " WHERE tbl_id = ${tblId}" - + " AND part_id IS NOT NULL"; - private static final String QUERY_COLUMN_STATISTICS = "SELECT * FROM " + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.STATISTIC_TBL_NAME + " WHERE " + "tbl_id=${tblId} AND idx_id=${idxId} AND col_id='${colId}'"; private static final String QUERY_PARTITION_STATISTICS = "SELECT * FROM " + FeConstants.INTERNAL_DB_NAME + "." + StatisticConstants.STATISTIC_TBL_NAME + " WHERE " - + " tbl_id=${tblId} AND idx_id=${idxId} AND col_id='${colId}' " + + " ${inPredicate}" + " AND part_id IS NOT NULL"; public static ColumnStatistic queryColumnStatisticsByName(long tableId, String colName) { @@ -201,8 +179,7 @@ private static String constructId(Object... params) { return stringJoiner.toString(); } - public static void dropStatistics(Set partIds) throws DdlException { - dropStatisticsByPartId(partIds, StatisticConstants.ANALYSIS_TBL_NAME); + public static void dropStatistics(Set partIds) throws DdlException { dropStatisticsByPartId(partIds, StatisticConstants.STATISTIC_TBL_NAME); } @@ -211,18 +188,6 @@ public static void dropStatistics(long tblId, Set colNames) throws DdlEx dropStatisticsByColName(tblId, colNames, StatisticConstants.HISTOGRAM_TBL_NAME); } - public static void dropExternalTableStatistics(long tblId) throws DdlException { - Map params = new HashMap<>(); - String inPredicate = String.format("tbl_id = %s", tblId); - params.put("tblName", StatisticConstants.ANALYSIS_TBL_NAME); - params.put("condition", inPredicate); - try { - StatisticsUtil.execUpdate(new StringSubstitutor(params).replace(DROP_TABLE_STATISTICS_TEMPLATE)); - } catch (Exception e) { - throw new DdlException(e.getMessage(), e); - } - } - public static void dropStatisticsByColName(long tblId, Set colNames, String statsTblName) throws DdlException { Map params = new HashMap<>(); @@ -237,7 +202,7 @@ public static void dropStatisticsByColName(long tblId, Set colNames, Str } } - public static void dropStatisticsByPartId(Set partIds, String statsTblName) throws DdlException { + public static void dropStatisticsByPartId(Set partIds, String statsTblName) throws DdlException { Map params = new HashMap<>(); String right = StatisticsUtil.joinElementsToString(partIds, ","); String inPredicate = String.format(" part_id IN (%s)", right); @@ -250,34 +215,6 @@ public static void dropStatisticsByPartId(Set partIds, String statsTblName } } - public static void persistTableStats(Map params) throws Exception { - StatisticsUtil.execUpdate(PERSIST_TABLE_STATS_TEMPLATE, params); - } - - public static void alterTableStatistics(AlterTableStatsStmt alterTableStatsStmt) throws Exception { - TableName tableName = alterTableStatsStmt.getTableName(); - DBObjects objects = StatisticsUtil.convertTableNameToObjects(tableName); - String rowCount = alterTableStatsStmt.getValue(StatsType.ROW_COUNT); - TableStatisticBuilder builder = new TableStatisticBuilder(); - builder.setRowCount(Long.parseLong(rowCount)); - builder.setLastAnalyzeTimeInMs(0); - TableStatistic tableStatistic = builder.build(); - Map params = new HashMap<>(); - String id = StatisticsUtil.constructId(objects.table.getId(), -1); - params.put("id", id); - params.put("catalogId", String.valueOf(objects.catalog.getId())); - params.put("dbId", String.valueOf(objects.db.getId())); - params.put("tblId", String.valueOf(objects.table.getId())); - params.put("indexId", "-1"); - params.put("partId", "NULL"); - params.put("rowCount", String.valueOf(tableStatistic.rowCount)); - params.put("lastAnalyzeTimeInMs", "0"); - StatisticsUtil.execUpdate(PERSIST_TABLE_STATS_TEMPLATE, params); - // TODO update statistics cache - // Env.getCurrentEnv().getStatisticsCache() - // .updateColStatsCache(objects.table.getId(), -1, builder.build()); - } - public static void alterColumnStatistics(AlterColumnStatsStmt alterColumnStatsStmt) throws Exception { TableName tableName = alterColumnStatsStmt.getTableName(); List partitionIds = alterColumnStatsStmt.getPartitionIds(); @@ -359,25 +296,24 @@ public static List fetchStatsFullName(long limit, long offset) { return StatisticsUtil.execStatisticQuery(new StringSubstitutor(params).replace(FETCH_STATS_FULL_NAME)); } - public static Map> fetchColAndPartsForStats(long tblId) { + public static Map> fetchColAndPartsForStats(long tblId) { Map params = Maps.newHashMap(); params.put("tblId", String.valueOf(tblId)); StringSubstitutor stringSubstitutor = new StringSubstitutor(params); String partSql = stringSubstitutor.replace(FETCH_STATS_PART_ID); List resultRows = StatisticsUtil.execStatisticQuery(partSql); - Map> columnToPartitions = Maps.newHashMap(); + Map> columnToPartitions = Maps.newHashMap(); resultRows.forEach(row -> { try { - String colId = row.getColumnValue("col_id"); - String partId = row.getColumnValue("part_id"); - if (partId == null) { + StatsId statsId = new StatsId(row); + if (statsId.partId == null) { return; } - columnToPartitions.computeIfAbsent(colId, - k -> new HashSet<>()).add(Long.valueOf(partId)); - } catch (NumberFormatException | DdlException e) { + columnToPartitions.computeIfAbsent(String.valueOf(statsId.colId), + k -> new HashSet<>()).add(statsId.partId); + } catch (NumberFormatException e) { LOG.warn("Failed to obtain the column and partition for statistics.", e); } @@ -386,50 +322,6 @@ public static Map> fetchColAndPartsForStats(long tblId) { return columnToPartitions; } - public static TableStatistic fetchTableLevelStats(long tblId) throws DdlException { - ImmutableMap params = ImmutableMap - .of("tblId", String.valueOf(tblId)); - String sql = StatisticsUtil.replaceParams(FETCH_TABLE_LEVEL_STATS_TEMPLATE, params); - List resultRows = StatisticsUtil.execStatisticQuery(sql); - if (resultRows.size() == 1) { - return TableStatistic.fromResultRow(resultRows.get(0)); - } - throw new DdlException("Query result is not as expected: " + sql); - } - - public static TableStatistic fetchTableLevelOfPartStats(long partId) throws DdlException { - ImmutableMap params = ImmutableMap - .of("partId", String.valueOf(partId)); - String sql = StatisticsUtil.replaceParams(FETCH_TABLE_LEVEL_PART_STATS_TEMPLATE, params); - List resultRows = StatisticsUtil.execStatisticQuery(sql); - if (resultRows.size() == 1) { - return TableStatistic.fromResultRow(resultRows.get(0)); - } - throw new DdlException("Query result is not as expected: " + sql); - } - - public static Map fetchTableLevelOfIdPartStats(long tblId) throws DdlException { - ImmutableMap params = ImmutableMap - .of("tblId", String.valueOf(tblId)); - StringSubstitutor stringSubstitutor = new StringSubstitutor(params); - String sql = stringSubstitutor.replace(FETCH_PART_TABLE_STATS_TEMPLATE); - List resultRows = StatisticsUtil.execStatisticQuery(sql); - - if (resultRows.size() == 0) { - return Collections.emptyMap(); - } - - Map idToPartitionTableStats = Maps.newHashMap(); - - for (ResultRow resultRow : resultRows) { - long partId = Long.parseLong(resultRow.getColumnValue("part_id")); - TableStatistic partStats = TableStatistic.fromResultRow(resultRow); - idToPartitionTableStats.put(partId, partStats); - } - - return idToPartitionTableStats; - } - public static List loadColStats(long tableId, long idxId, String colName) { Map params = new HashMap<>(); params.put("tblId", String.valueOf(tableId)); @@ -440,12 +332,14 @@ public static List loadColStats(long tableId, long idxId, String colN .replace(QUERY_COLUMN_STATISTICS)); } - public static List loadPartStats(long tableId, long idxId, String colName) { + public static List loadPartStats(Collection keys) { + String inPredicate = "CONCAT(tbl_id, '-', idx_id, '-', col_id) in (%s)"; + StringJoiner sj = new StringJoiner(","); + for (StatisticsCacheKey statisticsCacheKey : keys) { + sj.add("'" + statisticsCacheKey.toString() + "'"); + } Map params = new HashMap<>(); - params.put("tblId", String.valueOf(tableId)); - params.put("idxId", String.valueOf(idxId)); - params.put("colId", colName); - + params.put("inPredicate", String.format(inPredicate, sj.toString())); return StatisticsUtil.execStatisticQuery(new StringSubstitutor(params) .replace(QUERY_PARTITION_STATISTICS)); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java index 6010daa6db49042..8c301f911be95b0 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsDeriveResult.java @@ -144,13 +144,6 @@ public StatsDeriveResult updateByLimit(long limit) { return statsDeriveResult; } - public StatsDeriveResult merge(StatsDeriveResult other) { - for (Entry entry : other.getSlotIdToColumnStats().entrySet()) { - this.slotIdToColumnStats.put(entry.getKey(), entry.getValue().copy()); - } - return this; - } - public StatsDeriveResult copy() { return new StatsDeriveResult(this); } diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java new file mode 100644 index 000000000000000..3f9b2641b752240 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatsId.java @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.statistics.util.StatisticsUtil; + +import java.util.StringJoiner; + +public class StatsId { + + public final String id; + public final long catalogId; + public final long dbId; + public final long tblId; + public final long idxId; + + public final String colId; + + // nullable + public final String partId; + + public StatsId(ResultRow row) { + this.id = row.get(0); + this.catalogId = Long.parseLong(row.get(1)); + this.dbId = Long.parseLong(row.get(2)); + this.tblId = Long.parseLong(row.get(3)); + this.idxId = Long.parseLong(row.get(4)); + this.colId = row.get(5); + this.partId = row.get(6); + } + + public String toSQL() { + StringJoiner sj = new StringJoiner(","); + sj.add(StatisticsUtil.quote(id)); + sj.add(String.valueOf(catalogId)); + sj.add(String.valueOf(dbId)); + sj.add(String.valueOf(tblId)); + sj.add(String.valueOf(idxId)); + sj.add(StatisticsUtil.quote(colId)); + sj.add(StatisticsUtil.quote(partId)); + return sj.toString(); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatistic.java deleted file mode 100644 index 28d0c17b5610468..000000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatistic.java +++ /dev/null @@ -1,61 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -import org.apache.doris.common.DdlException; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -public class TableStatistic { - - private static final Logger LOG = LogManager.getLogger(TableStatistic.class); - - public static TableStatistic UNKNOWN = new TableStatisticBuilder() - .setRowCount(0).setUpdateTime("NULL").setLastAnalyzeTimeInMs(0L) - .build(); - - public final long rowCount; - public final long lastAnalyzeTimeInMs; - public final String updateTime; - - public TableStatistic(long rowCount, long lastAnalyzeTimeInMs, String updateTime) { - this.rowCount = rowCount; - this.lastAnalyzeTimeInMs = lastAnalyzeTimeInMs; - this.updateTime = updateTime; - } - - // TODO: use thrift - public static TableStatistic fromResultRow(ResultRow resultRow) { - try { - TableStatisticBuilder tableStatisticBuilder = new TableStatisticBuilder(); - long rowCount = Long.parseLong(resultRow.getColumnValue("count")); - String updateTime = resultRow.getColumnValue("update_time"); - long lastAnalyzeTimeInMs = Long - .parseLong(resultRow.getColumnValue("last_analyze_time_in_ms")); - tableStatisticBuilder.setRowCount(rowCount); - tableStatisticBuilder.setLastAnalyzeTimeInMs(lastAnalyzeTimeInMs); - tableStatisticBuilder.setUpdateTime(updateTime); - return tableStatisticBuilder.build(); - } catch (DdlException e) { - LOG.warn("Failed to deserialize table statistics", e); - return TableStatistic.UNKNOWN; - } - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticBuilder.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticBuilder.java deleted file mode 100644 index ddb45b824cb1f87..000000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticBuilder.java +++ /dev/null @@ -1,51 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -public class TableStatisticBuilder { - public long rowCount; - public long lastAnalyzeTimeInMs; - public String updateTime; - - public TableStatisticBuilder() { - } - - public TableStatisticBuilder(TableStatistic tableStatistic) { - this.rowCount = tableStatistic.rowCount; - this.updateTime = tableStatistic.updateTime; - } - - public TableStatisticBuilder setRowCount(long rowCount) { - this.rowCount = rowCount; - return this; - } - - public TableStatisticBuilder setLastAnalyzeTimeInMs(long lastAnalyzeTimeInMs) { - this.lastAnalyzeTimeInMs = lastAnalyzeTimeInMs; - return this; - } - - public TableStatisticBuilder setUpdateTime(String updateTime) { - this.updateTime = updateTime; - return this; - } - - public TableStatistic build() { - return new TableStatistic(rowCount, lastAnalyzeTimeInMs, updateTime); - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticsCacheLoader.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticsCacheLoader.java deleted file mode 100644 index 953bc9a42742b8a..000000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatisticsCacheLoader.java +++ /dev/null @@ -1,60 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics; - -import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.TableIf; -import org.apache.doris.common.DdlException; - -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; - -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.Optional; - -public class TableStatisticsCacheLoader extends StatisticsCacheLoader> { - - private static final Logger LOG = LogManager.getLogger(TableStatisticsCacheLoader.class); - - @Override - protected Optional doLoad(StatisticsCacheKey key) { - try { - TableStatistic tableStatistic = StatisticsRepository.fetchTableLevelStats(key.tableId); - if (tableStatistic != TableStatistic.UNKNOWN) { - return Optional.of(tableStatistic); - } - } catch (DdlException e) { - LOG.debug("Fail to get table line number from table_statistics table. " - + "Will try to get from data source.", e); - } - // Get row count by call TableIf interface getRowCount - // when statistic table doesn't contain a record for this table. - try { - TableIf table = Env.getCurrentEnv().getCatalogMgr().getCatalog(key.catalogId) - .getDbOrDdlException(key.dbId).getTableOrAnalysisException(key.tableId); - long rowCount = table.getRowCount(); - long lastAnalyzeTimeInMs = System.currentTimeMillis(); - String updateTime = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date(lastAnalyzeTimeInMs)); - return Optional.of(new TableStatistic(rowCount, lastAnalyzeTimeInMs, updateTime)); - } catch (Exception e) { - LOG.warn(String.format("Fail to get row count for table %d", key.tableId), e); - } - return Optional.empty(); - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java new file mode 100644 index 000000000000000..17ca61e9da5c52e --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/TableStatsMeta.java @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import org.apache.doris.common.io.Text; +import org.apache.doris.common.io.Writable; +import org.apache.doris.persist.gson.GsonUtils; +import org.apache.doris.statistics.AnalysisInfo.JobType; + +import com.google.gson.annotations.SerializedName; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentMap; +import java.util.concurrent.atomic.AtomicLong; +import java.util.stream.Collectors; + +public class TableStatsMeta implements Writable { + + @SerializedName("tblId") + public final long tblId; + + @SerializedName("idxId") + public final long idxId; + @SerializedName("updatedRows") + public final AtomicLong updatedRows = new AtomicLong(); + + // We would like to analyze tables which queried frequently with higher priority in the future. + @SerializedName("queriedTimes") + public final AtomicLong queriedTimes = new AtomicLong(); + + // Used for external table. + @SerializedName("rowCount") + public final long rowCount; + + @SerializedName("updateTime") + public long updatedTime; + + @SerializedName("colNameToColStatsMeta") + private ConcurrentMap colNameToColStatsMeta = new ConcurrentHashMap<>(); + + @SerializedName("trigger") + public JobType jobType; + + // It's necessary to store these fields separately from AnalysisInfo, since the lifecycle between AnalysisInfo + // and TableStats is quite different. + public TableStatsMeta(long tblId, long rowCount, AnalysisInfo analyzedJob) { + this.tblId = tblId; + this.idxId = -1; + this.rowCount = rowCount; + updateByJob(analyzedJob); + } + + @Override + public void write(DataOutput out) throws IOException { + String json = GsonUtils.GSON.toJson(this); + Text.writeString(out, json); + } + + public static TableStatsMeta read(DataInput dataInput) throws IOException { + String json = Text.readString(dataInput); + TableStatsMeta tableStats = GsonUtils.GSON.fromJson(json, TableStatsMeta.class); + // Might be null counterintuitively, for compatible + if (tableStats.colNameToColStatsMeta == null) { + tableStats.colNameToColStatsMeta = new ConcurrentHashMap<>(); + } + return tableStats; + } + + public long findColumnLastUpdateTime(String colName) { + ColStatsMeta colStatsMeta = colNameToColStatsMeta.get(colName); + if (colStatsMeta == null) { + return 0; + } + return colStatsMeta.updatedTime; + } + + public ColStatsMeta findColumnStatsMeta(String colName) { + return colNameToColStatsMeta.get(colName); + } + + public void removeColumn(String colName) { + colNameToColStatsMeta.remove(colName); + } + + public Set analyzeColumns() { + return colNameToColStatsMeta.keySet(); + } + + public void reset() { + updatedTime = 0; + colNameToColStatsMeta.values().forEach(ColStatsMeta::clear); + } + + public void updateByJob(AnalysisInfo analyzedJob) { + updatedTime = System.currentTimeMillis(); + String colNameStr = analyzedJob.colName; + // colName field AnalyzeJob's format likes: "[col1, col2]", we need to remove brackets here + // TODO: Refactor this later + if (analyzedJob.colName.startsWith("[") && analyzedJob.colName.endsWith("]")) { + colNameStr = colNameStr.substring(1, colNameStr.length() - 1); + } + List cols = Arrays.stream(colNameStr.split(",")).map(String::trim).collect(Collectors.toList()); + for (String col : cols) { + ColStatsMeta colStatsMeta = colNameToColStatsMeta.get(col); + if (colStatsMeta == null) { + colNameToColStatsMeta.put(col, new ColStatsMeta(updatedTime, + analyzedJob.analysisMethod, analyzedJob.analysisType, analyzedJob.jobType, 0)); + } else { + colStatsMeta.updatedTime = updatedTime; + colStatsMeta.analysisType = analyzedJob.analysisType; + colStatsMeta.analysisMethod = analyzedJob.analysisMethod; + } + } + jobType = analyzedJob.jobType; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/TaskStatusWrapper.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/TaskStatusWrapper.java new file mode 100644 index 000000000000000..d74b14267d1eca5 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/TaskStatusWrapper.java @@ -0,0 +1,33 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +public class TaskStatusWrapper { + + public final AnalysisInfo info; + public final AnalysisState taskState; + public final String message; + public final long time; + + public TaskStatusWrapper(AnalysisInfo info, AnalysisState taskState, String message, long time) { + this.info = info; + this.taskState = taskState; + this.message = message; + this.time = time; + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQuery.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQuery.java index 09af38d830a709b..40669b6a9396ea9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQuery.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQuery.java @@ -24,7 +24,6 @@ import org.apache.doris.analysis.StatementBase; import org.apache.doris.analysis.UserIdentity; import org.apache.doris.catalog.Env; -import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.cluster.ClusterNamespace; import org.apache.doris.common.Config; import org.apache.doris.common.DdlException; @@ -38,7 +37,7 @@ import org.apache.doris.qe.OriginStatement; import org.apache.doris.qe.QeProcessorImpl; import org.apache.doris.qe.RowBatch; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; +import org.apache.doris.statistics.ResultRow; import org.apache.doris.system.SystemInfoService; import org.apache.doris.thrift.TQueryOptions; import org.apache.doris.thrift.TResultBatch; @@ -50,9 +49,9 @@ import java.io.StringReader; import java.nio.ByteBuffer; +import java.util.ArrayList; import java.util.List; import java.util.UUID; -import java.util.stream.Collectors; /** * Execute SQL query statements internally(in FE). Internal-query mainly used for statistics module, @@ -87,7 +86,7 @@ public void setTimeout(int timeout) { * @return Result of the query statement * @throws Exception Errors in parsing or execution */ - public InternalQueryResult query() throws Exception { + public List query() throws Exception { // step1: mock connectContext buildContext(); @@ -180,14 +179,9 @@ private void execute() throws Exception { } } - private InternalQueryResult fetchResult() { + private List fetchResult() { List columns = stmt.getColLabels(); - List types = stmt.getResultExprs().stream() - .map(e -> e.getType().getPrimitiveType()) - .collect(Collectors.toList()); - - InternalQueryResult result = new InternalQueryResult(); - List resultRows = result.getResultRows(); + List resultRows = new ArrayList<>(); for (TResultBatch batch : resultBatches) { List rows = batch.getRows(); @@ -200,12 +194,11 @@ private InternalQueryResult fetchResult() { values.add(value); } - ResultRow resultRow = new ResultRow(columns, types, values); + ResultRow resultRow = new ResultRow(values); resultRows.add(resultRow); } } - - return result; + return resultRows; } public void cancel() { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQueryResult.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQueryResult.java deleted file mode 100644 index e79198601075319..000000000000000 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/InternalQueryResult.java +++ /dev/null @@ -1,242 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics.util; - -import org.apache.doris.catalog.PrimitiveType; -import org.apache.doris.common.DdlException; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import java.util.Collections; -import java.util.List; -import java.util.Map; - -/** - * Readable results of internal SQL execution, - * providing some read operations. - */ -public class InternalQueryResult { - private final List resultRows = Lists.newArrayList(); - - public InternalQueryResult() { - } - - public List getResultRows() { - return resultRows; - } - - public static class ResultRow { - private final List columns; - private final List types; - private final List values; - - private final Map columnNameMap = Maps.newHashMap(); - private final Map columnIndexMap = Maps.newHashMap(); - - public ResultRow(List columns, List types, List values) { - this.columns = columns; - this.types = types; - this.values = values; - buildColumnNameMap(); - buildColumnIndexMap(); - } - - public List getColumns() { - return columns != null ? columns : Collections.emptyList(); - } - - public List getTypes() { - return types != null ? types : Collections.emptyList(); - } - - public List getValues() { - return values != null ? values : Collections.emptyList(); - } - - private void buildColumnNameMap() { - List columns = getColumns(); - for (int i = 0; i < columns.size(); i++) { - columnNameMap.put(columns.get(i), i); - } - } - - private void buildColumnIndexMap() { - List columns = getColumns(); - for (int i = 0; i < columns.size(); i++) { - columnIndexMap.put(i, columns.get(i)); - } - } - - public int getColumnIndex(String columnName) { - return columnNameMap.getOrDefault(columnName, -1); - } - - public String getColumnName(int index) throws DdlException { - List columns = getColumns(); - if (columnIndexMap.containsKey(index)) { - return columnIndexMap.get(index); - } else { - throw new DdlException("Index should be between 0 and " + columns.size()); - } - } - - public PrimitiveType getColumnType(String columnName) throws DdlException { - List types = getTypes(); - int index = getColumnIndex(columnName); - if (index == -1) { - throw new DdlException(String.format("The column name:[%s] does not exist.", columnName)); - } - return types.get(index); - } - - public PrimitiveType getColumnType(int index) throws DdlException { - List types = getTypes(); - if (index >= 0 && index < types.size()) { - return types.get(index); - } else { - throw new DdlException("Index should be between 0 and " + types.size()); - } - } - - public String getColumnValue(String columnName) throws DdlException { - int index = getColumnIndex(columnName); - if (index == -1) { - throw new DdlException(String.format("The column name:[%s] does not exist.", columnName)); - } - return values.get(index); - } - - public String getColumnValueWithDefault(String columnName, String defaultVal) throws DdlException { - String val = getColumnValue(columnName); - return val == null ? defaultVal : val; - } - - public Object getColumnValue(int index) throws DdlException { - List columns = getColumns(); - if (index >= 0 && index < columns.size()) { - return values.get(index); - } else { - throw new DdlException("Index should be between 0 and " + columns.size()); - } - } - - public String getString(int index) throws DdlException { - List columns = getColumns(); - if (index >= 0 && index < columns.size()) { - return values.get(index); - } - throw new DdlException("Index should be between 0 and " + columns.size()); - } - - public int getInt(int index) throws DdlException { - List types = getTypes(); - if (index >= 0 && index < types.size()) { - String value = values.get(index); - PrimitiveType type = types.get(index); - switch (type) { - case BOOLEAN: - case TINYINT: - case SMALLINT: - case INT: - case BIGINT: - return new Integer(value); - default: - throw new DdlException("Unable to convert field to int: " + value); - } - } - throw new DdlException("Index should be between 0 and " + types.size()); - } - - public long getLong(int index) throws DdlException { - List types = getTypes(); - if (index >= 0 && index < types.size()) { - String value = values.get(index); - PrimitiveType type = types.get(index); - switch (type) { - case TINYINT: - case SMALLINT: - case INT: - case BIGINT: - return Long.parseLong(value); - default: - throw new DdlException("Unable to convert field to long: " + value); - } - } - throw new DdlException("Index should be between 0 and " + types.size()); - } - - public float getFloat(int index) throws DdlException { - List types = getTypes(); - if (index >= 0 && index < types.size()) { - String value = values.get(index); - PrimitiveType type = types.get(index); - if (type == PrimitiveType.FLOAT) { - return Float.parseFloat(value); - } - throw new DdlException("Unable to convert field to float: " + value); - } - throw new DdlException("Index should be between 0 and " + types.size()); - } - - public double getDouble(int index) throws DdlException { - List types = getTypes(); - if (index >= 0 && index < types.size()) { - String value = values.get(index); - PrimitiveType type = types.get(index); - if (type == PrimitiveType.DOUBLE) { - return Double.parseDouble(value); - } - throw new DdlException("Unable to convert field to long: " + value); - } - throw new DdlException("Index should be between 0 and " + types.size()); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("ResultRow{ "); - if (values != null && values.size() > 0) { - List columns = getColumns(); - for (int i = 0; i < values.size(); i++) { - sb.append(columns.get(i)); - sb.append(":"); - sb.append(values.get(i)); - sb.append(" "); - } - } - sb.append("}"); - return sb.toString(); - } - } - - @Override - public String toString() { - if (resultRows.size() > 0) { - StringBuilder sb = new StringBuilder(); - sb.append("InternalQueryResult:\n"); - for (ResultRow resultRow : resultRows) { - sb.append(" - "); - sb.append(resultRow.toString()); - sb.append("\n"); - } - return sb.toString(); - } - return "InternalQueryResult{" + "resultRows=" + resultRows + '}'; - } -} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/SimpleQueue.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/SimpleQueue.java new file mode 100644 index 000000000000000..5740c4e30885a3f --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/SimpleQueue.java @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics.util; + +import java.util.Collection; +import java.util.LinkedList; +import java.util.function.Function; + +// Any operation on this structure should be thread-safe +public class SimpleQueue extends LinkedList { + + private final long limit; + + private final Function offerFunc; + + private final Function evictFunc; + + + public SimpleQueue(long limit, Function offerFunc, Function evictFunc) { + this.limit = limit; + this.offerFunc = offerFunc; + this.evictFunc = evictFunc; + } + + @Override + public synchronized boolean offer(T analysisInfo) { + while (size() >= limit) { + remove(); + } + super.offer(analysisInfo); + offerFunc.apply(analysisInfo); + return true; + } + + @Override + public synchronized T remove() { + T analysisInfo = super.remove(); + evictFunc.apply(analysisInfo); + return analysisInfo; + } + + public SimpleQueue(long limit, Function offerFunc, Function evictFunc, Collection collection) { + this(limit, offerFunc, evictFunc); + if (collection != null) { + for (T e : collection) { + offer(e); + } + } + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 89a5ae1f3e68668..40ae13a0e0e293a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -25,10 +25,12 @@ import org.apache.doris.analysis.IntLiteral; import org.apache.doris.analysis.LargeIntLiteral; import org.apache.doris.analysis.LiteralExpr; +import org.apache.doris.analysis.SetType; import org.apache.doris.analysis.StatementBase; import org.apache.doris.analysis.StringLiteral; import org.apache.doris.analysis.TableName; import org.apache.doris.analysis.UserIdentity; +import org.apache.doris.analysis.VariableExpr; import org.apache.doris.catalog.ArrayType; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DatabaseIf; @@ -44,10 +46,12 @@ import org.apache.doris.catalog.TableIf; import org.apache.doris.catalog.Type; import org.apache.doris.catalog.VariantType; +import org.apache.doris.catalog.external.ExternalTable; import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; +import org.apache.doris.common.Pair; import org.apache.doris.common.UserException; import org.apache.doris.datasource.CatalogIf; import org.apache.doris.datasource.HMSExternalCatalog; @@ -61,12 +65,13 @@ import org.apache.doris.qe.QueryState; import org.apache.doris.qe.SessionVariable; import org.apache.doris.qe.StmtExecutor; -import org.apache.doris.statistics.AnalysisInfo; +import org.apache.doris.qe.VariableMgr; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.ColumnStatisticBuilder; import org.apache.doris.statistics.Histogram; +import org.apache.doris.statistics.ResultRow; import org.apache.doris.statistics.StatisticConstants; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; +import org.apache.doris.system.Frontend; import org.apache.doris.system.SystemInfoService; import com.google.common.base.Preconditions; @@ -82,9 +87,12 @@ import org.apache.iceberg.types.Types; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.apache.thrift.TException; +import java.net.InetSocketAddress; import java.text.SimpleDateFormat; +import java.time.LocalTime; +import java.time.format.DateTimeFormatter; +import java.time.format.DateTimeParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; @@ -94,6 +102,7 @@ import java.util.Map; import java.util.Objects; import java.util.Optional; +import java.util.Set; import java.util.StringJoiner; import java.util.function.Function; import java.util.stream.Collectors; @@ -102,7 +111,6 @@ public class StatisticsUtil { private static final Logger LOG = LogManager.getLogger(StatisticsUtil.class); private static final String ID_DELIMITER = "-"; - private static final String VALUES_DELIMITER = ","; private static final String TOTAL_SIZE = "totalSize"; private static final String NUM_ROWS = "numRows"; @@ -142,16 +150,6 @@ public static QueryState execUpdate(String sql) throws Exception { } } - public static List deserializeToAnalysisJob(List resultBatches) - throws TException { - if (CollectionUtils.isEmpty(resultBatches)) { - return Collections.emptyList(); - } - return resultBatches.stream() - .map(AnalysisInfo::fromResultRow) - .collect(Collectors.toList()); - } - public static ColumnStatistic deserializeToColumnStatistics(List resultBatches) throws Exception { if (CollectionUtils.isEmpty(resultBatches)) { @@ -166,15 +164,22 @@ public static List deserializeToHistogramStatistics(List r } public static AutoCloseConnectContext buildConnectContext() { + return buildConnectContext(false); + } + + public static AutoCloseConnectContext buildConnectContext(boolean limitScan) { ConnectContext connectContext = new ConnectContext(); SessionVariable sessionVariable = connectContext.getSessionVariable(); sessionVariable.internalSession = true; sessionVariable.setMaxExecMemByte(Config.statistics_sql_mem_limit_in_bytes); + sessionVariable.cpuResourceLimit = Config.cpu_resource_limit_per_analyze_task; sessionVariable.setEnableInsertStrict(true); + sessionVariable.enablePageCache = false; sessionVariable.parallelExecInstanceNum = Config.statistics_sql_parallel_exec_instance_num; sessionVariable.parallelPipelineTaskNum = Config.statistics_sql_parallel_exec_instance_num; sessionVariable.setEnableNereidsPlanner(false); sessionVariable.enableProfile = false; + sessionVariable.enableScanRunSerial = limitScan; sessionVariable.queryTimeoutS = Config.analyze_task_timeout_in_hours * 60 * 60; sessionVariable.insertTimeoutS = Config.analyze_task_timeout_in_hours * 60 * 60; sessionVariable.enableFileCache = false; @@ -216,7 +221,7 @@ public static LiteralExpr readableValue(Type type, String columnValue) throws An case DOUBLE: return new FloatLiteral(columnValue); case DECIMALV2: - //no need to check precision and scale, since V2 is fixed point + // no need to check precision and scale, since V2 is fixed point return new DecimalLiteral(columnValue); case DECIMAL32: case DECIMAL64: @@ -392,11 +397,12 @@ public static boolean statsTblAvailable() { .findTable(InternalCatalog.INTERNAL_CATALOG_NAME, dbName, StatisticConstants.STATISTIC_TBL_NAME)); - statsTbls.add( - (OlapTable) StatisticsUtil - .findTable(InternalCatalog.INTERNAL_CATALOG_NAME, - dbName, - StatisticConstants.HISTOGRAM_TBL_NAME)); + // uncomment it when hist is available for user. + // statsTbls.add( + // (OlapTable) StatisticsUtil + // .findTable(InternalCatalog.INTERNAL_CATALOG_NAME, + // dbName, + // StatisticConstants.HISTOGRAM_TBL_NAME)); } catch (Throwable t) { return false; } @@ -430,6 +436,15 @@ public static Map getPartitionIdToName(TableIf table) { )); } + public static Set getPartitionIds(TableIf table) { + if (table instanceof OlapTable) { + return ((OlapTable) table).getPartitionIds().stream().map(String::valueOf).collect(Collectors.toSet()); + } else if (table instanceof ExternalTable) { + return table.getPartitionNames(); + } + throw new RuntimeException(String.format("Not supported Table %s", table.getClass().getName())); + } + public static String joinElementsToString(Collection values, String delimiter) { StringJoiner builder = new StringJoiner(delimiter); values.forEach(v -> builder.add(String.valueOf(v))); @@ -475,9 +490,9 @@ public static String replaceParams(String template, Map params) * when update_rows < row_count, the health degree is 100 (1 - update_rows row_count). * * @param updatedRows The number of rows updated by the table - * @return Health, the value range is [0, 100], the larger the value, * @param totalRows The current number of rows in the table - * the healthier the statistics of the table + * the healthier the statistics of the table + * @return Health, the value range is [0, 100], the larger the value, */ public static int getTableHealth(long totalRows, long updatedRows) { if (updatedRows >= totalRows) { @@ -491,19 +506,25 @@ public static int getTableHealth(long totalRows, long updatedRows) { /** * Estimate hive table row count. * First get it from remote table parameters. If not found, estimate it : totalSize/estimatedRowSize + * * @param table Hive HMSExternalTable to estimate row count. + * @param isInit Flag to indicate if this is called during init. To avoid recursively get schema. * @return estimated row count */ - public static long getHiveRowCount(HMSExternalTable table) { + public static long getHiveRowCount(HMSExternalTable table, boolean isInit) { Map parameters = table.getRemoteTable().getParameters(); if (parameters == null) { return -1; } // Table parameters contains row count, simply get and return it. if (parameters.containsKey(NUM_ROWS)) { - return Long.parseLong(parameters.get(NUM_ROWS)); + long rows = Long.parseLong(parameters.get(NUM_ROWS)); + // Sometimes, the NUM_ROWS in hms is 0 but actually is not. Need to check TOTAL_SIZE if NUM_ROWS is 0. + if (rows != 0) { + return rows; + } } - if (!parameters.containsKey(TOTAL_SIZE)) { + if (!parameters.containsKey(TOTAL_SIZE) || isInit) { return -1; } // Table parameters doesn't contain row count but contain total size. Estimate row count : totalSize/rowSize @@ -521,6 +542,7 @@ public static long getHiveRowCount(HMSExternalTable table) { /** * Estimate iceberg table row count. * Get the row count by adding all task file recordCount. + * * @param table Iceberg HMSExternalTable to estimate row count. * @return estimated row count */ @@ -544,6 +566,7 @@ public static long getIcebergRowCount(HMSExternalTable table) { /** * Estimate hive table row count : totalFileSize/estimatedRowSize + * * @param table Hive HMSExternalTable to estimate row count. * @return estimated row count */ @@ -618,6 +641,7 @@ public static long getRowCountFromFileList(HMSExternalTable table) { /** * Get Iceberg column statistics. + * * @param colName * @param table Iceberg table. * @return Optional Column statistic for the given column. @@ -626,8 +650,8 @@ public static Optional getIcebergColumnStats(String colName, or TableScan tableScan = table.newScan().includeColumnStats(); ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(); columnStatisticBuilder.setCount(0); - columnStatisticBuilder.setMaxValue(Double.MAX_VALUE); - columnStatisticBuilder.setMinValue(Double.MIN_VALUE); + columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY); + columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY); columnStatisticBuilder.setDataSize(0); columnStatisticBuilder.setAvgSizeByte(0); columnStatisticBuilder.setNumNulls(0); @@ -642,7 +666,7 @@ public static Optional getIcebergColumnStats(String colName, or } private static void processDataFile(DataFile dataFile, PartitionSpec partitionSpec, - String colName, ColumnStatisticBuilder columnStatisticBuilder) { + String colName, ColumnStatisticBuilder columnStatisticBuilder) { int colId = -1; for (Types.NestedField column : partitionSpec.schema().columns()) { if (column.name().equals(colName)) { @@ -678,4 +702,87 @@ public static void sleep(long millis) { // IGNORE } } + + public static String quote(String str) { + return "'" + str + "'"; + } + + public static boolean isMaster(Frontend frontend) { + InetSocketAddress socketAddress = new InetSocketAddress(frontend.getHost(), frontend.getEditLogPort()); + return Env.getCurrentEnv().getHaProtocol().getLeader().equals(socketAddress); + } + + public static String escapeSQL(String str) { + if (str == null) { + return null; + } + return org.apache.commons.lang3.StringUtils.replace(str, "'", "''"); + } + + public static boolean isExternalTable(String catalogName, String dbName, String tblName) { + TableIf table; + try { + table = StatisticsUtil.findTable(catalogName, dbName, tblName); + } catch (Throwable e) { + LOG.warn(e.getMessage()); + return false; + } + return table instanceof ExternalTable; + } + + public static boolean inAnalyzeTime(LocalTime now) { + try { + Pair range = findRangeFromGlobalSessionVar(); + if (range == null) { + return false; + } + LocalTime start = range.first; + LocalTime end = range.second; + if (start.isAfter(end) && (now.isAfter(start) || now.isBefore(end))) { + return true; + } else { + return now.isAfter(start) && now.isBefore(end); + } + } catch (DateTimeParseException e) { + LOG.warn("Parse analyze start/end time format fail", e); + return true; + } + } + + private static Pair findRangeFromGlobalSessionVar() { + try { + String startTime = + findRangeFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_START_TIME) + .fullAutoAnalyzeStartTime; + // For compatibility + if (StringUtils.isEmpty(startTime)) { + startTime = StatisticConstants.FULL_AUTO_ANALYZE_START_TIME; + } + String endTime = findRangeFromGlobalSessionVar(SessionVariable.FULL_AUTO_ANALYZE_END_TIME) + .fullAutoAnalyzeEndTime; + if (StringUtils.isEmpty(startTime)) { + endTime = StatisticConstants.FULL_AUTO_ANALYZE_END_TIME; + } + DateTimeFormatter timeFormatter = DateTimeFormatter.ofPattern("HH:mm:ss"); + return Pair.of(LocalTime.parse(startTime, timeFormatter), LocalTime.parse(endTime, timeFormatter)); + } catch (Exception e) { + return null; + } + } + + private static SessionVariable findRangeFromGlobalSessionVar(String varName) throws Exception { + SessionVariable sessionVariable = VariableMgr.newSessionVariable(); + VariableExpr variableExpr = new VariableExpr(varName, SetType.GLOBAL); + VariableMgr.getValue(sessionVariable, variableExpr); + return sessionVariable; + } + + public static boolean enableAutoAnalyze() { + try { + return findRangeFromGlobalSessionVar(SessionVariable.ENABLE_FULL_AUTO_ANALYZE).enableFullAutoAnalyze; + } catch (Exception e) { + LOG.warn("Fail to get value of enable auto analyze, return false by default", e); + } + return false; + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/jobs/joinorder/hypergraph/OtherJoinTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/jobs/joinorder/hypergraph/OtherJoinTest.java index feeb971b15ee694..a4062d2edff9c5a 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/jobs/joinorder/hypergraph/OtherJoinTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/jobs/joinorder/hypergraph/OtherJoinTest.java @@ -20,6 +20,7 @@ import org.apache.doris.nereids.CascadesContext; import org.apache.doris.nereids.datasets.tpch.TPCHTestBase; import org.apache.doris.nereids.trees.plans.Plan; +import org.apache.doris.nereids.trees.plans.logical.LogicalProject; import org.apache.doris.nereids.util.HyperGraphBuilder; import org.apache.doris.nereids.util.MemoTestUtils; import org.apache.doris.nereids.util.PlanChecker; @@ -32,23 +33,37 @@ public class OtherJoinTest extends TPCHTestBase { @Test - public void randomTest() { + public void test() { + for (int t = 3; t < 10; t++) { + for (int e = t - 1; e <= (t * (t - 1)) / 2; e++) { + for (int i = 0; i < 10; i++) { + System.out.println(String.valueOf(t) + " " + e + ": " + i); + randomTest(t, e); + } + } + } + } + + private void randomTest(int tableNum, int edgeNum) { HyperGraphBuilder hyperGraphBuilder = new HyperGraphBuilder(); Plan plan = hyperGraphBuilder - .randomBuildPlanWith(10, 20); - Set> res1 = hyperGraphBuilder.evaluate(plan); + .randomBuildPlanWith(tableNum, edgeNum); + plan = new LogicalProject(plan.getOutput(), plan); + Set> res1 = hyperGraphBuilder.evaluate(plan); CascadesContext cascadesContext = MemoTestUtils.createCascadesContext(connectContext, plan); hyperGraphBuilder.initStats(cascadesContext); Plan optimizedPlan = PlanChecker.from(cascadesContext) - .dpHypOptimize() - .getBestPlanTree(); + .dpHypOptimize() + .getBestPlanTree(); - Set> res2 = hyperGraphBuilder.evaluate(optimizedPlan); + Set> res2 = hyperGraphBuilder.evaluate(optimizedPlan); if (!res1.equals(res2)) { - System.out.println(res1); - System.out.println(res2); System.out.println(plan.treeString()); System.out.println(optimizedPlan.treeString()); + cascadesContext = MemoTestUtils.createCascadesContext(connectContext, plan); + PlanChecker.from(cascadesContext).dpHypOptimize().getBestPlanTree(); + System.out.println(res1); + System.out.println(res2); } Assertions.assertTrue(res1.equals(res2)); diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java index 1fe5e5b0a0e6f6c..31affe06252bd81 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/stats/FilterEstimationTest.java @@ -17,6 +17,7 @@ package org.apache.doris.nereids.stats; +import org.apache.doris.analysis.IntLiteral; import org.apache.doris.nereids.trees.expressions.And; import org.apache.doris.nereids.trees.expressions.Cast; import org.apache.doris.nereids.trees.expressions.EqualTo; @@ -75,7 +76,7 @@ public void testOrNaN() { Statistics expected = filterEstimation.estimate(or, stat); Assertions.assertTrue( Precision.equals(expected.getRowCount(), 750, - 0.01)); + 0.01)); } // a > 500 and b < 100 @@ -132,12 +133,12 @@ public void testNotInNaN() { Map slotToColumnStat = new HashMap<>(); ColumnStatisticBuilder builder = new ColumnStatisticBuilder() .setNdv(500) - .setIsUnknown(true); + .setIsUnknown(false); slotToColumnStat.put(a, builder.build()); Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation(); Statistics expected = filterEstimation.estimate(notIn, stat); - Assertions.assertTrue(Precision.equals(666.666, expected.getRowCount(), 0.01)); + Assertions.assertTrue(Precision.equals(1000, expected.getRowCount(), 0.01)); } /** @@ -165,7 +166,6 @@ public void testRelatedAnd() { ColumnStatistic aStatsEst = result.findColumnStatistics(a); Assertions.assertEquals(100, aStatsEst.minValue); Assertions.assertEquals(200, aStatsEst.maxValue); - Assertions.assertEquals(1.0, aStatsEst.selectivity); Assertions.assertEquals(10, aStatsEst.ndv); } @@ -198,7 +198,7 @@ public void test1() { Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation(); Statistics expected = filterEstimation.estimate(or, stat); - Assertions.assertEquals(51, expected.getRowCount(), 0.1); + Assertions.assertEquals(51.9, expected.getRowCount(), 0.1); } // a > 500 and b < 100 or a > c @@ -418,7 +418,9 @@ public void test10() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(1) - .setMaxValue(10); + .setMinExpr(new IntLiteral(1)) + .setMaxValue(10) + .setMaxExpr(new IntLiteral(10)); slotToColumnStat.put(a, builder.build()); Statistics stat = new Statistics(1000, slotToColumnStat); FilterEstimation filterEstimation = new FilterEstimation(); @@ -467,22 +469,19 @@ public void test12() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(1000) - .setMaxValue(10000) - .setSelectivity(1.0); + .setMaxValue(10000); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(500) - .setSelectivity(1.0); + .setMaxValue(500); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(200) - .setSelectivity(1.0); + .setMaxValue(200); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -504,7 +503,7 @@ public void test12() { * filter range has intersection with (c.min, c.max) * rows = 100 * a primary key, a.ndv reduced by 1/4 - * b normal field, b.ndv=20 => + * b normal field, b.ndv=20 * c.ndv = 10/40 * c.ndv */ @Test @@ -524,22 +523,19 @@ public void testFilterInsideMinMax() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(100) - .setSelectivity(1.0); + .setMaxValue(100); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setNdv(20) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(500) - .setSelectivity(1.0); + .setMaxValue(500); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setNdv(40) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(40) - .setSelectivity(1.0); + .setMaxValue(40); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -554,25 +550,21 @@ public void testFilterInsideMinMax() { Assertions.assertEquals(100, statsA.maxValue); ColumnStatistic statsB = estimated.findColumnStatistics(b); - Assertions.assertEquals(15.6, statsB.ndv, 0.1); + Assertions.assertEquals(20, statsB.ndv, 0.1); Assertions.assertEquals(0, statsB.minValue); Assertions.assertEquals(500, statsB.maxValue); - Assertions.assertEquals(1.0, statsB.selectivity); ColumnStatistic statsC = estimated.findColumnStatistics(c); Assertions.assertEquals(10, statsC.ndv); Assertions.assertEquals(10, statsC.minValue); Assertions.assertEquals(20, statsC.maxValue); - Assertions.assertEquals(1.0, statsC.selectivity); } /** * test filter estimation, c > 300, where 300 is out of c's range (0,200) * after filter - * c.selectivity=a.selectivity=b.selectivity = 0 * c.ndv=a.ndv=b.ndv=0 - * a.ndv = b.ndv = 0 */ @Test @@ -587,23 +579,23 @@ public void testFilterOutofMinMax() { .setNdv(1000) .setAvgSizeByte(4) .setNumNulls(0) - .setMinValue(10000) - .setMaxValue(1000) - .setSelectivity(1.0); + .setMinValue(1000) + .setMaxValue(10000) + .setCount(1000); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) .setMaxValue(500) - .setSelectivity(1.0); + .setCount(1000); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setNdv(100) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) .setMaxValue(200) - .setSelectivity(1.0); + .setCount(1000); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -616,8 +608,8 @@ public void testFilterOutofMinMax() { Assertions.assertEquals(0, statsB.ndv); ColumnStatistic statsC = estimated.findColumnStatistics(c); Assertions.assertEquals(0, statsC.ndv); - Assertions.assertTrue(Double.isNaN(statsC.minValue)); - Assertions.assertTrue(Double.isNaN(statsC.maxValue)); + Assertions.assertTrue(Double.isInfinite(statsC.minValue)); + Assertions.assertTrue(Double.isInfinite(statsC.maxValue)); } /** @@ -660,22 +652,19 @@ public void testInPredicateEstimationForColumns() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(100) - .setSelectivity(1.0); + .setMaxValue(100); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setNdv(20) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(500) - .setSelectivity(1.0); + .setMaxValue(500); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setNdv(40) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(40) - .setSelectivity(1.0); + .setMaxValue(40); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -690,7 +679,7 @@ public void testInPredicateEstimationForColumns() { Assertions.assertEquals(5, statsA.ndv, 0.1); Assertions.assertEquals(0, statsA.minValue); Assertions.assertEquals(100, statsA.maxValue); - Assertions.assertEquals(4.5, statsB.ndv, 0.1); + Assertions.assertEquals(5, statsB.ndv, 0.1); Assertions.assertEquals(0, statsB.minValue); Assertions.assertEquals(500, statsB.maxValue); Assertions.assertEquals(2, statsC.ndv); @@ -732,7 +721,6 @@ public void testInPredicateEstimationForColumnsOutofRange() { .setNumNulls(0) .setMinValue(0) .setMaxValue(100) - .setSelectivity(1.0) .setCount(100); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setCount(100) @@ -740,16 +728,14 @@ public void testInPredicateEstimationForColumnsOutofRange() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(500) - .setSelectivity(1.0); + .setMaxValue(500); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setCount(100) .setNdv(40) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(40) - .setSelectivity(1.0); + .setMaxValue(40); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -767,7 +753,7 @@ public void testInPredicateEstimationForColumnsOutofRange() { Assertions.assertEquals(5, statsA.ndv, 0.1); Assertions.assertEquals(0, statsA.minValue); Assertions.assertEquals(100, statsA.maxValue); - Assertions.assertEquals(4.5, statsB.ndv, 0.1); + Assertions.assertEquals(5, statsB.ndv, 0.1); Assertions.assertEquals(0, statsB.minValue); Assertions.assertEquals(500, statsB.maxValue); Assertions.assertEquals(2, statsC.ndv); @@ -804,7 +790,6 @@ public void testFilterEstimationForColumnsNotChanged() { .setNumNulls(0) .setMinValue(0) .setMaxValue(100) - .setSelectivity(1.0) .setCount(100); ColumnStatisticBuilder builderB = new ColumnStatisticBuilder() .setCount(100) @@ -812,16 +797,14 @@ public void testFilterEstimationForColumnsNotChanged() { .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(500) - .setSelectivity(1.0); + .setMaxValue(500); ColumnStatisticBuilder builderC = new ColumnStatisticBuilder() .setCount(100) .setNdv(40) .setAvgSizeByte(4) .setNumNulls(0) .setMinValue(0) - .setMaxValue(40) - .setSelectivity(1.0); + .setMaxValue(40); slotToColumnStat.put(a, builderA.build()); slotToColumnStat.put(b, builderB.build()); slotToColumnStat.put(c, builderC.build()); @@ -836,7 +819,7 @@ public void testFilterEstimationForColumnsNotChanged() { Assertions.assertEquals(75, statsA.ndv); Assertions.assertEquals(0, statsA.minValue); Assertions.assertEquals(100, statsA.maxValue); - Assertions.assertEquals(19.9, statsB.ndv, 0.1); + Assertions.assertEquals(20, statsB.ndv, 0.1); Assertions.assertEquals(0, statsB.minValue); Assertions.assertEquals(500, statsB.maxValue); Assertions.assertEquals(30, statsC.ndv); @@ -853,7 +836,6 @@ public void testBetweenCastFilter() { .setNumNulls(0) .setMaxValue(100) .setMinValue(0) - .setSelectivity(1.0) .setCount(100); DoubleLiteral begin = new DoubleLiteral(40.0); DoubleLiteral end = new DoubleLiteral(50.0); @@ -881,7 +863,6 @@ public void testDateRangeSelectivity() { .setNumNulls(0) .setMaxValue(to.getDouble()) .setMinValue(from.getDouble()) - .setSelectivity(1.0) .setCount(100); DateLiteral mid = new DateLiteral("1999-01-01"); GreaterThan greaterThan = new GreaterThan(a, mid); diff --git a/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java index 612daae8739b8df..e33c28ae933950f 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java +++ b/fe/fe-core/src/test/java/org/apache/doris/nereids/util/HyperGraphBuilder.java @@ -17,6 +17,7 @@ package org.apache.doris.nereids.util; +import org.apache.doris.catalog.Env; import org.apache.doris.common.Pair; import org.apache.doris.nereids.CascadesContext; import org.apache.doris.nereids.jobs.joinorder.JoinOrderJob; @@ -26,6 +27,7 @@ import org.apache.doris.nereids.trees.expressions.EqualTo; import org.apache.doris.nereids.trees.expressions.Expression; import org.apache.doris.nereids.trees.expressions.Slot; +import org.apache.doris.nereids.trees.expressions.SlotReference; import org.apache.doris.nereids.trees.plans.JoinType; import org.apache.doris.nereids.trees.plans.Plan; import org.apache.doris.nereids.trees.plans.logical.LogicalJoin; @@ -35,6 +37,7 @@ import org.apache.doris.nereids.trees.plans.physical.PhysicalOlapScan; import org.apache.doris.statistics.ColumnStatistic; import org.apache.doris.statistics.Statistics; +import org.apache.doris.statistics.StatisticsCacheKey; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; @@ -58,14 +61,14 @@ public class HyperGraphBuilder { private final HashMap plans = new HashMap<>(); private final HashMap> schemas = new HashMap<>(); - private final ImmutableList fullJoinTypes = ImmutableList.of( + private ImmutableList fullJoinTypes = ImmutableList.of( JoinType.INNER_JOIN, JoinType.LEFT_OUTER_JOIN, JoinType.RIGHT_OUTER_JOIN, JoinType.FULL_OUTER_JOIN ); - private final ImmutableList leftFullJoinTypes = ImmutableList.of( + private ImmutableList leftFullJoinTypes = ImmutableList.of( JoinType.INNER_JOIN, JoinType.LEFT_OUTER_JOIN, JoinType.RIGHT_OUTER_JOIN, @@ -75,7 +78,7 @@ public class HyperGraphBuilder { JoinType.NULL_AWARE_LEFT_ANTI_JOIN ); - private final ImmutableList rightFullJoinTypes = ImmutableList.of( + private ImmutableList rightFullJoinTypes = ImmutableList.of( JoinType.INNER_JOIN, JoinType.LEFT_OUTER_JOIN, JoinType.RIGHT_OUTER_JOIN, @@ -84,12 +87,32 @@ public class HyperGraphBuilder { JoinType.RIGHT_ANTI_JOIN ); + public HyperGraphBuilder() {} + + public HyperGraphBuilder(Set validJoinType) { + fullJoinTypes = fullJoinTypes.stream() + .filter(validJoinType::contains) + .collect(ImmutableList.toImmutableList()); + leftFullJoinTypes = leftFullJoinTypes.stream() + .filter(validJoinType::contains) + .collect(ImmutableList.toImmutableList()); + rightFullJoinTypes = rightFullJoinTypes.stream() + .filter(validJoinType::contains) + .collect(ImmutableList.toImmutableList()); + } + public HyperGraph build() { assert plans.size() == 1 : "there are cross join"; Plan plan = plans.values().iterator().next(); return buildHyperGraph(plan); } + public Plan buildPlan() { + assert plans.size() == 1 : "there are cross join"; + Plan plan = plans.values().iterator().next(); + return plan; + } + public Plan buildJoinPlan() { assert plans.size() == 1 : "there are cross join"; Plan plan = plans.values().iterator().next(); @@ -166,9 +189,14 @@ public void initStats(CascadesContext context) { for (Group group : context.getMemo().getGroups()) { GroupExpression groupExpression = group.getLogicalExpression(); if (groupExpression.getPlan() instanceof LogicalOlapScan) { + LogicalOlapScan scan = (LogicalOlapScan) groupExpression.getPlan(); Statistics stats = injectRowcount((LogicalOlapScan) groupExpression.getPlan()); - groupExpression.setStatDerived(true); - group.setStatistics(stats); + for (Expression expr : stats.columnStatistics().keySet()) { + SlotReference slot = (SlotReference) expr; + Env.getCurrentEnv().getStatisticsCache().putCache( + new StatisticsCacheKey(scan.getTable().getId(), -1, slot.getName()), + stats.columnStatistics().get(expr)); + } } } } @@ -313,8 +341,8 @@ private Statistics injectRowcount(LogicalOlapScan scanPlan) { for (Slot slot : scanPlan.getOutput()) { slotIdToColumnStats.put(slot, new ColumnStatistic(count, count, null, 1, 0, 0, 0, - count, 1, null, null, true, null, - new Date().toString())); + count, null, null, true, null, + new Date().toString(), null)); } return new Statistics(count, slotIdToColumnStats); } @@ -364,7 +392,7 @@ private Expression makeCondition(int node1, int node2, BitSet bitSet) { return hashConjunts; } - public Set> evaluate(Plan plan) { + public Set> evaluate(Plan plan) { JoinEvaluator evaluator = new JoinEvaluator(rowCounts); Map> res = evaluator.evaluate(plan); int rowCount = 0; @@ -376,11 +404,12 @@ public Set> evaluate(Plan plan) { (slot1, slot2) -> String.CASE_INSENSITIVE_ORDER.compare(slot1.toString(), slot2.toString())) .collect(Collectors.toList()); - Set> tuples = new HashSet<>(); + Set> tuples = new HashSet<>(); + tuples.add(keySet.stream().map(s -> s.toString()).collect(Collectors.toList())); for (int i = 0; i < rowCount; i++) { - List tuple = new ArrayList<>(); + List tuple = new ArrayList<>(); for (Slot key : keySet) { - tuple.add(res.get(key).get(i)); + tuple.add(String.valueOf(res.get(key).get(i))); } tuples.add(tuple); } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java index 1955a0d9a3e3612..9624c20149828dd 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisJobTest.java @@ -26,7 +26,6 @@ import org.apache.doris.statistics.AnalysisInfo.AnalysisMode; import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import org.apache.doris.statistics.AnalysisInfo.JobType; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.statistics.util.StatisticsUtil; import org.apache.doris.utframe.TestWithFeService; @@ -38,6 +37,7 @@ import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -62,13 +62,7 @@ protected void runBeforeAll() throws Exception { } @Test - public void testCreateAnalysisJob(@Mocked AnalysisTaskScheduler scheduler) throws Exception { - new Expectations() { - { - scheduler.schedule((BaseAnalysisTask) any); - times = 3; - } - }; + public void testCreateAnalysisJob() throws Exception { new MockUp() { @@ -101,7 +95,7 @@ public ConnectContext get() { } @Test - public void testJobExecution(@Mocked AnalysisTaskScheduler scheduler, @Mocked StmtExecutor stmtExecutor) + public void testJobExecution(@Mocked StmtExecutor stmtExecutor) throws Exception { new MockUp() { @@ -120,10 +114,16 @@ public void execUpdate(String sql) throws Exception { public void syncLoadColStats(long tableId, long idxId, String colName) { } }; - new Expectations() { - { - stmtExecutor.execute(); - times = 2; + new MockUp() { + + @Mock + public void execute() throws Exception { + + } + + @Mock + public List executeInternalQuery() { + return new ArrayList<>(); } }; HashMap> colToPartitions = Maps.newHashMap(); @@ -135,8 +135,15 @@ public void syncLoadColStats(long tableId, long idxId, String colName) { .setAnalysisMethod(AnalysisMethod.FULL) .setAnalysisType(AnalysisType.FUNDAMENTALS) .setColToPartitions(colToPartitions) + .setState(AnalysisState.RUNNING) .build(); new OlapAnalysisTask(analysisJobInfo).doExecute(); + new Expectations() { + { + stmtExecutor.execute(); + times = 1; + } + }; } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java index 42f643a137d8b7b..196ac8ad9a056f3 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/AnalysisTaskExecutorTest.java @@ -24,13 +24,12 @@ import org.apache.doris.statistics.AnalysisInfo.AnalysisMode; import org.apache.doris.statistics.AnalysisInfo.AnalysisType; import org.apache.doris.statistics.AnalysisInfo.JobType; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; import org.apache.doris.utframe.TestWithFeService; import com.google.common.collect.Maps; +import mockit.Expectations; import mockit.Mock; import mockit.MockUp; -import mockit.Mocked; import org.junit.jupiter.api.Test; import java.util.Collections; @@ -41,8 +40,6 @@ public class AnalysisTaskExecutorTest extends TestWithFeService { - @Mocked - AnalysisTaskScheduler analysisTaskScheduler; @Override protected void runBeforeAll() throws Exception { @@ -71,13 +68,7 @@ public void testExpiredJobCancellation() throws Exception { .build(); OlapAnalysisTask analysisJob = new OlapAnalysisTask(analysisJobInfo); - new MockUp() { - public synchronized BaseAnalysisTask getPendingTasks() { - return analysisJob; - } - }; - - AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(analysisTaskScheduler); + AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(1); BlockingQueue b = Deencapsulation.getField(analysisTaskExecutor, "taskQueue"); AnalysisTaskWrapper analysisTaskWrapper = new AnalysisTaskWrapper(analysisTaskExecutor, analysisJob); Deencapsulation.setField(analysisTaskWrapper, "startTime", 5); @@ -97,7 +88,12 @@ public List executeInternalQuery() { new MockUp() { @Mock - public void execSQL(String sql) throws Exception { + public void execSQLs(List sqls) throws Exception { + } + + @Mock + protected void executeWithExceptionOnFail(StmtExecutor stmtExecutor) throws Exception { + // DO NOTHING } }; @@ -108,7 +104,7 @@ public void syncLoadColStats(long tableId, long idxId, String colName) { } }; - AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(analysisTaskScheduler); + AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(1); HashMap> colToPartitions = Maps.newHashMap(); colToPartitions.put("col1", Collections.singleton("t1")); AnalysisInfo analysisInfo = new AnalysisInfoBuilder().setJobId(0).setTaskId(0) @@ -117,19 +113,20 @@ public void syncLoadColStats(long tableId, long idxId, String colName) { .setAnalysisMode(AnalysisMode.FULL) .setAnalysisMethod(AnalysisMethod.FULL) .setAnalysisType(AnalysisType.FUNDAMENTALS) + .setState(AnalysisState.RUNNING) .setColToPartitions(colToPartitions) .build(); OlapAnalysisTask task = new OlapAnalysisTask(analysisInfo); - new MockUp() { - @Mock - public synchronized BaseAnalysisTask getPendingTasks() { - return task; - } - }; + new MockUp() { @Mock public void updateTaskStatus(AnalysisInfo info, AnalysisState jobState, String message, long time) {} }; - Deencapsulation.invoke(analysisTaskExecutor, "doFetchAndExecute"); + new Expectations() { + { + task.doExecute(); + } + }; + Deencapsulation.invoke(analysisTaskExecutor, "submitTask", task); } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java index d6570ecebc5df82..77086723e27153c 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/CacheTest.java @@ -25,8 +25,10 @@ import org.apache.doris.catalog.external.HMSExternalTable; import org.apache.doris.datasource.CatalogMgr; import org.apache.doris.datasource.HMSExternalCatalog; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; +import org.apache.doris.ha.FrontendNodeType; import org.apache.doris.statistics.util.StatisticsUtil; +import org.apache.doris.system.Frontend; +import org.apache.doris.thrift.TUpdateFollowerStatsCacheRequest; import org.apache.doris.utframe.TestWithFeService; import com.google.common.collect.Lists; @@ -90,60 +92,20 @@ public List execStatisticQuery(String sql) { } catch (InterruptedException e) { // ignore } - List colNames = new ArrayList<>(); - colNames.add("count"); - colNames.add("ndv"); - colNames.add("null_count"); - colNames.add("data_size_in_bytes"); - colNames.add("catalog_id"); - colNames.add("db_id"); - colNames.add("idx_id"); - colNames.add("tbl_id"); - colNames.add("col_id"); - colNames.add("min"); - colNames.add("max"); - colNames.add("part_id"); - colNames.add("update_time"); - List primitiveTypes = new ArrayList<>(); - primitiveTypes.add(PrimitiveType.BIGINT); - primitiveTypes.add(PrimitiveType.BIGINT); - primitiveTypes.add(PrimitiveType.BIGINT); - primitiveTypes.add(PrimitiveType.BIGINT); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - List values = new ArrayList<>(); - values.add("1"); - values.add("2"); - values.add("3"); - values.add("4"); - values.add("5"); - values.add("-1"); - values.add("6"); - values.add("7"); - values.add("8"); - values.add("9"); - values.add("10"); - values.add(null); - values.add(new Date().toString()); - ResultRow resultRow = new ResultRow(colNames, primitiveTypes, values); - return Arrays.asList(resultRow); + return Arrays.asList(StatsMockUtil.mockResultRow(true)); } }; StatisticsCache statisticsCache = new StatisticsCache(); ColumnStatistic columnStatistic = statisticsCache.getColumnStatistics(-1, -1, 0, "col"); + // load not finished yet, should return unknown Assertions.assertTrue(columnStatistic.isUnKnown); + // wait 1 sec to ensure `execStatisticQuery` is finished as much as possible. Thread.sleep(1000); + // load has finished, return corresponding stats. columnStatistic = statisticsCache.getColumnStatistics(-1, -1, 0, "col"); - Assertions.assertEquals(1, columnStatistic.count); - Assertions.assertEquals(2, columnStatistic.ndv); - Assertions.assertEquals(10, columnStatistic.maxValue); + Assertions.assertEquals(7, columnStatistic.count); + Assertions.assertEquals(8, columnStatistic.ndv); + Assertions.assertEquals(11, columnStatistic.maxValue); } @Test @@ -159,11 +121,10 @@ public Histogram fromResultRow(ResultRow resultRow) { Type dataType = col.getType(); histogramBuilder.setDataType(dataType); + HistData histData = new HistData(resultRow); + histogramBuilder.setSampleRate(histData.sampleRate); - double sampleRate = Double.parseDouble(resultRow.getColumnValue("sample_rate")); - histogramBuilder.setSampleRate(sampleRate); - - String json = resultRow.getColumnValue("buckets"); + String json = histData.buckets; JsonObject jsonObj = JsonParser.parseString(json).getAsJsonObject(); int bucketNum = jsonObj.get("num_buckets").getAsInt(); @@ -202,28 +163,14 @@ public List execStatisticQuery(String sql) { } catch (InterruptedException e) { // ignore } - List colNames = new ArrayList<>(); - colNames.add("catalog_id"); - colNames.add("db_id"); - colNames.add("idx_id"); - colNames.add("tbl_id"); - colNames.add("col_id"); - colNames.add("sample_rate"); - colNames.add("buckets"); - List primitiveTypes = new ArrayList<>(); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); - primitiveTypes.add(PrimitiveType.VARCHAR); List values = new ArrayList<>(); values.add("1"); values.add("2"); values.add("3"); - values.add("-1"); values.add("4"); + values.add("-1"); + values.add("col"); + values.add(null); values.add("0.2"); String buckets = "{\"num_buckets\":5,\"buckets\":" + "[{\"lower\":\"2022-09-21 17:30:29\",\"upper\":\"2022-09-21 22:30:29\"," @@ -237,7 +184,8 @@ public List execStatisticQuery(String sql) { + "{\"lower\":\"2022-09-25 17:30:29\",\"upper\":\"2022-09-25 22:30:29\"," + "\"count\":9,\"pre_sum\":37,\"ndv\":1}]}"; values.add(buckets); - ResultRow resultRow = new ResultRow(colNames, primitiveTypes, values); + values.add(new Date().toString()); + ResultRow resultRow = new ResultRow(values); return Collections.singletonList(resultRow); } }; @@ -251,10 +199,10 @@ public List execStatisticQuery(String sql) { @Test public void testLoadFromMeta(@Mocked Env env, - @Mocked CatalogMgr mgr, - @Mocked HMSExternalCatalog catalog, - @Mocked HMSExternalDatabase db, - @Mocked HMSExternalTable table) throws Exception { + @Mocked CatalogMgr mgr, + @Mocked HMSExternalCatalog catalog, + @Mocked HMSExternalDatabase db, + @Mocked HMSExternalTable table) throws Exception { new MockUp() { @Mock @@ -290,8 +238,8 @@ public Env getCurrentEnv() { table.getColumnStatistic("col"); result = new ColumnStatistic(1, 2, - null, 3, 4, 5, 6, 7, 8, - null, null, false, null, new Date().toString()); + null, 3, 4, 5, 6, 7, + null, null, false, null, new Date().toString(), null); } }; StatisticsCache statisticsCache = new StatisticsCache(); @@ -306,4 +254,96 @@ public Env getCurrentEnv() { Assertions.assertEquals(6, columnStatistic.minValue); Assertions.assertEquals(7, columnStatistic.maxValue); } + + @Test + public void testSync1() throws Exception { + new MockUp() { + @Mock + public List loadColStats(long tableId, long idxId, String colName) { + List rows = new ArrayList<>(); + rows.add(StatsMockUtil.mockResultRow(true)); + rows.add(StatsMockUtil.mockResultRow(false)); + return rows; + } + + @Mock + public boolean isMaster(Frontend frontend) { + return frontend.getRole().equals(FrontendNodeType.MASTER); + } + }; + new MockUp() { + @Mock + public List getFrontends(FrontendNodeType nodeType) { + Frontend frontend1 = new Frontend(FrontendNodeType.MASTER, + "fe1", "localhost:1111", "localhost", 2222); + Frontend frontend2 = new Frontend(FrontendNodeType.FOLLOWER, + "fe1", "localhost:1112", "localhost", 2223); + List frontends = new ArrayList<>(); + frontends.add(frontend1); + frontends.add(frontend2); + return frontends; + } + }; + + new MockUp() { + @Mock + private void sendStats(Frontend frontend, + TUpdateFollowerStatsCacheRequest updateFollowerStatsCacheRequest) { + // DO NONTHING + } + }; + StatisticsCache statisticsCache = new StatisticsCache(); + statisticsCache.syncLoadColStats(1L, 1L, "any"); + new Expectations() { + { + statisticsCache.sendStats((Frontend) any, (TUpdateFollowerStatsCacheRequest) any); + times = 1; + } + }; + } + + @Test + public void testSync2() throws Exception { + new MockUp() { + @Mock + + public ColumnStatistic fromResultRow(ResultRow row) { + return ColumnStatistic.UNKNOWN; + } + + @Mock + public ColumnStatistic fromResultRow(List row) { + return ColumnStatistic.UNKNOWN; + } + }; + new MockUp() { + @Mock + public List getFrontends(FrontendNodeType nodeType) { + Frontend frontend1 = new Frontend(FrontendNodeType.MASTER, + "fe1", "localhost:1111", "localhost", 2222); + Frontend frontend2 = new Frontend(FrontendNodeType.FOLLOWER, + "fe1", "localhost:1112", "localhost", 2223); + List frontends = new ArrayList<>(); + frontends.add(frontend1); + frontends.add(frontend2); + return frontends; + } + }; + + new MockUp() { + @Mock + private void sendStats(Frontend frontend, + TUpdateFollowerStatsCacheRequest updateFollowerStatsCacheRequest) { + // DO NOTHING + } + }; + StatisticsCache statisticsCache = new StatisticsCache(); + statisticsCache.syncLoadColStats(1L, 1L, "any"); + new Expectations() { + { + statisticsCache.sendStats((Frontend) any, (TUpdateFollowerStatsCacheRequest) any); + times = 0; + } + }; + } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/HistogramTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/HistogramTaskTest.java index d3d5245a81f8509..0660c994a127835 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/HistogramTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/HistogramTaskTest.java @@ -30,7 +30,6 @@ import mockit.Mock; import mockit.MockUp; -import mockit.Mocked; import org.junit.FixMethodOrder; import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Test; @@ -43,9 +42,6 @@ @FixMethodOrder(value = MethodSorters.NAME_ASCENDING) public class HistogramTaskTest extends TestWithFeService { - @Mocked - AnalysisTaskScheduler analysisTaskScheduler; - @Override protected void runBeforeAll() throws Exception { createDatabase("histogram_task_test"); @@ -96,7 +92,7 @@ public void test1TaskCreation() throws Exception { @Test public void test2TaskExecution() throws Exception { - AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(analysisTaskScheduler); + AnalysisTaskExecutor analysisTaskExecutor = new AnalysisTaskExecutor(1); AnalysisInfo analysisInfo = new AnalysisInfoBuilder() .setJobId(0).setTaskId(0).setCatalogName("internal") .setDbName(SystemInfoService.DEFAULT_CLUSTER + ":" + "histogram_task_test").setTblName("t1") @@ -107,17 +103,11 @@ public void test2TaskExecution() throws Exception { .build(); HistogramTask task = new HistogramTask(analysisInfo); - new MockUp() { - @Mock - public synchronized BaseAnalysisTask getPendingTasks() { - return task; - } - }; new MockUp() { @Mock public void updateTaskStatus(AnalysisInfo info, AnalysisState jobState, String message, long time) {} }; - Deencapsulation.invoke(analysisTaskExecutor, "doFetchAndExecute"); + Deencapsulation.invoke(analysisTaskExecutor, "submitTask", task); } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java index 78872a547d4084f..a1ff5b135875222 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsDeriveResultTest.java @@ -29,7 +29,8 @@ public class StatsDeriveResultTest { public void testUpdateRowCountByLimit() { StatsDeriveResult stats = new StatsDeriveResult(100); ColumnStatistic a = new ColumnStatistic(100, 10, null, 1, 5, 10, - 1, 100, 0.5, null, null, false, null, new Date().toString()); + 1, 100, null, null, false, null, + new Date().toString(), null); Id id = new Id(1); stats.addColumnStats(id, a); StatsDeriveResult res = stats.updateByLimit(0); @@ -42,7 +43,6 @@ public void testUpdateRowCountByLimit() { Assertions.assertEquals(1, resColStats.dataSize); Assertions.assertEquals(1, resColStats.minValue); Assertions.assertEquals(100, resColStats.maxValue); - Assertions.assertEquals(0, resColStats.selectivity); Assertions.assertEquals(false, resColStats.isUnKnown); res = stats.updateByLimit(1); @@ -53,7 +53,6 @@ public void testUpdateRowCountByLimit() { Assertions.assertEquals(1, resColStats.dataSize); Assertions.assertEquals(1, resColStats.minValue); Assertions.assertEquals(100, resColStats.maxValue); - Assertions.assertEquals(0.05, resColStats.selectivity); Assertions.assertEquals(false, resColStats.isUnKnown); } } diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java new file mode 100644 index 000000000000000..21035051ff86066 --- /dev/null +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.statistics; + +import java.util.ArrayList; +import java.util.List; + +public class StatsMockUtil { + + public static ResultRow mockResultRow(boolean col) { + List vals = new ArrayList() {{ + add("0"); + add("1"); + add("2"); + add("3"); + add("-1"); + add("5"); + if (col) { + add(null); + } else { + add("6"); + } + add("7"); + add("8"); + add("0"); + add("10"); + add("11"); + add("12"); + add(String.valueOf(System.currentTimeMillis())); + }}; + return new ResultRow(vals); + } +} diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/InternalQueryResultTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/InternalQueryResultTest.java deleted file mode 100644 index 8d2518ae406dc5e..000000000000000 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/InternalQueryResultTest.java +++ /dev/null @@ -1,119 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -package org.apache.doris.statistics.util; - -import org.apache.doris.catalog.PrimitiveType; -import org.apache.doris.common.DdlException; -import org.apache.doris.statistics.util.InternalQueryResult.ResultRow; - -import org.junit.Assert; -import org.junit.Before; -import org.junit.Test; - -import java.util.Arrays; -import java.util.List; - - -public class InternalQueryResultTest { - private InternalQueryResult queryResult; - private InternalQueryResult.ResultRow resultRow; - - @Before - public void setUp() throws Exception { - List columns = Arrays.asList("c1", "c2", "c3", "c4", "c5"); - List types = Arrays.asList(PrimitiveType.STRING, - PrimitiveType.INT, PrimitiveType.FLOAT, - PrimitiveType.DOUBLE, PrimitiveType.BIGINT); - queryResult = new InternalQueryResult(); - List values = Arrays.asList("s1", "1000", "0.1", "0.0001", "1000000"); - resultRow = new ResultRow(columns, types, values); - } - - @Test - public void testGetColumnIndex() { - Assert.assertEquals(0, resultRow.getColumnIndex("c1")); - Assert.assertEquals(1, resultRow.getColumnIndex("c2")); - Assert.assertEquals(2, resultRow.getColumnIndex("c3")); - Assert.assertEquals(3, resultRow.getColumnIndex("c4")); - Assert.assertEquals(4, resultRow.getColumnIndex("c5")); - } - - @Test - public void testGetColumnName() throws Exception { - Assert.assertEquals("c1", resultRow.getColumnName(0)); - Assert.assertEquals("c2", resultRow.getColumnName(1)); - Assert.assertEquals("c3", resultRow.getColumnName(2)); - Assert.assertEquals("c4", resultRow.getColumnName(3)); - Assert.assertEquals("c5", resultRow.getColumnName(4)); - } - - @Test - public void testGetColumnTypeWithIndex() { - try { - Assert.assertEquals(PrimitiveType.STRING, resultRow.getColumnType(0)); - Assert.assertEquals(PrimitiveType.INT, resultRow.getColumnType(1)); - Assert.assertEquals(PrimitiveType.FLOAT, resultRow.getColumnType(2)); - Assert.assertEquals(PrimitiveType.DOUBLE, resultRow.getColumnType(3)); - Assert.assertEquals(PrimitiveType.BIGINT, resultRow.getColumnType(4)); - } catch (DdlException e) { - e.printStackTrace(); - Assert.fail(); - } - } - - @Test - public void testGetColumnTypeWithName() { - try { - Assert.assertEquals(PrimitiveType.STRING, resultRow.getColumnType("c1")); - Assert.assertEquals(PrimitiveType.INT, resultRow.getColumnType("c2")); - Assert.assertEquals(PrimitiveType.FLOAT, resultRow.getColumnType("c3")); - Assert.assertEquals(PrimitiveType.DOUBLE, resultRow.getColumnType("c4")); - Assert.assertEquals(PrimitiveType.BIGINT, resultRow.getColumnType("c5")); - } catch (DdlException e) { - e.printStackTrace(); - Assert.fail(); - } - } - - @Test - public void testGetColumnValueWithIndex() throws Exception { - Assert.assertEquals("s1", resultRow.getColumnValue(0).toString()); - Assert.assertEquals(1000, Integer.parseInt((String) resultRow.getColumnValue(1))); - Assert.assertEquals(0.1f, Float.parseFloat((String) resultRow.getColumnValue(2)), 0.0001); - Assert.assertEquals(0.0001, Double.parseDouble((String) resultRow.getColumnValue(3)), 0.0001); - Assert.assertEquals(1000000, Long.parseLong((String) resultRow.getColumnValue(4))); - } - - @Test - public void testGetColumnValueWithName() throws Exception { - Assert.assertEquals("s1", resultRow.getColumnValue(0).toString()); - Assert.assertEquals(1000, Integer.parseInt((String) resultRow.getColumnValue(1))); - Assert.assertEquals(0.1f, Float.parseFloat((String) resultRow.getColumnValue(2)), 0.0001); - Assert.assertEquals(0.0001, Double.parseDouble((String) resultRow.getColumnValue(3)), 0.0001); - Assert.assertEquals(1000000, Long.parseLong((String) resultRow.getColumnValue(4))); - } - - @Test - public void testGetTypeValue() throws Exception { - Assert.assertEquals("s1", resultRow.getString(0)); - Assert.assertEquals(1000, resultRow.getInt(1)); - Assert.assertEquals(0.1f, resultRow.getFloat(2), 0.0001); - Assert.assertEquals(0.0001, resultRow.getDouble(3), 0.0001); - Assert.assertEquals(1000000, resultRow.getLong(4)); - } -} diff --git a/gensrc/thrift/FrontendService.thrift b/gensrc/thrift/FrontendService.thrift index 5a3cf88db8f6eaf..37633aeb83658bb 100644 --- a/gensrc/thrift/FrontendService.thrift +++ b/gensrc/thrift/FrontendService.thrift @@ -1091,7 +1091,7 @@ struct TGetBinlogLagResult { struct TUpdateFollowerStatsCacheRequest { 1: optional string key; - 2: optional string colStats; + 2: list statsRows; } service FrontendService { diff --git a/regression-test/suites/statistics/analyze_stats.groovy b/regression-test/suites/statistics/analyze_stats.groovy index 3064a89d3f4db11..b5a39f5a43162fb 100644 --- a/regression-test/suites/statistics/analyze_stats.groovy +++ b/regression-test/suites/statistics/analyze_stats.groovy @@ -1,3 +1,5 @@ +import java.util.stream.Collectors + // Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information @@ -19,6 +21,14 @@ suite("test_analyze") { String db = "regression_test_statistics" String tbl = "analyzetestlimited_duplicate_all" + sql """ + DROP DATABASE IF EXISTS `${db}` + """ + + sql """ + CREATE DATABASE `${db}` + """ + sql """ DROP TABLE IF EXISTS `${tbl}` """ @@ -86,7 +96,7 @@ suite("test_analyze") { sql """ SET enable_nereids_planner=true; - + """ sql """ SET enable_fallback_to_original_planner=false; @@ -98,7 +108,7 @@ suite("test_analyze") { Thread.sleep(1000 * 60) sql """ - SELECT COUNT(*) FROM ${tbl}; + SELECT COUNT(*) FROM ${tbl}; """ sql """ @@ -109,7 +119,7 @@ suite("test_analyze") { try { sql """ - SELECT COUNT(*) FROM ${tbl}; + SELECT * FROM ${tbl}; """ } catch (Exception e) { exception = e @@ -124,7 +134,7 @@ suite("test_analyze") { """ sql """ - SELECT COUNT(*) FROM ${tbl}; + SELECT COUNT(*) FROM ${tbl}; """ sql """ @@ -133,7 +143,7 @@ suite("test_analyze") { try { sql """ - SELECT COUNT(*) FROM ${tbl}; + SELECT COUNT(*) FROM ${tbl}; """ } catch (Exception e) { exception = e @@ -148,25 +158,25 @@ suite("test_analyze") { """ a_result_3 = sql """ - ANALYZE DATABASE ${db} WITH SAMPLE PERCENT 5 WITH AUTO + ANALYZE DATABASE ${db} WITH SAMPLE PERCENT 5 """ show_result = sql """ SHOW ANALYZE """ - def contains_expected_table = {r -> - for(int i = 0; i < r.size; i++) { - if (r[i][3] == "${tbl}" ) { + def contains_expected_table = { r -> + for (int i = 0; i < r.size; i++) { + if (r[i][3] == "${tbl}") { return true } } return false } - def stats_job_removed = {r, id -> - for(int i = 0; i < r.size; i++) { - if (r[i][0] == id ) { + def stats_job_removed = { r, id -> + for (int i = 0; i < r.size; i++) { + if (r[i][0] == id) { return false } } @@ -176,14 +186,14 @@ suite("test_analyze") { assert contains_expected_table(show_result) sql """ - DROP ANALYZE JOB ${a_result_3[0][4]} + DROP ANALYZE JOB ${a_result_3[0][0]} """ show_result = sql """ SHOW ANALYZE """ - assert stats_job_removed(show_result, a_result_3[0][4]) + assert stats_job_removed(show_result, a_result_3[0][0]) sql """ ANALYZE DATABASE ${db} WITH SAMPLE ROWS 5 WITH PERIOD 100000 @@ -224,8 +234,8 @@ suite("test_analyze") { SHOW COLUMN CACHED STATS analyze_partitioned_tbl_test(col1) """ - def expected_result = { r-> - for(int i = 0; i < r.size; i++) { + def expected_result = { r -> + for (int i = 0; i < r.size; i++) { if ((int) Double.parseDouble(r[i][1]) == 6) { return true } else { @@ -870,7 +880,7 @@ PARTITION `p599` VALUES IN (599) SHOW COLUMN CACHED STATS test_600_partition_table_analyze(id); """ - def expected_col_stats = { r, expected_value, idx -> + def expected_col_stats = { r, expected_value, idx -> return (int) Double.parseDouble(r[0][idx]) == expected_value } @@ -1030,7 +1040,7 @@ PARTITION `p599` VALUES IN (599) sql """ DROP TABLE IF EXISTS two_thousand_partition_table_test """ - + // check analyze table with thousand partition sql """ CREATE TABLE two_thousand_partition_table_test (col1 int(11451) not null) DUPLICATE KEY(col1) @@ -1049,5 +1059,49 @@ PARTITION `p599` VALUES IN (599) ANALYZE TABLE two_thousand_partition_table_test WITH SYNC; """ -} + // meta check + sql """ + CREATE TABLE `test_meta_management` ( + `col1` varchar(11451) NOT NULL, + `col2` int(11) NOT NULL, + `col3` int(11) NOT NULL + ) ENGINE=OLAP + DUPLICATE KEY(`col1`) + COMMENT 'OLAP' + DISTRIBUTED BY HASH(`col1`) BUCKETS 3 + PROPERTIES ( + "replication_allocation" = "tag.location.default: 1" + ); + """ + sql """insert into test_meta_management values(1, 2, 3);""" + sql """insert into test_meta_management values(4, 5, 6);""" + sql """insert into test_meta_management values(7, 1, 9);""" + sql """insert into test_meta_management values(3, 8, 2);""" + sql """insert into test_meta_management values(5, 2, 1);""" + sql """insert into test_meta_management values(41, 2, 3)""" + + sql """ANALYZE TABLE test_meta_management WITH SYNC""" + sql """DROP STATS test_meta_management(col1)""" + + def afterDropped = sql """SHOW TABLE STATS test_meta_management""" + def convert_col_list_str_to_java_collection = { cols -> + if (cols.startsWith("[") && cols.endsWith("]")) { + cols = cols.substring(1, cols.length() - 1); + } + return Arrays.stream(cols.split(",")).map(String::trim).collect(Collectors.toList()) + } + + def check_column = { r, expected -> + expected_result = convert_col_list_str_to_java_collection(expected) + actual_result = convert_col_list_str_to_java_collection(r[0][4]) + System.out.println(expected_result) + System.out.println(actual_result) + return expected_result.containsAll(actual_result) && actual_result.containsAll(expected_result) + } + assert check_column(afterDropped, "[col2, col3]") + sql """ANALYZE TABLE test_meta_management WITH SYNC""" + afterDropped = sql """SHOW TABLE STATS test_meta_management""" + assert check_column(afterDropped, "[col1, col2, col3]") + +}