From c8a598b08a12af05c3e0bf1453cebe8ba7712375 Mon Sep 17 00:00:00 2001 From: laihui <1353307710@qq.com> Date: Thu, 21 Mar 2024 11:52:40 +0800 Subject: [PATCH] self-adaption backoff timeout --- .../doris/common/InternalErrorCode.java | 4 ++- .../load/routineload/RoutineLoadJob.java | 13 ++++++++ .../load/routineload/RoutineLoadTaskInfo.java | 30 +++++++++++++++++++ 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/InternalErrorCode.java b/fe/fe-core/src/main/java/org/apache/doris/common/InternalErrorCode.java index 2bbd5c58efa02b..214f74a38f475e 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/InternalErrorCode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/InternalErrorCode.java @@ -34,7 +34,9 @@ public enum InternalErrorCode { MANUAL_STOP_ERR(101), TOO_MANY_FAILURE_ROWS_ERR(102), CREATE_TASKS_ERR(103), - TASKS_ABORT_ERR(104); + TASKS_ABORT_ERR(104), + CANNOT_RESUME_ERR(105), + TIMEOUT_TOO_MUCH(106); private long errCode; diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java index 0d9ae516351597..7dbffb27f23b5b 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadJob.java @@ -709,6 +709,18 @@ public void processTimeoutTasks() { // and after renew, the previous task is removed from routineLoadTaskInfoList, // so task can no longer be committed successfully. // the already committed task will not be handled here. + int timeoutBackOffCount = routineLoadTaskInfo.getTimeoutBackOffCount(); + if (timeoutBackOffCount > RoutineLoadTaskInfo.MAX_TIMEOUT_BACK_OFF_COUNT) { + try { + updateState(JobState.PAUSED, new ErrorReason(InternalErrorCode.TIMEOUT_TOO_MUCH, + "task " + routineLoadTaskInfo.getId() + " timeout too much"), false); + } catch (UserException e) { + LOG.warn("update job state to pause failed", e); + } + return; + } + routineLoadTaskInfo.setTimeoutBackOffCount(timeoutBackOffCount + 1); + routineLoadTaskInfo.setTimeoutMs((routineLoadTaskInfo.getTimeoutMs() << 1)); RoutineLoadTaskInfo newTask = unprotectRenewTask(routineLoadTaskInfo); Env.getCurrentEnv().getRoutineLoadTaskScheduler().addTaskInQueue(newTask); } @@ -1212,6 +1224,7 @@ private void executeTaskOnTxnStatusChanged(RoutineLoadTaskInfo routineLoadTaskIn } else if (checkCommitInfo(rlTaskTxnCommitAttachment, txnState, txnStatusChangeReason)) { // step2: update job progress updateProgress(rlTaskTxnCommitAttachment); + routineLoadTaskInfo.selfAdaptTimeout(rlTaskTxnCommitAttachment); } if (rlTaskTxnCommitAttachment != null && !Strings.isNullOrEmpty(rlTaskTxnCommitAttachment.getErrorLogUrl())) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java index 7a5312b2c8f760..500f1e27258713 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/RoutineLoadTaskInfo.java @@ -73,6 +73,9 @@ public abstract class RoutineLoadTaskInfo { protected boolean isMultiTable = false; + protected static final int MAX_TIMEOUT_BACK_OFF_COUNT = 3; + protected int timeoutBackOffCount = 0; + // this status will be set when corresponding transaction's status is changed. // so that user or other logic can know the status of the corresponding txn. protected TransactionStatus txnStatus = TransactionStatus.UNKNOWN; @@ -136,6 +139,10 @@ public void setLastScheduledTime(long lastScheduledTime) { this.lastScheduledTime = lastScheduledTime; } + public void setTimeoutMs(long timeoutMs) { + this.timeoutMs = timeoutMs; + } + public long getTimeoutMs() { return timeoutMs; } @@ -148,6 +155,14 @@ public TransactionStatus getTxnStatus() { return txnStatus; } + public void setTimeoutBackOffCount(int timeoutBackOffCount) { + this.timeoutBackOffCount = timeoutBackOffCount; + } + + public int getTimeoutBackOffCount() { + return timeoutBackOffCount; + } + public boolean isTimeout() { if (txnStatus == TransactionStatus.COMMITTED || txnStatus == TransactionStatus.VISIBLE) { // the corresponding txn is already finished, this task can not be treated as timeout. @@ -162,6 +177,21 @@ public boolean isTimeout() { return false; } + public void selfAdaptTimeout(RLTaskTxnCommitAttachment rlTaskTxnCommitAttachment) { + long taskExecutionTime = rlTaskTxnCommitAttachment.getTaskExecutionTimeMs(); + long timeoutMs = this.timeoutMs; + + while (this.timeoutBackOffCount > 0) { + timeoutMs = timeoutMs >> 1; + if (timeoutMs <= taskExecutionTime) { + this.timeoutMs = timeoutMs << 1; + return; + } + this.timeoutBackOffCount--; + } + this.timeoutMs = timeoutMs; + } + abstract TRoutineLoadTask createRoutineLoadTask() throws UserException; // begin the txn of this task