Browse Source

[Bug-7206] [MasterServer] fix process isntance always running when task timeout (#7207)

* fix timeout

* add task timeout map to avoid repeated timeout event

* split task check list for retry and timeout

Co-authored-by: caishunfeng <534328519@qq.com>
3.0.0/version-upgrade
wind 3 years ago committed by GitHub
parent
commit
ff9bc806ac
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 11
      dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/master/runner/MasterSchedulerService.java
  2. 101
      dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/master/runner/StateWheelExecuteThread.java
  3. 45
      dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteThread.java
  4. 3
      dolphinscheduler-server/src/test/java/org/apache/dolphinscheduler/server/master/WorkflowExecuteThreadTest.java

11
dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/master/runner/MasterSchedulerService.java

@ -23,7 +23,6 @@ import org.apache.dolphinscheduler.common.thread.ThreadUtils;
import org.apache.dolphinscheduler.common.utils.NetUtils; import org.apache.dolphinscheduler.common.utils.NetUtils;
import org.apache.dolphinscheduler.common.utils.OSUtils; import org.apache.dolphinscheduler.common.utils.OSUtils;
import org.apache.dolphinscheduler.dao.entity.Command; import org.apache.dolphinscheduler.dao.entity.Command;
import org.apache.dolphinscheduler.dao.entity.ProcessDefinition;
import org.apache.dolphinscheduler.dao.entity.ProcessInstance; import org.apache.dolphinscheduler.dao.entity.ProcessInstance;
import org.apache.dolphinscheduler.dao.entity.TaskInstance; import org.apache.dolphinscheduler.dao.entity.TaskInstance;
import org.apache.dolphinscheduler.remote.NettyRemotingClient; import org.apache.dolphinscheduler.remote.NettyRemotingClient;
@ -41,7 +40,6 @@ import org.apache.commons.collections4.CollectionUtils;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch; import java.util.concurrent.CountDownLatch;
@ -117,10 +115,15 @@ public class MasterSchedulerService extends Thread {
ConcurrentHashMap<Integer, ProcessInstance> processTimeoutCheckList = new ConcurrentHashMap<>(); ConcurrentHashMap<Integer, ProcessInstance> processTimeoutCheckList = new ConcurrentHashMap<>();
/** /**
* task time out checkout list * task time out check list
*/ */
ConcurrentHashMap<Integer, TaskInstance> taskTimeoutCheckList = new ConcurrentHashMap<>(); ConcurrentHashMap<Integer, TaskInstance> taskTimeoutCheckList = new ConcurrentHashMap<>();
/**
* task retry check list
*/
ConcurrentHashMap<Integer, TaskInstance> taskRetryCheckList = new ConcurrentHashMap<>();
private StateWheelExecuteThread stateWheelExecuteThread; private StateWheelExecuteThread stateWheelExecuteThread;
/** /**
@ -134,6 +137,7 @@ public class MasterSchedulerService extends Thread {
stateWheelExecuteThread = new StateWheelExecuteThread(processTimeoutCheckList, stateWheelExecuteThread = new StateWheelExecuteThread(processTimeoutCheckList,
taskTimeoutCheckList, taskTimeoutCheckList,
taskRetryCheckList,
this.processInstanceExecCacheManager, this.processInstanceExecCacheManager,
masterConfig.getStateWheelInterval() * Constants.SLEEP_TIME_MILLIS); masterConfig.getStateWheelInterval() * Constants.SLEEP_TIME_MILLIS);
} }
@ -209,6 +213,7 @@ public class MasterSchedulerService extends Thread {
, processAlertManager , processAlertManager
, masterConfig , masterConfig
, taskTimeoutCheckList , taskTimeoutCheckList
, taskRetryCheckList
, taskProcessorFactory); , taskProcessorFactory);
this.processInstanceExecCacheManager.cache(processInstance.getId(), workflowExecuteThread); this.processInstanceExecCacheManager.cache(processInstance.getId(), workflowExecuteThread);

101
dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/master/runner/StateWheelExecuteThread.java

@ -43,18 +43,21 @@ public class StateWheelExecuteThread extends Thread {
private static final Logger logger = LoggerFactory.getLogger(StateWheelExecuteThread.class); private static final Logger logger = LoggerFactory.getLogger(StateWheelExecuteThread.class);
ConcurrentHashMap<Integer, ProcessInstance> processInstanceCheckList; private ConcurrentHashMap<Integer, ProcessInstance> processInstanceTimeoutCheckList;
ConcurrentHashMap<Integer, TaskInstance> taskInstanceCheckList; private ConcurrentHashMap<Integer, TaskInstance> taskInstanceTimeoutCheckList;
private ConcurrentHashMap<Integer, TaskInstance> taskInstanceRetryCheckList;
private ProcessInstanceExecCacheManager processInstanceExecCacheManager; private ProcessInstanceExecCacheManager processInstanceExecCacheManager;
private int stateCheckIntervalSecs; private int stateCheckIntervalSecs;
public StateWheelExecuteThread(ConcurrentHashMap<Integer, ProcessInstance> processInstances, public StateWheelExecuteThread(ConcurrentHashMap<Integer, ProcessInstance> processInstanceTimeoutCheckList,
ConcurrentHashMap<Integer, TaskInstance> taskInstances, ConcurrentHashMap<Integer, TaskInstance> taskInstanceTimeoutCheckList,
ConcurrentHashMap<Integer, TaskInstance> taskInstanceRetryCheckList,
ProcessInstanceExecCacheManager processInstanceExecCacheManager, ProcessInstanceExecCacheManager processInstanceExecCacheManager,
int stateCheckIntervalSecs) { int stateCheckIntervalSecs) {
this.processInstanceCheckList = processInstances; this.processInstanceTimeoutCheckList = processInstanceTimeoutCheckList;
this.taskInstanceCheckList = taskInstances; this.taskInstanceTimeoutCheckList = taskInstanceTimeoutCheckList;
this.taskInstanceRetryCheckList = taskInstanceRetryCheckList;
this.processInstanceExecCacheManager = processInstanceExecCacheManager; this.processInstanceExecCacheManager = processInstanceExecCacheManager;
this.stateCheckIntervalSecs = stateCheckIntervalSecs; this.stateCheckIntervalSecs = stateCheckIntervalSecs;
} }
@ -65,8 +68,9 @@ public class StateWheelExecuteThread extends Thread {
logger.info("state wheel thread start"); logger.info("state wheel thread start");
while (Stopper.isRunning()) { while (Stopper.isRunning()) {
try { try {
checkProcess(); checkTask4Timeout();
checkTask(); checkTask4Retry();
checkProcess4Timeout();
} catch (Exception e) { } catch (Exception e) {
logger.error("state wheel thread check error:", e); logger.error("state wheel thread check error:", e);
} }
@ -74,85 +78,96 @@ public class StateWheelExecuteThread extends Thread {
} }
} }
public boolean addProcess(ProcessInstance processInstance) { public void addProcess4TimeoutCheck(ProcessInstance processInstance) {
this.processInstanceCheckList.put(processInstance.getId(), processInstance); this.processInstanceTimeoutCheckList.put(processInstance.getId(), processInstance);
return true;
} }
public boolean addTask(TaskInstance taskInstance) { public void addTask4TimeoutCheck(TaskInstance taskInstance) {
this.taskInstanceCheckList.put(taskInstance.getId(), taskInstance); this.taskInstanceTimeoutCheckList.put(taskInstance.getId(), taskInstance);
return true; }
public void addTask4RetryCheck(TaskInstance taskInstance) {
this.taskInstanceRetryCheckList.put(taskInstance.getId(), taskInstance);
} }
private void checkTask() { public void checkTask4Timeout() {
if (taskInstanceCheckList.isEmpty()) { if (taskInstanceTimeoutCheckList.isEmpty()) {
return; return;
} }
for (TaskInstance taskInstance : taskInstanceTimeoutCheckList.values()) {
for (TaskInstance taskInstance : this.taskInstanceCheckList.values()) {
if (TimeoutFlag.OPEN == taskInstance.getTaskDefine().getTimeoutFlag()) { if (TimeoutFlag.OPEN == taskInstance.getTaskDefine().getTimeoutFlag()) {
long timeRemain = DateUtils.getRemainTime(taskInstance.getStartTime(), taskInstance.getTaskDefine().getTimeout() * Constants.SEC_2_MINUTES_TIME_UNIT); long timeRemain = DateUtils.getRemainTime(taskInstance.getStartTime(), taskInstance.getTaskDefine().getTimeout() * Constants.SEC_2_MINUTES_TIME_UNIT);
if (0 >= timeRemain && processTimeout(taskInstance)) { if (0 >= timeRemain) {
taskInstanceCheckList.remove(taskInstance.getId()); addTaskTimeoutEvent(taskInstance);
taskInstanceTimeoutCheckList.remove(taskInstance.getId());
} }
} }
}
}
private void checkTask4Retry() {
if (taskInstanceRetryCheckList.isEmpty()) {
return;
}
for (TaskInstance taskInstance : this.taskInstanceRetryCheckList.values()) {
if (taskInstance.taskCanRetry() && taskInstance.retryTaskIntervalOverTime()) { if (taskInstance.taskCanRetry() && taskInstance.retryTaskIntervalOverTime()) {
processDependCheck(taskInstance); addTaskStateChangeEvent(taskInstance);
taskInstanceCheckList.remove(taskInstance.getId()); taskInstanceRetryCheckList.remove(taskInstance.getId());
} }
if (taskInstance.isSubProcess() || taskInstance.isDependTask()) { if (taskInstance.isSubProcess() || taskInstance.isDependTask()) {
processDependCheck(taskInstance); addTaskStateChangeEvent(taskInstance);
} }
} }
} }
private void checkProcess() { private void checkProcess4Timeout() {
if (processInstanceCheckList.isEmpty()) { if (processInstanceTimeoutCheckList.isEmpty()) {
return; return;
} }
for (ProcessInstance processInstance : this.processInstanceCheckList.values()) { for (ProcessInstance processInstance : this.processInstanceTimeoutCheckList.values()) {
long timeRemain = DateUtils.getRemainTime(processInstance.getStartTime(), processInstance.getTimeout() * Constants.SEC_2_MINUTES_TIME_UNIT); long timeRemain = DateUtils.getRemainTime(processInstance.getStartTime(), processInstance.getTimeout() * Constants.SEC_2_MINUTES_TIME_UNIT);
if (0 <= timeRemain && processTimeout(processInstance)) { if (0 >= timeRemain) {
processInstanceCheckList.remove(processInstance.getId()); addProcessTimeoutEvent(processInstance);
processInstanceTimeoutCheckList.remove(processInstance.getId());
} }
} }
} }
private void putEvent(StateEvent stateEvent) { private boolean addTaskStateChangeEvent(TaskInstance taskInstance) {
if (!processInstanceExecCacheManager.contains(stateEvent.getProcessInstanceId())) {
return;
}
WorkflowExecuteThread workflowExecuteThread = this.processInstanceExecCacheManager.getByProcessInstanceId(stateEvent.getProcessInstanceId());
workflowExecuteThread.addStateEvent(stateEvent);
}
private boolean processDependCheck(TaskInstance taskInstance) {
StateEvent stateEvent = new StateEvent(); StateEvent stateEvent = new StateEvent();
stateEvent.setType(StateEventType.TASK_STATE_CHANGE); stateEvent.setType(StateEventType.TASK_STATE_CHANGE);
stateEvent.setProcessInstanceId(taskInstance.getProcessInstanceId()); stateEvent.setProcessInstanceId(taskInstance.getProcessInstanceId());
stateEvent.setTaskInstanceId(taskInstance.getId()); stateEvent.setTaskInstanceId(taskInstance.getId());
stateEvent.setExecutionStatus(ExecutionStatus.RUNNING_EXECUTION); stateEvent.setExecutionStatus(ExecutionStatus.RUNNING_EXECUTION);
putEvent(stateEvent); addEvent(stateEvent);
return true; return true;
} }
private boolean processTimeout(TaskInstance taskInstance) { private boolean addTaskTimeoutEvent(TaskInstance taskInstance) {
StateEvent stateEvent = new StateEvent(); StateEvent stateEvent = new StateEvent();
stateEvent.setType(StateEventType.TASK_TIMEOUT); stateEvent.setType(StateEventType.TASK_TIMEOUT);
stateEvent.setProcessInstanceId(taskInstance.getProcessInstanceId()); stateEvent.setProcessInstanceId(taskInstance.getProcessInstanceId());
stateEvent.setTaskInstanceId(taskInstance.getId()); stateEvent.setTaskInstanceId(taskInstance.getId());
putEvent(stateEvent); addEvent(stateEvent);
return true; return true;
} }
private boolean processTimeout(ProcessInstance processInstance) { private boolean addProcessTimeoutEvent(ProcessInstance processInstance) {
StateEvent stateEvent = new StateEvent(); StateEvent stateEvent = new StateEvent();
stateEvent.setType(StateEventType.PROCESS_TIMEOUT); stateEvent.setType(StateEventType.PROCESS_TIMEOUT);
stateEvent.setProcessInstanceId(processInstance.getId()); stateEvent.setProcessInstanceId(processInstance.getId());
putEvent(stateEvent); addEvent(stateEvent);
return true; return true;
} }
private void addEvent(StateEvent stateEvent) {
if (!processInstanceExecCacheManager.contains(stateEvent.getProcessInstanceId())) {
return;
}
WorkflowExecuteThread workflowExecuteThread = this.processInstanceExecCacheManager.getByProcessInstanceId(stateEvent.getProcessInstanceId());
workflowExecuteThread.addStateEvent(stateEvent);
}
} }

45
dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteThread.java

@ -205,6 +205,11 @@ public class WorkflowExecuteThread implements Runnable {
*/ */
private ConcurrentHashMap<Integer, TaskInstance> taskTimeoutCheckList; private ConcurrentHashMap<Integer, TaskInstance> taskTimeoutCheckList;
/**
* task retry check list
*/
private ConcurrentHashMap<Integer, TaskInstance> taskRetryCheckList;
/** /**
* state event queue * state event queue
*/ */
@ -232,14 +237,15 @@ public class WorkflowExecuteThread implements Runnable {
, ProcessAlertManager processAlertManager , ProcessAlertManager processAlertManager
, MasterConfig masterConfig , MasterConfig masterConfig
, ConcurrentHashMap<Integer, TaskInstance> taskTimeoutCheckList , ConcurrentHashMap<Integer, TaskInstance> taskTimeoutCheckList
, ConcurrentHashMap<Integer, TaskInstance> taskRetryCheckList
, TaskProcessorFactory taskProcessorFactory) { , TaskProcessorFactory taskProcessorFactory) {
this.processService = processService; this.processService = processService;
this.processInstance = processInstance; this.processInstance = processInstance;
this.masterConfig = masterConfig; this.masterConfig = masterConfig;
this.nettyExecutorManager = nettyExecutorManager; this.nettyExecutorManager = nettyExecutorManager;
this.processAlertManager = processAlertManager; this.processAlertManager = processAlertManager;
this.taskTimeoutCheckList = taskTimeoutCheckList; this.taskTimeoutCheckList = taskTimeoutCheckList;
this.taskRetryCheckList = taskRetryCheckList;
this.taskProcessorFactory = taskProcessorFactory; this.taskProcessorFactory = taskProcessorFactory;
} }
@ -378,11 +384,10 @@ public class WorkflowExecuteThread implements Runnable {
if (TaskTimeoutStrategy.FAILED == taskTimeoutStrategy) { if (TaskTimeoutStrategy.FAILED == taskTimeoutStrategy) {
ITaskProcessor taskProcessor = activeTaskProcessorMaps.get(stateEvent.getTaskInstanceId()); ITaskProcessor taskProcessor = activeTaskProcessorMaps.get(stateEvent.getTaskInstanceId());
taskProcessor.action(TaskAction.TIMEOUT); taskProcessor.action(TaskAction.TIMEOUT);
return false;
} else { } else {
processAlertManager.sendTaskTimeoutAlert(processInstance, taskInstance, taskInstance.getTaskDefine()); processAlertManager.sendTaskTimeoutAlert(processInstance, taskInstance, taskInstance.getTaskDefine());
return true;
} }
return true;
} }
private boolean processTimeout() { private boolean processTimeout() {
@ -415,7 +420,7 @@ public class WorkflowExecuteThread implements Runnable {
this.stateEvents.add(nextEvent); this.stateEvents.add(nextEvent);
} else { } else {
ProcessInstance processInstance = this.processService.findProcessInstanceById(nextTaskInstance.getProcessInstanceId()); ProcessInstance processInstance = this.processService.findProcessInstanceById(nextTaskInstance.getProcessInstanceId());
this.processService.sendStartTask2Master(processInstance,nextTaskInstance.getId(), this.processService.sendStartTask2Master(processInstance, nextTaskInstance.getId(),
org.apache.dolphinscheduler.remote.command.CommandType.TASK_WAKEUP_EVENT_REQUEST); org.apache.dolphinscheduler.remote.command.CommandType.TASK_WAKEUP_EVENT_REQUEST);
} }
} }
@ -450,6 +455,7 @@ public class WorkflowExecuteThread implements Runnable {
task.getMaxRetryTimes(), task.getMaxRetryTimes(),
task.getRetryInterval()); task.getRetryInterval());
this.addTimeoutCheck(task); this.addTimeoutCheck(task);
this.addRetryCheck(task);
} else { } else {
submitStandByTask(); submitStandByTask();
} }
@ -459,6 +465,7 @@ public class WorkflowExecuteThread implements Runnable {
completeTaskMap.put(Long.toString(task.getTaskCode()), task.getId()); completeTaskMap.put(Long.toString(task.getTaskCode()), task.getId());
activeTaskProcessorMaps.remove(task.getId()); activeTaskProcessorMaps.remove(task.getId());
taskTimeoutCheckList.remove(task.getId()); taskTimeoutCheckList.remove(task.getId());
taskRetryCheckList.remove(task.getId());
if (task.getState().typeIsSuccess()) { if (task.getState().typeIsSuccess()) {
processInstance.setVarPool(task.getVarPool()); processInstance.setVarPool(task.getVarPool());
@ -826,6 +833,7 @@ public class WorkflowExecuteThread implements Runnable {
taskProcessor.run(); taskProcessor.run();
addTimeoutCheck(taskInstance); addTimeoutCheck(taskInstance);
addRetryCheck(taskInstance);
if (taskProcessor.taskState().typeIsFinished()) { if (taskProcessor.taskState().typeIsFinished()) {
StateEvent stateEvent = new StateEvent(); StateEvent stateEvent = new StateEvent();
@ -867,13 +875,30 @@ public class WorkflowExecuteThread implements Runnable {
logger.error("taskDefinition is null, taskId:{}", taskInstance.getId()); logger.error("taskDefinition is null, taskId:{}", taskInstance.getId());
return; return;
} }
if (TimeoutFlag.OPEN == taskDefinition.getTimeoutFlag()) {
if (TimeoutFlag.OPEN == taskDefinition.getTimeoutFlag() || taskInstance.taskCanRetry()) {
this.taskTimeoutCheckList.put(taskInstance.getId(), taskInstance); this.taskTimeoutCheckList.put(taskInstance.getId(), taskInstance);
} else { }
if (taskInstance.isDependTask() || taskInstance.isSubProcess()) { if (taskInstance.isDependTask() || taskInstance.isSubProcess()) {
this.taskTimeoutCheckList.put(taskInstance.getId(), taskInstance); this.taskTimeoutCheckList.put(taskInstance.getId(), taskInstance);
} }
}
private void addRetryCheck(TaskInstance taskInstance) {
if (taskRetryCheckList.containsKey(taskInstance.getId())) {
return;
}
TaskDefinition taskDefinition = taskInstance.getTaskDefine();
if (taskDefinition == null) {
logger.error("taskDefinition is null, taskId:{}", taskInstance.getId());
return;
}
if (taskInstance.taskCanRetry()) {
this.taskRetryCheckList.put(taskInstance.getId(), taskInstance);
}
if (taskInstance.isDependTask() || taskInstance.isSubProcess()) {
this.taskRetryCheckList.put(taskInstance.getId(), taskInstance);
} }
} }

3
dolphinscheduler-server/src/test/java/org/apache/dolphinscheduler/server/master/WorkflowExecuteThreadTest.java

@ -108,7 +108,8 @@ public class WorkflowExecuteThreadTest {
Mockito.when(processInstance.getProcessDefinition()).thenReturn(processDefinition); Mockito.when(processInstance.getProcessDefinition()).thenReturn(processDefinition);
ConcurrentHashMap<Integer, TaskInstance> taskTimeoutCheckList = new ConcurrentHashMap<>(); ConcurrentHashMap<Integer, TaskInstance> taskTimeoutCheckList = new ConcurrentHashMap<>();
workflowExecuteThread = PowerMockito.spy(new WorkflowExecuteThread(processInstance, processService, null, null, config, taskTimeoutCheckList, taskProcessorFactory)); ConcurrentHashMap<Integer, TaskInstance> taskRetryCheckList = new ConcurrentHashMap<>();
workflowExecuteThread = PowerMockito.spy(new WorkflowExecuteThread(processInstance, processService, null, null, config, taskTimeoutCheckList, taskRetryCheckList, taskProcessorFactory));
// prepareProcess init dag // prepareProcess init dag
Field dag = WorkflowExecuteThread.class.getDeclaredField("dag"); Field dag = WorkflowExecuteThread.class.getDeclaredField("dag");
dag.setAccessible(true); dag.setAccessible(true);

Loading…
Cancel
Save