Browse Source

Catch exception when check state in StateWheelExecuteThread (#10908)

* Catch exception when check state

(cherry picked from commit 2a67866718)
3.0.0/version-upgrade
Wenjun Ruan 2 years ago
parent
commit
527ee472fb
  1. 36
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/StateWheelExecuteThread.java
  2. 5
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java
  3. 2
      dolphinscheduler-master/src/main/resources/application.yaml
  4. 2
      dolphinscheduler-service/src/main/java/org/apache/dolphinscheduler/service/process/ProcessServiceImpl.java
  5. 5
      dolphinscheduler-service/src/test/java/org/apache/dolphinscheduler/service/process/ProcessServiceTest.java
  6. 2
      dolphinscheduler-standalone-server/src/main/resources/application.yaml

36
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/StateWheelExecuteThread.java

@ -134,9 +134,13 @@ public class StateWheelExecuteThread extends BaseDaemonThread {
return; return;
} }
for (Integer processInstanceId : processInstanceTimeoutCheckList) { for (Integer processInstanceId : processInstanceTimeoutCheckList) {
WorkflowExecuteRunnable workflowExecuteThread = processInstanceExecCacheManager.getByProcessInstanceId(processInstanceId); try {
LoggerUtils.setWorkflowInstanceIdMDC(processInstanceId);
WorkflowExecuteRunnable workflowExecuteThread = processInstanceExecCacheManager.getByProcessInstanceId(
processInstanceId);
if (workflowExecuteThread == null) { if (workflowExecuteThread == null) {
logger.warn("Check workflow timeout failed, can not find workflowExecuteThread from cache manager, will remove this workflowInstance from check list"); logger.warn(
"Check workflow timeout failed, can not find workflowExecuteThread from cache manager, will remove this workflowInstance from check list");
processInstanceTimeoutCheckList.remove(processInstanceId); processInstanceTimeoutCheckList.remove(processInstanceId);
continue; continue;
} }
@ -145,13 +149,20 @@ public class StateWheelExecuteThread extends BaseDaemonThread {
logger.warn("Check workflow timeout failed, the workflowInstance is null"); logger.warn("Check workflow timeout failed, the workflowInstance is null");
continue; continue;
} }
long timeRemain = DateUtils.getRemainTime(processInstance.getStartTime(), (long) processInstance.getTimeout() * Constants.SEC_2_MINUTES_TIME_UNIT); long timeRemain = DateUtils.getRemainTime(processInstance.getStartTime(),
(long) processInstance.getTimeout()
* Constants.SEC_2_MINUTES_TIME_UNIT);
if (timeRemain < 0) { if (timeRemain < 0) {
logger.info("Workflow instance timeout, adding timeout event"); logger.info("Workflow instance timeout, adding timeout event");
addProcessTimeoutEvent(processInstance); addProcessTimeoutEvent(processInstance);
processInstanceTimeoutCheckList.remove(processInstance.getId()); processInstanceTimeoutCheckList.remove(processInstance.getId());
logger.info("Workflow instance timeout, added timeout event"); logger.info("Workflow instance timeout, added timeout event");
} }
} catch (Exception ex) {
logger.error("Check workflow instance timeout error");
} finally {
LoggerUtils.removeWorkflowInstanceIdMDC();
}
} }
} }
@ -243,20 +254,26 @@ public class StateWheelExecuteThread extends BaseDaemonThread {
} }
Optional<TaskInstance> taskInstanceOptional = workflowExecuteThread.getActiveTaskInstanceByTaskCode(taskCode); Optional<TaskInstance> taskInstanceOptional = workflowExecuteThread.getActiveTaskInstanceByTaskCode(taskCode);
if (!taskInstanceOptional.isPresent()) { if (!taskInstanceOptional.isPresent()) {
logger.warn("Check task instance timeout failed, can not get taskInstance from workflowExecuteThread, taskCode: {}" logger.warn(
+ "will remove this check task", taskCode); "Check task instance timeout failed, can not get taskInstance from workflowExecuteThread, taskCode: {}"
+ "will remove this check task",
taskCode);
taskInstanceTimeoutCheckList.remove(taskInstanceKey); taskInstanceTimeoutCheckList.remove(taskInstanceKey);
continue; continue;
} }
TaskInstance taskInstance = taskInstanceOptional.get(); TaskInstance taskInstance = taskInstanceOptional.get();
if (TimeoutFlag.OPEN == taskInstance.getTaskDefine().getTimeoutFlag()) { if (TimeoutFlag.OPEN == taskInstance.getTaskDefine().getTimeoutFlag()) {
long timeRemain = DateUtils.getRemainTime(taskInstance.getStartTime(), (long) taskInstance.getTaskDefine().getTimeout() * Constants.SEC_2_MINUTES_TIME_UNIT); long timeRemain = DateUtils.getRemainTime(taskInstance.getStartTime(),
(long) taskInstance.getTaskDefine().getTimeout()
* Constants.SEC_2_MINUTES_TIME_UNIT);
if (timeRemain < 0) { if (timeRemain < 0) {
logger.info("Task instance is timeout, adding task timeout event and remove the check"); logger.info("Task instance is timeout, adding task timeout event and remove the check");
addTaskTimeoutEvent(taskInstance); addTaskTimeoutEvent(taskInstance);
taskInstanceTimeoutCheckList.remove(taskInstanceKey); taskInstanceTimeoutCheckList.remove(taskInstanceKey);
} }
} }
} catch (Exception ex) {
logger.error("Check task timeout error, taskInstanceKey: {}", taskInstanceKey, ex);
} finally { } finally {
LoggerUtils.removeWorkflowInstanceIdMDC(); LoggerUtils.removeWorkflowInstanceIdMDC();
} }
@ -277,7 +294,8 @@ public class StateWheelExecuteThread extends BaseDaemonThread {
WorkflowExecuteRunnable workflowExecuteThread = processInstanceExecCacheManager.getByProcessInstanceId(processInstanceId); WorkflowExecuteRunnable workflowExecuteThread = processInstanceExecCacheManager.getByProcessInstanceId(processInstanceId);
if (workflowExecuteThread == null) { if (workflowExecuteThread == null) {
logger.warn("Task instance retry check failed, can not find workflowExecuteThread from cache manager, " logger.warn(
"Task instance retry check failed, can not find workflowExecuteThread from cache manager, "
+ "will remove this check task"); + "will remove this check task");
taskInstanceRetryCheckList.remove(taskInstanceKey); taskInstanceRetryCheckList.remove(taskInstanceKey);
continue; continue;
@ -315,6 +333,8 @@ public class StateWheelExecuteThread extends BaseDaemonThread {
addTaskRetryEvent(taskInstance); addTaskRetryEvent(taskInstance);
taskInstanceRetryCheckList.remove(taskInstanceKey); taskInstanceRetryCheckList.remove(taskInstanceKey);
} }
} catch (Exception ex) {
logger.error("Check task retry error, taskInstanceKey: {}", taskInstanceKey, ex);
} finally { } finally {
LoggerUtils.removeWorkflowInstanceIdMDC(); LoggerUtils.removeWorkflowInstanceIdMDC();
} }
@ -349,6 +369,8 @@ public class StateWheelExecuteThread extends BaseDaemonThread {
continue; continue;
} }
addTaskStateChangeEvent(taskInstance); addTaskStateChangeEvent(taskInstance);
} catch (Exception ex) {
logger.error("Task state check error, taskInstanceKey: {}", taskInstanceKey, ex);
} finally { } finally {
LoggerUtils.removeWorkflowInstanceIdMDC(); LoggerUtils.removeWorkflowInstanceIdMDC();
} }

5
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java

@ -552,8 +552,9 @@ public class WorkflowExecuteRunnable implements Callable<WorkflowSubmitStatue> {
} }
public Optional<TaskInstance> getActiveTaskInstanceByTaskCode(long taskCode) { public Optional<TaskInstance> getActiveTaskInstanceByTaskCode(long taskCode) {
if (activeTaskProcessorMaps.containsKey(taskCode)) { Integer taskInstanceId = validTaskMap.get(taskCode);
return Optional.ofNullable(activeTaskProcessorMaps.get(taskCode).taskInstance()); if (taskInstanceId != null) {
return Optional.ofNullable(taskInstanceMap.get(taskInstanceId));
} }
return Optional.empty(); return Optional.empty();
} }

2
dolphinscheduler-master/src/main/resources/application.yaml

@ -102,7 +102,7 @@ master:
task-commit-retry-times: 5 task-commit-retry-times: 5
# master commit task interval # master commit task interval
task-commit-interval: 1s task-commit-interval: 1s
state-wheel-interval: 5 state-wheel-interval: 5s
# master max cpuload avg, only higher than the system cpu load average, master server can schedule. default value -1: the number of cpu cores * 2 # master max cpuload avg, only higher than the system cpu load average, master server can schedule. default value -1: the number of cpu cores * 2
max-cpu-load-avg: -1 max-cpu-load-avg: -1
# master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, the unit is G # master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, the unit is G

2
dolphinscheduler-service/src/main/java/org/apache/dolphinscheduler/service/process/ProcessServiceImpl.java

@ -909,7 +909,7 @@ public class ProcessServiceImpl implements ProcessService {
command.getProcessDefinitionVersion()); command.getProcessDefinitionVersion());
if (processDefinition == null) { if (processDefinition == null) {
logger.error("cannot find the work process define! define code : {}", command.getProcessDefinitionCode()); logger.error("cannot find the work process define! define code : {}", command.getProcessDefinitionCode());
return null; throw new IllegalArgumentException("Cannot find the process definition for this workflowInstance");
} }
Map<String, String> cmdParam = JSONUtils.toMap(command.getCommandParam()); Map<String, String> cmdParam = JSONUtils.toMap(command.getCommandParam());
int processInstanceId = command.getProcessInstanceId(); int processInstanceId = command.getProcessInstanceId();

5
dolphinscheduler-service/src/test/java/org/apache/dolphinscheduler/service/process/ProcessServiceTest.java

@ -292,7 +292,12 @@ public class ProcessServiceTest {
+ "\":\"111\",\"" + "\":\"111\",\""
+ CMD_PARAM_SUB_PROCESS_DEFINE_CODE + CMD_PARAM_SUB_PROCESS_DEFINE_CODE
+ "\":\"222\"}"); + "\":\"222\"}");
try {
Assert.assertNull(processService.handleCommand(host, command)); Assert.assertNull(processService.handleCommand(host, command));
} catch (IllegalArgumentException illegalArgumentException) {
// assert throw illegalArgumentException here since the definition is null
Assert.assertTrue(true);
}
int definitionVersion = 1; int definitionVersion = 1;
long definitionCode = 123; long definitionCode = 123;

2
dolphinscheduler-standalone-server/src/main/resources/application.yaml

@ -120,7 +120,7 @@ master:
task-commit-retry-times: 5 task-commit-retry-times: 5
# master commit task interval # master commit task interval
task-commit-interval: 1s task-commit-interval: 1s
state-wheel-interval: 5 state-wheel-interval: 5s
# master max cpuload avg, only higher than the system cpu load average, master server can schedule. default value -1: the number of cpu cores * 2 # master max cpuload avg, only higher than the system cpu load average, master server can schedule. default value -1: the number of cpu cores * 2
max-cpu-load-avg: -1 max-cpu-load-avg: -1
# master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, the unit is G # master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, the unit is G

Loading…
Cancel
Save