Browse Source

cherry-pick [Fix]Solve the deadlock problem caused by queuing #13191

3.1.9-release
sssqhai 2 years ago committed by zhuangchong
parent
commit
81a6057fd3
  1. 33
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandleFailure.java
  2. 5
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandler.java
  3. 8
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java
  4. 14
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java

33
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandleFailure.java

@ -0,0 +1,33 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.dolphinscheduler.server.master.event;
/**
* This exception represent the exception can be recovered, when we get this exception,
* we will move the event to the fail of the queue.
*/
public class StateEventHandleFailure extends Exception {
public StateEventHandleFailure(String message) {
super(message);
}
public StateEventHandleFailure(String message, Throwable throwable) {
super(message, throwable);
}
}

5
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/StateEventHandler.java

@ -28,9 +28,10 @@ public interface StateEventHandler {
* @param stateEvent given state event.
* @throws StateEventHandleException this exception means it can be recovered.
* @throws StateEventHandleError this exception means it cannot be recovered, so the event need to drop.
* @throws StateEventHandleException this means it can be recovered.
*/
boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable, StateEvent stateEvent)
throws StateEventHandleException, StateEventHandleError;
boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable,
StateEvent stateEvent) throws StateEventHandleException, StateEventHandleError, StateEventHandleFailure;
StateEventType getEventType();
}

8
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/event/TaskWaitTaskGroupStateHandler.java

@ -31,12 +31,10 @@ public class TaskWaitTaskGroupStateHandler implements StateEventHandler {
@Override
public boolean handleStateEvent(WorkflowExecuteRunnable workflowExecuteRunnable,
StateEvent stateEvent) {
StateEvent stateEvent) throws StateEventHandleFailure {
logger.info("Handle task instance wait task group event, taskInstanceId: {}", stateEvent.getTaskInstanceId());
if (workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent)) {
logger.info("Success wake up task instance, taskInstanceId: {}", stateEvent.getTaskInstanceId());
} else {
logger.info("Failed to wake up task instance, taskInstanceId: {}", stateEvent.getTaskInstanceId());
if (!workflowExecuteRunnable.checkForceStartAndWakeUp(stateEvent)) {
throw new StateEventHandleFailure("Task state event handle failed due to robing taskGroup resource failed");
}
return true;
}

14
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/runner/WorkflowExecuteRunnable.java

@ -70,6 +70,7 @@ import org.apache.dolphinscheduler.server.master.dispatch.executor.NettyExecutor
import org.apache.dolphinscheduler.server.master.event.StateEvent;
import org.apache.dolphinscheduler.server.master.event.StateEventHandleError;
import org.apache.dolphinscheduler.server.master.event.StateEventHandleException;
import org.apache.dolphinscheduler.server.master.event.StateEventHandleFailure;
import org.apache.dolphinscheduler.server.master.event.StateEventHandler;
import org.apache.dolphinscheduler.server.master.event.StateEventHandlerManager;
import org.apache.dolphinscheduler.server.master.event.TaskStateEvent;
@ -292,19 +293,26 @@ public class WorkflowExecuteRunnable implements Callable<WorkflowSubmitStatue> {
} catch (StateEventHandleError stateEventHandleError) {
logger.error("State event handle error, will remove this event: {}", stateEvent, stateEventHandleError);
this.stateEvents.remove(stateEvent);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS_SHORT);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
} catch (StateEventHandleException stateEventHandleException) {
logger.error("State event handle error, will retry this event: {}",
stateEvent,
stateEventHandleException);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS_SHORT);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
} catch (StateEventHandleFailure stateEventHandleFailure) {
logger.error("State event handle failed, will move event to the tail: {}",
stateEvent,
stateEventHandleFailure);
this.stateEvents.remove(stateEvent);
this.stateEvents.offer(stateEvent);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
} catch (Exception e) {
// we catch the exception here, since if the state event handle failed, the state event will still keep
// in the stateEvents queue.
logger.error("State event handle error, get a unknown exception, will retry this event: {}",
stateEvent,
e);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS_SHORT);
ThreadUtils.sleep(Constants.SLEEP_TIME_MILLIS);
} finally {
LoggerUtils.removeWorkflowAndTaskInstanceIdMDC();
}

Loading…
Cancel
Save