From 3b980cb06a22fc1a2e917547c98bb8a4a307f90e Mon Sep 17 00:00:00 2001 From: hokie-chan <88499013+hokie-chan@users.noreply.github.com> Date: Tue, 3 Jan 2023 19:24:11 +0800 Subject: [PATCH] [fix][worker][bug] master/worker crash when registry recover from SUSPENDED to RECONNECTED (#13328) --- .../registry/MasterWaitingStrategy.java | 28 +++++++++++-------- .../registry/WorkerWaitingStrategy.java | 28 +++++++++++-------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterWaitingStrategy.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterWaitingStrategy.java index a007e221b8..4556500db6 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterWaitingStrategy.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterWaitingStrategy.java @@ -93,18 +93,22 @@ public class MasterWaitingStrategy implements MasterConnectStrategy { @Override public void reconnect() { - try { - ServerLifeCycleManager.recoverFromWaiting(); - reStartMasterResource(); - // reopen the resource - logger.info("Recover from waiting success, the current server status is {}", - ServerLifeCycleManager.getServerStatus()); - } catch (Exception e) { - String errorMessage = - String.format("Recover from waiting failed, the current server status is %s, will stop the server", - ServerLifeCycleManager.getServerStatus()); - logger.error(errorMessage, e); - registryClient.getStoppable().stop(errorMessage); + if (ServerLifeCycleManager.isRunning()) { + logger.info("no need to reconnect, as the current server status is running"); + } else { + try { + ServerLifeCycleManager.recoverFromWaiting(); + reStartMasterResource(); + logger.info("Recover from waiting success, the current server status is {}", + ServerLifeCycleManager.getServerStatus()); + } catch (Exception e) { + String errorMessage = + String.format( + "Recover from waiting failed, the current server status is %s, will stop the server", + ServerLifeCycleManager.getServerStatus()); + logger.error(errorMessage, e); + registryClient.getStoppable().stop(errorMessage); + } } } diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/registry/WorkerWaitingStrategy.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/registry/WorkerWaitingStrategy.java index 1549385fe9..24a609b74f 100644 --- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/registry/WorkerWaitingStrategy.java +++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/registry/WorkerWaitingStrategy.java @@ -93,19 +93,23 @@ public class WorkerWaitingStrategy implements WorkerConnectStrategy { @Override public void reconnect() { - try { - ServerLifeCycleManager.recoverFromWaiting(); - reStartWorkerResource(); - logger.info("Recover from waiting success, the current server status is {}", - ServerLifeCycleManager.getServerStatus()); - } catch (Exception e) { - String errorMessage = - String.format("Recover from waiting failed, the current server status is %s, will stop the server", - ServerLifeCycleManager.getServerStatus()); - logger.error(errorMessage, e); - registryClient.getStoppable().stop(errorMessage); + if (ServerLifeCycleManager.isRunning()) { + logger.info("no need to reconnect, as the current server status is running"); + } else { + try { + ServerLifeCycleManager.recoverFromWaiting(); + reStartWorkerResource(); + logger.info("Recover from waiting success, the current server status is {}", + ServerLifeCycleManager.getServerStatus()); + } catch (Exception e) { + String errorMessage = + String.format( + "Recover from waiting failed, the current server status is %s, will stop the server", + ServerLifeCycleManager.getServerStatus()); + logger.error(errorMessage, e); + registryClient.getStoppable().stop(errorMessage); + } } - } @Override