Browse Source
* Add waiting strategy to support master/worker can recover from registry lost * throw exception when zookeeper registry start failed due to interrupted3.1.0-release
Wenjun Ruan
2 years ago
committed by
GitHub
60 changed files with 1744 additions and 950 deletions
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,29 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.common.lifecycle; |
||||
|
||||
public class ServerLifeCycleException extends Exception { |
||||
|
||||
public ServerLifeCycleException(String message) { |
||||
super(message); |
||||
} |
||||
|
||||
public ServerLifeCycleException(String message, Throwable throwable) { |
||||
super(message, throwable); |
||||
} |
||||
} |
@ -0,0 +1,75 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.common.lifecycle; |
||||
|
||||
import lombok.experimental.UtilityClass; |
||||
|
||||
@UtilityClass |
||||
public class ServerLifeCycleManager { |
||||
|
||||
private static volatile ServerStatus serverStatus = ServerStatus.RUNNING; |
||||
|
||||
public static boolean isRunning() { |
||||
return serverStatus == ServerStatus.RUNNING; |
||||
} |
||||
|
||||
public static boolean isStopped() { |
||||
return serverStatus == ServerStatus.STOPPED; |
||||
} |
||||
|
||||
public static ServerStatus getServerStatus() { |
||||
return serverStatus; |
||||
} |
||||
|
||||
/** |
||||
* Change the current server state to {@link ServerStatus#WAITING}, only {@link ServerStatus#RUNNING} can change to {@link ServerStatus#WAITING}. |
||||
* |
||||
* @throws ServerLifeCycleException if change failed. |
||||
*/ |
||||
public static synchronized void toWaiting() throws ServerLifeCycleException { |
||||
if (isStopped()) { |
||||
throw new ServerLifeCycleException("The current server is already stopped, cannot change to waiting"); |
||||
} |
||||
|
||||
if (serverStatus != ServerStatus.RUNNING) { |
||||
throw new ServerLifeCycleException("The current server is not at running status, cannot change to waiting"); |
||||
} |
||||
serverStatus = ServerStatus.WAITING; |
||||
} |
||||
|
||||
/** |
||||
* Recover from {@link ServerStatus#WAITING} to {@link ServerStatus#RUNNING}. |
||||
* |
||||
* @throws ServerLifeCycleException if change failed |
||||
*/ |
||||
public static synchronized void recoverFromWaiting() throws ServerLifeCycleException { |
||||
if (serverStatus != ServerStatus.WAITING) { |
||||
throw new ServerLifeCycleException("The current server status is not waiting, cannot recover form waiting"); |
||||
} |
||||
serverStatus = ServerStatus.RUNNING; |
||||
} |
||||
|
||||
public static synchronized boolean toStopped() { |
||||
if (serverStatus == ServerStatus.STOPPED) { |
||||
return false; |
||||
} |
||||
serverStatus = ServerStatus.STOPPED; |
||||
return true; |
||||
} |
||||
|
||||
} |
@ -0,0 +1,45 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.common.lifecycle; |
||||
|
||||
/** |
||||
* This enum is used to represent the server status, include master/worker. |
||||
*/ |
||||
public enum ServerStatus { |
||||
|
||||
RUNNING(0, "The current server is running"), |
||||
WAITING(1, "The current server is waiting, this means it cannot work"), |
||||
STOPPED(2, "The current server is stopped"), |
||||
; |
||||
|
||||
private final int code; |
||||
private final String desc; |
||||
|
||||
ServerStatus(int code, String desc) { |
||||
this.code = code; |
||||
this.desc = desc; |
||||
} |
||||
|
||||
public int getCode() { |
||||
return code; |
||||
} |
||||
|
||||
public String getDesc() { |
||||
return desc; |
||||
} |
||||
} |
@ -1,58 +0,0 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.common.thread; |
||||
|
||||
import java.util.concurrent.atomic.AtomicBoolean; |
||||
|
||||
import lombok.experimental.UtilityClass; |
||||
|
||||
/** |
||||
* If the process closes, a signal is placed as true, and all threads get this flag to stop working. |
||||
*/ |
||||
@UtilityClass |
||||
public class Stopper { |
||||
|
||||
private static final AtomicBoolean stoppedSignal = new AtomicBoolean(false); |
||||
|
||||
/** |
||||
* Return the flag if the Server is stopped. |
||||
* |
||||
* @return True, if the server is stopped; False, the server is still running. |
||||
*/ |
||||
public static boolean isStopped() { |
||||
return stoppedSignal.get(); |
||||
} |
||||
|
||||
/** |
||||
* Return the flag if the Server is stopped. |
||||
* |
||||
* @return True, if the server is running, False, the server is stopped. |
||||
*/ |
||||
public static boolean isRunning() { |
||||
return !stoppedSignal.get(); |
||||
} |
||||
|
||||
/** |
||||
* Stop the server |
||||
* |
||||
* @return True, if the server stopped success. False, if the server is already stopped. |
||||
*/ |
||||
public static boolean stop() { |
||||
return stoppedSignal.compareAndSet(false, true); |
||||
} |
||||
} |
@ -0,0 +1,24 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.server.master.registry; |
||||
|
||||
import org.apache.dolphinscheduler.registry.api.ConnectStrategy; |
||||
|
||||
public interface MasterConnectStrategy extends ConnectStrategy { |
||||
|
||||
} |
@ -0,0 +1,58 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.server.master.registry; |
||||
|
||||
import org.apache.dolphinscheduler.registry.api.StrategyType; |
||||
import org.apache.dolphinscheduler.server.master.config.MasterConfig; |
||||
import org.apache.dolphinscheduler.service.registry.RegistryClient; |
||||
import org.slf4j.Logger; |
||||
import org.slf4j.LoggerFactory; |
||||
import org.springframework.beans.factory.annotation.Autowired; |
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; |
||||
import org.springframework.stereotype.Service; |
||||
|
||||
/** |
||||
* This strategy will stop the master server, when disconnected from {@link org.apache.dolphinscheduler.registry.api.Registry}. |
||||
*/ |
||||
@Service |
||||
@ConditionalOnProperty(prefix = "master.registry-disconnect-strategy", name = "strategy", havingValue = "stop", matchIfMissing = true) |
||||
public class MasterStopStrategy implements MasterConnectStrategy { |
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(MasterStopStrategy.class); |
||||
|
||||
@Autowired |
||||
private RegistryClient registryClient; |
||||
@Autowired |
||||
private MasterConfig masterConfig; |
||||
|
||||
@Override |
||||
public void disconnect() { |
||||
registryClient.getStoppable() |
||||
.stop("Master disconnected from registry, will stop myself due to the stop strategy"); |
||||
} |
||||
|
||||
@Override |
||||
public void reconnect() { |
||||
logger.warn("The current connect strategy is stop, so the master will not reconnect to registry"); |
||||
} |
||||
|
||||
@Override |
||||
public StrategyType getStrategyType() { |
||||
return StrategyType.STOP; |
||||
} |
||||
} |
@ -0,0 +1,134 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.server.master.registry; |
||||
|
||||
import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleException; |
||||
import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleManager; |
||||
import org.apache.dolphinscheduler.common.lifecycle.ServerStatus; |
||||
import org.apache.dolphinscheduler.registry.api.Registry; |
||||
import org.apache.dolphinscheduler.registry.api.RegistryException; |
||||
import org.apache.dolphinscheduler.registry.api.StrategyType; |
||||
import org.apache.dolphinscheduler.server.master.cache.ProcessInstanceExecCacheManager; |
||||
import org.apache.dolphinscheduler.server.master.config.MasterConfig; |
||||
import org.apache.dolphinscheduler.server.master.event.WorkflowEventQueue; |
||||
import org.apache.dolphinscheduler.server.master.rpc.MasterRPCServer; |
||||
import org.apache.dolphinscheduler.server.master.runner.StateWheelExecuteThread; |
||||
import org.apache.dolphinscheduler.service.registry.RegistryClient; |
||||
import org.slf4j.Logger; |
||||
import org.slf4j.LoggerFactory; |
||||
import org.springframework.beans.factory.annotation.Autowired; |
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; |
||||
import org.springframework.stereotype.Service; |
||||
|
||||
import java.time.Duration; |
||||
|
||||
/** |
||||
* This strategy will change the server status to {@link ServerStatus#WAITING} when disconnect from {@link Registry}. |
||||
*/ |
||||
@Service |
||||
@ConditionalOnProperty(prefix = "master.registry-disconnect-strategy", name = "strategy", havingValue = "waiting") |
||||
public class MasterWaitingStrategy implements MasterConnectStrategy { |
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(MasterWaitingStrategy.class); |
||||
|
||||
@Autowired |
||||
private MasterConfig masterConfig; |
||||
@Autowired |
||||
private RegistryClient registryClient; |
||||
@Autowired |
||||
private MasterRPCServer masterRPCServer; |
||||
@Autowired |
||||
private WorkflowEventQueue workflowEventQueue; |
||||
@Autowired |
||||
private ProcessInstanceExecCacheManager processInstanceExecCacheManager; |
||||
@Autowired |
||||
private StateWheelExecuteThread stateWheelExecuteThread; |
||||
|
||||
@Override |
||||
public void disconnect() { |
||||
try { |
||||
ServerLifeCycleManager.toWaiting(); |
||||
// todo: clear the current resource
|
||||
clearMasterResource(); |
||||
Duration maxWaitingTime = masterConfig.getRegistryDisconnectStrategy().getMaxWaitingTime(); |
||||
try { |
||||
logger.info("Master disconnect from registry will try to reconnect in {} s", |
||||
maxWaitingTime.getSeconds()); |
||||
registryClient.connectUntilTimeout(maxWaitingTime); |
||||
} catch (RegistryException ex) { |
||||
throw new ServerLifeCycleException( |
||||
String.format("Waiting to reconnect to registry in %s failed", maxWaitingTime), ex); |
||||
} |
||||
} catch (ServerLifeCycleException e) { |
||||
String errorMessage = String.format( |
||||
"Disconnect from registry and change the current status to waiting error, the current server state is %s, will stop the current server", |
||||
ServerLifeCycleManager.getServerStatus()); |
||||
logger.error(errorMessage, e); |
||||
registryClient.getStoppable().stop(errorMessage); |
||||
} catch (RegistryException ex) { |
||||
String errorMessage = "Disconnect from registry and waiting to reconnect failed, will stop the server"; |
||||
logger.error(errorMessage, ex); |
||||
registryClient.getStoppable().stop(errorMessage); |
||||
} catch (Exception ex) { |
||||
String errorMessage = "Disconnect from registry and get an unknown exception, will stop the server"; |
||||
logger.error(errorMessage, ex); |
||||
registryClient.getStoppable().stop(errorMessage); |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public void reconnect() { |
||||
try { |
||||
ServerLifeCycleManager.recoverFromWaiting(); |
||||
reStartMasterResource(); |
||||
// reopen the resource
|
||||
logger.info("Recover from waiting success, the current server status is {}", |
||||
ServerLifeCycleManager.getServerStatus()); |
||||
} catch (Exception e) { |
||||
String errorMessage = |
||||
String.format("Recover from waiting failed, the current server status is %s, will stop the server", |
||||
ServerLifeCycleManager.getServerStatus()); |
||||
logger.error(errorMessage, e); |
||||
registryClient.getStoppable().stop(errorMessage); |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public StrategyType getStrategyType() { |
||||
return StrategyType.WAITING; |
||||
} |
||||
|
||||
private void clearMasterResource() { |
||||
// close the worker resource, if close failed should stop the worker server
|
||||
masterRPCServer.close(); |
||||
logger.warn("Master closed RPC server due to lost registry connection"); |
||||
workflowEventQueue.clearWorkflowEventQueue(); |
||||
logger.warn("Master clear workflow event queue due to lost registry connection"); |
||||
processInstanceExecCacheManager.clearCache(); |
||||
logger.warn("Master clear process instance cache due to lost registry connection"); |
||||
stateWheelExecuteThread.clearAllTasks(); |
||||
logger.warn("Master clear all state wheel task due to lost registry connection"); |
||||
|
||||
} |
||||
|
||||
private void reStartMasterResource() { |
||||
// reopen the resource, if reopen failed should stop the worker server
|
||||
masterRPCServer.start(); |
||||
logger.warn("Master restarted RPC server due to reconnect to registry"); |
||||
} |
||||
} |
@ -0,0 +1,31 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.registry.api; |
||||
|
||||
/** |
||||
* This interface defined a method to be executed when the server disconnected from registry. |
||||
*/ |
||||
public interface ConnectStrategy { |
||||
|
||||
void disconnect(); |
||||
|
||||
void reconnect(); |
||||
|
||||
StrategyType getStrategyType(); |
||||
|
||||
} |
@ -0,0 +1,31 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.registry.api; |
||||
|
||||
import lombok.Data; |
||||
|
||||
import java.time.Duration; |
||||
|
||||
@Data |
||||
public class ConnectStrategyProperties { |
||||
|
||||
private StrategyType strategy = StrategyType.STOP; |
||||
|
||||
private Duration maxWaitingTime = Duration.ofSeconds(0); |
||||
|
||||
} |
@ -0,0 +1,25 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.registry.api; |
||||
|
||||
public enum StrategyType { |
||||
|
||||
STOP, |
||||
WAITING, |
||||
; |
||||
} |
@ -0,0 +1,24 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.server.worker.registry; |
||||
|
||||
import org.apache.dolphinscheduler.registry.api.ConnectStrategy; |
||||
|
||||
public interface WorkerConnectStrategy extends ConnectStrategy { |
||||
|
||||
} |
@ -0,0 +1,61 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.server.worker.registry; |
||||
|
||||
import lombok.NonNull; |
||||
import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleManager; |
||||
import org.apache.dolphinscheduler.registry.api.ConnectionListener; |
||||
import org.apache.dolphinscheduler.registry.api.ConnectionState; |
||||
import org.apache.dolphinscheduler.server.worker.config.WorkerConfig; |
||||
import org.apache.dolphinscheduler.service.registry.RegistryClient; |
||||
import org.slf4j.Logger; |
||||
import org.slf4j.LoggerFactory; |
||||
|
||||
public class WorkerConnectionStateListener implements ConnectionListener { |
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(WorkerConnectionStateListener.class); |
||||
private final WorkerConfig workerConfig; |
||||
private final RegistryClient registryClient; |
||||
private final WorkerConnectStrategy workerConnectStrategy; |
||||
|
||||
public WorkerConnectionStateListener(@NonNull WorkerConfig workerConfig, |
||||
@NonNull RegistryClient registryClient, |
||||
@NonNull WorkerConnectStrategy workerConnectStrategy) { |
||||
this.workerConfig = workerConfig; |
||||
this.registryClient = registryClient; |
||||
this.workerConnectStrategy = workerConnectStrategy; |
||||
} |
||||
|
||||
@Override |
||||
public void onUpdate(ConnectionState state) { |
||||
logger.info("Worker received a {} event from registry, the current server state is {}", state, |
||||
ServerLifeCycleManager.getServerStatus()); |
||||
switch (state) { |
||||
case CONNECTED: |
||||
break; |
||||
case SUSPENDED: |
||||
break; |
||||
case RECONNECTED: |
||||
workerConnectStrategy.reconnect(); |
||||
break; |
||||
case DISCONNECTED: |
||||
workerConnectStrategy.disconnect(); |
||||
default: |
||||
} |
||||
} |
||||
} |
@ -0,0 +1,55 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.server.worker.registry; |
||||
|
||||
import org.apache.dolphinscheduler.registry.api.StrategyType; |
||||
import org.apache.dolphinscheduler.server.worker.config.WorkerConfig; |
||||
import org.apache.dolphinscheduler.service.registry.RegistryClient; |
||||
import org.slf4j.Logger; |
||||
import org.slf4j.LoggerFactory; |
||||
import org.springframework.beans.factory.annotation.Autowired; |
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; |
||||
import org.springframework.stereotype.Service; |
||||
|
||||
@Service |
||||
@ConditionalOnProperty(prefix = "worker.registry-disconnect-strategy", name = "strategy", havingValue = "stop", matchIfMissing = true) |
||||
public class WorkerStopStrategy implements WorkerConnectStrategy { |
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(WorkerStopStrategy.class); |
||||
|
||||
@Autowired |
||||
public RegistryClient registryClient; |
||||
@Autowired |
||||
private WorkerConfig workerConfig; |
||||
|
||||
@Override |
||||
public void disconnect() { |
||||
registryClient.getStoppable() |
||||
.stop("Worker disconnected from registry, will stop myself due to the stop strategy"); |
||||
} |
||||
|
||||
@Override |
||||
public void reconnect() { |
||||
logger.warn("The current connect strategy is stop, so the worker will not reconnect to registry"); |
||||
} |
||||
|
||||
@Override |
||||
public StrategyType getStrategyType() { |
||||
return StrategyType.STOP; |
||||
} |
||||
} |
@ -0,0 +1,135 @@
|
||||
/* |
||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||
* contributor license agreements. See the NOTICE file distributed with |
||||
* this work for additional information regarding copyright ownership. |
||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||
* (the "License"); you may not use this file except in compliance with |
||||
* the License. You may obtain a copy of the License at |
||||
* |
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
* |
||||
* Unless required by applicable law or agreed to in writing, software |
||||
* distributed under the License is distributed on an "AS IS" BASIS, |
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||
* See the License for the specific language governing permissions and |
||||
* limitations under the License. |
||||
*/ |
||||
|
||||
package org.apache.dolphinscheduler.server.worker.registry; |
||||
|
||||
import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleException; |
||||
import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleManager; |
||||
import org.apache.dolphinscheduler.registry.api.RegistryException; |
||||
import org.apache.dolphinscheduler.registry.api.StrategyType; |
||||
import org.apache.dolphinscheduler.server.worker.config.WorkerConfig; |
||||
import org.apache.dolphinscheduler.server.worker.message.MessageRetryRunner; |
||||
import org.apache.dolphinscheduler.server.worker.rpc.WorkerRpcClient; |
||||
import org.apache.dolphinscheduler.server.worker.rpc.WorkerRpcServer; |
||||
import org.apache.dolphinscheduler.server.worker.runner.WorkerManagerThread; |
||||
import org.apache.dolphinscheduler.service.registry.RegistryClient; |
||||
import org.slf4j.Logger; |
||||
import org.slf4j.LoggerFactory; |
||||
import org.springframework.beans.factory.annotation.Autowired; |
||||
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; |
||||
import org.springframework.stereotype.Service; |
||||
|
||||
import java.time.Duration; |
||||
|
||||
@Service |
||||
@ConditionalOnProperty(prefix = "worker.registry-disconnect-strategy", name = "strategy", havingValue = "waiting") |
||||
public class WorkerWaitingStrategy implements WorkerConnectStrategy { |
||||
|
||||
private final Logger logger = LoggerFactory.getLogger(WorkerWaitingStrategy.class); |
||||
|
||||
@Autowired |
||||
private WorkerConfig workerConfig; |
||||
|
||||
@Autowired |
||||
private RegistryClient registryClient; |
||||
|
||||
@Autowired |
||||
private WorkerRpcServer workerRpcServer; |
||||
|
||||
@Autowired |
||||
private WorkerRpcClient workerRpcClient; |
||||
|
||||
@Autowired |
||||
private MessageRetryRunner messageRetryRunner; |
||||
|
||||
@Autowired |
||||
private WorkerManagerThread workerManagerThread; |
||||
|
||||
@Override |
||||
public void disconnect() { |
||||
try { |
||||
ServerLifeCycleManager.toWaiting(); |
||||
clearWorkerResource(); |
||||
Duration maxWaitingTime = workerConfig.getRegistryDisconnectStrategy().getMaxWaitingTime(); |
||||
try { |
||||
logger.info("Worker disconnect from registry will try to reconnect in {} s", |
||||
maxWaitingTime.getSeconds()); |
||||
registryClient.connectUntilTimeout(maxWaitingTime); |
||||
} catch (RegistryException ex) { |
||||
throw new ServerLifeCycleException( |
||||
String.format("Waiting to reconnect to registry in %s failed", maxWaitingTime), ex); |
||||
} |
||||
} catch (ServerLifeCycleException e) { |
||||
String errorMessage = String.format( |
||||
"Disconnect from registry and change the current status to waiting error, the current server state is %s, will stop the current server", |
||||
ServerLifeCycleManager.getServerStatus()); |
||||
logger.error(errorMessage, e); |
||||
registryClient.getStoppable().stop(errorMessage); |
||||
} catch (RegistryException ex) { |
||||
String errorMessage = "Disconnect from registry and waiting to reconnect failed, will stop the server"; |
||||
logger.error(errorMessage, ex); |
||||
registryClient.getStoppable().stop(errorMessage); |
||||
} catch (Exception ex) { |
||||
String errorMessage = "Disconnect from registry and get an unknown exception, will stop the server"; |
||||
logger.error(errorMessage, ex); |
||||
registryClient.getStoppable().stop(errorMessage); |
||||
} |
||||
} |
||||
|
||||
@Override |
||||
public void reconnect() { |
||||
try { |
||||
ServerLifeCycleManager.recoverFromWaiting(); |
||||
reStartWorkerResource(); |
||||
logger.info("Recover from waiting success, the current server status is {}", |
||||
ServerLifeCycleManager.getServerStatus()); |
||||
} catch (Exception e) { |
||||
String errorMessage = |
||||
String.format("Recover from waiting failed, the current server status is %s, will stop the server", |
||||
ServerLifeCycleManager.getServerStatus()); |
||||
logger.error(errorMessage, e); |
||||
registryClient.getStoppable().stop(errorMessage); |
||||
} |
||||
|
||||
} |
||||
|
||||
@Override |
||||
public StrategyType getStrategyType() { |
||||
return StrategyType.WAITING; |
||||
} |
||||
|
||||
private void clearWorkerResource() { |
||||
// close the worker resource, if close failed should stop the worker server
|
||||
workerRpcServer.close(); |
||||
logger.warn("Worker server close the RPC server due to lost connection from registry"); |
||||
workerRpcClient.close(); |
||||
logger.warn("Worker server close the RPC client due to lost connection from registry"); |
||||
workerManagerThread.clearTask(); |
||||
logger.warn("Worker server clear the tasks due to lost connection from registry"); |
||||
messageRetryRunner.clearMessage(); |
||||
logger.warn("Worker server clear the retry message due to lost connection from registry"); |
||||
|
||||
} |
||||
|
||||
private void reStartWorkerResource() { |
||||
// reopen the resource, if reopen failed should stop the worker server
|
||||
workerRpcServer.start(); |
||||
logger.warn("Worker server restart PRC server due to reconnect to registry"); |
||||
workerRpcClient.start(); |
||||
logger.warn("Worker server restart PRC client due to reconnect to registry"); |
||||
} |
||||
} |
Loading…
Reference in new issue