Browse Source
* Add waiting strategy to support master/worker can recover from registry lost * throw exception when zookeeper registry start failed due to interrupted3.1.0-release
Wenjun Ruan
2 years ago
committed by
GitHub
60 changed files with 1744 additions and 950 deletions
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,29 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.common.lifecycle; |
||||||
|
|
||||||
|
public class ServerLifeCycleException extends Exception { |
||||||
|
|
||||||
|
public ServerLifeCycleException(String message) { |
||||||
|
super(message); |
||||||
|
} |
||||||
|
|
||||||
|
public ServerLifeCycleException(String message, Throwable throwable) { |
||||||
|
super(message, throwable); |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,75 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.common.lifecycle; |
||||||
|
|
||||||
|
import lombok.experimental.UtilityClass; |
||||||
|
|
||||||
|
@UtilityClass |
||||||
|
public class ServerLifeCycleManager { |
||||||
|
|
||||||
|
private static volatile ServerStatus serverStatus = ServerStatus.RUNNING; |
||||||
|
|
||||||
|
public static boolean isRunning() { |
||||||
|
return serverStatus == ServerStatus.RUNNING; |
||||||
|
} |
||||||
|
|
||||||
|
public static boolean isStopped() { |
||||||
|
return serverStatus == ServerStatus.STOPPED; |
||||||
|
} |
||||||
|
|
||||||
|
public static ServerStatus getServerStatus() { |
||||||
|
return serverStatus; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Change the current server state to {@link ServerStatus#WAITING}, only {@link ServerStatus#RUNNING} can change to {@link ServerStatus#WAITING}. |
||||||
|
* |
||||||
|
* @throws ServerLifeCycleException if change failed. |
||||||
|
*/ |
||||||
|
public static synchronized void toWaiting() throws ServerLifeCycleException { |
||||||
|
if (isStopped()) { |
||||||
|
throw new ServerLifeCycleException("The current server is already stopped, cannot change to waiting"); |
||||||
|
} |
||||||
|
|
||||||
|
if (serverStatus != ServerStatus.RUNNING) { |
||||||
|
throw new ServerLifeCycleException("The current server is not at running status, cannot change to waiting"); |
||||||
|
} |
||||||
|
serverStatus = ServerStatus.WAITING; |
||||||
|
} |
||||||
|
|
||||||
|
/** |
||||||
|
* Recover from {@link ServerStatus#WAITING} to {@link ServerStatus#RUNNING}. |
||||||
|
* |
||||||
|
* @throws ServerLifeCycleException if change failed |
||||||
|
*/ |
||||||
|
public static synchronized void recoverFromWaiting() throws ServerLifeCycleException { |
||||||
|
if (serverStatus != ServerStatus.WAITING) { |
||||||
|
throw new ServerLifeCycleException("The current server status is not waiting, cannot recover form waiting"); |
||||||
|
} |
||||||
|
serverStatus = ServerStatus.RUNNING; |
||||||
|
} |
||||||
|
|
||||||
|
public static synchronized boolean toStopped() { |
||||||
|
if (serverStatus == ServerStatus.STOPPED) { |
||||||
|
return false; |
||||||
|
} |
||||||
|
serverStatus = ServerStatus.STOPPED; |
||||||
|
return true; |
||||||
|
} |
||||||
|
|
||||||
|
} |
@ -0,0 +1,45 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.common.lifecycle; |
||||||
|
|
||||||
|
/** |
||||||
|
* This enum is used to represent the server status, include master/worker. |
||||||
|
*/ |
||||||
|
public enum ServerStatus { |
||||||
|
|
||||||
|
RUNNING(0, "The current server is running"), |
||||||
|
WAITING(1, "The current server is waiting, this means it cannot work"), |
||||||
|
STOPPED(2, "The current server is stopped"), |
||||||
|
; |
||||||
|
|
||||||
|
private final int code; |
||||||
|
private final String desc; |
||||||
|
|
||||||
|
ServerStatus(int code, String desc) { |
||||||
|
this.code = code; |
||||||
|
this.desc = desc; |
||||||
|
} |
||||||
|
|
||||||
|
public int getCode() { |
||||||
|
return code; |
||||||
|
} |
||||||
|
|
||||||
|
public String getDesc() { |
||||||
|
return desc; |
||||||
|
} |
||||||
|
} |
@ -1,58 +0,0 @@ |
|||||||
/* |
|
||||||
* Licensed to the Apache Software Foundation (ASF) under one or more |
|
||||||
* contributor license agreements. See the NOTICE file distributed with |
|
||||||
* this work for additional information regarding copyright ownership. |
|
||||||
* The ASF licenses this file to You under the Apache License, Version 2.0 |
|
||||||
* (the "License"); you may not use this file except in compliance with |
|
||||||
* the License. You may obtain a copy of the License at |
|
||||||
* |
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
* |
|
||||||
* Unless required by applicable law or agreed to in writing, software |
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS, |
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|
||||||
* See the License for the specific language governing permissions and |
|
||||||
* limitations under the License. |
|
||||||
*/ |
|
||||||
|
|
||||||
package org.apache.dolphinscheduler.common.thread; |
|
||||||
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean; |
|
||||||
|
|
||||||
import lombok.experimental.UtilityClass; |
|
||||||
|
|
||||||
/** |
|
||||||
* If the process closes, a signal is placed as true, and all threads get this flag to stop working. |
|
||||||
*/ |
|
||||||
@UtilityClass |
|
||||||
public class Stopper { |
|
||||||
|
|
||||||
private static final AtomicBoolean stoppedSignal = new AtomicBoolean(false); |
|
||||||
|
|
||||||
/** |
|
||||||
* Return the flag if the Server is stopped. |
|
||||||
* |
|
||||||
* @return True, if the server is stopped; False, the server is still running. |
|
||||||
*/ |
|
||||||
public static boolean isStopped() { |
|
||||||
return stoppedSignal.get(); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Return the flag if the Server is stopped. |
|
||||||
* |
|
||||||
* @return True, if the server is running, False, the server is stopped. |
|
||||||
*/ |
|
||||||
public static boolean isRunning() { |
|
||||||
return !stoppedSignal.get(); |
|
||||||
} |
|
||||||
|
|
||||||
/** |
|
||||||
* Stop the server |
|
||||||
* |
|
||||||
* @return True, if the server stopped success. False, if the server is already stopped. |
|
||||||
*/ |
|
||||||
public static boolean stop() { |
|
||||||
return stoppedSignal.compareAndSet(false, true); |
|
||||||
} |
|
||||||
} |
|
@ -0,0 +1,24 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.server.master.registry; |
||||||
|
|
||||||
|
import org.apache.dolphinscheduler.registry.api.ConnectStrategy; |
||||||
|
|
||||||
|
public interface MasterConnectStrategy extends ConnectStrategy { |
||||||
|
|
||||||
|
} |
@ -0,0 +1,58 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.server.master.registry; |
||||||
|
|
||||||
|
import org.apache.dolphinscheduler.registry.api.StrategyType; |
||||||
|
import org.apache.dolphinscheduler.server.master.config.MasterConfig; |
||||||
|
import org.apache.dolphinscheduler.service.registry.RegistryClient; |
||||||
|
import org.slf4j.Logger; |
||||||
|
import org.slf4j.LoggerFactory; |
||||||
|
import org.springframework.beans.factory.annotation.Autowired; |
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; |
||||||
|
import org.springframework.stereotype.Service; |
||||||
|
|
||||||
|
/** |
||||||
|
* This strategy will stop the master server, when disconnected from {@link org.apache.dolphinscheduler.registry.api.Registry}. |
||||||
|
*/ |
||||||
|
@Service |
||||||
|
@ConditionalOnProperty(prefix = "master.registry-disconnect-strategy", name = "strategy", havingValue = "stop", matchIfMissing = true) |
||||||
|
public class MasterStopStrategy implements MasterConnectStrategy { |
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(MasterStopStrategy.class); |
||||||
|
|
||||||
|
@Autowired |
||||||
|
private RegistryClient registryClient; |
||||||
|
@Autowired |
||||||
|
private MasterConfig masterConfig; |
||||||
|
|
||||||
|
@Override |
||||||
|
public void disconnect() { |
||||||
|
registryClient.getStoppable() |
||||||
|
.stop("Master disconnected from registry, will stop myself due to the stop strategy"); |
||||||
|
} |
||||||
|
|
||||||
|
@Override |
||||||
|
public void reconnect() { |
||||||
|
logger.warn("The current connect strategy is stop, so the master will not reconnect to registry"); |
||||||
|
} |
||||||
|
|
||||||
|
@Override |
||||||
|
public StrategyType getStrategyType() { |
||||||
|
return StrategyType.STOP; |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,134 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.server.master.registry; |
||||||
|
|
||||||
|
import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleException; |
||||||
|
import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleManager; |
||||||
|
import org.apache.dolphinscheduler.common.lifecycle.ServerStatus; |
||||||
|
import org.apache.dolphinscheduler.registry.api.Registry; |
||||||
|
import org.apache.dolphinscheduler.registry.api.RegistryException; |
||||||
|
import org.apache.dolphinscheduler.registry.api.StrategyType; |
||||||
|
import org.apache.dolphinscheduler.server.master.cache.ProcessInstanceExecCacheManager; |
||||||
|
import org.apache.dolphinscheduler.server.master.config.MasterConfig; |
||||||
|
import org.apache.dolphinscheduler.server.master.event.WorkflowEventQueue; |
||||||
|
import org.apache.dolphinscheduler.server.master.rpc.MasterRPCServer; |
||||||
|
import org.apache.dolphinscheduler.server.master.runner.StateWheelExecuteThread; |
||||||
|
import org.apache.dolphinscheduler.service.registry.RegistryClient; |
||||||
|
import org.slf4j.Logger; |
||||||
|
import org.slf4j.LoggerFactory; |
||||||
|
import org.springframework.beans.factory.annotation.Autowired; |
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; |
||||||
|
import org.springframework.stereotype.Service; |
||||||
|
|
||||||
|
import java.time.Duration; |
||||||
|
|
||||||
|
/** |
||||||
|
* This strategy will change the server status to {@link ServerStatus#WAITING} when disconnect from {@link Registry}. |
||||||
|
*/ |
||||||
|
@Service |
||||||
|
@ConditionalOnProperty(prefix = "master.registry-disconnect-strategy", name = "strategy", havingValue = "waiting") |
||||||
|
public class MasterWaitingStrategy implements MasterConnectStrategy { |
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(MasterWaitingStrategy.class); |
||||||
|
|
||||||
|
@Autowired |
||||||
|
private MasterConfig masterConfig; |
||||||
|
@Autowired |
||||||
|
private RegistryClient registryClient; |
||||||
|
@Autowired |
||||||
|
private MasterRPCServer masterRPCServer; |
||||||
|
@Autowired |
||||||
|
private WorkflowEventQueue workflowEventQueue; |
||||||
|
@Autowired |
||||||
|
private ProcessInstanceExecCacheManager processInstanceExecCacheManager; |
||||||
|
@Autowired |
||||||
|
private StateWheelExecuteThread stateWheelExecuteThread; |
||||||
|
|
||||||
|
@Override |
||||||
|
public void disconnect() { |
||||||
|
try { |
||||||
|
ServerLifeCycleManager.toWaiting(); |
||||||
|
// todo: clear the current resource
|
||||||
|
clearMasterResource(); |
||||||
|
Duration maxWaitingTime = masterConfig.getRegistryDisconnectStrategy().getMaxWaitingTime(); |
||||||
|
try { |
||||||
|
logger.info("Master disconnect from registry will try to reconnect in {} s", |
||||||
|
maxWaitingTime.getSeconds()); |
||||||
|
registryClient.connectUntilTimeout(maxWaitingTime); |
||||||
|
} catch (RegistryException ex) { |
||||||
|
throw new ServerLifeCycleException( |
||||||
|
String.format("Waiting to reconnect to registry in %s failed", maxWaitingTime), ex); |
||||||
|
} |
||||||
|
} catch (ServerLifeCycleException e) { |
||||||
|
String errorMessage = String.format( |
||||||
|
"Disconnect from registry and change the current status to waiting error, the current server state is %s, will stop the current server", |
||||||
|
ServerLifeCycleManager.getServerStatus()); |
||||||
|
logger.error(errorMessage, e); |
||||||
|
registryClient.getStoppable().stop(errorMessage); |
||||||
|
} catch (RegistryException ex) { |
||||||
|
String errorMessage = "Disconnect from registry and waiting to reconnect failed, will stop the server"; |
||||||
|
logger.error(errorMessage, ex); |
||||||
|
registryClient.getStoppable().stop(errorMessage); |
||||||
|
} catch (Exception ex) { |
||||||
|
String errorMessage = "Disconnect from registry and get an unknown exception, will stop the server"; |
||||||
|
logger.error(errorMessage, ex); |
||||||
|
registryClient.getStoppable().stop(errorMessage); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
@Override |
||||||
|
public void reconnect() { |
||||||
|
try { |
||||||
|
ServerLifeCycleManager.recoverFromWaiting(); |
||||||
|
reStartMasterResource(); |
||||||
|
// reopen the resource
|
||||||
|
logger.info("Recover from waiting success, the current server status is {}", |
||||||
|
ServerLifeCycleManager.getServerStatus()); |
||||||
|
} catch (Exception e) { |
||||||
|
String errorMessage = |
||||||
|
String.format("Recover from waiting failed, the current server status is %s, will stop the server", |
||||||
|
ServerLifeCycleManager.getServerStatus()); |
||||||
|
logger.error(errorMessage, e); |
||||||
|
registryClient.getStoppable().stop(errorMessage); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
@Override |
||||||
|
public StrategyType getStrategyType() { |
||||||
|
return StrategyType.WAITING; |
||||||
|
} |
||||||
|
|
||||||
|
private void clearMasterResource() { |
||||||
|
// close the worker resource, if close failed should stop the worker server
|
||||||
|
masterRPCServer.close(); |
||||||
|
logger.warn("Master closed RPC server due to lost registry connection"); |
||||||
|
workflowEventQueue.clearWorkflowEventQueue(); |
||||||
|
logger.warn("Master clear workflow event queue due to lost registry connection"); |
||||||
|
processInstanceExecCacheManager.clearCache(); |
||||||
|
logger.warn("Master clear process instance cache due to lost registry connection"); |
||||||
|
stateWheelExecuteThread.clearAllTasks(); |
||||||
|
logger.warn("Master clear all state wheel task due to lost registry connection"); |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
private void reStartMasterResource() { |
||||||
|
// reopen the resource, if reopen failed should stop the worker server
|
||||||
|
masterRPCServer.start(); |
||||||
|
logger.warn("Master restarted RPC server due to reconnect to registry"); |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,31 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.registry.api; |
||||||
|
|
||||||
|
/** |
||||||
|
* This interface defined a method to be executed when the server disconnected from registry. |
||||||
|
*/ |
||||||
|
public interface ConnectStrategy { |
||||||
|
|
||||||
|
void disconnect(); |
||||||
|
|
||||||
|
void reconnect(); |
||||||
|
|
||||||
|
StrategyType getStrategyType(); |
||||||
|
|
||||||
|
} |
@ -0,0 +1,31 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.registry.api; |
||||||
|
|
||||||
|
import lombok.Data; |
||||||
|
|
||||||
|
import java.time.Duration; |
||||||
|
|
||||||
|
@Data |
||||||
|
public class ConnectStrategyProperties { |
||||||
|
|
||||||
|
private StrategyType strategy = StrategyType.STOP; |
||||||
|
|
||||||
|
private Duration maxWaitingTime = Duration.ofSeconds(0); |
||||||
|
|
||||||
|
} |
@ -0,0 +1,25 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.registry.api; |
||||||
|
|
||||||
|
public enum StrategyType { |
||||||
|
|
||||||
|
STOP, |
||||||
|
WAITING, |
||||||
|
; |
||||||
|
} |
@ -0,0 +1,24 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.server.worker.registry; |
||||||
|
|
||||||
|
import org.apache.dolphinscheduler.registry.api.ConnectStrategy; |
||||||
|
|
||||||
|
public interface WorkerConnectStrategy extends ConnectStrategy { |
||||||
|
|
||||||
|
} |
@ -0,0 +1,61 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.server.worker.registry; |
||||||
|
|
||||||
|
import lombok.NonNull; |
||||||
|
import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleManager; |
||||||
|
import org.apache.dolphinscheduler.registry.api.ConnectionListener; |
||||||
|
import org.apache.dolphinscheduler.registry.api.ConnectionState; |
||||||
|
import org.apache.dolphinscheduler.server.worker.config.WorkerConfig; |
||||||
|
import org.apache.dolphinscheduler.service.registry.RegistryClient; |
||||||
|
import org.slf4j.Logger; |
||||||
|
import org.slf4j.LoggerFactory; |
||||||
|
|
||||||
|
public class WorkerConnectionStateListener implements ConnectionListener { |
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(WorkerConnectionStateListener.class); |
||||||
|
private final WorkerConfig workerConfig; |
||||||
|
private final RegistryClient registryClient; |
||||||
|
private final WorkerConnectStrategy workerConnectStrategy; |
||||||
|
|
||||||
|
public WorkerConnectionStateListener(@NonNull WorkerConfig workerConfig, |
||||||
|
@NonNull RegistryClient registryClient, |
||||||
|
@NonNull WorkerConnectStrategy workerConnectStrategy) { |
||||||
|
this.workerConfig = workerConfig; |
||||||
|
this.registryClient = registryClient; |
||||||
|
this.workerConnectStrategy = workerConnectStrategy; |
||||||
|
} |
||||||
|
|
||||||
|
@Override |
||||||
|
public void onUpdate(ConnectionState state) { |
||||||
|
logger.info("Worker received a {} event from registry, the current server state is {}", state, |
||||||
|
ServerLifeCycleManager.getServerStatus()); |
||||||
|
switch (state) { |
||||||
|
case CONNECTED: |
||||||
|
break; |
||||||
|
case SUSPENDED: |
||||||
|
break; |
||||||
|
case RECONNECTED: |
||||||
|
workerConnectStrategy.reconnect(); |
||||||
|
break; |
||||||
|
case DISCONNECTED: |
||||||
|
workerConnectStrategy.disconnect(); |
||||||
|
default: |
||||||
|
} |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,55 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.server.worker.registry; |
||||||
|
|
||||||
|
import org.apache.dolphinscheduler.registry.api.StrategyType; |
||||||
|
import org.apache.dolphinscheduler.server.worker.config.WorkerConfig; |
||||||
|
import org.apache.dolphinscheduler.service.registry.RegistryClient; |
||||||
|
import org.slf4j.Logger; |
||||||
|
import org.slf4j.LoggerFactory; |
||||||
|
import org.springframework.beans.factory.annotation.Autowired; |
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; |
||||||
|
import org.springframework.stereotype.Service; |
||||||
|
|
||||||
|
@Service |
||||||
|
@ConditionalOnProperty(prefix = "worker.registry-disconnect-strategy", name = "strategy", havingValue = "stop", matchIfMissing = true) |
||||||
|
public class WorkerStopStrategy implements WorkerConnectStrategy { |
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(WorkerStopStrategy.class); |
||||||
|
|
||||||
|
@Autowired |
||||||
|
public RegistryClient registryClient; |
||||||
|
@Autowired |
||||||
|
private WorkerConfig workerConfig; |
||||||
|
|
||||||
|
@Override |
||||||
|
public void disconnect() { |
||||||
|
registryClient.getStoppable() |
||||||
|
.stop("Worker disconnected from registry, will stop myself due to the stop strategy"); |
||||||
|
} |
||||||
|
|
||||||
|
@Override |
||||||
|
public void reconnect() { |
||||||
|
logger.warn("The current connect strategy is stop, so the worker will not reconnect to registry"); |
||||||
|
} |
||||||
|
|
||||||
|
@Override |
||||||
|
public StrategyType getStrategyType() { |
||||||
|
return StrategyType.STOP; |
||||||
|
} |
||||||
|
} |
@ -0,0 +1,135 @@ |
|||||||
|
/* |
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more |
||||||
|
* contributor license agreements. See the NOTICE file distributed with |
||||||
|
* this work for additional information regarding copyright ownership. |
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0 |
||||||
|
* (the "License"); you may not use this file except in compliance with |
||||||
|
* the License. You may obtain a copy of the License at |
||||||
|
* |
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
* |
||||||
|
* Unless required by applicable law or agreed to in writing, software |
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS, |
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
||||||
|
* See the License for the specific language governing permissions and |
||||||
|
* limitations under the License. |
||||||
|
*/ |
||||||
|
|
||||||
|
package org.apache.dolphinscheduler.server.worker.registry; |
||||||
|
|
||||||
|
import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleException; |
||||||
|
import org.apache.dolphinscheduler.common.lifecycle.ServerLifeCycleManager; |
||||||
|
import org.apache.dolphinscheduler.registry.api.RegistryException; |
||||||
|
import org.apache.dolphinscheduler.registry.api.StrategyType; |
||||||
|
import org.apache.dolphinscheduler.server.worker.config.WorkerConfig; |
||||||
|
import org.apache.dolphinscheduler.server.worker.message.MessageRetryRunner; |
||||||
|
import org.apache.dolphinscheduler.server.worker.rpc.WorkerRpcClient; |
||||||
|
import org.apache.dolphinscheduler.server.worker.rpc.WorkerRpcServer; |
||||||
|
import org.apache.dolphinscheduler.server.worker.runner.WorkerManagerThread; |
||||||
|
import org.apache.dolphinscheduler.service.registry.RegistryClient; |
||||||
|
import org.slf4j.Logger; |
||||||
|
import org.slf4j.LoggerFactory; |
||||||
|
import org.springframework.beans.factory.annotation.Autowired; |
||||||
|
import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; |
||||||
|
import org.springframework.stereotype.Service; |
||||||
|
|
||||||
|
import java.time.Duration; |
||||||
|
|
||||||
|
@Service |
||||||
|
@ConditionalOnProperty(prefix = "worker.registry-disconnect-strategy", name = "strategy", havingValue = "waiting") |
||||||
|
public class WorkerWaitingStrategy implements WorkerConnectStrategy { |
||||||
|
|
||||||
|
private final Logger logger = LoggerFactory.getLogger(WorkerWaitingStrategy.class); |
||||||
|
|
||||||
|
@Autowired |
||||||
|
private WorkerConfig workerConfig; |
||||||
|
|
||||||
|
@Autowired |
||||||
|
private RegistryClient registryClient; |
||||||
|
|
||||||
|
@Autowired |
||||||
|
private WorkerRpcServer workerRpcServer; |
||||||
|
|
||||||
|
@Autowired |
||||||
|
private WorkerRpcClient workerRpcClient; |
||||||
|
|
||||||
|
@Autowired |
||||||
|
private MessageRetryRunner messageRetryRunner; |
||||||
|
|
||||||
|
@Autowired |
||||||
|
private WorkerManagerThread workerManagerThread; |
||||||
|
|
||||||
|
@Override |
||||||
|
public void disconnect() { |
||||||
|
try { |
||||||
|
ServerLifeCycleManager.toWaiting(); |
||||||
|
clearWorkerResource(); |
||||||
|
Duration maxWaitingTime = workerConfig.getRegistryDisconnectStrategy().getMaxWaitingTime(); |
||||||
|
try { |
||||||
|
logger.info("Worker disconnect from registry will try to reconnect in {} s", |
||||||
|
maxWaitingTime.getSeconds()); |
||||||
|
registryClient.connectUntilTimeout(maxWaitingTime); |
||||||
|
} catch (RegistryException ex) { |
||||||
|
throw new ServerLifeCycleException( |
||||||
|
String.format("Waiting to reconnect to registry in %s failed", maxWaitingTime), ex); |
||||||
|
} |
||||||
|
} catch (ServerLifeCycleException e) { |
||||||
|
String errorMessage = String.format( |
||||||
|
"Disconnect from registry and change the current status to waiting error, the current server state is %s, will stop the current server", |
||||||
|
ServerLifeCycleManager.getServerStatus()); |
||||||
|
logger.error(errorMessage, e); |
||||||
|
registryClient.getStoppable().stop(errorMessage); |
||||||
|
} catch (RegistryException ex) { |
||||||
|
String errorMessage = "Disconnect from registry and waiting to reconnect failed, will stop the server"; |
||||||
|
logger.error(errorMessage, ex); |
||||||
|
registryClient.getStoppable().stop(errorMessage); |
||||||
|
} catch (Exception ex) { |
||||||
|
String errorMessage = "Disconnect from registry and get an unknown exception, will stop the server"; |
||||||
|
logger.error(errorMessage, ex); |
||||||
|
registryClient.getStoppable().stop(errorMessage); |
||||||
|
} |
||||||
|
} |
||||||
|
|
||||||
|
@Override |
||||||
|
public void reconnect() { |
||||||
|
try { |
||||||
|
ServerLifeCycleManager.recoverFromWaiting(); |
||||||
|
reStartWorkerResource(); |
||||||
|
logger.info("Recover from waiting success, the current server status is {}", |
||||||
|
ServerLifeCycleManager.getServerStatus()); |
||||||
|
} catch (Exception e) { |
||||||
|
String errorMessage = |
||||||
|
String.format("Recover from waiting failed, the current server status is %s, will stop the server", |
||||||
|
ServerLifeCycleManager.getServerStatus()); |
||||||
|
logger.error(errorMessage, e); |
||||||
|
registryClient.getStoppable().stop(errorMessage); |
||||||
|
} |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
@Override |
||||||
|
public StrategyType getStrategyType() { |
||||||
|
return StrategyType.WAITING; |
||||||
|
} |
||||||
|
|
||||||
|
private void clearWorkerResource() { |
||||||
|
// close the worker resource, if close failed should stop the worker server
|
||||||
|
workerRpcServer.close(); |
||||||
|
logger.warn("Worker server close the RPC server due to lost connection from registry"); |
||||||
|
workerRpcClient.close(); |
||||||
|
logger.warn("Worker server close the RPC client due to lost connection from registry"); |
||||||
|
workerManagerThread.clearTask(); |
||||||
|
logger.warn("Worker server clear the tasks due to lost connection from registry"); |
||||||
|
messageRetryRunner.clearMessage(); |
||||||
|
logger.warn("Worker server clear the retry message due to lost connection from registry"); |
||||||
|
|
||||||
|
} |
||||||
|
|
||||||
|
private void reStartWorkerResource() { |
||||||
|
// reopen the resource, if reopen failed should stop the worker server
|
||||||
|
workerRpcServer.start(); |
||||||
|
logger.warn("Worker server restart PRC server due to reconnect to registry"); |
||||||
|
workerRpcClient.start(); |
||||||
|
logger.warn("Worker server restart PRC client due to reconnect to registry"); |
||||||
|
} |
||||||
|
} |
Loading…
Reference in new issue