@ -20,6 +20,7 @@ import org.apache.commons.lang.StringUtils;
import org.apache.curator.framework.CuratorFramework ;
import org.apache.curator.framework.CuratorFramework ;
import org.apache.curator.framework.recipes.cache.TreeCacheEvent ;
import org.apache.curator.framework.recipes.cache.TreeCacheEvent ;
import org.apache.curator.framework.recipes.locks.InterProcessMutex ;
import org.apache.curator.framework.recipes.locks.InterProcessMutex ;
import org.apache.dolphinscheduler.common.Constants ;
import org.apache.dolphinscheduler.common.Constants ;
import org.apache.dolphinscheduler.common.enums.ExecutionStatus ;
import org.apache.dolphinscheduler.common.enums.ExecutionStatus ;
import org.apache.dolphinscheduler.common.enums.ZKNodeType ;
import org.apache.dolphinscheduler.common.enums.ZKNodeType ;
@ -33,6 +34,7 @@ import org.apache.dolphinscheduler.server.entity.TaskExecutionContext;
import org.apache.dolphinscheduler.server.utils.ProcessUtils ;
import org.apache.dolphinscheduler.server.utils.ProcessUtils ;
import org.apache.dolphinscheduler.service.process.ProcessService ;
import org.apache.dolphinscheduler.service.process.ProcessService ;
import org.apache.dolphinscheduler.service.zk.AbstractZKClient ;
import org.apache.dolphinscheduler.service.zk.AbstractZKClient ;
import org.slf4j.Logger ;
import org.slf4j.Logger ;
import org.slf4j.LoggerFactory ;
import org.slf4j.LoggerFactory ;
import org.springframework.beans.factory.annotation.Autowired ;
import org.springframework.beans.factory.annotation.Autowired ;
@ -45,309 +47,309 @@ import static org.apache.dolphinscheduler.common.Constants.SLEEP_TIME_MILLIS;
/ * *
/ * *
* zookeeper master client
* zookeeper master client
*
* < p >
* single instance
* single instance
* /
* /
@Component
@Component
public class ZKMasterClient extends AbstractZKClient {
public class ZKMasterClient extends AbstractZKClient {
/ * *
/ * *
* logger
* logger
* /
* /
private static final Logger logger = LoggerFactory . getLogger ( ZKMasterClient . class ) ;
private static final Logger logger = LoggerFactory . getLogger ( ZKMasterClient . class ) ;
/ * *
/ * *
* process service
* process service
* /
* /
@Autowired
@Autowired
private ProcessService processService ;
private ProcessService processService ;
public void start ( ) {
public void start ( ) {
InterProcessMutex mutex = null ;
InterProcessMutex mutex = null ;
try {
try {
// create distributed lock with the root node path of the lock space as /dolphinscheduler/lock/failover/master
// create distributed lock with the root node path of the lock space as /dolphinscheduler/lock/failover/master
String znodeLock = getMasterStartUpLockPath ( ) ;
String znodeLock = getMasterStartUpLockPath ( ) ;
mutex = new InterProcessMutex ( getZkClient ( ) , znodeLock ) ;
mutex = new InterProcessMutex ( getZkClient ( ) , znodeLock ) ;
mutex . acquire ( ) ;
mutex . acquire ( ) ;
// init system znode
// init system znode
this . initSystemZNode ( ) ;
this . initSystemZNode ( ) ;
while ( ! checkZKNodeExists ( NetUtils . getHost ( ) , ZKNodeType . MASTER ) ) {
while ( ! checkZKNodeExists ( NetUtils . getHost ( ) , ZKNodeType . MASTER ) ) {
ThreadUtils . sleep ( SLEEP_TIME_MILLIS ) ;
ThreadUtils . sleep ( SLEEP_TIME_MILLIS ) ;
}
}
// self tolerant
// self tolerant
if ( getActiveMasterNum ( ) = = 1 ) {
if ( getActiveMasterNum ( ) = = 1 ) {
failoverWorker ( null , true ) ;
failoverWork er ( null , true ) ;
failoverMast er ( null ) ;
failoverMaster ( null ) ;
}
}
} catch ( Exception e ) {
} catch ( Exception e ) {
logger . error ( "master start up exception" , e ) ;
logger . error ( "master start up exception" , e ) ;
} finally {
} finally {
releaseMutex ( mutex ) ;
releaseMutex ( mutex ) ;
}
}
}
}
@Override
@Override
public void close ( ) {
public void close ( ) {
super . close ( ) ;
super . close ( ) ;
}
}
/ * *
/ * *
* handle path events that this class cares about
* handle path events that this class cares about
*
* @param client zkClient
* @param client zkClient
* @param event path event
* @param event path event
* @param path zk path
* @param path zk path
* /
* /
@Override
@Override
protected void dataChanged ( CuratorFramework client , TreeCacheEvent event , String path ) {
protected void dataChanged ( CuratorFramework client , TreeCacheEvent event , String path ) {
//monitor master
//monitor master
if ( path . startsWith ( getZNodeParentPath ( ZKNodeType . MASTER ) + Constants . SINGLE_SLASH ) ) {
if ( path . startsWith ( getZNodeParentPath ( ZKNodeType . MASTER ) + Constants . SINGLE_SLASH ) ) {
handleMasterEvent ( event , path ) ;
handleMasterEvent ( event , path ) ;
} else if ( path . startsWith ( getZNodeParentPath ( ZKNodeType . WORKER ) + Constants . SINGLE_SLASH ) ) {
} else if ( path . startsWith ( getZNodeParentPath ( ZKNodeType . WORKER ) + Constants . SINGLE_SLASH ) ) {
//monitor worker
//monitor worker
handleWorkerEvent ( event , path ) ;
handleWorkerEvent ( event , path ) ;
}
}
}
}
/ * *
/ * *
* remove zookeeper node path
* remove zookeeper node path
*
*
* @param path zookeeper node path
* @param path zookeeper node path
* @param zkNodeType zookeeper node type
* @param zkNodeType zookeeper node type
* @param failover is failover
* @param failover is failover
* /
* /
private void removeZKNodePath ( String path , ZKNodeType zkNodeType , boolean failover ) {
private void removeZKNodePath ( String path , ZKNodeType zkNodeType , boolean failover ) {
logger . info ( "{} node deleted : {}" , zkNodeType . toString ( ) , path ) ;
logger . info ( "{} node deleted : {}" , zkNodeType . toString ( ) , path ) ;
InterProcessMutex mutex = null ;
InterProcessMutex mutex = null ;
try {
try {
String failoverPath = getFailoverLockPath ( zkNodeType ) ;
String failoverPath = getFailoverLockPath ( zkNodeType ) ;
// create a distributed lock
// create a distributed lock
mutex = new InterProcessMutex ( getZkClient ( ) , failoverPath ) ;
mutex = new InterProcessMutex ( getZkClient ( ) , failoverPath ) ;
mutex . acquire ( ) ;
mutex . acquire ( ) ;
String serverHost = getHostByEventDataPath ( path ) ;
String serverHost = getHostByEventDataPath ( path ) ;
// handle dead server
// handle dead server
handleDeadServer ( path , zkNodeType , Constants . ADD_ZK_OP ) ;
handleDeadServer ( path , zkNodeType , Constants . ADD_ZK_OP ) ;
//failover server
//failover server
if ( failover ) {
if ( failover ) {
failoverServerWhenDown ( serverHost , zkNodeType ) ;
failoverServerWhenDown ( serverHost , zkNodeType ) ;
}
}
} catch ( Exception e ) {
} catch ( Exception e ) {
logger . error ( "{} server failover failed." , zkNodeType . toString ( ) ) ;
logger . error ( "{} server failover failed." , zkNodeType . toString ( ) ) ;
logger . error ( "failover exception " , e ) ;
logger . error ( "failover exception " , e ) ;
}
} finally {
finally {
releaseMutex ( mutex ) ;
releaseMutex ( mutex ) ;
}
}
}
}
/ * *
/ * *
* failover server when server down
* failover server when server down
*
*
* @param serverHost server host
* @param serverHost server host
* @param zkNodeType zookeeper node type
* @param zkNodeType zookeeper node type
* @throws Exception exception
* @throws Exception exception
* /
* /
private void failoverServerWhenDown ( String serverHost , ZKNodeType zkNodeType ) throws Exception {
private void failoverServerWhenDown ( String serverHost , ZKNodeType zkNodeType ) throws Exception {
if ( StringUtils . isEmpty ( serverHost ) ) {
if ( StringUtils . isEmpty ( serverHost ) | | serverHost . startsWith ( NetUtils . getHost ( ) ) ) {
return ;
return ;
}
}
switch ( zkNodeType ) {
switch ( zkNodeType ) {
case MASTER :
case MASTER :
failoverMaster ( serverHost ) ;
failoverMaster ( serverHost ) ;
break ;
break ;
case WORKER :
case WORKER :
failoverWorker ( serverHost , true ) ;
failoverWorker ( serverHost , true ) ;
break ;
default :
default :
break ;
break ;
}
}
}
}
/ * *
/ * *
* get failover lock path
* get failover lock path
*
*
* @param zkNodeType zookeeper node type
* @param zkNodeType zookeeper node type
* @return fail over lock path
* @return fail over lock path
* /
* /
private String getFailoverLockPath ( ZKNodeType zkNodeType ) {
private String getFailoverLockPath ( ZKNodeType zkNodeType ) {
switch ( zkNodeType ) {
switch ( zkNodeType ) {
case MASTER :
case MASTER :
return getMasterFailoverLockPath ( ) ;
return getMasterFailoverLockPath ( ) ;
case WORKER :
case WORKER :
return getWorkerFailoverLockPath ( ) ;
return getWorkerFailoverLockPath ( ) ;
default :
default :
return "" ;
return "" ;
}
}
}
}
/ * *
/ * *
* monitor master
* monitor master
* @param event event
*
* @param path path
* @param event event
* /
* @param path path
public void handleMasterEvent ( TreeCacheEvent event , String path ) {
* /
switch ( event . getType ( ) ) {
public void handleMasterEvent ( TreeCacheEvent event , String path ) {
case NODE_ADDED :
switch ( event . getType ( ) ) {
logger . info ( "master node added : {}" , path ) ;
case NODE_ADDED :
break ;
logger . info ( "master node added : {}" , path ) ;
case NODE_REMOVED :
break ;
removeZKNodePath ( path , ZKNodeType . MASTER , true ) ;
case NODE_REMOVED :
break ;
removeZKNodePath ( path , ZKNodeType . MASTER , true ) ;
default :
break ;
break ;
default :
}
break ;
}
}
}
/ * *
* monitor worker
/ * *
* @param event event
* monitor worker
* @param path path
*
* /
* @param event event
public void handleWorkerEvent ( TreeCacheEvent event , String path ) {
* @param path path
switch ( event . getType ( ) ) {
* /
case NODE_ADDED :
public void handleWorkerEvent ( TreeCacheEvent event , String path ) {
logger . info ( "worker node added : {}" , path ) ;
switch ( event . getType ( ) ) {
break ;
case NODE_ADDED :
case NODE_REMOVED :
logger . info ( "worker node added : {}" , path ) ;
logger . info ( "worker node deleted : {}" , path ) ;
break ;
removeZKNodePath ( path , ZKNodeType . WORKER , true ) ;
case NODE_REMOVED :
break ;
logger . info ( "worker node deleted : {}" , path ) ;
default :
removeZKNodePath ( path , ZKNodeType . WORKER , true ) ;
break ;
break ;
}
default :
}
break ;
}
/ * *
}
* task needs failover if task start before worker starts
*
/ * *
* @param taskInstance task in stanc e
* task needs failover if task start befor e worker starts
* @return true if task instance need fail over
*
* /
* @param taskInstance task instance
private boolean checkTaskInstanceNeedFailover ( TaskInstance taskInstance ) throws Exception {
* @return true if task instance need fail over
* /
boolean taskNeedFailover = true ;
private boolean checkTaskInstanceNeedFailover ( TaskInstance taskInstance ) throws Exception {
//now no host will execute this task instance,so no need to failover the task
boolean taskNeedFailover = true ;
if ( taskInstance . getHost ( ) = = null ) {
return false ;
//now no host will execute this task instance,so no need to failover the task
}
if ( taskInstance . getHost ( ) = = null ) {
return false ;
// if the worker node exists in zookeeper, we must check the task starts after the worker
}
if ( checkZKNodeExists ( taskInstance . getHost ( ) , ZKNodeType . WORKER ) ) {
//if task start after worker starts, there is no need to failover the task.
// if the worker node exists in zookeeper, we must check the task starts after the worker
if ( checkTaskAfterWorkerStart ( taskInstance ) ) {
if ( checkZKNodeExists ( taskInstance . getHost ( ) , ZKNodeType . WORKER ) ) {
taskNeedFailover = false ;
//if task start after worker starts, there is no need to failover the task.
}
if ( checkTaskAfterWorkerStart ( taskInstance ) ) {
}
taskNeedFailover = false ;
return taskNeedFailover ;
}
}
}
return taskNeedFailover ;
/ * *
}
* check task start after the worker server starts .
*
/ * *
* @param taskInstance task instance
* check task start after the worker server starts .
* @return true if task instance start time after worker server start date
*
* /
* @param taskInstance task instance
private boolean checkTaskAfterWorkerStart ( TaskInstance taskInstance ) {
* @return true if task instance start time after worker server start date
if ( StringUtils . isEmpty ( taskInstance . getHost ( ) ) ) {
* /
return false ;
private boolean checkTaskAfterWorkerStart ( TaskInstance taskInstance ) {
}
if ( StringUtils . isEmpty ( taskInstance . getHost ( ) ) ) {
Date workerServerStartDate = null ;
return false ;
List < Server > workerServers = getServersList ( ZKNodeType . WORKER ) ;
}
for ( Server workerServer : workerServers ) {
Date workerServerStartDate = null ;
if ( taskInstance . getHost ( ) . equals ( workerServer . getHost ( ) + Constants . COLON + workerServer . getPort ( ) ) ) {
List < Server > workerServers = getServersList ( ZKNodeType . WORKER ) ;
workerServerStartDate = workerServer . getCreateTime ( ) ;
for ( Server workerServer : workerServers ) {
break ;
if ( taskInstance . getHost ( ) . equals ( workerServer . getHost ( ) + Constants . COLON + workerServer . getPort ( ) ) ) {
}
workerServerStartDate = workerServer . getCreateTime ( ) ;
}
break ;
}
if ( workerServerStartDate ! = null ) {
}
return taskInstance . getStartTime ( ) . after ( workerServerStartDate ) ;
if ( workerServerStartDate ! = null ) {
} else {
return taskInstance . getStartTime ( ) . after ( workerServerStartDate ) ;
return false ;
}
}
return false ;
}
}
/ * *
/ * *
* failover worker tasks
* failover worker tasks
*
*
* 1 . kill yarn job if there are yarn jobs in tasks .
* 1 . kill yarn job if there are yarn jobs in tasks .
* 2 . change task state from running to need failover .
* 2 . change task state from running to need failover .
* 3 . failover all tasks when workerHost is null
* 3 . failover all tasks when workerHost is null
* @param workerHost worker host
* @param workerHost worker host
* /
* /
/ * *
/ * *
* failover worker tasks
* failover worker tasks
*
* < p >
* 1 . kill yarn job if there are yarn jobs in tasks .
* 1 . kill yarn job if there are yarn jobs in tasks .
* 2 . change task state from running to need failover .
* 2 . change task state from running to need failover .
* 3 . failover all tasks when workerHost is null
* 3 . failover all tasks when workerHost is null
* @param workerHost worker host
*
* @param needCheckWorkerAlive need check worker alive
* @param workerHost worker host
* @throws Exception exception
* @param needCheckWorkerAlive need check worker alive
* /
* @throws Exception exception
private void failoverWorker ( String workerHost , boolean needCheckWorkerAlive ) throws Exception {
* /
logger . info ( "start worker[{}] failover ..." , workerHost ) ;
private void failoverWorker ( String workerHost , boolean needCheckWorkerAlive ) throws Exception {
logger . info ( "start worker[{}] failover ..." , workerHost ) ;
List < TaskInstance > needFailoverTaskInstanceList = processService . queryNeedFailoverTaskInstances ( workerHost ) ;
for ( TaskInstance taskInstance : needFailoverTaskInstanceList ) {
List < TaskInstance > needFailoverTaskInstanceList = processService . queryNeedFailoverTaskInstances ( workerHost ) ;
if ( needCheckWorkerAlive ) {
for ( TaskInstance taskInstance : needFailoverTaskInstanceList ) {
if ( ! checkTaskInstanceNeedFailover ( taskInstance ) ) {
if ( needCheckWorkerAlive ) {
continue ;
if ( ! checkTaskInstanceNeedFailover ( taskInstance ) ) {
}
continue ;
}
}
}
ProcessInstance processInstance = processService . findProcessInstanceDetailById ( taskInstance . getProcessInstanceId ( ) ) ;
if ( processInstance ! = null ) {
ProcessInstance processInstance = processService . findProcessInstanceDetailById ( taskInstance . getProcessInstanceId ( ) ) ;
taskInstance . setProcessInstance ( processInstance ) ;
if ( processInstance ! = null ) {
}
taskInstance . setProcessInstance ( processInstance ) ;
}
TaskExecutionContext taskExecutionContext = TaskExecutionContextBuilder . get ( )
. buildTaskInstanceRelatedInfo ( taskInstance )
TaskExecutionContext taskExecutionContext = TaskExecutionContextBuilder . get ( )
. buildProcessInstanceRelatedInfo ( process Instance )
. buildTaskInstanceRelatedInfo ( task Instance )
. create ( ) ;
. buildProcessInstanceRelatedInfo ( processInstance )
// only kill yarn job if exists , the local thread has exited
. create ( ) ;
ProcessUtils . killYarnJob ( taskExecutionContext ) ;
// only kill yarn job if exists , the local thread has exited
ProcessUtils . killYarnJob ( taskExecutionContext ) ;
taskInstance . setState ( ExecutionStatus . NEED_FAULT_TOLERANCE ) ;
processService . saveTaskInstance ( taskInstance ) ;
taskInstance . setState ( ExecutionStatus . NEED_FAULT_TOLERANCE ) ;
}
processService . saveTaskInstance ( taskInstance ) ;
logger . info ( "end worker[{}] failover ..." , workerHost ) ;
}
}
logger . info ( "end worker[{}] failover ..." , workerHost ) ;
}
/ * *
* failover master tasks
/ * *
*
* failover master tasks
* @param masterHost master host
*
* /
* @param masterHost master host
private void failoverMaster ( String masterHost ) {
* /
logger . info ( "start master failover ..." ) ;
private void failoverMaster ( String masterHost ) {
logger . info ( "start master failover ..." ) ;
List < ProcessInstance > needFailoverProcessInstanceList = processService . queryNeedFailoverProcessInstances ( masterHost ) ;
List < ProcessInstance > needFailoverProcessInstanceList = processService . queryNeedFailoverProcessInstances ( masterHost ) ;
//updateProcessInstance host is null and insert into command
for ( ProcessInstance processInstance : needFailoverProcessInstanceList ) {
//updateProcessInstance host is null and insert into command
if ( Constants . NULL . equals ( processInstance . getHost ( ) ) ) {
for ( ProcessInstance processInstance : needFailoverProcessInstanceList ) {
continue ;
if ( Constants . NULL . equals ( processInstance . getHost ( ) ) ) {
}
continue ;
processService . processNeedFailoverProcessInstances ( processInstance ) ;
}
}
processService . processNeedFailoverProcessInstances ( processInstance ) ;
}
logger . info ( "master failover end" ) ;
}
logger . info ( "master failover end" ) ;
}
public InterProcessMutex blockAcquireMutex ( ) throws Exception {
InterProcessMutex mutex = new InterProcessMutex ( getZkClient ( ) , getMasterLockPath ( ) ) ;
public InterProcessMutex blockAcquireMutex ( ) throws Exception {
mutex . acquire ( ) ;
InterProcessMutex mutex = new InterProcessMutex ( getZkClient ( ) , getMasterLockPath ( ) ) ;
return mutex ;
mutex . acquire ( ) ;
}
return mutex ;
}
}
}