Browse Source

worker fault tolerance modify (#2212)

* 1, master persistent task
2. extract  master and worker communication model

* 1, master persistent task
2. extract  master and worker communication model

* 1, master persistent task
2. extract  master and worker communication model

* add license

* modify javadoc error

* TaskExecutionContext create modify

* buildAckCommand taskInstanceId not set modify

* java doc error modify

* add comment

* ExecutorManager interface add generic type

* add TaskInstanceCacheManager receive Worker report result

* TaskInstance setExecutePath

* add TaskInstanceCacheManager to receive Worker Task result report

* TaskInstanceCacheManager add remove method

* add license

* add dispatcht task method

* AbstractCommandExecutor remove db access

* AbstractCommandExecutor remove db access

* AbstractCommandExecutor remove db access

* AbstractCommandExecutor remove db access

* AbstractCommandExecutor remove db access

* AbstractCommandExecutor remove db access

* AbstractCommandExecutor remove db access

* taskInstanceCache is null ,need load from db

* taskInstanceCache is null ,need load from db

* taskInstanceCache is null ,need load from db

* 1,worker TaskPros use TaskExecutionContext replase
2,Master kill Task , KillTaskProcessor modify

* worker remove db

* ShellTask modify

* master persistence processId and appIds

* master persistence processId and appIds

* master add kill task logic

* master add kill task logic

* master add kill task logic

* javadoc error modify

* remove chinese log

* executeDirectly method add Override

* remote module modify

* TaskKillResponseProcessor command type modify

* create buildKillCommand

* host add host:port format

* host add host:port format

* TaskAckProcessor modify

* TaskAckProcessor modify

* task prioriry refator

* remove ITaskQueue

* task prioriry refator

* remove ITaskQueue

* TaskPriority refactor

* remove logs

* WorkerServer refactor

* MasterSchedulerService modify

* WorkerConfig listen port modify

* modify master and worker listen port

* cancelTaskInstance set TaskExecutionContext host,logPath,executePath

* cancelTaskInstance set TaskExecutionContext host,logPath,executePath

* Encapsulate the parameters required by sqltask

* 1,Encapsulate the parameters required by sqltask
2,SQLTask optimization

* AbstractTask modify

* ProcedureTask optimization

* MasterSchedulerService modify

* TaskUpdateQueueConsumer modify

* test

* DataxTask process run debug

* DataxTask process run debug

* add protobuf dependency,MR、Spark task etc need this

* TaskUpdateQueueConsumer modify

* TaskExecutionContextBuilder set TaskInstance workgroup

* WorkerGroupService queryAllGroup modify
query available work group

* 1,get workergroup from zk modify
2,SpringConnectionFactory repeat load modify

* master and worker register ip  use OSUtils.getHost()

* ProcessInstance host set ip:port format

* worker fault tolerance modify

* Constants and .env modify

Co-authored-by: qiaozhanwei <qiaozhanwei@analysys.com.cn>
pull/2/head
qiaozhanwei 5 years ago committed by GitHub
parent
commit
055bb28de4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 8
      dolphinscheduler-api/src/main/java/org/apache/dolphinscheduler/api/controller/LoggerController.java
  2. 124
      dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/Constants.java
  3. 1
      dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/builder/TaskExecutionContextBuilder.java
  4. 5
      dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/utils/ProcessUtils.java
  5. 3
      dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/zk/ZKMasterClient.java
  6. 42
      dolphinscheduler-service/src/main/java/org/apache/dolphinscheduler/service/zk/AbstractZKClient.java
  7. 2
      dolphinscheduler-service/src/main/java/org/apache/dolphinscheduler/service/zk/ZookeeperCachedOperator.java

8
dolphinscheduler-api/src/main/java/org/apache/dolphinscheduler/api/controller/LoggerController.java

@ -60,14 +60,14 @@ public class LoggerController extends BaseController {
*/
@ApiOperation(value = "queryLog", notes= "QUERY_TASK_INSTANCE_LOG_NOTES")
@ApiImplicitParams({
@ApiImplicitParam(name = "taskInstId", value = "TASK_ID", dataType = "Int", example = "100"),
@ApiImplicitParam(name = "taskInstanceId", value = "TASK_ID", dataType = "Int", example = "100"),
@ApiImplicitParam(name = "skipLineNum", value = "SKIP_LINE_NUM", dataType ="Int", example = "100"),
@ApiImplicitParam(name = "limit", value = "LIMIT", dataType ="Int", example = "100")
})
@GetMapping(value = "/detail")
@ResponseStatus(HttpStatus.OK)
public Result queryLog(@ApiIgnore @RequestAttribute(value = Constants.SESSION_USER) User loginUser,
@RequestParam(value = "taskInstId") int taskInstanceId,
@RequestParam(value = "taskInstanceId") int taskInstanceId,
@RequestParam(value = "skipLineNum") int skipNum,
@RequestParam(value = "limit") int limit) {
try {
@ -91,12 +91,12 @@ public class LoggerController extends BaseController {
*/
@ApiOperation(value = "downloadTaskLog", notes= "DOWNLOAD_TASK_INSTANCE_LOG_NOTES")
@ApiImplicitParams({
@ApiImplicitParam(name = "taskInstId", value = "TASK_ID",dataType = "Int", example = "100")
@ApiImplicitParam(name = "taskInstanceId", value = "TASK_ID",dataType = "Int", example = "100")
})
@GetMapping(value = "/download-log")
@ResponseBody
public ResponseEntity downloadTaskLog(@ApiIgnore @RequestAttribute(value = Constants.SESSION_USER) User loginUser,
@RequestParam(value = "taskInstId") int taskInstanceId) {
@RequestParam(value = "taskInstanceId") int taskInstanceId) {
try {
byte[] logBytes = loggerService.getLogBytes(taskInstanceId);
return ResponseEntity

124
dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/Constants.java

@ -118,20 +118,15 @@ public final class Constants {
*/
public static final String RES_UPLOAD_STARTUP_TYPE = "res.upload.startup.type";
/**
* zookeeper quorum
*/
public static final String ZOOKEEPER_QUORUM = "zookeeper.quorum";
/**
* MasterServer directory registered in zookeeper
*/
public static final String ZOOKEEPER_DOLPHINSCHEDULER_MASTERS = "/masters";
public static final String ZOOKEEPER_DOLPHINSCHEDULER_MASTERS = "/nodes/masters";
/**
* WorkerServer directory registered in zookeeper
*/
public static final String ZOOKEEPER_DOLPHINSCHEDULER_WORKERS = "/workers";
public static final String ZOOKEEPER_DOLPHINSCHEDULER_WORKERS = "/nodes/worker";
/**
* all servers directory registered in zookeeper
@ -143,10 +138,6 @@ public final class Constants {
*/
public static final String ZOOKEEPER_DOLPHINSCHEDULER_LOCK_MASTERS = "/lock/masters";
/**
* WorkerServer lock directory registered in zookeeper
*/
public static final String ZOOKEEPER_DOLPHINSCHEDULER_LOCK_WORKERS = "/lock/workers";
/**
* MasterServer failover directory registered in zookeeper
@ -163,10 +154,6 @@ public final class Constants {
*/
public static final String ZOOKEEPER_DOLPHINSCHEDULER_LOCK_FAILOVER_STARTUP_MASTERS = "/lock/failover/startup-masters";
/**
* need send warn times when master server or worker server failover
*/
public static final int DOLPHINSCHEDULER_WARN_TIMES_FAILOVER = 3;
/**
* comma ,
@ -203,37 +190,6 @@ public final class Constants {
*/
public static final String EQUAL_SIGN = "=";
/**
* ZOOKEEPER_SESSION_TIMEOUT
*/
public static final String ZOOKEEPER_SESSION_TIMEOUT = "zookeeper.session.timeout";
public static final String ZOOKEEPER_CONNECTION_TIMEOUT = "zookeeper.connection.timeout";
public static final String ZOOKEEPER_RETRY_SLEEP = "zookeeper.retry.sleep";
public static final String ZOOKEEPER_RETRY_BASE_SLEEP = "zookeeper.retry.base.sleep";
public static final String ZOOKEEPER_RETRY_MAX_SLEEP = "zookeeper.retry.max.sleep";
public static final String ZOOKEEPER_RETRY_MAXTIME = "zookeeper.retry.maxtime";
public static final String MASTER_HEARTBEAT_INTERVAL = "master.heartbeat.interval";
public static final String MASTER_EXEC_THREADS = "master.exec.threads";
public static final String MASTER_EXEC_TASK_THREADS = "master.exec.task.number";
public static final String MASTER_COMMIT_RETRY_TIMES = "master.task.commit.retryTimes";
public static final String MASTER_COMMIT_RETRY_INTERVAL = "master.task.commit.interval";
public static final String WORKER_EXEC_THREADS = "worker.exec.threads";
public static final String WORKER_HEARTBEAT_INTERVAL = "worker.heartbeat.interval";
public static final String WORKER_FETCH_TASK_NUM = "worker.fetch.task.num";
public static final String WORKER_MAX_CPULOAD_AVG = "worker.max.cpuload.avg";
@ -244,17 +200,6 @@ public final class Constants {
public static final String MASTER_RESERVED_MEMORY = "master.reserved.memory";
/**
* dolphinscheduler tasks queue
*/
public static final String DOLPHINSCHEDULER_TASKS_QUEUE = "tasks_queue";
/**
* dolphinscheduler need kill tasks queue
*/
public static final String DOLPHINSCHEDULER_TASKS_KILL = "tasks_kill";
public static final String ZOOKEEPER_DOLPHINSCHEDULER_ROOT = "zookeeper.dolphinscheduler.root";
public static final String SCHEDULER_QUEUE_IMPL = "dolphinscheduler.queue.impl";
@ -350,26 +295,6 @@ public final class Constants {
public static final int MAX_TASK_TIMEOUT = 24 * 3600;
/**
* heartbeat threads number
*/
public static final int DEFAUL_WORKER_HEARTBEAT_THREAD_NUM = 1;
/**
* heartbeat interval
*/
public static final int DEFAULT_WORKER_HEARTBEAT_INTERVAL = 60;
/**
* worker fetch task number
*/
public static final int DEFAULT_WORKER_FETCH_TASK_NUM = 1;
/**
* worker execute threads number
*/
public static final int DEFAULT_WORKER_EXEC_THREAD_NUM = 10;
/**
* master cpu load
*/
@ -391,16 +316,6 @@ public final class Constants {
public static final double DEFAULT_WORKER_RESERVED_MEMORY = OSUtils.totalMemorySize() / 10;
/**
* master execute threads number
*/
public static final int DEFAULT_MASTER_EXEC_THREAD_NUM = 100;
/**
* default master concurrent task execute num
*/
public static final int DEFAULT_MASTER_TASK_EXEC_NUM = 20;
/**
* default log cache rows num,output when reach the number
@ -408,33 +323,11 @@ public final class Constants {
public static final int DEFAULT_LOG_ROWS_NUM = 4 * 16;
/**
* log flush intervaloutput when reach the interval
* log flush interval?output when reach the interval
*/
public static final int DEFAULT_LOG_FLUSH_INTERVAL = 1000;
/**
* default master heartbeat thread number
*/
public static final int DEFAULT_MASTER_HEARTBEAT_THREAD_NUM = 1;
/**
* default master heartbeat interval
*/
public static final int DEFAULT_MASTER_HEARTBEAT_INTERVAL = 60;
/**
* default master commit retry times
*/
public static final int DEFAULT_MASTER_COMMIT_RETRY_TIMES = 5;
/**
* default master commit retry interval
*/
public static final int DEFAULT_MASTER_COMMIT_RETRY_INTERVAL = 3000;
/**
* time unit secong to minutes
*/
@ -805,7 +698,6 @@ public final class Constants {
public static final String ALIAS = "alias";
public static final String CONTENT = "content";
public static final String DEPENDENT_SPLIT = ":||";
public static final String DEPENDENT_ALL = "ALL";
/**
@ -864,7 +756,7 @@ public final class Constants {
*/
public static final String HIVE_CONF = "hiveconf:";
//flink 任务
//flink ??
public static final String FLINK_YARN_CLUSTER = "yarn-cluster";
public static final String FLINK_RUN_MODE = "-m";
public static final String FLINK_YARN_SLOT = "-ys";
@ -899,26 +791,20 @@ public final class Constants {
/**
* data total
* 数据总数
*/
public static final String COUNT = "count";
/**
* page size
* 每页数据条数
*/
public static final String PAGE_SIZE = "pageSize";
/**
* current page no
* 当前页码
*/
public static final String PAGE_NUMBER = "pageNo";
/**
* result
*/
public static final String RESULT = "result";
/**
*

1
dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/builder/TaskExecutionContextBuilder.java

@ -49,6 +49,7 @@ public class TaskExecutionContextBuilder {
taskExecutionContext.setExecutePath(taskInstance.getExecutePath());
taskExecutionContext.setTaskJson(taskInstance.getTaskJson());
taskExecutionContext.setWorkerGroup(taskInstance.getWorkerGroup());
taskExecutionContext.setHost(taskInstance.getHost());
return this;
}

5
dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/utils/ProcessUtils.java

@ -22,6 +22,7 @@ import org.apache.dolphinscheduler.common.utils.LoggerUtils;
import org.apache.dolphinscheduler.common.utils.OSUtils;
import org.apache.dolphinscheduler.common.utils.StringUtils;
import org.apache.commons.io.FileUtils;
import org.apache.dolphinscheduler.remote.utils.Host;
import org.apache.dolphinscheduler.server.entity.TaskExecutionContext;
import org.apache.dolphinscheduler.service.log.LogClientService;
import org.slf4j.Logger;
@ -379,7 +380,9 @@ public class ProcessUtils {
String log = null;
try {
logClient = new LogClientService();
log = logClient.viewLog(taskExecutionContext.getHost(), Constants.RPC_PORT, taskExecutionContext.getLogPath());
log = logClient.viewLog(Host.of(taskExecutionContext.getHost()).getIp(),
Constants.RPC_PORT,
taskExecutionContext.getLogPath());
} finally {
if(logClient != null){
logClient.close();

3
dolphinscheduler-server/src/main/java/org/apache/dolphinscheduler/server/zk/ZKMasterClient.java

@ -71,7 +71,7 @@ public class ZKMasterClient extends AbstractZKClient {
// init system znode
this.initSystemZNode();
// check if fault tolerance is requiredfailure and tolerance
// check if fault tolerance is required?failure and tolerance
if (getActiveMasterNum() == 1) {
failoverWorker(null, true);
failoverMaster(null);
@ -304,7 +304,6 @@ public class ZKMasterClient extends AbstractZKClient {
TaskExecutionContext taskExecutionContext = TaskExecutionContextBuilder.get()
.buildTaskInstanceRelatedInfo(taskInstance)
.buildProcessInstanceRelatedInfo(processInstance)
.buildProcessDefinitionRelatedInfo(null)
.create();
// only kill yarn job if exists , the local thread has exited
ProcessUtils.killYarnJob(taskExecutionContext);

42
dolphinscheduler-service/src/main/java/org/apache/dolphinscheduler/service/zk/AbstractZKClient.java

@ -38,29 +38,13 @@ public abstract class AbstractZKClient extends ZookeeperCachedOperator {
private static final Logger logger = LoggerFactory.getLogger(AbstractZKClient.class);
/**
* check dead server or not , if dead, stop self
*
* @param zNode node path
* @param serverType master or worker prefix
* @return true if not exists
* @throws Exception errors
* remove dead server by host
* @param host host
* @param serverType serverType
* @throws Exception
*/
protected boolean checkIsDeadServer(String zNode, String serverType) throws Exception{
//ip_sequenceno
String[] zNodesPath = zNode.split("\\/");
String ipSeqNo = zNodesPath[zNodesPath.length - 1];
String type = serverType.equals(MASTER_PREFIX) ? MASTER_PREFIX : WORKER_PREFIX;
String deadServerPath = getDeadZNodeParentPath() + SINGLE_SLASH + type + UNDERLINE + ipSeqNo;
if(!isExisted(zNode) || isExisted(deadServerPath)){
return true;
}
return false;
}
public void removeDeadServerByHost(String host, String serverType) throws Exception {
List<String> deadServers = super.getChildrenKeys(getDeadZNodeParentPath());
for(String serverPath : deadServers){
@ -72,22 +56,6 @@ public abstract class AbstractZKClient extends ZookeeperCachedOperator {
}
}
/**
* create zookeeper path according the zk node type.
* @param zkNodeType zookeeper node type
* @return register zookeeper path
* @throws Exception
*/
private String createZNodePath(ZKNodeType zkNodeType, String host) throws Exception {
// specify the format of stored data in ZK nodes
String heartbeatZKInfo = ResInfo.getHeartBeatInfo(new Date());
// create temporary sequence nodes for master znode
String registerPath= getZNodeParentPath(zkNodeType) + SINGLE_SLASH + host;
super.persistEphemeral(registerPath, heartbeatZKInfo);
logger.info("register {} node {} success" , zkNodeType.toString(), registerPath);
return registerPath;
}
/**
* opType(add): if find dead server , then add to zk deadServerPath

2
dolphinscheduler-service/src/main/java/org/apache/dolphinscheduler/service/zk/ZookeeperCachedOperator.java

@ -39,7 +39,7 @@ public class ZookeeperCachedOperator extends ZookeeperOperator {
*/
@Override
protected void registerListener() {
treeCache = new TreeCache(zkClient, getZookeeperConfig().getDsRoot());
treeCache = new TreeCache(zkClient, getZookeeperConfig().getDsRoot() + "/nodes");
logger.info("add listener to zk path: {}", getZookeeperConfig().getDsRoot());
try {
treeCache.start();

Loading…
Cancel
Save