Browse Source

Use percentage to represent memory/cpu usage (#13896)

3.2.0-release
Wenjun Ruan 2 years ago committed by GitHub
parent
commit
61a689aa5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 4
      deploy/kubernetes/dolphinscheduler/values.yaml
  2. 8
      docs/docs/en/architecture/configuration.md
  3. 8
      docs/docs/zh/architecture/configuration.md
  4. 5
      dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java
  5. 1
      dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java
  6. 2
      dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java
  7. 1
      dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java
  8. 69
      dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/utils/OSUtils.java
  9. 16
      dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/os/OSUtilsTest.java
  10. 10
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterConfig.java
  11. 6
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java
  12. 6
      dolphinscheduler-master/src/main/resources/application.yaml
  13. 16
      dolphinscheduler-standalone-server/src/main/resources/application.yaml
  14. 2
      dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerConfig.java
  15. 30
      dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java
  16. 6
      dolphinscheduler-worker/src/main/resources/application.yaml

4
deploy/kubernetes/dolphinscheduler/values.yaml

@ -310,7 +310,7 @@ master:
MASTER_TASK_COMMIT_RETRYTIMES: "5" MASTER_TASK_COMMIT_RETRYTIMES: "5"
MASTER_TASK_COMMIT_INTERVAL: "1s" MASTER_TASK_COMMIT_INTERVAL: "1s"
MASTER_STATE_WHEEL_INTERVAL: "5s" MASTER_STATE_WHEEL_INTERVAL: "5s"
MASTER_MAX_CPU_LOAD_AVG: "-1" MASTER_MAX_CPU_LOAD_AVG: "1"
MASTER_RESERVED_MEMORY: "0.3" MASTER_RESERVED_MEMORY: "0.3"
MASTER_FAILOVER_INTERVAL: "10m" MASTER_FAILOVER_INTERVAL: "10m"
MASTER_KILL_APPLICATION_WHEN_HANDLE_FAILOVER: "true" MASTER_KILL_APPLICATION_WHEN_HANDLE_FAILOVER: "true"
@ -382,7 +382,7 @@ worker:
storageClassName: "-" storageClassName: "-"
storage: "20Gi" storage: "20Gi"
env: env:
WORKER_MAX_CPU_LOAD_AVG: "-1" WORKER_MAX_CPU_LOAD_AVG: "1"
WORKER_RESERVED_MEMORY: "0.3" WORKER_RESERVED_MEMORY: "0.3"
WORKER_EXEC_THREADS: "100" WORKER_EXEC_THREADS: "100"
WORKER_HEARTBEAT_INTERVAL: "10s" WORKER_HEARTBEAT_INTERVAL: "10s"

8
docs/docs/en/architecture/configuration.md

@ -273,8 +273,8 @@ Location: `master-server/conf/application.yaml`
|master.task-commit-retry-times|5|master commit task retry times| |master.task-commit-retry-times|5|master commit task retry times|
|master.task-commit-interval|1000|master commit task interval, the unit is millisecond| |master.task-commit-interval|1000|master commit task interval, the unit is millisecond|
|master.state-wheel-interval|5|time to check status| |master.state-wheel-interval|5|time to check status|
|master.max-cpu-load-avg|-1|master max CPU load avg, only higher than the system CPU load average, master server can schedule. default value -1: the number of CPU cores * 2| |master.max-cpu-load-avg|1|master max cpuload avg percentage, only higher than the system cpu load average, master server can schedule. default value 1: will use 100% cpu|
|master.reserved-memory|0.3|master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, the unit is G| |master.reserved-memory|0.3|master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, only the available memory is higher than 30%, master server can schedule.|
|master.failover-interval|10|failover interval, the unit is minute| |master.failover-interval|10|failover interval, the unit is minute|
|master.kill-application-when-task-failover|true|whether to kill yarn/k8s application when failover taskInstance| |master.kill-application-when-task-failover|true|whether to kill yarn/k8s application when failover taskInstance|
|master.registry-disconnect-strategy.strategy|stop|Used when the master disconnect from registry, default value: stop. Optional values include stop, waiting| |master.registry-disconnect-strategy.strategy|stop|Used when the master disconnect from registry, default value: stop. Optional values include stop, waiting|
@ -292,8 +292,8 @@ Location: `worker-server/conf/application.yaml`
|worker.heartbeat-interval|10|worker-service heartbeat interval, the unit is second| |worker.heartbeat-interval|10|worker-service heartbeat interval, the unit is second|
|worker.host-weight|100|worker host weight to dispatch tasks| |worker.host-weight|100|worker host weight to dispatch tasks|
|worker.tenant-auto-create|true|tenant corresponds to the user of the system, which is used by the worker to submit the job. If system does not have this user, it will be automatically created after the parameter worker.tenant.auto.create is true.| |worker.tenant-auto-create|true|tenant corresponds to the user of the system, which is used by the worker to submit the job. If system does not have this user, it will be automatically created after the parameter worker.tenant.auto.create is true.|
|worker.max-cpu-load-avg|-1|worker max CPU load avg, only higher than the system CPU load average, worker server can be dispatched tasks. default value -1: the number of CPU cores * 2| |worker.max-cpu-load-avg|1|worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value 1: will use 100% cpu.|
|worker.reserved-memory|0.3|worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, the unit is G| |worker.reserved-memory|0.3|worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, only the available memory is higher than 30%, worker server can receive task.|
|worker.alert-listen-host|localhost|the alert listen host of worker| |worker.alert-listen-host|localhost|the alert listen host of worker|
|worker.alert-listen-port|50052|the alert listen port of worker| |worker.alert-listen-port|50052|the alert listen port of worker|
|worker.registry-disconnect-strategy.strategy|stop|Used when the worker disconnect from registry, default value: stop. Optional values include stop, waiting| |worker.registry-disconnect-strategy.strategy|stop|Used when the worker disconnect from registry, default value: stop. Optional values include stop, waiting|

8
docs/docs/zh/architecture/configuration.md

@ -268,8 +268,8 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId
|master.task-commit-retry-times|5|任务重试次数| |master.task-commit-retry-times|5|任务重试次数|
|master.task-commit-interval|1000|任务提交间隔,单位为毫秒| |master.task-commit-interval|1000|任务提交间隔,单位为毫秒|
|master.state-wheel-interval|5|轮询检查状态时间| |master.state-wheel-interval|5|轮询检查状态时间|
|master.max-cpu-load-avg|-1|master最大cpuload均值,只有高于系统cpuload均值时,master服务才能调度任务. 默认值为-1: cpu cores * 2| |master.max-cpu-load-avg|1|master最大cpuload均值,只有高于系统cpuload均值时,master服务才能调度任务. 默认值为1: 会使用100%的CPU|
|master.reserved-memory|0.3|master预留内存,只有低于系统可用内存时,master服务才能调度任务,单位为G| |master.reserved-memory|0.3|master预留内存,只有低于系统可用内存时,master服务才能调度任务. 默认值为0.3:当系统内存低于30%时会停止调度新的工作流|
|master.failover-interval|10|failover间隔,单位为分钟| |master.failover-interval|10|failover间隔,单位为分钟|
|master.kill-application-when-task-failover|true|当任务实例failover时,是否kill掉yarn或k8s application| |master.kill-application-when-task-failover|true|当任务实例failover时,是否kill掉yarn或k8s application|
|master.registry-disconnect-strategy.strategy|stop|当Master与注册中心失联之后采取的策略, 默认值是: stop. 可选值包括: stop, waiting| |master.registry-disconnect-strategy.strategy|stop|当Master与注册中心失联之后采取的策略, 默认值是: stop. 可选值包括: stop, waiting|
@ -286,8 +286,8 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId
|worker.heartbeat-interval|10|worker心跳间隔,单位为秒| |worker.heartbeat-interval|10|worker心跳间隔,单位为秒|
|worker.host-weight|100|派发任务时,worker主机的权重| |worker.host-weight|100|派发任务时,worker主机的权重|
|worker.tenant-auto-create|true|租户对应于系统的用户,由worker提交作业.如果系统没有该用户,则在参数worker.tenant.auto.create为true后自动创建。| |worker.tenant-auto-create|true|租户对应于系统的用户,由worker提交作业.如果系统没有该用户,则在参数worker.tenant.auto.create为true后自动创建。|
|worker.max-cpu-load-avg|-1|worker最大cpuload均值,只有高于系统cpuload均值时,worker服务才能被派发任务. 默认值为-1: cpu cores * 2| |worker.max-cpu-load-avg|1|worker最大cpuload均值,只有高于系统cpuload均值时,worker服务才能被派发任务. 默认值为1: 会使用100%的CPU|
|worker.reserved-memory|0.3|worker预留内存,只有低于系统可用内存时,worker服务才能被派发任务,单位为G| |worker.reserved-memory|0.3|worker预留内存,只有低于系统可用内存时,worker服务才能被派发任务. 默认值为0.3:当系统内存低于30%时会停止调度新的工作流|
|worker.alert-listen-host|localhost|alert监听host| |worker.alert-listen-host|localhost|alert监听host|
|worker.alert-listen-port|50052|alert监听端口| |worker.alert-listen-port|50052|alert监听端口|
|worker.registry-disconnect-strategy.strategy|stop|当Worker与注册中心失联之后采取的策略, 默认值是: stop. 可选值包括: stop, waiting| |worker.registry-disconnect-strategy.strategy|stop|当Worker与注册中心失联之后采取的策略, 默认值是: stop. 可选值包括: stop, waiting|

5
dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java

@ -56,9 +56,8 @@ public class AlertHeartbeatTask extends BaseHeartBeatTask<AlertServerHeartBeat>
.processId(processId) .processId(processId)
.startupTime(startupTime) .startupTime(startupTime)
.reportTime(System.currentTimeMillis()) .reportTime(System.currentTimeMillis())
.cpuUsage(OSUtils.cpuUsage()) .cpuUsage(OSUtils.cpuUsagePercentage())
.memoryUsage(OSUtils.memoryUsage()) .memoryUsage(OSUtils.memoryUsagePercentage())
.loadAverage(OSUtils.loadAverage())
.availablePhysicalMemorySize(OSUtils.availablePhysicalMemorySize()) .availablePhysicalMemorySize(OSUtils.availablePhysicalMemorySize())
.alertServerAddress(alertConfig.getAlertServerAddress()) .alertServerAddress(alertConfig.getAlertServerAddress())
.build(); .build();

1
dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java

@ -33,7 +33,6 @@ public class AlertServerHeartBeat implements HeartBeat {
private long reportTime; private long reportTime;
private double cpuUsage; private double cpuUsage;
private double memoryUsage; private double memoryUsage;
private double loadAverage;
private double availablePhysicalMemorySize; private double availablePhysicalMemorySize;
private String alertServerAddress; private String alertServerAddress;
} }

2
dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java

@ -32,9 +32,7 @@ public class MasterHeartBeat implements HeartBeat {
private long reportTime; private long reportTime;
private double cpuUsage; private double cpuUsage;
private double memoryUsage; private double memoryUsage;
private double loadAverage;
private double availablePhysicalMemorySize; private double availablePhysicalMemorySize;
private double maxCpuloadAvg;
private double reservedMemory; private double reservedMemory;
private double diskAvailable; private double diskAvailable;
private int processId; private int processId;

1
dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java

@ -34,7 +34,6 @@ public class WorkerHeartBeat implements HeartBeat {
private double memoryUsage; private double memoryUsage;
private double loadAverage; private double loadAverage;
private double availablePhysicalMemorySize; private double availablePhysicalMemorySize;
private double maxCpuloadAvg;
private double reservedMemory; private double reservedMemory;
private double diskAvailable; private double diskAvailable;
private int serverStatus; private int serverStatus;

69
dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/utils/OSUtils.java

@ -34,7 +34,6 @@ import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStreamReader; import java.io.InputStreamReader;
import java.lang.management.ManagementFactory; import java.lang.management.ManagementFactory;
import java.lang.management.OperatingSystemMXBean;
import java.lang.management.RuntimeMXBean; import java.lang.management.RuntimeMXBean;
import java.math.RoundingMode; import java.math.RoundingMode;
import java.text.DecimalFormat; import java.text.DecimalFormat;
@ -58,14 +57,15 @@ public class OSUtils {
/** /**
* return -1 when the function can not get hardware env info * return -1 when the function can not get hardware env info
* e.g {@link OSUtils#loadAverage()} {@link OSUtils#cpuUsage()} * e.g {@link OSUtils#cpuUsagePercentage()}
*/ */
public static final double NEGATIVE_ONE = -1; public static final double NEGATIVE_ONE = -1;
private static final HardwareAbstractionLayer hal = SI.getHardware(); private static final HardwareAbstractionLayer hal = SI.getHardware();
private static long[] prevTicks = new long[CentralProcessor.TickType.values().length]; private static long[] prevTicks = new long[CentralProcessor.TickType.values().length];
private static long prevTickTime = 0L; private static long prevTickTime = 0L;
private static double cpuUsage = 0.0D; private static volatile double cpuUsage = 0.0D;
private static final double TOTAL_MEMORY = hal.getMemory().getTotal() / 1024.0 / 1024 / 1024;
private OSUtils() { private OSUtils() {
throw new UnsupportedOperationException("Construct OSUtils"); throw new UnsupportedOperationException("Construct OSUtils");
@ -77,21 +77,6 @@ public class OSUtils {
*/ */
private static final Pattern PATTERN = Pattern.compile("\\s+"); private static final Pattern PATTERN = Pattern.compile("\\s+");
/**
* get memory usage
* Keep 2 decimal
*
* @return percent %
*/
public static double memoryUsage() {
GlobalMemory memory = hal.getMemory();
double memoryUsage = (memory.getTotal() - memory.getAvailable()) * 1.0 / memory.getTotal();
DecimalFormat df = new DecimalFormat(TWO_DECIMAL);
df.setRoundingMode(RoundingMode.HALF_UP);
return Double.parseDouble(df.format(memoryUsage));
}
/** /**
* get disk usage * get disk usage
* Keep 2 decimal * Keep 2 decimal
@ -125,34 +110,12 @@ public class OSUtils {
return Double.parseDouble(df.format(availablePhysicalMemorySize)); return Double.parseDouble(df.format(availablePhysicalMemorySize));
} }
/**
* load average
*
* @return load average
*/
public static double loadAverage() {
double loadAverage;
try {
OperatingSystemMXBean osBean = ManagementFactory.getPlatformMXBean(OperatingSystemMXBean.class);
loadAverage = osBean.getSystemLoadAverage();
} catch (Exception e) {
log.error("get operation system load average exception, try another method ", e);
loadAverage = hal.getProcessor().getSystemLoadAverage(1)[0];
if (Double.isNaN(loadAverage)) {
return NEGATIVE_ONE;
}
}
DecimalFormat df = new DecimalFormat(TWO_DECIMAL);
df.setRoundingMode(RoundingMode.HALF_UP);
return Double.parseDouble(df.format(loadAverage));
}
/** /**
* get cpu usage * get cpu usage
* *
* @return cpu usage * @return cpu usage
*/ */
public static double cpuUsage() { public static double cpuUsagePercentage() {
CentralProcessor processor = hal.getProcessor(); CentralProcessor processor = hal.getProcessor();
// Check if > ~ 0.95 seconds since last tick count. // Check if > ~ 0.95 seconds since last tick count.
@ -173,6 +136,10 @@ public class OSUtils {
return Double.parseDouble(df.format(cpuUsage)); return Double.parseDouble(df.format(cpuUsage));
} }
public static double memoryUsagePercentage() {
return (TOTAL_MEMORY - availablePhysicalMemorySize()) / TOTAL_MEMORY;
}
public static List<String> getUserList() { public static List<String> getUserList() {
try { try {
if (SystemUtils.IS_OS_MAC) { if (SystemUtils.IS_OS_MAC) {
@ -466,23 +433,25 @@ public class OSUtils {
/** /**
* Check memory and cpu usage is overload the given thredshod. * Check memory and cpu usage is overload the given thredshod.
* *
* @param maxCpuLoadAvg maxCpuLoadAvg * @param maxCpuLoadAvgThreshold maxCpuLoadAvg
* @param reservedMemory reservedMemory * @param reservedMemoryThreshold reservedMemory
* @return True, if the cpu or memory exceed the given thredshod. * @return True, if the cpu or memory exceed the given thredshod.
*/ */
public static Boolean isOverload(double maxCpuLoadAvg, double reservedMemory) { public static Boolean isOverload(double maxCpuLoadAvgThreshold, double reservedMemoryThreshold) {
// system load average // system load average
double loadAverage = loadAverage(); double freeCPUPercentage = 1 - cpuUsagePercentage();
// system available physical memory // system available physical memory
double availablePhysicalMemorySize = availablePhysicalMemorySize(); double freeMemoryPercentage = 1 - memoryUsagePercentage();
if (loadAverage > maxCpuLoadAvg) { if (freeCPUPercentage > maxCpuLoadAvgThreshold) {
log.warn("Current cpu load average {} is too high, max.cpuLoad.avg={}", loadAverage, maxCpuLoadAvg); log.warn("Current cpu load average {} is too high, max.cpuLoad.avg={}", freeCPUPercentage,
maxCpuLoadAvgThreshold);
return true; return true;
} }
if (availablePhysicalMemorySize < reservedMemory) { if (freeMemoryPercentage < reservedMemoryThreshold) {
log.warn( log.warn(
"Current available memory {}G is too low, reserved.memory={}G", maxCpuLoadAvg, reservedMemory); "Current available memory percentage{} is too low, reserved.memory={}", freeMemoryPercentage,
reservedMemoryThreshold);
return true; return true;
} }
return false; return false;

16
dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/os/OSUtilsTest.java

@ -34,13 +34,6 @@ public class OSUtilsTest {
private static Logger logger = LoggerFactory.getLogger(OSUtilsTest.class); private static Logger logger = LoggerFactory.getLogger(OSUtilsTest.class);
@Test
public void memoryUsage() {
double memoryUsage = OSUtils.memoryUsage();
logger.info("memoryUsage : {}", memoryUsage);
Assertions.assertTrue(memoryUsage >= 0.0);
}
@Test @Test
public void diskAvailable() { public void diskAvailable() {
double diskAvailable = OSUtils.diskAvailable(); double diskAvailable = OSUtils.diskAvailable();
@ -48,16 +41,9 @@ public class OSUtilsTest {
Assertions.assertTrue(diskAvailable >= 0.0); Assertions.assertTrue(diskAvailable >= 0.0);
} }
@Test
public void loadAverage() {
double loadAverage = OSUtils.loadAverage();
logger.info("loadAverage : {}", loadAverage);
Assertions.assertTrue(loadAverage >= 0.0);
}
@Test @Test
public void cpuUsage() { public void cpuUsage() {
double cpuUsage = OSUtils.cpuUsage(); double cpuUsage = OSUtils.cpuUsagePercentage();
logger.info("cpuUsage : {}", cpuUsage); logger.info("cpuUsage : {}", cpuUsage);
Assertions.assertTrue(cpuUsage >= 0.0); Assertions.assertTrue(cpuUsage >= 0.0);
} }

10
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterConfig.java

@ -84,8 +84,8 @@ public class MasterConfig implements Validator {
* state wheel check interval, if this value is bigger, may increase the delay of task/processInstance. * state wheel check interval, if this value is bigger, may increase the delay of task/processInstance.
*/ */
private Duration stateWheelInterval = Duration.ofMillis(5); private Duration stateWheelInterval = Duration.ofMillis(5);
private double maxCpuLoadAvg = -1; private double maxCpuLoadAvg = 1;
private double reservedMemory = 0.3; private double reservedMemory = 0.1;
private Duration failoverInterval = Duration.ofMinutes(10); private Duration failoverInterval = Duration.ofMinutes(10);
private boolean killApplicationWhenTaskFailover = true; private boolean killApplicationWhenTaskFailover = true;
private ConnectStrategyProperties registryDisconnectStrategy = new ConnectStrategyProperties(); private ConnectStrategyProperties registryDisconnectStrategy = new ConnectStrategyProperties();
@ -137,8 +137,12 @@ public class MasterConfig implements Validator {
errors.rejectValue("failover-interval", null, "should be a valid duration"); errors.rejectValue("failover-interval", null, "should be a valid duration");
} }
if (masterConfig.getMaxCpuLoadAvg() <= 0) { if (masterConfig.getMaxCpuLoadAvg() <= 0) {
masterConfig.setMaxCpuLoadAvg(Runtime.getRuntime().availableProcessors() * 2); masterConfig.setMaxCpuLoadAvg(100);
} }
if (masterConfig.getReservedMemory() <= 0) {
masterConfig.setReservedMemory(100);
}
if (masterConfig.getWorkerGroupRefreshInterval().getSeconds() < 10) { if (masterConfig.getWorkerGroupRefreshInterval().getSeconds() < 10) {
errors.rejectValue("worker-group-refresh-interval", null, "should >= 10s"); errors.rejectValue("worker-group-refresh-interval", null, "should >= 10s");
} }

6
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java

@ -53,12 +53,10 @@ public class MasterHeartBeatTask extends BaseHeartBeatTask<MasterHeartBeat> {
return MasterHeartBeat.builder() return MasterHeartBeat.builder()
.startupTime(ServerLifeCycleManager.getServerStartupTime()) .startupTime(ServerLifeCycleManager.getServerStartupTime())
.reportTime(System.currentTimeMillis()) .reportTime(System.currentTimeMillis())
.cpuUsage(OSUtils.cpuUsage()) .cpuUsage(OSUtils.cpuUsagePercentage())
.loadAverage(OSUtils.loadAverage())
.availablePhysicalMemorySize(OSUtils.availablePhysicalMemorySize()) .availablePhysicalMemorySize(OSUtils.availablePhysicalMemorySize())
.maxCpuloadAvg(masterConfig.getMaxCpuLoadAvg())
.reservedMemory(masterConfig.getReservedMemory()) .reservedMemory(masterConfig.getReservedMemory())
.memoryUsage(OSUtils.memoryUsage()) .memoryUsage(OSUtils.memoryUsagePercentage())
.diskAvailable(OSUtils.diskAvailable()) .diskAvailable(OSUtils.diskAvailable())
.processId(processId) .processId(processId)
.build(); .build();

6
dolphinscheduler-master/src/main/resources/application.yaml

@ -102,9 +102,9 @@ master:
# master commit task interval # master commit task interval
task-commit-interval: 1s task-commit-interval: 1s
state-wheel-interval: 5s state-wheel-interval: 5s
# master max cpuload avg, only higher than the system cpu load average, master server can schedule. default value -1: the number of cpu cores * 2 # master max cpuload avg percentage, only higher than the system cpu load average, master server can schedule. default value 1: will use 100% cpu
max-cpu-load-avg: -1 max-cpu-load-avg: 1
# master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, the unit is G # master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, only the available memory is higher than 30%, master server can schedule.
reserved-memory: 0.3 reserved-memory: 0.3
# failover interval, the unit is minute # failover interval, the unit is minute
failover-interval: 10m failover-interval: 10m

16
dolphinscheduler-standalone-server/src/main/resources/application.yaml

@ -140,10 +140,10 @@ master:
# master commit task interval # master commit task interval
task-commit-interval: 1s task-commit-interval: 1s
state-wheel-interval: 5s state-wheel-interval: 5s
# master max cpuload avg, only higher than the system cpu load average, master server can schedule. default value -1: the number of cpu cores * 2 # master max cpuload avg percentage, only higher than the system cpu load average, master server can schedule. default value 1: will use 100% cpu
max-cpu-load-avg: 80 max-cpu-load-avg: 1
# master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, the unit is G # master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, only the available memory is higher than 30%, master server can schedule.
reserved-memory: 0.1 reserved-memory: 0.3
# failover interval # failover interval
failover-interval: 10m failover-interval: 10m
# kill yarn/k8s application when failover taskInstance, default true # kill yarn/k8s application when failover taskInstance, default true
@ -163,10 +163,10 @@ worker:
tenant-auto-create: true tenant-auto-create: true
#Scenes to be used for distributed users.For example,users created by FreeIpa are stored in LDAP.This parameter only applies to Linux, When this parameter is true, worker.tenant.auto.create has no effect and will not automatically create tenants. #Scenes to be used for distributed users.For example,users created by FreeIpa are stored in LDAP.This parameter only applies to Linux, When this parameter is true, worker.tenant.auto.create has no effect and will not automatically create tenants.
tenant-distributed-user: false tenant-distributed-user: false
# worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value -1: the number of cpu cores * 2 # worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value 1: will use 100% cpu.
max-cpu-load-avg: 80 max-cpu-load-avg: 1
# worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, the unit is G # worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, only the available memory is higher than 30%, worker server can receive task.
reserved-memory: 0.1 reserved-memory: 0.3
task-execute-threads-full-policy: REJECT task-execute-threads-full-policy: REJECT
alert: alert:

2
dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerConfig.java

@ -47,7 +47,7 @@ public class WorkerConfig implements Validator {
private boolean tenantAutoCreate = true; private boolean tenantAutoCreate = true;
private boolean tenantDistributedUser = false; private boolean tenantDistributedUser = false;
private int maxCpuLoadAvg = -1; private int maxCpuLoadAvg = -1;
private double reservedMemory = 0.3; private double reservedMemory = 0.1;
private ConnectStrategyProperties registryDisconnectStrategy = new ConnectStrategyProperties(); private ConnectStrategyProperties registryDisconnectStrategy = new ConnectStrategyProperties();
/** /**

30
dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java

@ -53,23 +53,21 @@ public class WorkerHeartBeatTask extends BaseHeartBeatTask<WorkerHeartBeat> {
@Override @Override
public WorkerHeartBeat getHeartBeat() { public WorkerHeartBeat getHeartBeat() {
double loadAverage = OSUtils.loadAverage(); double cpuUsagePercentage = OSUtils.cpuUsagePercentage();
double cpuUsage = OSUtils.cpuUsage(); int maxCpuUsePercentage = workerConfig.getMaxCpuLoadAvg();
int maxCpuLoadAvg = workerConfig.getMaxCpuLoadAvg();
double reservedMemory = workerConfig.getReservedMemory(); double reservedMemory = workerConfig.getReservedMemory();
double availablePhysicalMemorySize = OSUtils.availablePhysicalMemorySize(); double memoryUsagePercentage = OSUtils.memoryUsagePercentage();
int execThreads = workerConfig.getExecThreads(); int execThreads = workerConfig.getExecThreads();
int serverStatus = getServerStatus(loadAverage, maxCpuLoadAvg, availablePhysicalMemorySize, reservedMemory, int serverStatus =
getServerStatus(cpuUsagePercentage, maxCpuUsePercentage, memoryUsagePercentage, reservedMemory,
execThreads, this.workerWaitingTaskCount.get()); execThreads, this.workerWaitingTaskCount.get());
return WorkerHeartBeat.builder() return WorkerHeartBeat.builder()
.startupTime(ServerLifeCycleManager.getServerStartupTime()) .startupTime(ServerLifeCycleManager.getServerStartupTime())
.reportTime(System.currentTimeMillis()) .reportTime(System.currentTimeMillis())
.cpuUsage(cpuUsage) .cpuUsage(cpuUsagePercentage)
.loadAverage(loadAverage) .availablePhysicalMemorySize(OSUtils.availablePhysicalMemorySize())
.availablePhysicalMemorySize(availablePhysicalMemorySize) .memoryUsage(OSUtils.memoryUsagePercentage())
.maxCpuloadAvg(maxCpuLoadAvg)
.memoryUsage(OSUtils.memoryUsage())
.reservedMemory(reservedMemory) .reservedMemory(reservedMemory)
.diskAvailable(OSUtils.diskAvailable()) .diskAvailable(OSUtils.diskAvailable())
.processId(processId) .processId(processId)
@ -90,16 +88,16 @@ public class WorkerHeartBeatTask extends BaseHeartBeatTask<WorkerHeartBeat> {
workerRegistryPath, workerHeartBeatJson); workerRegistryPath, workerHeartBeatJson);
} }
public int getServerStatus(double loadAverage, public int getServerStatus(double cpuUsagePercentage,
double maxCpuloadAvg, double maxCpuUsePercentage,
double availablePhysicalMemorySize, double memoryUsagePercentage,
double reservedMemory, double reservedMemory,
int workerExecThreadCount, int workerExecThreadCount,
int workerWaitingTaskCount) { int workerWaitingTaskCount) {
if (loadAverage > maxCpuloadAvg || availablePhysicalMemorySize < reservedMemory) { if (cpuUsagePercentage > maxCpuUsePercentage || memoryUsagePercentage < reservedMemory) {
log.warn( log.warn(
"current cpu load average {} is too high or available memory {}G is too low, under max.cpuload.avg={} and reserved.memory={}G", "current cpu load average {} is higher than {} max.cpuload.avg or available memory {} is lower than {} reserved.memory={}",
loadAverage, availablePhysicalMemorySize, maxCpuloadAvg, reservedMemory); cpuUsagePercentage, maxCpuUsePercentage, memoryUsagePercentage, reservedMemory);
return Constants.ABNORMAL_NODE_STATUS; return Constants.ABNORMAL_NODE_STATUS;
} else if (workerWaitingTaskCount > workerExecThreadCount) { } else if (workerWaitingTaskCount > workerExecThreadCount) {
log.warn("current waiting task count {} is large than worker thread count {}, worker is busy", log.warn("current waiting task count {} is large than worker thread count {}, worker is busy",

6
dolphinscheduler-worker/src/main/resources/application.yaml

@ -48,9 +48,9 @@ worker:
tenant-auto-create: true tenant-auto-create: true
#Scenes to be used for distributed users.For example,users created by FreeIpa are stored in LDAP.This parameter only applies to Linux, When this parameter is true, worker.tenant.auto.create has no effect and will not automatically create tenants. #Scenes to be used for distributed users.For example,users created by FreeIpa are stored in LDAP.This parameter only applies to Linux, When this parameter is true, worker.tenant.auto.create has no effect and will not automatically create tenants.
tenant-distributed-user: false tenant-distributed-user: false
# worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value -1: the number of cpu cores * 2 # worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value 1: will use 100% cpu.
max-cpu-load-avg: -1 max-cpu-load-avg: 1
# worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, the unit is G # worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, only the available memory is higher than 30%, worker server can receive task.
reserved-memory: 0.3 reserved-memory: 0.3
registry-disconnect-strategy: registry-disconnect-strategy:
# The disconnect strategy: stop, waiting # The disconnect strategy: stop, waiting

Loading…
Cancel
Save