From 61a689aa5aaea5cbb29235dfaaa629804f4b4464 Mon Sep 17 00:00:00 2001 From: Wenjun Ruan Date: Thu, 13 Apr 2023 15:48:20 +0800 Subject: [PATCH] Use percentage to represent memory/cpu usage (#13896) --- .../kubernetes/dolphinscheduler/values.yaml | 4 +- docs/docs/en/architecture/configuration.md | 8 +-- docs/docs/zh/architecture/configuration.md | 8 +-- .../alert/registry/AlertHeartbeatTask.java | 5 +- .../common/model/AlertServerHeartBeat.java | 1 - .../common/model/MasterHeartBeat.java | 2 - .../common/model/WorkerHeartBeat.java | 1 - .../common/utils/OSUtils.java | 69 +++++-------------- .../common/os/OSUtilsTest.java | 16 +---- .../server/master/config/MasterConfig.java | 10 ++- .../master/task/MasterHeartBeatTask.java | 6 +- .../src/main/resources/application.yaml | 6 +- .../src/main/resources/application.yaml | 16 ++--- .../server/worker/config/WorkerConfig.java | 2 +- .../worker/task/WorkerHeartBeatTask.java | 32 ++++----- .../src/main/resources/application.yaml | 6 +- 16 files changed, 71 insertions(+), 121 deletions(-) diff --git a/deploy/kubernetes/dolphinscheduler/values.yaml b/deploy/kubernetes/dolphinscheduler/values.yaml index 73d739adc2..6f163b589b 100644 --- a/deploy/kubernetes/dolphinscheduler/values.yaml +++ b/deploy/kubernetes/dolphinscheduler/values.yaml @@ -310,7 +310,7 @@ master: MASTER_TASK_COMMIT_RETRYTIMES: "5" MASTER_TASK_COMMIT_INTERVAL: "1s" MASTER_STATE_WHEEL_INTERVAL: "5s" - MASTER_MAX_CPU_LOAD_AVG: "-1" + MASTER_MAX_CPU_LOAD_AVG: "1" MASTER_RESERVED_MEMORY: "0.3" MASTER_FAILOVER_INTERVAL: "10m" MASTER_KILL_APPLICATION_WHEN_HANDLE_FAILOVER: "true" @@ -382,7 +382,7 @@ worker: storageClassName: "-" storage: "20Gi" env: - WORKER_MAX_CPU_LOAD_AVG: "-1" + WORKER_MAX_CPU_LOAD_AVG: "1" WORKER_RESERVED_MEMORY: "0.3" WORKER_EXEC_THREADS: "100" WORKER_HEARTBEAT_INTERVAL: "10s" diff --git a/docs/docs/en/architecture/configuration.md b/docs/docs/en/architecture/configuration.md index a03e7d6b8b..4a03a58139 100644 --- a/docs/docs/en/architecture/configuration.md +++ b/docs/docs/en/architecture/configuration.md @@ -273,8 +273,8 @@ Location: `master-server/conf/application.yaml` |master.task-commit-retry-times|5|master commit task retry times| |master.task-commit-interval|1000|master commit task interval, the unit is millisecond| |master.state-wheel-interval|5|time to check status| -|master.max-cpu-load-avg|-1|master max CPU load avg, only higher than the system CPU load average, master server can schedule. default value -1: the number of CPU cores * 2| -|master.reserved-memory|0.3|master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, the unit is G| +|master.max-cpu-load-avg|1|master max cpuload avg percentage, only higher than the system cpu load average, master server can schedule. default value 1: will use 100% cpu| +|master.reserved-memory|0.3|master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, only the available memory is higher than 30%, master server can schedule.| |master.failover-interval|10|failover interval, the unit is minute| |master.kill-application-when-task-failover|true|whether to kill yarn/k8s application when failover taskInstance| |master.registry-disconnect-strategy.strategy|stop|Used when the master disconnect from registry, default value: stop. Optional values include stop, waiting| @@ -292,8 +292,8 @@ Location: `worker-server/conf/application.yaml` |worker.heartbeat-interval|10|worker-service heartbeat interval, the unit is second| |worker.host-weight|100|worker host weight to dispatch tasks| |worker.tenant-auto-create|true|tenant corresponds to the user of the system, which is used by the worker to submit the job. If system does not have this user, it will be automatically created after the parameter worker.tenant.auto.create is true.| -|worker.max-cpu-load-avg|-1|worker max CPU load avg, only higher than the system CPU load average, worker server can be dispatched tasks. default value -1: the number of CPU cores * 2| -|worker.reserved-memory|0.3|worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, the unit is G| +|worker.max-cpu-load-avg|1|worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value 1: will use 100% cpu.| +|worker.reserved-memory|0.3|worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, only the available memory is higher than 30%, worker server can receive task.| |worker.alert-listen-host|localhost|the alert listen host of worker| |worker.alert-listen-port|50052|the alert listen port of worker| |worker.registry-disconnect-strategy.strategy|stop|Used when the worker disconnect from registry, default value: stop. Optional values include stop, waiting| diff --git a/docs/docs/zh/architecture/configuration.md b/docs/docs/zh/architecture/configuration.md index d61f3a1834..9ae5e7bfab 100644 --- a/docs/docs/zh/architecture/configuration.md +++ b/docs/docs/zh/architecture/configuration.md @@ -268,8 +268,8 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId |master.task-commit-retry-times|5|任务重试次数| |master.task-commit-interval|1000|任务提交间隔,单位为毫秒| |master.state-wheel-interval|5|轮询检查状态时间| -|master.max-cpu-load-avg|-1|master最大cpuload均值,只有高于系统cpuload均值时,master服务才能调度任务. 默认值为-1: cpu cores * 2| -|master.reserved-memory|0.3|master预留内存,只有低于系统可用内存时,master服务才能调度任务,单位为G| +|master.max-cpu-load-avg|1|master最大cpuload均值,只有高于系统cpuload均值时,master服务才能调度任务. 默认值为1: 会使用100%的CPU| +|master.reserved-memory|0.3|master预留内存,只有低于系统可用内存时,master服务才能调度任务. 默认值为0.3:当系统内存低于30%时会停止调度新的工作流| |master.failover-interval|10|failover间隔,单位为分钟| |master.kill-application-when-task-failover|true|当任务实例failover时,是否kill掉yarn或k8s application| |master.registry-disconnect-strategy.strategy|stop|当Master与注册中心失联之后采取的策略, 默认值是: stop. 可选值包括: stop, waiting| @@ -286,8 +286,8 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId |worker.heartbeat-interval|10|worker心跳间隔,单位为秒| |worker.host-weight|100|派发任务时,worker主机的权重| |worker.tenant-auto-create|true|租户对应于系统的用户,由worker提交作业.如果系统没有该用户,则在参数worker.tenant.auto.create为true后自动创建。| -|worker.max-cpu-load-avg|-1|worker最大cpuload均值,只有高于系统cpuload均值时,worker服务才能被派发任务. 默认值为-1: cpu cores * 2| -|worker.reserved-memory|0.3|worker预留内存,只有低于系统可用内存时,worker服务才能被派发任务,单位为G| +|worker.max-cpu-load-avg|1|worker最大cpuload均值,只有高于系统cpuload均值时,worker服务才能被派发任务. 默认值为1: 会使用100%的CPU| +|worker.reserved-memory|0.3|worker预留内存,只有低于系统可用内存时,worker服务才能被派发任务. 默认值为0.3:当系统内存低于30%时会停止调度新的工作流| |worker.alert-listen-host|localhost|alert监听host| |worker.alert-listen-port|50052|alert监听端口| |worker.registry-disconnect-strategy.strategy|stop|当Worker与注册中心失联之后采取的策略, 默认值是: stop. 可选值包括: stop, waiting| diff --git a/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java b/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java index a5a0056d84..31b30ae42b 100644 --- a/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java +++ b/dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java @@ -56,9 +56,8 @@ public class AlertHeartbeatTask extends BaseHeartBeatTask .processId(processId) .startupTime(startupTime) .reportTime(System.currentTimeMillis()) - .cpuUsage(OSUtils.cpuUsage()) - .memoryUsage(OSUtils.memoryUsage()) - .loadAverage(OSUtils.loadAverage()) + .cpuUsage(OSUtils.cpuUsagePercentage()) + .memoryUsage(OSUtils.memoryUsagePercentage()) .availablePhysicalMemorySize(OSUtils.availablePhysicalMemorySize()) .alertServerAddress(alertConfig.getAlertServerAddress()) .build(); diff --git a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java index a89c903187..4d3610b7fe 100644 --- a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java +++ b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java @@ -33,7 +33,6 @@ public class AlertServerHeartBeat implements HeartBeat { private long reportTime; private double cpuUsage; private double memoryUsage; - private double loadAverage; private double availablePhysicalMemorySize; private String alertServerAddress; } diff --git a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java index e9552f507d..b6086b9bbe 100644 --- a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java +++ b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java @@ -32,9 +32,7 @@ public class MasterHeartBeat implements HeartBeat { private long reportTime; private double cpuUsage; private double memoryUsage; - private double loadAverage; private double availablePhysicalMemorySize; - private double maxCpuloadAvg; private double reservedMemory; private double diskAvailable; private int processId; diff --git a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java index 4bc5c044bc..dcacd75c6c 100644 --- a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java +++ b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java @@ -34,7 +34,6 @@ public class WorkerHeartBeat implements HeartBeat { private double memoryUsage; private double loadAverage; private double availablePhysicalMemorySize; - private double maxCpuloadAvg; private double reservedMemory; private double diskAvailable; private int serverStatus; diff --git a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/utils/OSUtils.java b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/utils/OSUtils.java index e95886ba48..678abff0c6 100644 --- a/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/utils/OSUtils.java +++ b/dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/utils/OSUtils.java @@ -34,7 +34,6 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStreamReader; import java.lang.management.ManagementFactory; -import java.lang.management.OperatingSystemMXBean; import java.lang.management.RuntimeMXBean; import java.math.RoundingMode; import java.text.DecimalFormat; @@ -58,14 +57,15 @@ public class OSUtils { /** * return -1 when the function can not get hardware env info - * e.g {@link OSUtils#loadAverage()} {@link OSUtils#cpuUsage()} + * e.g {@link OSUtils#cpuUsagePercentage()} */ public static final double NEGATIVE_ONE = -1; private static final HardwareAbstractionLayer hal = SI.getHardware(); private static long[] prevTicks = new long[CentralProcessor.TickType.values().length]; private static long prevTickTime = 0L; - private static double cpuUsage = 0.0D; + private static volatile double cpuUsage = 0.0D; + private static final double TOTAL_MEMORY = hal.getMemory().getTotal() / 1024.0 / 1024 / 1024; private OSUtils() { throw new UnsupportedOperationException("Construct OSUtils"); @@ -77,21 +77,6 @@ public class OSUtils { */ private static final Pattern PATTERN = Pattern.compile("\\s+"); - /** - * get memory usage - * Keep 2 decimal - * - * @return percent % - */ - public static double memoryUsage() { - GlobalMemory memory = hal.getMemory(); - double memoryUsage = (memory.getTotal() - memory.getAvailable()) * 1.0 / memory.getTotal(); - - DecimalFormat df = new DecimalFormat(TWO_DECIMAL); - df.setRoundingMode(RoundingMode.HALF_UP); - return Double.parseDouble(df.format(memoryUsage)); - } - /** * get disk usage * Keep 2 decimal @@ -125,34 +110,12 @@ public class OSUtils { return Double.parseDouble(df.format(availablePhysicalMemorySize)); } - /** - * load average - * - * @return load average - */ - public static double loadAverage() { - double loadAverage; - try { - OperatingSystemMXBean osBean = ManagementFactory.getPlatformMXBean(OperatingSystemMXBean.class); - loadAverage = osBean.getSystemLoadAverage(); - } catch (Exception e) { - log.error("get operation system load average exception, try another method ", e); - loadAverage = hal.getProcessor().getSystemLoadAverage(1)[0]; - if (Double.isNaN(loadAverage)) { - return NEGATIVE_ONE; - } - } - DecimalFormat df = new DecimalFormat(TWO_DECIMAL); - df.setRoundingMode(RoundingMode.HALF_UP); - return Double.parseDouble(df.format(loadAverage)); - } - /** * get cpu usage * * @return cpu usage */ - public static double cpuUsage() { + public static double cpuUsagePercentage() { CentralProcessor processor = hal.getProcessor(); // Check if > ~ 0.95 seconds since last tick count. @@ -173,6 +136,10 @@ public class OSUtils { return Double.parseDouble(df.format(cpuUsage)); } + public static double memoryUsagePercentage() { + return (TOTAL_MEMORY - availablePhysicalMemorySize()) / TOTAL_MEMORY; + } + public static List getUserList() { try { if (SystemUtils.IS_OS_MAC) { @@ -466,23 +433,25 @@ public class OSUtils { /** * Check memory and cpu usage is overload the given thredshod. * - * @param maxCpuLoadAvg maxCpuLoadAvg - * @param reservedMemory reservedMemory + * @param maxCpuLoadAvgThreshold maxCpuLoadAvg + * @param reservedMemoryThreshold reservedMemory * @return True, if the cpu or memory exceed the given thredshod. */ - public static Boolean isOverload(double maxCpuLoadAvg, double reservedMemory) { + public static Boolean isOverload(double maxCpuLoadAvgThreshold, double reservedMemoryThreshold) { // system load average - double loadAverage = loadAverage(); + double freeCPUPercentage = 1 - cpuUsagePercentage(); // system available physical memory - double availablePhysicalMemorySize = availablePhysicalMemorySize(); - if (loadAverage > maxCpuLoadAvg) { - log.warn("Current cpu load average {} is too high, max.cpuLoad.avg={}", loadAverage, maxCpuLoadAvg); + double freeMemoryPercentage = 1 - memoryUsagePercentage(); + if (freeCPUPercentage > maxCpuLoadAvgThreshold) { + log.warn("Current cpu load average {} is too high, max.cpuLoad.avg={}", freeCPUPercentage, + maxCpuLoadAvgThreshold); return true; } - if (availablePhysicalMemorySize < reservedMemory) { + if (freeMemoryPercentage < reservedMemoryThreshold) { log.warn( - "Current available memory {}G is too low, reserved.memory={}G", maxCpuLoadAvg, reservedMemory); + "Current available memory percentage{} is too low, reserved.memory={}", freeMemoryPercentage, + reservedMemoryThreshold); return true; } return false; diff --git a/dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/os/OSUtilsTest.java b/dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/os/OSUtilsTest.java index 561f2a73f2..4c37e99dc1 100644 --- a/dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/os/OSUtilsTest.java +++ b/dolphinscheduler-common/src/test/java/org/apache/dolphinscheduler/common/os/OSUtilsTest.java @@ -34,13 +34,6 @@ public class OSUtilsTest { private static Logger logger = LoggerFactory.getLogger(OSUtilsTest.class); - @Test - public void memoryUsage() { - double memoryUsage = OSUtils.memoryUsage(); - logger.info("memoryUsage : {}", memoryUsage); - Assertions.assertTrue(memoryUsage >= 0.0); - } - @Test public void diskAvailable() { double diskAvailable = OSUtils.diskAvailable(); @@ -48,16 +41,9 @@ public class OSUtilsTest { Assertions.assertTrue(diskAvailable >= 0.0); } - @Test - public void loadAverage() { - double loadAverage = OSUtils.loadAverage(); - logger.info("loadAverage : {}", loadAverage); - Assertions.assertTrue(loadAverage >= 0.0); - } - @Test public void cpuUsage() { - double cpuUsage = OSUtils.cpuUsage(); + double cpuUsage = OSUtils.cpuUsagePercentage(); logger.info("cpuUsage : {}", cpuUsage); Assertions.assertTrue(cpuUsage >= 0.0); } diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterConfig.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterConfig.java index 5cbdd4d310..b878b2a4fe 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterConfig.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterConfig.java @@ -84,8 +84,8 @@ public class MasterConfig implements Validator { * state wheel check interval, if this value is bigger, may increase the delay of task/processInstance. */ private Duration stateWheelInterval = Duration.ofMillis(5); - private double maxCpuLoadAvg = -1; - private double reservedMemory = 0.3; + private double maxCpuLoadAvg = 1; + private double reservedMemory = 0.1; private Duration failoverInterval = Duration.ofMinutes(10); private boolean killApplicationWhenTaskFailover = true; private ConnectStrategyProperties registryDisconnectStrategy = new ConnectStrategyProperties(); @@ -137,8 +137,12 @@ public class MasterConfig implements Validator { errors.rejectValue("failover-interval", null, "should be a valid duration"); } if (masterConfig.getMaxCpuLoadAvg() <= 0) { - masterConfig.setMaxCpuLoadAvg(Runtime.getRuntime().availableProcessors() * 2); + masterConfig.setMaxCpuLoadAvg(100); } + if (masterConfig.getReservedMemory() <= 0) { + masterConfig.setReservedMemory(100); + } + if (masterConfig.getWorkerGroupRefreshInterval().getSeconds() < 10) { errors.rejectValue("worker-group-refresh-interval", null, "should >= 10s"); } diff --git a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java index cacad78c3b..a28b026192 100644 --- a/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java +++ b/dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java @@ -53,12 +53,10 @@ public class MasterHeartBeatTask extends BaseHeartBeatTask { return MasterHeartBeat.builder() .startupTime(ServerLifeCycleManager.getServerStartupTime()) .reportTime(System.currentTimeMillis()) - .cpuUsage(OSUtils.cpuUsage()) - .loadAverage(OSUtils.loadAverage()) + .cpuUsage(OSUtils.cpuUsagePercentage()) .availablePhysicalMemorySize(OSUtils.availablePhysicalMemorySize()) - .maxCpuloadAvg(masterConfig.getMaxCpuLoadAvg()) .reservedMemory(masterConfig.getReservedMemory()) - .memoryUsage(OSUtils.memoryUsage()) + .memoryUsage(OSUtils.memoryUsagePercentage()) .diskAvailable(OSUtils.diskAvailable()) .processId(processId) .build(); diff --git a/dolphinscheduler-master/src/main/resources/application.yaml b/dolphinscheduler-master/src/main/resources/application.yaml index e333b566b6..bab902fb59 100644 --- a/dolphinscheduler-master/src/main/resources/application.yaml +++ b/dolphinscheduler-master/src/main/resources/application.yaml @@ -102,9 +102,9 @@ master: # master commit task interval task-commit-interval: 1s state-wheel-interval: 5s - # master max cpuload avg, only higher than the system cpu load average, master server can schedule. default value -1: the number of cpu cores * 2 - max-cpu-load-avg: -1 - # master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, the unit is G + # master max cpuload avg percentage, only higher than the system cpu load average, master server can schedule. default value 1: will use 100% cpu + max-cpu-load-avg: 1 + # master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, only the available memory is higher than 30%, master server can schedule. reserved-memory: 0.3 # failover interval, the unit is minute failover-interval: 10m diff --git a/dolphinscheduler-standalone-server/src/main/resources/application.yaml b/dolphinscheduler-standalone-server/src/main/resources/application.yaml index 404b30ad69..02666512b5 100644 --- a/dolphinscheduler-standalone-server/src/main/resources/application.yaml +++ b/dolphinscheduler-standalone-server/src/main/resources/application.yaml @@ -140,10 +140,10 @@ master: # master commit task interval task-commit-interval: 1s state-wheel-interval: 5s - # master max cpuload avg, only higher than the system cpu load average, master server can schedule. default value -1: the number of cpu cores * 2 - max-cpu-load-avg: 80 - # master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, the unit is G - reserved-memory: 0.1 + # master max cpuload avg percentage, only higher than the system cpu load average, master server can schedule. default value 1: will use 100% cpu + max-cpu-load-avg: 1 + # master reserved memory, only lower than system available memory, master server can schedule. default value 0.3, only the available memory is higher than 30%, master server can schedule. + reserved-memory: 0.3 # failover interval failover-interval: 10m # kill yarn/k8s application when failover taskInstance, default true @@ -163,10 +163,10 @@ worker: tenant-auto-create: true #Scenes to be used for distributed users.For example,users created by FreeIpa are stored in LDAP.This parameter only applies to Linux, When this parameter is true, worker.tenant.auto.create has no effect and will not automatically create tenants. tenant-distributed-user: false - # worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value -1: the number of cpu cores * 2 - max-cpu-load-avg: 80 - # worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, the unit is G - reserved-memory: 0.1 + # worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value 1: will use 100% cpu. + max-cpu-load-avg: 1 + # worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, only the available memory is higher than 30%, worker server can receive task. + reserved-memory: 0.3 task-execute-threads-full-policy: REJECT alert: diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerConfig.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerConfig.java index 7154dc3878..211c4691b4 100644 --- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerConfig.java +++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerConfig.java @@ -47,7 +47,7 @@ public class WorkerConfig implements Validator { private boolean tenantAutoCreate = true; private boolean tenantDistributedUser = false; private int maxCpuLoadAvg = -1; - private double reservedMemory = 0.3; + private double reservedMemory = 0.1; private ConnectStrategyProperties registryDisconnectStrategy = new ConnectStrategyProperties(); /** diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java index d1a732f0e1..3a52ea5ff9 100644 --- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java +++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java @@ -53,23 +53,21 @@ public class WorkerHeartBeatTask extends BaseHeartBeatTask { @Override public WorkerHeartBeat getHeartBeat() { - double loadAverage = OSUtils.loadAverage(); - double cpuUsage = OSUtils.cpuUsage(); - int maxCpuLoadAvg = workerConfig.getMaxCpuLoadAvg(); + double cpuUsagePercentage = OSUtils.cpuUsagePercentage(); + int maxCpuUsePercentage = workerConfig.getMaxCpuLoadAvg(); double reservedMemory = workerConfig.getReservedMemory(); - double availablePhysicalMemorySize = OSUtils.availablePhysicalMemorySize(); + double memoryUsagePercentage = OSUtils.memoryUsagePercentage(); int execThreads = workerConfig.getExecThreads(); - int serverStatus = getServerStatus(loadAverage, maxCpuLoadAvg, availablePhysicalMemorySize, reservedMemory, - execThreads, this.workerWaitingTaskCount.get()); + int serverStatus = + getServerStatus(cpuUsagePercentage, maxCpuUsePercentage, memoryUsagePercentage, reservedMemory, + execThreads, this.workerWaitingTaskCount.get()); return WorkerHeartBeat.builder() .startupTime(ServerLifeCycleManager.getServerStartupTime()) .reportTime(System.currentTimeMillis()) - .cpuUsage(cpuUsage) - .loadAverage(loadAverage) - .availablePhysicalMemorySize(availablePhysicalMemorySize) - .maxCpuloadAvg(maxCpuLoadAvg) - .memoryUsage(OSUtils.memoryUsage()) + .cpuUsage(cpuUsagePercentage) + .availablePhysicalMemorySize(OSUtils.availablePhysicalMemorySize()) + .memoryUsage(OSUtils.memoryUsagePercentage()) .reservedMemory(reservedMemory) .diskAvailable(OSUtils.diskAvailable()) .processId(processId) @@ -90,16 +88,16 @@ public class WorkerHeartBeatTask extends BaseHeartBeatTask { workerRegistryPath, workerHeartBeatJson); } - public int getServerStatus(double loadAverage, - double maxCpuloadAvg, - double availablePhysicalMemorySize, + public int getServerStatus(double cpuUsagePercentage, + double maxCpuUsePercentage, + double memoryUsagePercentage, double reservedMemory, int workerExecThreadCount, int workerWaitingTaskCount) { - if (loadAverage > maxCpuloadAvg || availablePhysicalMemorySize < reservedMemory) { + if (cpuUsagePercentage > maxCpuUsePercentage || memoryUsagePercentage < reservedMemory) { log.warn( - "current cpu load average {} is too high or available memory {}G is too low, under max.cpuload.avg={} and reserved.memory={}G", - loadAverage, availablePhysicalMemorySize, maxCpuloadAvg, reservedMemory); + "current cpu load average {} is higher than {} max.cpuload.avg or available memory {} is lower than {} reserved.memory={}", + cpuUsagePercentage, maxCpuUsePercentage, memoryUsagePercentage, reservedMemory); return Constants.ABNORMAL_NODE_STATUS; } else if (workerWaitingTaskCount > workerExecThreadCount) { log.warn("current waiting task count {} is large than worker thread count {}, worker is busy", diff --git a/dolphinscheduler-worker/src/main/resources/application.yaml b/dolphinscheduler-worker/src/main/resources/application.yaml index 21fbf4c293..d8587d48f0 100644 --- a/dolphinscheduler-worker/src/main/resources/application.yaml +++ b/dolphinscheduler-worker/src/main/resources/application.yaml @@ -48,9 +48,9 @@ worker: tenant-auto-create: true #Scenes to be used for distributed users.For example,users created by FreeIpa are stored in LDAP.This parameter only applies to Linux, When this parameter is true, worker.tenant.auto.create has no effect and will not automatically create tenants. tenant-distributed-user: false - # worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value -1: the number of cpu cores * 2 - max-cpu-load-avg: -1 - # worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, the unit is G + # worker max cpuload avg, only higher than the system cpu load average, worker server can be dispatched tasks. default value 1: will use 100% cpu. + max-cpu-load-avg: 1 + # worker reserved memory, only lower than system available memory, worker server can be dispatched tasks. default value 0.3, only the available memory is higher than 30%, worker server can receive task. reserved-memory: 0.3 registry-disconnect-strategy: # The disconnect strategy: stop, waiting