Browse Source

Split cpuUsage to systemCpuUsage and jvmCpuUsage (#15803)

3.2.2-release-bak
Wenjun Ruan 8 months ago committed by GitHub
parent
commit
66df5d4b90
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 8
      deploy/kubernetes/dolphinscheduler/README.md
  2. 16
      deploy/kubernetes/dolphinscheduler/values.yaml
  3. 208
      docs/docs/en/architecture/configuration.md
  4. 112
      docs/docs/zh/architecture/configuration.md
  5. 3
      dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java
  6. 28
      dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java
  7. 46
      dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/BaseHeartBeat.java
  8. 4
      dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/HeartBeat.java
  9. 28
      dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java
  10. 29
      dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java
  11. 2
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/MasterServer.java
  12. 50
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterServerLoadProtection.java
  13. 2
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/metrics/MasterServerMetrics.java
  14. 3
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterSlotManager.java
  15. 4
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/ServerNodeManager.java
  16. 3
      dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java
  17. 8
      dolphinscheduler-master/src/main/resources/application.yaml
  18. 25
      dolphinscheduler-master/src/test/java/org/apache/dolphinscheduler/server/master/config/MasterConfigTest.java
  19. 3
      dolphinscheduler-master/src/test/java/org/apache/dolphinscheduler/server/master/config/MasterServerLoadProtectionTest.java
  20. 164
      dolphinscheduler-master/src/test/resources/application.yaml
  21. 6
      dolphinscheduler-master/src/test/resources/logback.xml
  22. 67
      dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/BaseServerLoadProtection.java
  23. 11
      dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/DefaultMetricsProvider.java
  24. 24
      dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/ServerLoadProtection.java
  25. 3
      dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/SystemMetrics.java
  26. 16
      dolphinscheduler-standalone-server/src/main/resources/application.yaml
  27. 2
      dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/WorkerServer.java
  28. 50
      dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerServerLoadProtection.java
  29. 3
      dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java
  30. 8
      dolphinscheduler-worker/src/main/resources/application.yaml
  31. 3
      dolphinscheduler-worker/src/test/java/org/apache/dolphinscheduler/server/worker/config/WorkerServerLoadProtectionTest.java

8
deploy/kubernetes/dolphinscheduler/README.md

@ -200,9 +200,9 @@ Please refer to the [Quick Start in Kubernetes](../../../docs/docs/en/guide/inst
| master.env.MASTER_KILL_APPLICATION_WHEN_HANDLE_FAILOVER | string | `"true"` | Master kill application when handle failover |
| master.env.MASTER_MAX_HEARTBEAT_INTERVAL | string | `"10s"` | Master max heartbeat interval |
| master.env.MASTER_SERVER_LOAD_PROTECTION_ENABLED | bool | `false` | If set true, will open master overload protection |
| master.env.MASTER_SERVER_LOAD_PROTECTION_MAX_CPU_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow. |
| master.env.MASTER_SERVER_LOAD_PROTECTION_MAX_DISK_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow. |
| master.env.MASTER_SERVER_LOAD_PROTECTION_MAX_JVM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow. |
| master.env.MASTER_SERVER_LOAD_PROTECTION_MAX_JVM_CPU_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Master max jvm cpu usage, when the master's jvm cpu usage is smaller then this value, master server can execute workflow. |
| master.env.MASTER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_CPU_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Master max system cpu usage, when the master's system cpu usage is smaller then this value, master server can execute workflow. |
| master.env.MASTER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow. |
| master.env.MASTER_STATE_WHEEL_INTERVAL | string | `"5s"` | master state wheel interval, the unit is second |
| master.env.MASTER_TASK_COMMIT_INTERVAL | string | `"1s"` | master commit task interval, the unit is second |
@ -301,9 +301,9 @@ Please refer to the [Quick Start in Kubernetes](../../../docs/docs/en/guide/inst
| worker.env.WORKER_HOST_WEIGHT | string | `"100"` | Worker host weight to dispatch tasks |
| worker.env.WORKER_MAX_HEARTBEAT_INTERVAL | string | `"10s"` | Worker heartbeat interval |
| worker.env.WORKER_SERVER_LOAD_PROTECTION_ENABLED | bool | `false` | If set true, will open worker overload protection |
| worker.env.WORKER_SERVER_LOAD_PROTECTION_MAX_CPU_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Worker max cpu usage, when the worker's cpu usage is smaller then this value, worker server can be dispatched tasks. |
| worker.env.WORKER_SERVER_LOAD_PROTECTION_MAX_DISK_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Worker max disk usage , when the worker's disk usage is smaller then this value, worker server can be dispatched tasks. |
| worker.env.WORKER_SERVER_LOAD_PROTECTION_MAX_JVM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Worker max jvm memory usage , when the worker's jvm memory usage is smaller then this value, worker server can be dispatched tasks. |
| worker.env.WORKER_SERVER_LOAD_PROTECTION_MAX_JVM_CPU_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Worker max jvm cpu usage, when the worker's jvm cpu usage is smaller then this value, worker server can be dispatched tasks. |
| worker.env.WORKER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_CPU_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Worker max system cpu usage, when the worker's system cpu usage is smaller then this value, worker server can be dispatched tasks. |
| worker.env.WORKER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS | float | `0.7` | Worker max memory usage , when the worker's memory usage is smaller then this value, worker server can be dispatched tasks. |
| worker.env.WORKER_TENANT_CONFIG_AUTO_CREATE_TENANT_ENABLED | bool | `true` | tenant corresponds to the user of the system, which is used by the worker to submit the job. If system does not have this user, it will be automatically created after the parameter worker.tenant.auto.create is true. |
| worker.env.WORKER_TENANT_CONFIG_DISTRIBUTED_TENANT | bool | `false` | Scenes to be used for distributed users. For example, users created by FreeIpa are stored in LDAP. This parameter only applies to Linux, When this parameter is true, worker.tenant.auto.create has no effect and will not automatically create tenants. |

16
deploy/kubernetes/dolphinscheduler/values.yaml

@ -508,10 +508,10 @@ master:
MASTER_STATE_WHEEL_INTERVAL: "5s"
# -- If set true, will open master overload protection
MASTER_SERVER_LOAD_PROTECTION_ENABLED: false
# -- Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow.
MASTER_SERVER_LOAD_PROTECTION_MAX_CPU_USAGE_PERCENTAGE_THRESHOLDS: 0.7
# -- Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow.
MASTER_SERVER_LOAD_PROTECTION_MAX_JVM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS: 0.7
# -- Master max system cpu usage, when the master's system cpu usage is smaller then this value, master server can execute workflow.
MASTER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_CPU_USAGE_PERCENTAGE_THRESHOLDS: 0.7
# -- Master max jvm cpu usage, when the master's jvm cpu usage is smaller then this value, master server can execute workflow.
MASTER_SERVER_LOAD_PROTECTION_MAX_JVM_CPU_USAGE_PERCENTAGE_THRESHOLDS: 0.7
# -- Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow.
MASTER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS: 0.7
# -- Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow.
@ -629,10 +629,10 @@ worker:
env:
# -- If set true, will open worker overload protection
WORKER_SERVER_LOAD_PROTECTION_ENABLED: false
# -- Worker max cpu usage, when the worker's cpu usage is smaller then this value, worker server can be dispatched tasks.
WORKER_SERVER_LOAD_PROTECTION_MAX_CPU_USAGE_PERCENTAGE_THRESHOLDS: 0.7
# -- Worker max jvm memory usage , when the worker's jvm memory usage is smaller then this value, worker server can be dispatched tasks.
WORKER_SERVER_LOAD_PROTECTION_MAX_JVM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS: 0.7
# -- Worker max system cpu usage, when the worker's system cpu usage is smaller then this value, worker server can be dispatched tasks.
WORKER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_CPU_USAGE_PERCENTAGE_THRESHOLDS: 0.7
# -- Worker max jvm cpu usage, when the worker's jvm cpu usage is smaller then this value, worker server can be dispatched tasks.
WORKER_SERVER_LOAD_PROTECTION_MAX_JVM_CPU_USAGE_PERCENTAGE_THRESHOLDS: 0.7
# -- Worker max memory usage , when the worker's memory usage is smaller then this value, worker server can be dispatched tasks.
WORKER_SERVER_LOAD_PROTECTION_MAX_SYSTEM_MEMORY_USAGE_PERCENTAGE_THRESHOLDS: 0.7
# -- Worker max disk usage , when the worker's disk usage is smaller then this value, worker server can be dispatched tasks.

208
docs/docs/en/architecture/configuration.md

@ -110,7 +110,8 @@ The directory structure of DolphinScheduler is as follows:
dolphinscheduler-daemon.sh is responsible for DolphinScheduler startup and shutdown.
Essentially, start-all.sh or stop-all.sh startup and shutdown the cluster via dolphinscheduler-daemon.sh.
Currently, DolphinScheduler just makes a basic config, remember to config further JVM options based on your practical situation of resources.
Currently, DolphinScheduler just makes a basic config, remember to config further JVM options based on your practical
situation of resources.
Default simplified parameters are:
@ -128,43 +129,46 @@ export DOLPHINSCHEDULER_OPTS="
"
```
> "-XX:DisableExplicitGC" is not recommended due to may lead to memory link (DolphinScheduler dependent on Netty to communicate).
> If add "-Djava.net.preferIPv6Addresses=true" will use ipv6 address, if add "-Djava.net.preferIPv4Addresses=true" will use ipv4 address, if doesn't set the two parameter will use ipv4 or ipv6.
> "-XX:DisableExplicitGC" is not recommended due to may lead to memory link (DolphinScheduler dependent on Netty to
> communicate).
> If add "-Djava.net.preferIPv6Addresses=true" will use ipv6 address, if add "-Djava.net.preferIPv4Addresses=true" will
> use ipv4 address, if doesn't set the two parameter will use ipv4 or ipv6.
### Database connection related configuration
DolphinScheduler uses Spring Hikari to manage database connections, configuration file location:
|Service| Configuration file |
|--|--|
|Master Server | `master-server/conf/application.yaml`|
|Api Server| `api-server/conf/application.yaml`|
|Worker Server| `worker-server/conf/application.yaml`|
|Alert Server| `alert-server/conf/application.yaml`|
| Service | Configuration file |
|---------------|---------------------------------------|
| Master Server | `master-server/conf/application.yaml` |
| Api Server | `api-server/conf/application.yaml` |
| Worker Server | `worker-server/conf/application.yaml` |
| Alert Server | `alert-server/conf/application.yaml` |
The default configuration is as follows:
|Parameters | Default value| Description|
|--|--|--|
|spring.datasource.driver-class-name| org.postgresql.Driver |datasource driver|
|spring.datasource.url| jdbc:postgresql://127.0.0.1:5432/dolphinscheduler |datasource connection url|
|spring.datasource.username|root|datasource username|
|spring.datasource.password|root|datasource password|
|spring.datasource.hikari.connection-test-query|select 1|validate connection by running the SQL|
|spring.datasource.hikari.minimum-idle| 5| minimum connection pool size number|
|spring.datasource.hikari.auto-commit|true|whether auto commit|
|spring.datasource.hikari.pool-name|DolphinScheduler|name of the connection pool|
|spring.datasource.hikari.maximum-pool-size|50| maximum connection pool size number|
|spring.datasource.hikari.connection-timeout|30000|connection timeout|
|spring.datasource.hikari.idle-timeout|600000|Maximum idle connection survival time|
|spring.datasource.hikari.leak-detection-threshold|0|Connection leak detection threshold|
|spring.datasource.hikari.initialization-fail-timeout|1|Connection pool initialization failed timeout|
| Parameters | Default value | Description |
|------------------------------------------------------|---------------------------------------------------|-----------------------------------------------|
| spring.datasource.driver-class-name | org.postgresql.Driver | datasource driver |
| spring.datasource.url | jdbc:postgresql://127.0.0.1:5432/dolphinscheduler | datasource connection url |
| spring.datasource.username | root | datasource username |
| spring.datasource.password | root | datasource password |
| spring.datasource.hikari.connection-test-query | select 1 | validate connection by running the SQL |
| spring.datasource.hikari.minimum-idle | 5 | minimum connection pool size number |
| spring.datasource.hikari.auto-commit | true | whether auto commit |
| spring.datasource.hikari.pool-name | DolphinScheduler | name of the connection pool |
| spring.datasource.hikari.maximum-pool-size | 50 | maximum connection pool size number |
| spring.datasource.hikari.connection-timeout | 30000 | connection timeout |
| spring.datasource.hikari.idle-timeout | 600000 | Maximum idle connection survival time |
| spring.datasource.hikari.leak-detection-threshold | 0 | Connection leak detection threshold |
| spring.datasource.hikari.initialization-fail-timeout | 1 | Connection pool initialization failed timeout |
Note that DolphinScheduler also supports database configuration through `bin/env/dolphinscheduler_env.sh`.
### Zookeeper related configuration
DolphinScheduler uses Zookeeper for cluster management, fault tolerance, event monitoring and other functions. Configuration file location:
DolphinScheduler uses Zookeeper for cluster management, fault tolerance, event monitoring and other functions.
Configuration file location:
|Service| Configuration file |
|--|--|
|Master Server | `master-server/conf/application.yaml`|
@ -173,17 +177,17 @@ DolphinScheduler uses Zookeeper for cluster management, fault tolerance, event m
The default configuration is as follows:
|Parameters | Default value| Description|
|--|--|--|
|registry.zookeeper.namespace|dolphinscheduler|namespace of zookeeper|
|registry.zookeeper.connect-string|localhost:2181| the connection string of zookeeper|
|registry.zookeeper.retry-policy.base-sleep-time|60ms|time to wait between subsequent retries|
|registry.zookeeper.retry-policy.max-sleep|300ms|maximum time to wait between subsequent retries|
|registry.zookeeper.retry-policy.max-retries|5|maximum retry times|
|registry.zookeeper.session-timeout|30s|session timeout|
|registry.zookeeper.connection-timeout|30s|connection timeout|
|registry.zookeeper.block-until-connected|600ms|waiting time to block until the connection succeeds|
|registry.zookeeper.digest|{username}:{password}|digest of zookeeper to access znode, works only when acl is enabled, for more details please check [https://zookeeper.apache.org/doc/r3.4.14/zookeeperAdmin.html](Apache Zookeeper doc) |
| Parameters | Default value | Description |
|-------------------------------------------------|-----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| registry.zookeeper.namespace | dolphinscheduler | namespace of zookeeper |
| registry.zookeeper.connect-string | localhost:2181 | the connection string of zookeeper |
| registry.zookeeper.retry-policy.base-sleep-time | 60ms | time to wait between subsequent retries |
| registry.zookeeper.retry-policy.max-sleep | 300ms | maximum time to wait between subsequent retries |
| registry.zookeeper.retry-policy.max-retries | 5 | maximum retry times |
| registry.zookeeper.session-timeout | 30s | session timeout |
| registry.zookeeper.connection-timeout | 30s | connection timeout |
| registry.zookeeper.block-until-connected | 600ms | waiting time to block until the connection succeeds |
| registry.zookeeper.digest | {username}:{password} | digest of zookeeper to access znode, works only when acl is enabled, for more details please check [https://zookeeper.apache.org/doc/r3.4.14/zookeeperAdmin.html](Apache Zookeeper doc) |
Note that DolphinScheduler also supports zookeeper related configuration through `bin/env/dolphinscheduler_env.sh`.
@ -191,12 +195,12 @@ Note that DolphinScheduler also supports zookeeper related configuration through
Currently, common.properties mainly configures Hadoop,s3a related configurations. Configuration file location:
|Service| Configuration file |
|--|--|
|Master Server | `master-server/conf/common.properties`|
|Api Server| `api-server/conf/common.properties`|
|Worker Server| `worker-server/conf/common.properties`|
|Alert Server| `alert-server/conf/common.properties`|
| Service | Configuration file |
|---------------|----------------------------------------|
| Master Server | `master-server/conf/common.properties` |
| Api Server | `api-server/conf/common.properties` |
| Worker Server | `worker-server/conf/common.properties` |
| Alert Server | `alert-server/conf/common.properties` |
The default configuration is as follows:
@ -237,43 +241,43 @@ The default configuration is as follows:
Location: `api-server/conf/application.yaml`
|Parameters | Default value| Description|
|--|--|--|
|server.port|12345|api service communication port|
|server.servlet.session.timeout|120m|session timeout|
|server.servlet.context-path|/dolphinscheduler/ |request path|
|spring.servlet.multipart.max-file-size|1024MB|maximum file size|
|spring.servlet.multipart.max-request-size|1024MB|maximum request size|
|server.jetty.max-http-post-size|5000000|jetty maximum post size|
|spring.banner.charset|UTF-8|message encoding|
|spring.jackson.time-zone|UTC|time zone|
|spring.jackson.date-format|"yyyy-MM-dd HH:mm:ss"|time format|
|spring.messages.basename|i18n/messages|i18n config|
|security.authentication.type|PASSWORD|authentication type|
|security.authentication.ldap.user.admin|read-only-admin|admin user account when you log-in with LDAP|
|security.authentication.ldap.urls|ldap://ldap.forumsys.com:389/|LDAP urls|
|security.authentication.ldap.base.dn|dc=example,dc=com|LDAP base dn|
|security.authentication.ldap.username|cn=read-only-admin,dc=example,dc=com|LDAP username|
|security.authentication.ldap.password|password|LDAP password|
|security.authentication.ldap.user.identity-attribute|uid|LDAP user identity attribute|
|security.authentication.ldap.user.email-attribute|mail|LDAP user email attribute|
|security.authentication.ldap.user.not-exist-action|CREATE|action when ldap user is not exist,default value: CREATE. Optional values include(CREATE,DENY)|
|security.authentication.ldap.ssl.enable|false|LDAP ssl switch|
|security.authentication.ldap.ssl.trust-store|ldapkeystore.jks|LDAP jks file absolute path|
|security.authentication.ldap.ssl.trust-store-password|password|LDAP jks password|
|security.authentication.casdoor.user.admin||admin user account when you log-in with Casdoor|
|casdoor.endpoint||Casdoor server url|
|casdoor.client-id||id in Casdoor|
|casdoor.client-secret||secret in Casdoor|
|casdoor.certificate||certificate in Casdoor|
|casdoor.organization-name||organization name in Casdoor|
|casdoor.application-name||application name in Casdoor|
|casdoor.redirect-url||doplhinscheduler login url|
|api.traffic.control.global.switch|false|traffic control global switch|
|api.traffic.control.max-global-qps-rate|300|global max request number per second|
|api.traffic.control.tenant-switch|false|traffic control tenant switch|
|api.traffic.control.default-tenant-qps-rate|10|default tenant max request number per second|
|api.traffic.control.customize-tenant-qps-rate||customize tenant max request number per second|
| Parameters | Default value | Description |
|-------------------------------------------------------|--------------------------------------|------------------------------------------------------------------------------------------------|
| server.port | 12345 | api service communication port |
| server.servlet.session.timeout | 120m | session timeout |
| server.servlet.context-path | /dolphinscheduler/ | request path |
| spring.servlet.multipart.max-file-size | 1024MB | maximum file size |
| spring.servlet.multipart.max-request-size | 1024MB | maximum request size |
| server.jetty.max-http-post-size | 5000000 | jetty maximum post size |
| spring.banner.charset | UTF-8 | message encoding |
| spring.jackson.time-zone | UTC | time zone |
| spring.jackson.date-format | "yyyy-MM-dd HH:mm:ss" | time format |
| spring.messages.basename | i18n/messages | i18n config |
| security.authentication.type | PASSWORD | authentication type |
| security.authentication.ldap.user.admin | read-only-admin | admin user account when you log-in with LDAP |
| security.authentication.ldap.urls | ldap://ldap.forumsys.com:389/ | LDAP urls |
| security.authentication.ldap.base.dn | dc=example,dc=com | LDAP base dn |
| security.authentication.ldap.username | cn=read-only-admin,dc=example,dc=com | LDAP username |
| security.authentication.ldap.password | password | LDAP password |
| security.authentication.ldap.user.identity-attribute | uid | LDAP user identity attribute |
| security.authentication.ldap.user.email-attribute | mail | LDAP user email attribute |
| security.authentication.ldap.user.not-exist-action | CREATE | action when ldap user is not exist,default value: CREATE. Optional values include(CREATE,DENY) |
| security.authentication.ldap.ssl.enable | false | LDAP ssl switch |
| security.authentication.ldap.ssl.trust-store | ldapkeystore.jks | LDAP jks file absolute path |
| security.authentication.ldap.ssl.trust-store-password | password | LDAP jks password |
| security.authentication.casdoor.user.admin | | admin user account when you log-in with Casdoor |
| casdoor.endpoint | | Casdoor server url |
| casdoor.client-id | | id in Casdoor |
| casdoor.client-secret | | secret in Casdoor |
| casdoor.certificate | | certificate in Casdoor |
| casdoor.organization-name | | organization name in Casdoor |
| casdoor.application-name | | application name in Casdoor |
| casdoor.redirect-url | | doplhinscheduler login url |
| api.traffic.control.global.switch | false | traffic control global switch |
| api.traffic.control.max-global-qps-rate | 300 | global max request number per second |
| api.traffic.control.tenant-switch | false | traffic control tenant switch |
| api.traffic.control.default-tenant-qps-rate | 10 | default tenant max request number per second |
| api.traffic.control.customize-tenant-qps-rate | | customize tenant max request number per second |
### Master Server related configuration
@ -292,9 +296,9 @@ Location: `master-server/conf/application.yaml`
| master.task-commit-interval | 1000 | master commit task interval, the unit is millisecond |
| master.state-wheel-interval | 5 | time to check status |
| master.server-load-protection.enabled | true | If set true, will open master overload protection |
| master.server-load-protection.max-cpu-usage-percentage-thresholds | 0.7 | Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow. |
| master.server-load-protection.max-jvm-memory-usage-percentage-thresholds | 0.7 | Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow. |
| master.server-load-protection.max-system-memory-usage-percentage-thresholds | 0.7 | Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow. |
| master.server-load-protection.max-system-cpu-usage-percentage-thresholds | 0.7 | Master max system cpu usage, when the master's system cpu usage is smaller then this value, master server can execute workflow. |
| master.server-load-protection.max-jvm-cpu-usage-percentage-thresholds | 0.7 | Master max JVM cpu usage, when the master's jvm cpu usage is smaller then this value, master server can execute workflow. |
| master.server-load-protection.max-system-memory-usage-percentage-thresholds | 0.7 | Master max system memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow. |
| master.server-load-protection.max-disk-usage-percentage-thresholds | 0.7 | Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow. |
| master.failover-interval | 10 | failover interval, the unit is minute |
| master.kill-application-when-task-failover | true | whether to kill yarn/k8s application when failover taskInstance |
@ -307,16 +311,16 @@ Location: `master-server/conf/application.yaml`
Location: `worker-server/conf/application.yaml`
| Parameters | Default value | Description |
|--------------------------------------------------------------------------------|---------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|-----------------------------------------------------------------------------|---------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| worker.listen-port | 1234 | worker-service listen port |
| worker.exec-threads | 100 | worker-service execute thread number, used to limit the number of task instances in parallel |
| worker.max-heartbeat-interval | 10s | worker-service max heartbeat interval |
| worker.host-weight | 100 | worker host weight to dispatch tasks |
| worker.server-load-protection.enabled | true | If set true will open worker overload protection |
| worker.max-cpu-usage-percentage-thresholds.max-cpu-usage-percentage-thresholds | 0.7 | Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow. |
| worker.server-load-protection.max-jvm-memory-usage-percentage-thresholds | 0.7 | Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow. |
| worker.server-load-protection.max-system-memory-usage-percentage-thresholds | 0.7 | Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow. |
| worker.server-load-protection.max-disk-usage-percentage-thresholds | 0.7 | Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow. |
| worker.server-load-protection.max-system-cpu-usage-percentage-thresholds | 0.7 | Worker max system cpu usage, when the worker's system cpu usage is smaller then this value, master server can execute workflow. |
| worker.server-load-protection.max-jvm-cpu-usage-percentage-thresholds | 0.7 | Worker max JVM cpu usage, when the worker's jvm cpu usage is smaller then this value, master server can execute workflow. |
| worker.server-load-protection.max-system-memory-usage-percentage-thresholds | 0.7 | Worker max system memory usage , when the worker's system memory usage is smaller then this value, master server can execute workflow. |
| worker.server-load-protection.max-disk-usage-percentage-thresholds | 0.7 | Worker max disk usage , when the worker's disk usage is smaller then this value, master server can execute workflow. |
| worker.registry-disconnect-strategy.strategy | stop | Used when the worker disconnect from registry, default value: stop. Optional values include stop, waiting |
| worker.registry-disconnect-strategy.max-waiting-time | 100s | Used when the worker disconnect from registry, and the disconnect strategy is waiting, this config means the worker will waiting to reconnect to registry in given times, and after the waiting times, if the worker still cannot connect to registry, will stop itself, if the value is 0s, will wait infinitely |
| worker.task-execute-threads-full-policy | REJECT | If REJECT, when the task waiting in the worker reaches exec-threads, it will reject the received task and the Master will redispatch it; If CONTINUE, it will put the task into the worker's execution queue and wait for a free thread to start execution |
@ -337,10 +341,10 @@ Location: `alert-server/conf/application.yaml`
This part describes quartz configs and configure them based on your practical situation and resources.
|Service| Configuration file |
|--|--|
|Master Server | `master-server/conf/application.yaml`|
|Api Server| `api-server/conf/application.yaml`|
| Service | Configuration file |
|---------------|---------------------------------------|
| Master Server | `master-server/conf/application.yaml` |
| Api Server | `api-server/conf/application.yaml` |
The default configuration is as follows:
@ -358,7 +362,8 @@ The default configuration is as follows:
| spring.quartz.properties.org.quartz.jobStore.driverDelegateClass | org.quartz.impl.jdbcjobstore.PostgreSQLDelegate |
| spring.quartz.properties.org.quartz.jobStore.clusterCheckinInterval | 5000 |
The above configuration items is the same in *Master Server* and *Api Server*, but their *Quartz Scheduler* threadpool configuration is different.
The above configuration items is the same in *Master Server* and *Api Server*, but their *Quartz Scheduler* threadpool
configuration is different.
The default quartz threadpool configuration in *Master Server* is as follows:
@ -369,7 +374,8 @@ The default quartz threadpool configuration in *Master Server* is as follows:
| spring.quartz.properties.org.quartz.threadPool.threadPriority | 5 |
| spring.quartz.properties.org.quartz.threadPool.class | org.quartz.simpl.SimpleThreadPool |
Since *Api Server* will not start *Quartz Scheduler* instance, as a client only, therefore it's threadpool is configured as `QuartzZeroSizeThreadPool` which has zero thread;
Since *Api Server* will not start *Quartz Scheduler* instance, as a client only, therefore it's threadpool is configured
as `QuartzZeroSizeThreadPool` which has zero thread;
The default configuration is as follows:
| Parameters | Default value |
@ -378,7 +384,8 @@ The default configuration is as follows:
### dolphinscheduler_env.sh [load environment variables configs]
When using shell to commit tasks, DolphinScheduler will export environment variables from `bin/env/dolphinscheduler_env.sh`. The
When using shell to commit tasks, DolphinScheduler will export environment variables
from `bin/env/dolphinscheduler_env.sh`. The
mainly configuration including `JAVA_HOME` and other environment paths.
```bash
@ -406,9 +413,10 @@ export FLINK_ENV_JAVA_OPTS="-javaagent:${DOLPHINSCHEDULER_HOME}/tools/libs/aspec
### Log related configuration
|Service| Configuration file |
|--|--|
|Master Server | `master-server/conf/logback-spring.xml`|
|Api Server| `api-server/conf/logback-spring.xml`|
|Worker Server| `worker-server/conf/logback-spring.xml`|
|Alert Server| `alert-server/conf/logback-spring.xml`|
| Service | Configuration file |
|---------------|-----------------------------------------|
| Master Server | `master-server/conf/logback-spring.xml` |
| Api Server | `api-server/conf/logback-spring.xml` |
| Worker Server | `worker-server/conf/logback-spring.xml` |
| Alert Server | `alert-server/conf/logback-spring.xml` |

112
docs/docs/zh/architecture/configuration.md

@ -130,38 +130,40 @@ export DOLPHINSCHEDULER_OPTS="
> 不建议设置"-XX:DisableExplicitGC" , DolphinScheduler使用Netty进行通讯,设置该参数,可能会导致内存泄漏.
>
>> 如果设置"-Djava.net.preferIPv6Addresses=true" 将会使用ipv6的IP地址, 如果设置"-Djava.net.preferIPv4Addresses=true"将会使用ipv4的IP地址, 如果都不设置,将会随机使用ipv4或者ipv6.
>> 如果设置"-Djava.net.preferIPv6Addresses=true" 将会使用ipv6的IP地址, 如果设置"-Djava.net.preferIPv4Addresses=true"
>> 将会使用ipv4的IP地址, 如果都不设置,将会随机使用ipv4或者ipv6.
## 数据库连接相关配置
在DolphinScheduler中使用Spring Hikari对数据库连接进行管理,配置文件位置:
|服务名称| 配置文件 |
|--|--|
|Master Server | `master-server/conf/application.yaml`|
|Api Server| `api-server/conf/application.yaml`|
|Worker Server| `worker-server/conf/application.yaml`|
|Alert Server| `alert-server/conf/application.yaml`|
| 服务名称 | 配置文件 |
|---------------|---------------------------------------|
| Master Server | `master-server/conf/application.yaml` |
| Api Server | `api-server/conf/application.yaml` |
| Worker Server | `worker-server/conf/application.yaml` |
| Alert Server | `alert-server/conf/application.yaml` |
默认配置如下:
|参数 | 默认值| 描述|
|--|--|--|
|spring.datasource.driver-class-name| org.postgresql.Driver |数据库驱动|
|spring.datasource.url| jdbc:postgresql://127.0.0.1:5432/dolphinscheduler |数据库连接地址|
|spring.datasource.username|root|数据库用户名|
|spring.datasource.password|root|数据库密码|
|spring.datasource.hikari.connection-test-query|select 1|检测连接是否有效的sql|
|spring.datasource.hikari.minimum-idle| 5|最小空闲连接池数量|
|spring.datasource.hikari.auto-commit|true|是否自动提交|
|spring.datasource.hikari.pool-name|DolphinScheduler|连接池名称|
|spring.datasource.hikari.maximum-pool-size|50|连接池最大连接数|
|spring.datasource.hikari.connection-timeout|30000|连接超时时长|
|spring.datasource.hikari.idle-timeout|600000|空闲连接存活最大时间|
|spring.datasource.hikari.leak-detection-threshold|0|连接泄露检测阈值|
|spring.datasource.hikari.initialization-fail-timeout|1|连接池初始化失败timeout|
DolphinScheduler同样可以通过设置环境变量进行数据库连接相关的配置, 将以上小写字母转成大写并把`.`换成`_`作为环境变量名, 设置值即可。
| 参数 | 默认值 | 描述 |
|------------------------------------------------------|---------------------------------------------------|-----------------|
| spring.datasource.driver-class-name | org.postgresql.Driver | 数据库驱动 |
| spring.datasource.url | jdbc:postgresql://127.0.0.1:5432/dolphinscheduler | 数据库连接地址 |
| spring.datasource.username | root | 数据库用户名 |
| spring.datasource.password | root | 数据库密码 |
| spring.datasource.hikari.connection-test-query | select 1 | 检测连接是否有效的sql |
| spring.datasource.hikari.minimum-idle | 5 | 最小空闲连接池数量 |
| spring.datasource.hikari.auto-commit | true | 是否自动提交 |
| spring.datasource.hikari.pool-name | DolphinScheduler | 连接池名称 |
| spring.datasource.hikari.maximum-pool-size | 50 | 连接池最大连接数 |
| spring.datasource.hikari.connection-timeout | 30000 | 连接超时时长 |
| spring.datasource.hikari.idle-timeout | 600000 | 空闲连接存活最大时间 |
| spring.datasource.hikari.leak-detection-threshold | 0 | 连接泄露检测阈值 |
| spring.datasource.hikari.initialization-fail-timeout | 1 | 连接池初始化失败timeout |
DolphinScheduler同样可以通过设置环境变量进行数据库连接相关的配置, 将以上小写字母转成大写并把`.`换成`_`作为环境变量名,
设置值即可。
## Zookeeper相关配置
@ -174,17 +176,17 @@ DolphinScheduler使用Zookeeper进行集群管理、容错、事件监听等功
默认配置如下:
|参数 |默认值| 描述|
|--|--|--|
|registry.zookeeper.namespace|dolphinscheduler|Zookeeper集群使用的namespace|
|registry.zookeeper.connect-string|localhost:2181| Zookeeper集群连接信息|
|registry.zookeeper.retry-policy.base-sleep-time|60ms|基本重试时间差|
|registry.zookeeper.retry-policy.max-sleep|300ms|最大重试时间|
|registry.zookeeper.retry-policy.max-retries|5|最大重试次数|
|registry.zookeeper.session-timeout|30s|session超时时间|
|registry.zookeeper.connection-timeout|30s|连接超时时间|
|registry.zookeeper.block-until-connected|600ms|阻塞直到连接成功的等待时间|
|registry.zookeeper.digest|{用户名:密码}|如果zookeeper打开了acl,则需要填写认证信息访问znode,认证信息格式为{用户名}:{密码}。关于Zookeeper ACL详见[https://zookeeper.apache.org/doc/r3.4.14/zookeeperAdmin.html](Apache Zookeeper官方文档)|
| 参数 | 默认值 | 描述 |
|-------------------------------------------------|------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
| registry.zookeeper.namespace | dolphinscheduler | Zookeeper集群使用的namespace |
| registry.zookeeper.connect-string | localhost:2181 | Zookeeper集群连接信息 |
| registry.zookeeper.retry-policy.base-sleep-time | 60ms | 基本重试时间差 |
| registry.zookeeper.retry-policy.max-sleep | 300ms | 最大重试时间 |
| registry.zookeeper.retry-policy.max-retries | 5 | 最大重试次数 |
| registry.zookeeper.session-timeout | 30s | session超时时间 |
| registry.zookeeper.connection-timeout | 30s | 连接超时时间 |
| registry.zookeeper.block-until-connected | 600ms | 阻塞直到连接成功的等待时间 |
| registry.zookeeper.digest | {用户名:密码} | 如果zookeeper打开了acl,则需要填写认证信息访问znode,认证信息格式为{用户名}:{密码}。关于Zookeeper ACL详见[https://zookeeper.apache.org/doc/r3.4.14/zookeeperAdmin.html](Apache Zookeeper官方文档) |
DolphinScheduler同样可以通过`bin/env/dolphinscheduler_env.sh`进行Zookeeper相关的配置。
@ -201,7 +203,7 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId
默认配置如下:
| 参数 | 默认值 | 描述 |
|-----------------------------------------------|--------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|-----------------------------------------------|--------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| data.basedir.path | /tmp/dolphinscheduler | 本地工作目录,用于存放临时文件 |
| resource.storage.type | NONE | 资源文件存储类型: HDFS,S3,OSS,GCS,ABS,NONE |
| resource.upload.path | /dolphinscheduler | 资源文件存储路径 |
@ -280,7 +282,7 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId
位置:`master-server/conf/application.yaml`
| 参数 | 默认值 | 描述 |
|--------------------------------------------------------|--------------|-----------------------------------------------------------------------------------|
|-----------------------------------------------------------------------------|--------------|-----------------------------------------------------------------------------------------|
| master.listen-port | 5678 | master监听端口 |
| master.fetch-command-num | 10 | master拉取command数量 |
| master.pre-exec-threads | 10 | master准备执行任务的数量,用于限制并行的command |
@ -291,8 +293,11 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId
| master.task-commit-retry-times | 5 | 任务重试次数 |
| master.task-commit-interval | 1000 | 任务提交间隔,单位为毫秒 |
| master.state-wheel-interval | 5 | 轮询检查状态时间 |
| master.max-cpu-load-avg | 1 | master最大cpuload均值,只有高于系统cpuload均值时,master服务才能调度任务. 默认值为1: 会使用100%的CPU |
| master.reserved-memory | 0.3 | master预留内存,只有低于系统可用内存时,master服务才能调度任务. 默认值为0.3:当系统内存低于30%时会停止调度新的工作流 |
| master.server-load-protection.enabled | true | 是否开启系统保护策略 |
| master.server-load-protection.max-system-cpu-usage-percentage-thresholds | 0.7 | master最大系统cpu使用值,只有当前系统cpu使用值低于最大系统cpu使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的操作系统CPU |
| master.server-load-protection.max-jvm-cpu-usage-percentage-thresholds | 0.7 | master最大JVM cpu使用值,只有当前JVM cpu使用值低于最大JVM cpu使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的JVM CPU |
| master.server-load-protection.max-system-memory-usage-percentage-thresholds | 0.7 | master最大系统 内存使用值,只有当前系统内存使用值低于最大系统内存使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的操作系统内存 |
| master.server-load-protection.max-disk-usage-percentage-thresholds | 0.7 | master最大系统磁盘使用值,只有当前系统磁盘使用值低于最大系统磁盘使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的操作系统磁盘空间 |
| master.failover-interval | 10 | failover间隔,单位为分钟 |
| master.kill-application-when-task-failover | true | 当任务实例failover时,是否kill掉yarn或k8s application |
| master.registry-disconnect-strategy.strategy | stop | 当Master与注册中心失联之后采取的策略, 默认值是: stop. 可选值包括: stop, waiting |
@ -305,14 +310,17 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId
位置:`worker-server/conf/application.yaml`
| 参数 | 默认值 | 描述 |
|------------------------------------------------------|-----------|-------------------------------------------------------------------------------------------------------------------------------------------|
|-----------------------------------------------------------------------------|-----------|-------------------------------------------------------------------------------------------------------------------------------------------|
| worker.listen-port | 1234 | worker监听端口 |
| worker.exec-threads | 100 | worker工作线程数量,用于限制并行的任务实例数量 |
| worker.max-heartbeat-interval | 10s | worker最大心跳间隔 |
| worker.host-weight | 100 | 派发任务时,worker主机的权重 |
| worker.tenant-auto-create | true | 租户对应于系统的用户,由worker提交作业.如果系统没有该用户,则在参数worker.tenant.auto.create为true后自动创建。 |
| worker.max-cpu-load-avg | 1 | worker最大cpuload均值,只有高于系统cpuload均值时,worker服务才能被派发任务. 默认值为1: 会使用100%的CPU |
| worker.reserved-memory | 0.3 | worker预留内存,只有低于系统可用内存时,worker服务才能被派发任务. 默认值为0.3:当系统内存低于30%时会停止调度新的工作流 |
| worker.server-load-protection.enabled | true | 是否开启系统保护策略 |
| worker.server-load-protection.max-system-cpu-usage-percentage-thresholds | 0.7 | worker最大系统cpu使用值,只有当前系统cpu使用值低于最大系统cpu使用值,worker服务才能接收任务. 默认值为0.7: 会使用70%的操作系统CPU |
| worker.server-load-protection.max-jvm-cpu-usage-percentage-thresholds | 0.7 | worker最大JVM cpu使用值,只有当前JVM cpu使用值低于最大JVM cpu使用值,worker服务才能接收任务. 默认值为0.7: 会使用70%的JVM CPU |
| worker.server-load-protection.max-system-memory-usage-percentage-thresholds | 0.7 | worker最大系统 内存使用值,只有当前系统内存使用值低于最大系统内存使用值,worker服务才能接收任务. 默认值为0.7: 会使用70%的操作系统内存 |
| worker.server-load-protection.max-disk-usage-percentage-thresholds | 0.7 | worker最大系统磁盘使用值,只有当前系统磁盘使用值低于最大系统磁盘使用值,worker服务才能接收任务. 默认值为0.7: 会使用70%的操作系统磁盘空间 |
| worker.alert-listen-host | localhost | alert监听host |
| worker.alert-listen-port | 50052 | alert监听端口 |
| worker.registry-disconnect-strategy.strategy | stop | 当Worker与注册中心失联之后采取的策略, 默认值是: stop. 可选值包括: stop, waiting |
@ -366,7 +374,9 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId
| spring.quartz.properties.org.quartz.threadPool.threadPriority | 5 |
| spring.quartz.properties.org.quartz.threadPool.class | org.quartz.simpl.SimpleThreadPool |
因为*Api Server*不会启动*Quartz Scheduler*实例,只会作为Scheduler客户端使用,因此它的Quartz线程池将会使用`QuartzZeroSizeThreadPool`。`QuartzZeroSizeThreadPool`不会启动任何线程。具体的默认配置如下:
因为*Api Server*不会启动*Quartz Scheduler*
实例,只会作为Scheduler客户端使用,因此它的Quartz线程池将会使用`QuartzZeroSizeThreadPool`。`QuartzZeroSizeThreadPool`
不会启动任何线程。具体的默认配置如下:
| Parameters | Default value |
|------------------------------------------------------|-----------------------------------------------------------------------|
@ -374,7 +384,8 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId
## dolphinscheduler_env.sh [环境变量配置]
通过类似shell方式提交任务的的时候,会加载该配置文件中的环境变量到主机中。涉及到的 `JAVA_HOME` 任务类型的环境配置,其中任务类型主要有: Shell任务、Python任务、Spark任务、Flink任务、Datax任务等等。
通过类似shell方式提交任务的的时候,会加载该配置文件中的环境变量到主机中。涉及到的 `JAVA_HOME`
任务类型的环境配置,其中任务类型主要有: Shell任务、Python任务、Spark任务、Flink任务、Datax任务等等。
```bash
# JAVA_HOME, will use it to start DolphinScheduler server
@ -401,9 +412,10 @@ export FLINK_ENV_JAVA_OPTS="-javaagent:${DOLPHINSCHEDULER_HOME}/tools/libs/aspec
## 日志相关配置
|服务名称| 配置文件 |
|--|--|
|Master Server | `master-server/conf/logback-spring.xml`|
|Api Server| `api-server/conf/logback-spring.xml`|
|Worker Server| `worker-server/conf/logback-spring.xml`|
|Alert Server| `alert-server/conf/logback-spring.xml`|
| 服务名称 | 配置文件 |
|---------------|-----------------------------------------|
| Master Server | `master-server/conf/logback-spring.xml` |
| Api Server | `api-server/conf/logback-spring.xml` |
| Worker Server | `worker-server/conf/logback-spring.xml` |
| Alert Server | `alert-server/conf/logback-spring.xml` |

3
dolphinscheduler-alert/dolphinscheduler-alert-server/src/main/java/org/apache/dolphinscheduler/alert/registry/AlertHeartbeatTask.java

@ -65,7 +65,8 @@ public class AlertHeartbeatTask extends BaseHeartBeatTask<AlertServerHeartBeat>
.processId(processId)
.startupTime(startupTime)
.reportTime(System.currentTimeMillis())
.cpuUsage(systemMetrics.getTotalCpuUsedPercentage())
.jvmCpuUsage(systemMetrics.getJvmCpuUsagePercentage())
.cpuUsage(systemMetrics.getSystemCpuUsagePercentage())
.memoryUsage(systemMetrics.getSystemMemoryUsedPercentage())
.jvmMemoryUsage(systemMetrics.getJvmMemoryUsedPercentage())
.serverStatus(ServerStatus.NORMAL)

28
dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/AlertServerHeartBeat.java

@ -17,33 +17,11 @@
package org.apache.dolphinscheduler.common.model;
import org.apache.dolphinscheduler.common.enums.ServerStatus;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public class AlertServerHeartBeat implements HeartBeat {
private int processId;
private long startupTime;
private long reportTime;
private double cpuUsage;
private double memoryUsage;
private double jvmMemoryUsage;
private ServerStatus serverStatus;
private String host;
private int port;
public class AlertServerHeartBeat extends BaseHeartBeat implements HeartBeat {
@Override
public ServerStatus getServerStatus() {
return serverStatus;
}
}

46
dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/BaseHeartBeat.java

@ -0,0 +1,46 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.dolphinscheduler.common.model;
import org.apache.dolphinscheduler.common.enums.ServerStatus;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
@Data
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public class BaseHeartBeat implements HeartBeat {
protected int processId;
protected long startupTime;
protected long reportTime;
protected double jvmCpuUsage;
protected double cpuUsage;
protected double jvmMemoryUsage;
protected double memoryUsage;
protected double diskUsage;
protected ServerStatus serverStatus;
protected String host;
protected int port;
}

4
dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/HeartBeat.java

@ -21,10 +21,6 @@ import org.apache.dolphinscheduler.common.enums.ServerStatus;
public interface HeartBeat {
String getHost();
ServerStatus getServerStatus();
int getPort();
}

28
dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/MasterHeartBeat.java

@ -17,33 +17,11 @@
package org.apache.dolphinscheduler.common.model;
import org.apache.dolphinscheduler.common.enums.ServerStatus;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public class MasterHeartBeat implements HeartBeat {
private long startupTime;
private long reportTime;
private double cpuUsage;
private double jvmMemoryUsage;
private double memoryUsage;
private double diskUsage;
private ServerStatus serverStatus;
private int processId;
private String host;
private int port;
public class MasterHeartBeat extends BaseHeartBeat implements HeartBeat {
@Override
public ServerStatus getServerStatus() {
return serverStatus;
}
}

29
dolphinscheduler-common/src/main/java/org/apache/dolphinscheduler/common/model/WorkerHeartBeat.java

@ -17,37 +17,18 @@
package org.apache.dolphinscheduler.common.model;
import org.apache.dolphinscheduler.common.enums.ServerStatus;
import lombok.AllArgsConstructor;
import lombok.Builder;
import lombok.Data;
import lombok.EqualsAndHashCode;
import lombok.NoArgsConstructor;
import lombok.experimental.SuperBuilder;
@Data
@Builder
@EqualsAndHashCode(callSuper = true)
@SuperBuilder
@NoArgsConstructor
@AllArgsConstructor
public class WorkerHeartBeat implements HeartBeat {
private long startupTime;
private long reportTime;
private double cpuUsage;
private double jvmMemoryUsage;
private double memoryUsage;
private double diskUsage;
private ServerStatus serverStatus;
private int processId;
private String host;
private int port;
public class WorkerHeartBeat extends BaseHeartBeat implements HeartBeat {
private int workerHostWeight; // worker host weight
private int threadPoolUsage; // worker waiting task count
@Override
public ServerStatus getServerStatus() {
return serverStatus;
}
}

2
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/MasterServer.java

@ -123,7 +123,7 @@ public class MasterServer implements IStoppable {
MasterServerMetrics.registerMasterCpuUsageGauge(() -> {
SystemMetrics systemMetrics = metricsProvider.getSystemMetrics();
return systemMetrics.getTotalCpuUsedPercentage();
return systemMetrics.getSystemCpuUsagePercentage();
});
MasterServerMetrics.registerMasterMemoryAvailableGauge(() -> {
SystemMetrics systemMetrics = metricsProvider.getSystemMetrics();

50
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/config/MasterServerLoadProtection.java

@ -17,57 +17,11 @@
package org.apache.dolphinscheduler.server.master.config;
import org.apache.dolphinscheduler.meter.metrics.SystemMetrics;
import org.apache.dolphinscheduler.meter.metrics.BaseServerLoadProtection;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
@NoArgsConstructor
@AllArgsConstructor
public class MasterServerLoadProtection {
private boolean enabled = true;
private double maxCpuUsagePercentageThresholds = 0.7;
private double maxJVMMemoryUsagePercentageThresholds = 0.7;
private double maxSystemMemoryUsagePercentageThresholds = 0.7;
private double maxDiskUsagePercentageThresholds = 0.7;
public boolean isOverload(SystemMetrics systemMetrics) {
if (!enabled) {
return false;
}
if (systemMetrics.getTotalCpuUsedPercentage() > maxCpuUsagePercentageThresholds) {
log.info(
"Master OverLoad: the TotalCpuUsedPercentage: {} is over then the MaxCpuUsagePercentageThresholds {}",
systemMetrics.getTotalCpuUsedPercentage(), maxCpuUsagePercentageThresholds);
return true;
}
if (systemMetrics.getJvmMemoryUsedPercentage() > maxJVMMemoryUsagePercentageThresholds) {
log.info(
"Master OverLoad: the JvmMemoryUsedPercentage: {} is over then the MaxJVMMemoryUsagePercentageThresholds {}",
systemMetrics.getJvmMemoryUsedPercentage(), maxCpuUsagePercentageThresholds);
return true;
}
if (systemMetrics.getDiskUsedPercentage() > maxDiskUsagePercentageThresholds) {
log.info("Master OverLoad: the DiskUsedPercentage: {} is over then the MaxDiskUsagePercentageThresholds {}",
systemMetrics.getDiskUsedPercentage(), maxCpuUsagePercentageThresholds);
return true;
}
if (systemMetrics.getSystemMemoryUsedPercentage() > maxSystemMemoryUsagePercentageThresholds) {
log.info(
"Master OverLoad: the SystemMemoryUsedPercentage: {} is over then the MaxSystemMemoryUsagePercentageThresholds {}",
systemMetrics.getSystemMemoryUsedPercentage(), maxSystemMemoryUsagePercentageThresholds);
return true;
}
return false;
}
public class MasterServerLoadProtection extends BaseServerLoadProtection {
}

2
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/metrics/MasterServerMetrics.java

@ -51,7 +51,7 @@ public class MasterServerMetrics {
public void registerMasterCpuUsageGauge(Supplier<Number> supplier) {
Gauge.builder("ds.master.cpu.usage", supplier)
.description("worker cpu usage")
.description("master cpu usage")
.register(Metrics.globalRegistry);
}

3
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/MasterSlotManager.java

@ -70,7 +70,8 @@ public class MasterSlotManager {
public void notify(Map<String, MasterHeartBeat> masterNodeInfo) {
List<Server> serverList = masterNodeInfo.values().stream()
.filter(heartBeat -> !heartBeat.getServerStatus().equals(ServerStatus.BUSY))
.map(this::convertHeartBeatToServer).collect(Collectors.toList());
.map(this::convertHeartBeatToServer)
.collect(Collectors.toList());
syncMasterNodes(serverList);
}

4
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/registry/ServerNodeManager.java

@ -249,7 +249,9 @@ public class ServerNodeManager implements InitializingBean {
try {
Map<String, String> workerNodeMaps = registryClient.getServerMaps(RegistryNodeType.WORKER);
for (Map.Entry<String, String> entry : workerNodeMaps.entrySet()) {
workerNodeInfo.put(entry.getKey(), JSONUtils.parseObject(entry.getValue(), WorkerHeartBeat.class));
String nodeAddress = entry.getKey();
WorkerHeartBeat workerHeartBeat = JSONUtils.parseObject(entry.getValue(), WorkerHeartBeat.class);
workerNodeInfo.put(nodeAddress, workerHeartBeat);
}
} finally {
workerGroupWriteLock.unlock();

3
dolphinscheduler-master/src/main/java/org/apache/dolphinscheduler/server/master/task/MasterHeartBeatTask.java

@ -64,7 +64,8 @@ public class MasterHeartBeatTask extends BaseHeartBeatTask<MasterHeartBeat> {
return MasterHeartBeat.builder()
.startupTime(ServerLifeCycleManager.getServerStartupTime())
.reportTime(System.currentTimeMillis())
.cpuUsage(systemMetrics.getTotalCpuUsedPercentage())
.jvmCpuUsage(systemMetrics.getJvmCpuUsagePercentage())
.cpuUsage(systemMetrics.getSystemCpuUsagePercentage())
.jvmMemoryUsage(systemMetrics.getJvmMemoryUsedPercentage())
.memoryUsage(systemMetrics.getSystemMemoryUsedPercentage())
.diskUsage(systemMetrics.getDiskUsedPercentage())

8
dolphinscheduler-master/src/main/resources/application.yaml

@ -122,10 +122,10 @@ master:
server-load-protection:
# If set true, will open master overload protection
enabled: true
# Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow.
max-cpu-usage-percentage-thresholds: 0.7
# Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow.
max-jvm-memory-usage-percentage-thresholds: 0.7
# Master max system cpu usage, when the master's system cpu usage is smaller then this value, master server can execute workflow.
max-system-cpu-usage-percentage-thresholds: 0.7
# Master max jvm cpu usage, when the master's jvm cpu usage is smaller then this value, master server can execute workflow.
max-jvm-cpu-usage-percentage-thresholds: 0.7
# Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow.
max-system-memory-usage-percentage-thresholds: 0.7
# Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow.

25
dolphinscheduler-master/src/test/java/org/apache/dolphinscheduler/server/master/config/MasterConfigTest.java

@ -17,16 +17,15 @@
package org.apache.dolphinscheduler.server.master.config;
import org.junit.jupiter.api.Assertions;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.ActiveProfiles;
import org.springframework.test.context.junit.jupiter.SpringExtension;
@ActiveProfiles("master")
@ExtendWith(SpringExtension.class)
@AutoConfigureMockMvc
@SpringBootTest(classes = MasterConfig.class)
public class MasterConfigTest {
@ -36,6 +35,18 @@ public class MasterConfigTest {
@Test
public void getMasterDispatchTaskNumber() {
int masterDispatchTaskNumber = masterConfig.getDispatchTaskNumber();
Assertions.assertEquals(3, masterDispatchTaskNumber);
assertEquals(30, masterDispatchTaskNumber);
}
@Test
public void getServerLoadProtection() {
MasterServerLoadProtection serverLoadProtection = masterConfig.getServerLoadProtection();
assertTrue(serverLoadProtection.isEnabled());
assertEquals(0.77, serverLoadProtection.getMaxSystemCpuUsagePercentageThresholds());
assertEquals(0.77, serverLoadProtection.getMaxJvmCpuUsagePercentageThresholds());
assertEquals(0.77, serverLoadProtection.getMaxJvmCpuUsagePercentageThresholds());
assertEquals(0.77, serverLoadProtection.getMaxSystemMemoryUsagePercentageThresholds());
assertEquals(0.77, serverLoadProtection.getMaxDiskUsagePercentageThresholds());
}
}

3
dolphinscheduler-master/src/test/java/org/apache/dolphinscheduler/server/master/config/MasterServerLoadProtectionTest.java

@ -30,7 +30,8 @@ class MasterServerLoadProtectionTest {
SystemMetrics systemMetrics = SystemMetrics.builder()
.jvmMemoryUsedPercentage(0.71)
.systemMemoryUsedPercentage(0.71)
.totalCpuUsedPercentage(0.71)
.systemCpuUsagePercentage(0.71)
.jvmCpuUsagePercentage(0.71)
.diskUsedPercentage(0.71)
.build();
masterServerLoadProtection.setEnabled(false);

164
dolphinscheduler-master/src/test/resources/application.yaml

@ -0,0 +1,164 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
spring:
banner:
charset: UTF-8
jackson:
time-zone: UTC
date-format: "yyyy-MM-dd HH:mm:ss"
cache:
# default enable cache, you can disable by `type: none`
type: none
cache-names:
- tenant
- user
- processDefinition
- processTaskRelation
- taskDefinition
caffeine:
spec: maximumSize=100,expireAfterWrite=300s,recordStats
datasource:
driver-class-name: org.postgresql.Driver
url: jdbc:postgresql://127.0.0.1:5432/dolphinscheduler
username: root
password: root
hikari:
connection-test-query: select 1
minimum-idle: 5
auto-commit: true
validation-timeout: 3000
pool-name: DolphinScheduler
maximum-pool-size: 50
connection-timeout: 30000
idle-timeout: 600000
leak-detection-threshold: 0
initialization-fail-timeout: 1
quartz:
job-store-type: jdbc
jdbc:
initialize-schema: never
properties:
org.quartz.threadPool.threadPriority: 5
org.quartz.jobStore.isClustered: true
org.quartz.jobStore.class: org.springframework.scheduling.quartz.LocalDataSourceJobStore
org.quartz.scheduler.instanceId: AUTO
org.quartz.jobStore.tablePrefix: QRTZ_
org.quartz.jobStore.acquireTriggersWithinLock: true
org.quartz.scheduler.instanceName: DolphinScheduler
org.quartz.threadPool.class: org.quartz.simpl.SimpleThreadPool
org.quartz.jobStore.useProperties: false
org.quartz.threadPool.makeThreadsDaemons: true
org.quartz.threadPool.threadCount: 25
org.quartz.jobStore.misfireThreshold: 60000
org.quartz.scheduler.batchTriggerAcquisitionMaxCount: 1
org.quartz.scheduler.makeSchedulerThreadDaemon: true
org.quartz.jobStore.driverDelegateClass: org.quartz.impl.jdbcjobstore.PostgreSQLDelegate
org.quartz.jobStore.clusterCheckinInterval: 5000
# Mybatis-plus configuration, you don't need to change it
mybatis-plus:
mapper-locations: classpath:org/apache/dolphinscheduler/dao/mapper/*Mapper.xml
type-aliases-package: org.apache.dolphinscheduler.dao.entity
configuration:
cache-enabled: false
call-setters-on-nulls: true
map-underscore-to-camel-case: true
jdbc-type-for-null: NULL
global-config:
db-config:
id-type: auto
banner: false
registry:
type: zookeeper
zookeeper:
namespace: dolphinscheduler
connect-string: localhost:2181
retry-policy:
base-sleep-time: 60ms
max-sleep: 300ms
max-retries: 5
session-timeout: 30s
connection-timeout: 9s
block-until-connected: 600ms
digest: ~
master:
listen-port: 5678
# master fetch command num
fetch-command-num: 10
# master prepare execute thread number to limit handle commands in parallel
pre-exec-threads: 10
# master execute thread number to limit process instances in parallel
exec-threads: 100
# master dispatch task number per batch, if all the tasks dispatch failed in a batch, will sleep 1s.
dispatch-task-number: 30
# master host selector to select a suitable worker, default value: LowerWeight. Optional values include random, round_robin, lower_weight
host-selector: lower_weight
# master heartbeat interval
max-heartbeat-interval: 10s
# master commit task retry times
task-commit-retry-times: 5
# master commit task interval
task-commit-interval: 1s
state-wheel-interval: 5s
server-load-protection:
# If set true, will open master overload protection
enabled: true
# Master max system cpu usage, when the master's system cpu usage is smaller then this value, master server can execute workflow.
max-system-cpu-usage-percentage-thresholds: 0.77
# Master max jvm cpu usage, when the master's jvm cpu usage is smaller then this value, master server can execute workflow.
max-jvm-cpu-usage-percentage-thresholds: 0.77
# Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow.
max-system-memory-usage-percentage-thresholds: 0.77
# Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow.
max-disk-usage-percentage-thresholds: 0.77
# failover interval, the unit is minute
failover-interval: 10m
# kill yarn / k8s application when failover taskInstance, default true
kill-application-when-task-failover: true
registry-disconnect-strategy:
# The disconnect strategy: stop, waiting
strategy: waiting
# The max waiting time to reconnect to registry if you set the strategy to waiting
max-waiting-time: 100s
worker-group-refresh-interval: 10s
server:
port: 5679
management:
endpoints:
web:
exposure:
include: health,metrics,prometheus
endpoint:
health:
enabled: true
show-details: always
health:
db:
enabled: true
defaults:
enabled: false
metrics:
tags:
application: ${spring.application.name}
metrics:
enabled: true

6
dolphinscheduler-master/src/test/resources/logback.xml

@ -66,12 +66,6 @@
</appender>
<root level="INFO">
<if condition="${DOCKER:-false}">
<then>
<appender-ref ref="STDOUT"/>
</then>
</if>
<appender-ref ref="TASKLOGFILE"/>
<appender-ref ref="MASTERLOGFILE"/>
</root>
</configuration>

67
dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/BaseServerLoadProtection.java

@ -0,0 +1,67 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.dolphinscheduler.meter.metrics;
import lombok.Data;
import lombok.extern.slf4j.Slf4j;
@Slf4j
@Data
public class BaseServerLoadProtection implements ServerLoadProtection {
protected boolean enabled = true;
protected double maxSystemCpuUsagePercentageThresholds = 0.7;
protected double maxJvmCpuUsagePercentageThresholds = 0.7;
protected double maxSystemMemoryUsagePercentageThresholds = 0.7;
protected double maxDiskUsagePercentageThresholds = 0.7;
@Override
public boolean isOverload(SystemMetrics systemMetrics) {
if (!enabled) {
return false;
}
if (systemMetrics.getSystemCpuUsagePercentage() > maxSystemCpuUsagePercentageThresholds) {
log.info(
"OverLoad: the system cpu usage: {} is over then the maxSystemCpuUsagePercentageThresholds {}",
systemMetrics.getSystemCpuUsagePercentage(), maxSystemCpuUsagePercentageThresholds);
return true;
}
if (systemMetrics.getJvmCpuUsagePercentage() > maxJvmCpuUsagePercentageThresholds) {
log.info(
"OverLoad: the jvm cpu usage: {} is over then the maxJvmCpuUsagePercentageThresholds {}",
systemMetrics.getJvmCpuUsagePercentage(), maxJvmCpuUsagePercentageThresholds);
return true;
}
if (systemMetrics.getDiskUsedPercentage() > maxDiskUsagePercentageThresholds) {
log.info("OverLoad: the DiskUsedPercentage: {} is over then the maxDiskUsagePercentageThresholds {}",
systemMetrics.getDiskUsedPercentage(), maxDiskUsagePercentageThresholds);
return true;
}
if (systemMetrics.getSystemMemoryUsedPercentage() > maxSystemMemoryUsagePercentageThresholds) {
log.info(
"OverLoad: the SystemMemoryUsedPercentage: {} is over then the maxSystemMemoryUsagePercentageThresholds {}",
systemMetrics.getSystemMemoryUsedPercentage(), maxSystemMemoryUsagePercentageThresholds);
return true;
}
return false;
}
}

11
dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/DefaultMetricsProvider.java

@ -19,7 +19,6 @@ package org.apache.dolphinscheduler.meter.metrics;
import org.apache.dolphinscheduler.common.utils.OSUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;
import io.micrometer.core.instrument.MeterRegistry;
@ -27,8 +26,11 @@ import io.micrometer.core.instrument.MeterRegistry;
@Component
public class DefaultMetricsProvider implements MetricsProvider {
@Autowired
private MeterRegistry meterRegistry;
private final MeterRegistry meterRegistry;
public DefaultMetricsProvider(MeterRegistry meterRegistry) {
this.meterRegistry = meterRegistry;
}
private SystemMetrics systemMetrics;
@ -53,8 +55,7 @@ public class DefaultMetricsProvider implements MetricsProvider {
systemMetrics = SystemMetrics.builder()
.systemCpuUsagePercentage(systemCpuUsage)
.processCpuUsagePercentage(processCpuUsage)
.totalCpuUsedPercentage(systemCpuUsage + processCpuUsage)
.jvmCpuUsagePercentage(processCpuUsage)
.jvmMemoryUsed(jvmMemoryUsed)
.jvmMemoryMax(jvmMemoryMax)
.jvmMemoryUsedPercentage(jvmMemoryUsed / jvmMemoryMax)

24
dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/ServerLoadProtection.java

@ -0,0 +1,24 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.dolphinscheduler.meter.metrics;
public interface ServerLoadProtection {
boolean isOverload(SystemMetrics systemMetrics);
}

3
dolphinscheduler-meter/src/main/java/org/apache/dolphinscheduler/meter/metrics/SystemMetrics.java

@ -30,8 +30,7 @@ public class SystemMetrics {
// CPU
private double systemCpuUsagePercentage;
private double processCpuUsagePercentage;
private double totalCpuUsedPercentage;
private double jvmCpuUsagePercentage;
// JVM-Memory
// todo: get pod memory usage

16
dolphinscheduler-standalone-server/src/main/resources/application.yaml

@ -190,10 +190,10 @@ master:
state-wheel-interval: 5s
server-load-protection:
enabled: true
# Master max cpu usage, when the master's cpu usage is smaller then this value, master server can execute workflow.
max-cpu-usage-percentage-thresholds: 0.9
# Master max JVM memory usage , when the master's jvm memory usage is smaller then this value, master server can execute workflow.
max-jvm-memory-usage-percentage-thresholds: 0.9
# Master max system cpu usage, when the master's system cpu usage is smaller then this value, master server can execute workflow.
max-system-cpu-usage-percentage-thresholds: 0.9
# Master max jvm cpu usage, when the master's jvm cpu usage is smaller then this value, master server can execute workflow.
max-jvm-cpu-usage-percentage-thresholds: 0.9
# Master max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow.
max-system-memory-usage-percentage-thresholds: 0.9
# Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow.
@ -215,10 +215,10 @@ worker:
host-weight: 100
server-load-protection:
enabled: true
# Worker max cpu usage, when the worker's cpu usage is smaller then this value, worker server can be dispatched tasks.
max-cpu-usage-percentage-thresholds: 0.9
# Worker max JVM memory usage , when the worker's jvm memory usage is smaller then this value, worker server can be dispatched tasks.
max-jvm-memory-usage-percentage-thresholds: 0.9
# Worker max system cpu usage, when the worker's system cpu usage is smaller then this value, worker server can be dispatched tasks.
max-system-cpu-usage-percentage-thresholds: 0.9
# Worker max jvm cpu usage, when the worker's jvm cpu usage is smaller then this value, worker server can be dispatched tasks.
max-jvm-cpu-usage-percentage-thresholds: 0.9
# Worker max System memory usage , when the worker's system memory usage is smaller then this value, worker server can be dispatched tasks.
max-system-memory-usage-percentage-thresholds: 0.9
# Worker max disk usage , when the worker's disk usage is smaller then this value, worker server can be dispatched tasks.

2
dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/WorkerServer.java

@ -94,7 +94,7 @@ public class WorkerServer implements IStoppable {
WorkerServerMetrics.registerWorkerCpuUsageGauge(() -> {
SystemMetrics systemMetrics = metricsProvider.getSystemMetrics();
return systemMetrics.getTotalCpuUsedPercentage();
return systemMetrics.getSystemCpuUsagePercentage();
});
WorkerServerMetrics.registerWorkerMemoryAvailableGauge(() -> {
SystemMetrics systemMetrics = metricsProvider.getSystemMetrics();

50
dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/config/WorkerServerLoadProtection.java

@ -17,57 +17,11 @@
package org.apache.dolphinscheduler.server.worker.config;
import org.apache.dolphinscheduler.meter.metrics.SystemMetrics;
import org.apache.dolphinscheduler.meter.metrics.BaseServerLoadProtection;
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import lombok.extern.slf4j.Slf4j;
@Data
@Slf4j
@NoArgsConstructor
@AllArgsConstructor
public class WorkerServerLoadProtection {
private boolean enabled = true;
private double maxCpuUsagePercentageThresholds = 0.7;
private double maxJVMMemoryUsagePercentageThresholds = 0.7;
private double maxSystemMemoryUsagePercentageThresholds = 0.7;
private double maxDiskUsagePercentageThresholds = 0.7;
public boolean isOverload(SystemMetrics systemMetrics) {
if (!enabled) {
return false;
}
if (systemMetrics.getTotalCpuUsedPercentage() > maxCpuUsagePercentageThresholds) {
log.info(
"Worker OverLoad: the TotalCpuUsedPercentage: {} is over then the MaxCpuUsagePercentageThresholds {}",
systemMetrics.getTotalCpuUsedPercentage(), maxCpuUsagePercentageThresholds);
return true;
}
if (systemMetrics.getJvmMemoryUsedPercentage() > maxJVMMemoryUsagePercentageThresholds) {
log.info(
"Worker OverLoad: the JvmMemoryUsedPercentage: {} is over then the maxCpuUsagePercentageThresholds {}",
systemMetrics.getJvmMemoryUsedPercentage(), maxJVMMemoryUsagePercentageThresholds);
return true;
}
if (systemMetrics.getDiskUsedPercentage() > maxDiskUsagePercentageThresholds) {
log.info("Worker OverLoad: the DiskUsedPercentage: {} is over then the MaxCpuUsagePercentageThresholds {}",
systemMetrics.getDiskUsedPercentage(), maxDiskUsagePercentageThresholds);
return true;
}
if (systemMetrics.getSystemMemoryUsedPercentage() > maxSystemMemoryUsagePercentageThresholds) {
log.info(
"Worker OverLoad: the SystemMemoryUsedPercentage: {} is over then the MaxSystemMemoryUsagePercentageThresholds {}",
systemMetrics.getSystemMemoryUsedPercentage(), maxSystemMemoryUsagePercentageThresholds);
return true;
}
return false;
}
public class WorkerServerLoadProtection extends BaseServerLoadProtection {
}

3
dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/task/WorkerHeartBeatTask.java

@ -65,7 +65,8 @@ public class WorkerHeartBeatTask extends BaseHeartBeatTask<WorkerHeartBeat> {
return WorkerHeartBeat.builder()
.startupTime(ServerLifeCycleManager.getServerStartupTime())
.reportTime(System.currentTimeMillis())
.cpuUsage(systemMetrics.getTotalCpuUsedPercentage())
.jvmCpuUsage(systemMetrics.getJvmCpuUsagePercentage())
.cpuUsage(systemMetrics.getSystemCpuUsagePercentage())
.jvmMemoryUsage(systemMetrics.getJvmMemoryUsedPercentage())
.memoryUsage(systemMetrics.getSystemMemoryUsedPercentage())
.diskUsage(systemMetrics.getDiskUsedPercentage())

8
dolphinscheduler-worker/src/main/resources/application.yaml

@ -50,10 +50,10 @@ worker:
server-load-protection:
# If set true, will open worker overload protection
enabled: true
# Worker max cpu usage, when the worker's cpu usage is smaller then this value, worker server can be dispatched tasks.
max-cpu-usage-percentage-thresholds: 0.7
# Worker max jvm memory usage , when the worker's jvm memory usage is smaller then this value, worker server can be dispatched tasks.
max-jvm-memory-usage-percentage-thresholds: 0.7
# Worker max system cpu usage, when the worker's system cpu usage is smaller then this value, worker server can be dispatched tasks.
max-system-cpu-usage-percentage-thresholds: 0.7
# Worker max jvm cpu usage, when the worker's jvm cpu usage is smaller then this value, worker server can be dispatched tasks.
max-jvm-cpu-usage-percentage-thresholds: 0.7
# Worker max System memory usage , when the master's system memory usage is smaller then this value, master server can execute workflow.
max-system-memory-usage-percentage-thresholds: 0.7
# Worker max disk usage , when the worker's disk usage is smaller then this value, worker server can be dispatched tasks.

3
dolphinscheduler-worker/src/test/java/org/apache/dolphinscheduler/server/worker/config/WorkerServerLoadProtectionTest.java

@ -30,7 +30,8 @@ class WorkerServerLoadProtectionTest {
SystemMetrics systemMetrics = SystemMetrics.builder()
.jvmMemoryUsedPercentage(0.71)
.systemMemoryUsedPercentage(0.71)
.totalCpuUsedPercentage(0.71)
.systemCpuUsagePercentage(0.71)
.jvmCpuUsagePercentage(0.71)
.diskUsedPercentage(0.71)
.build();
workerServerLoadProtection.setEnabled(false);

Loading…
Cancel
Save