Browse Source

fix: data quality may fail in docker mode (#15563)

dev_wenjun_refactorMaster
Jay Chung 10 months ago committed by GitHub
parent
commit
91d56f4860
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
  1. 2
      deploy/kubernetes/dolphinscheduler/README.md
  2. 2
      deploy/kubernetes/dolphinscheduler/values.yaml
  3. 2
      docs/docs/en/architecture/configuration.md
  4. 2
      docs/docs/en/guide/data-quality.md
  5. 7
      docs/docs/en/guide/resource/configuration.md
  6. 1
      docs/docs/en/guide/upgrade/incompatible.md
  7. 2
      docs/docs/zh/architecture/configuration.md
  8. 2
      docs/docs/zh/guide/data-quality.md
  9. 7
      docs/docs/zh/guide/resource/configuration.md
  10. 7
      dolphinscheduler-api-test/dolphinscheduler-api-test-case/src/test/resources/docker/file-manage/common.properties
  11. 7
      dolphinscheduler-common/src/main/resources/common.properties
  12. 7
      dolphinscheduler-common/src/test/resources/common.properties
  13. 29
      dolphinscheduler-datasource-plugin/dolphinscheduler-datasource-api/src/main/java/org/apache/dolphinscheduler/plugin/datasource/api/utils/CommonUtils.java
  14. 7
      dolphinscheduler-e2e/dolphinscheduler-e2e-case/src/test/resources/docker/file-manage/common.properties
  15. 1
      dolphinscheduler-standalone-server/src/main/docker/Dockerfile
  16. 4
      dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/main/java/org/apache/dolphinscheduler/plugin/task/api/TaskConstants.java
  17. 7
      dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/test/resources/common.properties

2
deploy/kubernetes/dolphinscheduler/README.md

@ -121,7 +121,7 @@ Please refer to the [Quick Start in Kubernetes](../../../docs/docs/en/guide/inst
| conf.common."alert.rpc.port" | int | `50052` | rpc port | | conf.common."alert.rpc.port" | int | `50052` | rpc port |
| conf.common."appId.collect" | string | `"log"` | way to collect applicationId: log, aop | | conf.common."appId.collect" | string | `"log"` | way to collect applicationId: log, aop |
| conf.common."conda.path" | string | `"/opt/anaconda3/etc/profile.d/conda.sh"` | set path of conda.sh | | conf.common."conda.path" | string | `"/opt/anaconda3/etc/profile.d/conda.sh"` | set path of conda.sh |
| conf.common."data-quality.jar.name" | string | `"dolphinscheduler-data-quality-dev-SNAPSHOT.jar"` | data quality option | | conf.common."data-quality.jar.dir" | string | `nil` | data quality option |
| conf.common."data.basedir.path" | string | `"/tmp/dolphinscheduler"` | user data local directory path, please make sure the directory exists and have read write permissions | | conf.common."data.basedir.path" | string | `"/tmp/dolphinscheduler"` | user data local directory path, please make sure the directory exists and have read write permissions |
| conf.common."datasource.encryption.enable" | bool | `false` | datasource encryption enable | | conf.common."datasource.encryption.enable" | bool | `false` | datasource encryption enable |
| conf.common."datasource.encryption.salt" | string | `"!@#$%^&*"` | datasource encryption salt | | conf.common."datasource.encryption.salt" | string | `"!@#$%^&*"` | datasource encryption salt |

2
deploy/kubernetes/dolphinscheduler/values.yaml

@ -328,7 +328,7 @@ conf:
datasource.encryption.salt: '!@#$%^&*' datasource.encryption.salt: '!@#$%^&*'
# -- data quality option # -- data quality option
data-quality.jar.name: dolphinscheduler-data-quality-dev-SNAPSHOT.jar data-quality.jar.dir:
# -- Whether hive SQL is executed in the same session # -- Whether hive SQL is executed in the same session
support.hive.oneSession: false support.hive.oneSession: false

2
docs/docs/en/architecture/configuration.md

@ -226,7 +226,7 @@ The default configuration is as follows:
| yarn.job.history.status.address | http://ds1:19888/ws/v1/history/mapreduce/jobs/%s | job history status url of yarn | | yarn.job.history.status.address | http://ds1:19888/ws/v1/history/mapreduce/jobs/%s | job history status url of yarn |
| datasource.encryption.enable | false | whether to enable datasource encryption | | datasource.encryption.enable | false | whether to enable datasource encryption |
| datasource.encryption.salt | !@#$%^&* | the salt of the datasource encryption | | datasource.encryption.salt | !@#$%^&* | the salt of the datasource encryption |
| data-quality.jar.name | dolphinscheduler-data-quality-dev-SNAPSHOT.jar | the jar of data quality | | data-quality.jar.dir | | the jar of data quality |
| support.hive.oneSession | false | specify whether hive SQL is executed in the same session | | support.hive.oneSession | false | specify whether hive SQL is executed in the same session |
| sudo.enable | true | whether to enable sudo | | sudo.enable | true | whether to enable sudo |
| alert.rpc.port | 50052 | the RPC port of Alert Server | | alert.rpc.port | 50052 | the RPC port of Alert Server |

2
docs/docs/en/guide/data-quality.md

@ -12,7 +12,7 @@ The execution logic of the data quality task is as follows:
- The current data quality task result is stored in the `t_ds_dq_execute_result` table of `dolphinscheduler` - The current data quality task result is stored in the `t_ds_dq_execute_result` table of `dolphinscheduler`
`Worker` sends the task result to `Master`, after `Master` receives `TaskResponse`, it will judge whether the task type is `DataQualityTask`, if so, it will read the corresponding result from `t_ds_dq_execute_result` according to `taskInstanceId`, and then The result is judged according to the check mode, operator and threshold configured by the user. `Worker` sends the task result to `Master`, after `Master` receives `TaskResponse`, it will judge whether the task type is `DataQualityTask`, if so, it will read the corresponding result from `t_ds_dq_execute_result` according to `taskInstanceId`, and then The result is judged according to the check mode, operator and threshold configured by the user.
- If the result is a failure, the corresponding operation, alarm or interruption will be performed according to the failure policy configured by the user. - If the result is a failure, the corresponding operation, alarm or interruption will be performed according to the failure policy configured by the user.
- If you package `data-quality` separately, remember to modify the package name to be consistent with `data-quality.jar.name` in `common.properties` with attribute name `data-quality.jar.name` - If you package `data-quality` separately, remember to modify the package name to be consistent with `data-quality.jar.dir` in `common.properties` with attribute name `data-quality.jar.dir`
- If the old version is upgraded and used, you need to execute the `sql` update script to initialize the database before running. - If the old version is upgraded and used, you need to execute the `sql` update script to initialize the database before running.
- `dolphinscheduler-data-quality-dev-SNAPSHOT.jar` was built with no dependencies. If a `JDBC` driver is required, you can set the `-jars` parameter in the `node settings` `Option Parameters`, e.g. `--jars /lib/jars/mysql-connector-java-8.0.16.jar`. - `dolphinscheduler-data-quality-dev-SNAPSHOT.jar` was built with no dependencies. If a `JDBC` driver is required, you can set the `-jars` parameter in the `node settings` `Option Parameters`, e.g. `--jars /lib/jars/mysql-connector-java-8.0.16.jar`.
- Currently only `MySQL`, `PostgreSQL` and `HIVE` data sources have been tested, other data sources have not been tested yet. - Currently only `MySQL`, `PostgreSQL` and `HIVE` data sources have been tested, other data sources have not been tested yet.

7
docs/docs/en/guide/resource/configuration.md

@ -152,9 +152,10 @@ datasource.encryption.enable=false
# datasource encryption salt # datasource encryption salt
datasource.encryption.salt=!@#$%^&* datasource.encryption.salt=!@#$%^&*
# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory # data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in
# if you re-build it alone, or auto discovery mechanism fail # data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server
data-quality.jar.name= # libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`).
data-quality.jar.dir=
#data-quality.error.output.path=/tmp/data-quality-error-data #data-quality.error.output.path=/tmp/data-quality-error-data

1
docs/docs/en/guide/upgrade/incompatible.md

@ -11,6 +11,7 @@ This document records the incompatible updates between each version. You need to
* Change the default unix shell executor from sh to bash ([#12180](https://github.com/apache/dolphinscheduler/pull/12180)). * Change the default unix shell executor from sh to bash ([#12180](https://github.com/apache/dolphinscheduler/pull/12180)).
* Remove `deleteSource` in `download()` of `StorageOperate` ([#14084](https://github.com/apache/dolphinscheduler/pull/14084)) * Remove `deleteSource` in `download()` of `StorageOperate` ([#14084](https://github.com/apache/dolphinscheduler/pull/14084))
* Remove default key for attribute `data-quality.jar.name` in `common.properties` ([#15551](https://github.com/apache/dolphinscheduler/pull/15551)) * Remove default key for attribute `data-quality.jar.name` in `common.properties` ([#15551](https://github.com/apache/dolphinscheduler/pull/15551))
* Rename attribute `data-quality.jar.name` to `data-quality.jar.dir` in `common.properties` and represent for directory ([#15563](https://github.com/apache/dolphinscheduler/pull/15563))
## 3.2.0 ## 3.2.0

2
docs/docs/zh/architecture/configuration.md

@ -226,7 +226,7 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId
| yarn.job.history.status.address | http://ds1:19888/ws/v1/history/mapreduce/jobs/%s | yarn的作业历史状态URL | | yarn.job.history.status.address | http://ds1:19888/ws/v1/history/mapreduce/jobs/%s | yarn的作业历史状态URL |
| datasource.encryption.enable | false | 是否启用datasource 加密 | | datasource.encryption.enable | false | 是否启用datasource 加密 |
| datasource.encryption.salt | !@#$%^&* | datasource加密使用的salt | | datasource.encryption.salt | !@#$%^&* | datasource加密使用的salt |
| data-quality.jar.name | dolphinscheduler-data-quality-dev-SNAPSHOT.jar | 配置数据质量使用的jar包 | | data-quality.jar.dir | | 配置数据质量使用的jar包 |
| support.hive.oneSession | false | 设置hive SQL是否在同一个session中执行 | | support.hive.oneSession | false | 设置hive SQL是否在同一个session中执行 |
| sudo.enable | true | 是否开启sudo | | sudo.enable | true | 是否开启sudo |
| alert.rpc.port | 50052 | Alert Server的RPC端口 | | alert.rpc.port | 50052 | Alert Server的RPC端口 |

2
docs/docs/zh/guide/data-quality.md

@ -13,7 +13,7 @@
> >
## 注意事项 ## 注意事项
- 如果单独打包`data-quality`的话,记得修改包名和`data-quality.jar.name`一致,配置内容在 `common.properties` 中的 `data-quality.jar.name` - 如果单独打包`data-quality`的话,记得修改包路径和`data-quality.jar.dir`一致,配置内容在 `common.properties` 中的 `data-quality.jar.dir`
- 如果是老版本升级使用,运行之前需要先执行`SQL`更新脚本进行数据库初始化。 - 如果是老版本升级使用,运行之前需要先执行`SQL`更新脚本进行数据库初始化。
- 当前 `dolphinscheduler-data-quality-dev-SNAPSHOT.jar` 是瘦包,不包含任何 `JDBC` 驱动。 - 当前 `dolphinscheduler-data-quality-dev-SNAPSHOT.jar` 是瘦包,不包含任何 `JDBC` 驱动。
如果有 `JDBC` 驱动需要,可以在`节点设置` `选项参数`处设置 `--jars` 参数, 如果有 `JDBC` 驱动需要,可以在`节点设置` `选项参数`处设置 `--jars` 参数,

7
docs/docs/zh/guide/resource/configuration.md

@ -156,9 +156,10 @@ datasource.encryption.enable=false
# datasource encryption salt # datasource encryption salt
datasource.encryption.salt=!@#$%^&* datasource.encryption.salt=!@#$%^&*
# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory # data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in
# if you re-build it alone, or auto discovery mechanism fail # data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server
data-quality.jar.name= # libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`).
data-quality.jar.dir=
#data-quality.error.output.path=/tmp/data-quality-error-data #data-quality.error.output.path=/tmp/data-quality-error-data

7
dolphinscheduler-api-test/dolphinscheduler-api-test-case/src/test/resources/docker/file-manage/common.properties

@ -84,9 +84,10 @@ datasource.encryption.enable=false
# datasource encryption salt # datasource encryption salt
datasource.encryption.salt=!@#$%^&* datasource.encryption.salt=!@#$%^&*
# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory # data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in
# if you re-build it alone, or auto discovery mechanism fail # data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server
data-quality.jar.name= # libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`).
data-quality.jar.dir=
#data-quality.error.output.path=/tmp/data-quality-error-data #data-quality.error.output.path=/tmp/data-quality-error-data

7
dolphinscheduler-common/src/main/resources/common.properties

@ -120,9 +120,10 @@ datasource.encryption.enable=false
# datasource encryption salt # datasource encryption salt
datasource.encryption.salt=!@#$%^&* datasource.encryption.salt=!@#$%^&*
# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory # data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in
# if you re-build it alone, or auto discovery mechanism fail # data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server
data-quality.jar.name= # libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`).
data-quality.jar.dir=
#data-quality.error.output.path=/tmp/data-quality-error-data #data-quality.error.output.path=/tmp/data-quality-error-data

7
dolphinscheduler-common/src/test/resources/common.properties

@ -115,9 +115,10 @@ datasource.encryption.enable=false
# datasource encryption salt # datasource encryption salt
datasource.encryption.salt=!@#$%^&* datasource.encryption.salt=!@#$%^&*
# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory # data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in
# if you re-build it alone, or auto discovery mechanism fail # data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server
data-quality.jar.name= # libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`).
data-quality.jar.dir=
#data-quality.error.output.path=/tmp/data-quality-error-data #data-quality.error.output.path=/tmp/data-quality-error-data

29
dolphinscheduler-datasource-plugin/dolphinscheduler-datasource-api/src/main/java/org/apache/dolphinscheduler/plugin/datasource/api/utils/CommonUtils.java

@ -18,7 +18,7 @@
package org.apache.dolphinscheduler.plugin.datasource.api.utils; package org.apache.dolphinscheduler.plugin.datasource.api.utils;
import static org.apache.dolphinscheduler.common.constants.Constants.RESOURCE_STORAGE_TYPE; import static org.apache.dolphinscheduler.common.constants.Constants.RESOURCE_STORAGE_TYPE;
import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.DATA_QUALITY_JAR_NAME; import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.DATA_QUALITY_JAR_DIR;
import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.HADOOP_SECURITY_AUTHENTICATION; import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.HADOOP_SECURITY_AUTHENTICATION;
import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.HADOOP_SECURITY_AUTHENTICATION_STARTUP_STATE; import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.HADOOP_SECURITY_AUTHENTICATION_STARTUP_STATE;
import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.JAVA_SECURITY_KRB5_CONF; import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.JAVA_SECURITY_KRB5_CONF;
@ -133,14 +133,28 @@ public class CommonUtils {
} }
public static String getDataQualityJarPath() { public static String getDataQualityJarPath() {
String dqsJarPath = PropertyUtils.getString(DATA_QUALITY_JAR_NAME); log.info("Trying to get data quality jar in path");
String dqJarDir = PropertyUtils.getString(DATA_QUALITY_JAR_DIR);
if (StringUtils.isNotEmpty(dqJarDir)) {
log.info(
"Configuration data-quality.jar.dir is not empty, will try to get data quality jar from directory {}",
dqJarDir);
getDataQualityJarPathFromPath(dqJarDir).ifPresent(jarName -> DEFAULT_DATA_QUALITY_JAR_PATH = jarName);
}
if (StringUtils.isEmpty(DEFAULT_DATA_QUALITY_JAR_PATH)) {
log.info("data quality jar path is empty, will try to auto discover it from build-in rules.");
getDefaultDataQualityJarPath();
}
if (StringUtils.isEmpty(dqsJarPath)) { if (StringUtils.isEmpty(DEFAULT_DATA_QUALITY_JAR_PATH)) {
log.info("data quality jar path is empty, will try to get it from data quality jar name"); log.error(
return getDefaultDataQualityJarPath(); "Can not find data quality jar in both configuration and auto discover, please check your configuration or report a bug.");
throw new RuntimeException("data quality jar path is empty");
} }
return dqsJarPath; return DEFAULT_DATA_QUALITY_JAR_PATH;
} }
private static String getDefaultDataQualityJarPath() { private static String getDefaultDataQualityJarPath() {
@ -173,7 +187,7 @@ public class CommonUtils {
log.info("Try to get data quality jar from path {}", path); log.info("Try to get data quality jar from path {}", path);
File[] jars = new File(path).listFiles(); File[] jars = new File(path).listFiles();
if (jars == null) { if (jars == null) {
log.warn("No data quality related jar found from path {}", path); log.warn("No any files find given path {}", path);
return Optional.empty(); return Optional.empty();
} }
for (File jar : jars) { for (File jar : jars) {
@ -181,6 +195,7 @@ public class CommonUtils {
return Optional.of(jar.getAbsolutePath()); return Optional.of(jar.getAbsolutePath());
} }
} }
log.warn("No data quality related jar found from path {}", path);
return Optional.empty(); return Optional.empty();
} }

7
dolphinscheduler-e2e/dolphinscheduler-e2e-case/src/test/resources/docker/file-manage/common.properties

@ -95,9 +95,10 @@ datasource.encryption.enable=false
# datasource encryption salt # datasource encryption salt
datasource.encryption.salt=!@#$%^&* datasource.encryption.salt=!@#$%^&*
# data quality option, it would auto discovery from libs directory. You can also specific the jar name in libs directory # data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in
# if you re-build it alone, or auto discovery mechanism fail # data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server
data-quality.jar.name= # libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`).
data-quality.jar.dir=
#data-quality.error.output.path=/tmp/data-quality-error-data #data-quality.error.output.path=/tmp/data-quality-error-data

1
dolphinscheduler-standalone-server/src/main/docker/Dockerfile

@ -20,6 +20,7 @@ FROM eclipse-temurin:8-jdk
ENV DOCKER true ENV DOCKER true
ENV TZ Asia/Shanghai ENV TZ Asia/Shanghai
ENV DOLPHINSCHEDULER_HOME /opt/dolphinscheduler ENV DOLPHINSCHEDULER_HOME /opt/dolphinscheduler
ENV DATA_QUALITY_JAR_DIR /opt/dolphinscheduler/libs/worker-server
RUN apt update ; \ RUN apt update ; \
apt install -y sudo ; \ apt install -y sudo ; \

4
dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/main/java/org/apache/dolphinscheduler/plugin/task/api/TaskConstants.java

@ -358,9 +358,9 @@ public class TaskConstants {
public static final String RESOURCE_UPLOAD_PATH = "resource.storage.upload.base.path"; public static final String RESOURCE_UPLOAD_PATH = "resource.storage.upload.base.path";
/** /**
* data.quality.jar.name * data.quality.jar.dir
*/ */
public static final String DATA_QUALITY_JAR_NAME = "data-quality.jar.name"; public static final String DATA_QUALITY_JAR_DIR = "data-quality.jar.dir";
public static final String TASK_TYPE_CONDITIONS = "CONDITIONS"; public static final String TASK_TYPE_CONDITIONS = "CONDITIONS";

7
dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/test/resources/common.properties

@ -84,9 +84,10 @@ datasource.encryption.enable=false
# datasource encryption salt # datasource encryption salt
datasource.encryption.salt=!@#$%^&* datasource.encryption.salt=!@#$%^&*
# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory # data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in
# if you re-build it alone, or auto discovery mechanism fail # data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server
data-quality.jar.name= # libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`).
data-quality.jar.dir=
#data-quality.error.output.path=/tmp/data-quality-error-data #data-quality.error.output.path=/tmp/data-quality-error-data

Loading…
Cancel
Save