From 91d56f48601279eeb7de1a057e5c4425f86f2198 Mon Sep 17 00:00:00 2001 From: Jay Chung Date: Mon, 5 Feb 2024 17:14:59 +0800 Subject: [PATCH] fix: data quality may fail in docker mode (#15563) --- deploy/kubernetes/dolphinscheduler/README.md | 2 +- .../kubernetes/dolphinscheduler/values.yaml | 2 +- docs/docs/en/architecture/configuration.md | 2 +- docs/docs/en/guide/data-quality.md | 2 +- docs/docs/en/guide/resource/configuration.md | 7 +++-- docs/docs/en/guide/upgrade/incompatible.md | 1 + docs/docs/zh/architecture/configuration.md | 2 +- docs/docs/zh/guide/data-quality.md | 2 +- docs/docs/zh/guide/resource/configuration.md | 7 +++-- .../docker/file-manage/common.properties | 7 +++-- .../src/main/resources/common.properties | 7 +++-- .../src/test/resources/common.properties | 7 +++-- .../datasource/api/utils/CommonUtils.java | 29 ++++++++++++++----- .../docker/file-manage/common.properties | 7 +++-- .../src/main/docker/Dockerfile | 1 + .../plugin/task/api/TaskConstants.java | 4 +-- .../src/test/resources/common.properties | 7 +++-- 17 files changed, 60 insertions(+), 36 deletions(-) diff --git a/deploy/kubernetes/dolphinscheduler/README.md b/deploy/kubernetes/dolphinscheduler/README.md index 6c04c4085d..5659605b95 100644 --- a/deploy/kubernetes/dolphinscheduler/README.md +++ b/deploy/kubernetes/dolphinscheduler/README.md @@ -121,7 +121,7 @@ Please refer to the [Quick Start in Kubernetes](../../../docs/docs/en/guide/inst | conf.common."alert.rpc.port" | int | `50052` | rpc port | | conf.common."appId.collect" | string | `"log"` | way to collect applicationId: log, aop | | conf.common."conda.path" | string | `"/opt/anaconda3/etc/profile.d/conda.sh"` | set path of conda.sh | -| conf.common."data-quality.jar.name" | string | `"dolphinscheduler-data-quality-dev-SNAPSHOT.jar"` | data quality option | +| conf.common."data-quality.jar.dir" | string | `nil` | data quality option | | conf.common."data.basedir.path" | string | `"/tmp/dolphinscheduler"` | user data local directory path, please make sure the directory exists and have read write permissions | | conf.common."datasource.encryption.enable" | bool | `false` | datasource encryption enable | | conf.common."datasource.encryption.salt" | string | `"!@#$%^&*"` | datasource encryption salt | diff --git a/deploy/kubernetes/dolphinscheduler/values.yaml b/deploy/kubernetes/dolphinscheduler/values.yaml index 6effdf15ac..a8d9a34875 100644 --- a/deploy/kubernetes/dolphinscheduler/values.yaml +++ b/deploy/kubernetes/dolphinscheduler/values.yaml @@ -328,7 +328,7 @@ conf: datasource.encryption.salt: '!@#$%^&*' # -- data quality option - data-quality.jar.name: dolphinscheduler-data-quality-dev-SNAPSHOT.jar + data-quality.jar.dir: # -- Whether hive SQL is executed in the same session support.hive.oneSession: false diff --git a/docs/docs/en/architecture/configuration.md b/docs/docs/en/architecture/configuration.md index f4ab1435d5..b9a26b865c 100644 --- a/docs/docs/en/architecture/configuration.md +++ b/docs/docs/en/architecture/configuration.md @@ -226,7 +226,7 @@ The default configuration is as follows: | yarn.job.history.status.address | http://ds1:19888/ws/v1/history/mapreduce/jobs/%s | job history status url of yarn | | datasource.encryption.enable | false | whether to enable datasource encryption | | datasource.encryption.salt | !@#$%^&* | the salt of the datasource encryption | -| data-quality.jar.name | dolphinscheduler-data-quality-dev-SNAPSHOT.jar | the jar of data quality | +| data-quality.jar.dir | | the jar of data quality | | support.hive.oneSession | false | specify whether hive SQL is executed in the same session | | sudo.enable | true | whether to enable sudo | | alert.rpc.port | 50052 | the RPC port of Alert Server | diff --git a/docs/docs/en/guide/data-quality.md b/docs/docs/en/guide/data-quality.md index f6aa7a06b2..dca777d76f 100644 --- a/docs/docs/en/guide/data-quality.md +++ b/docs/docs/en/guide/data-quality.md @@ -12,7 +12,7 @@ The execution logic of the data quality task is as follows: - The current data quality task result is stored in the `t_ds_dq_execute_result` table of `dolphinscheduler` `Worker` sends the task result to `Master`, after `Master` receives `TaskResponse`, it will judge whether the task type is `DataQualityTask`, if so, it will read the corresponding result from `t_ds_dq_execute_result` according to `taskInstanceId`, and then The result is judged according to the check mode, operator and threshold configured by the user. - If the result is a failure, the corresponding operation, alarm or interruption will be performed according to the failure policy configured by the user. -- If you package `data-quality` separately, remember to modify the package name to be consistent with `data-quality.jar.name` in `common.properties` with attribute name `data-quality.jar.name` +- If you package `data-quality` separately, remember to modify the package name to be consistent with `data-quality.jar.dir` in `common.properties` with attribute name `data-quality.jar.dir` - If the old version is upgraded and used, you need to execute the `sql` update script to initialize the database before running. - `dolphinscheduler-data-quality-dev-SNAPSHOT.jar` was built with no dependencies. If a `JDBC` driver is required, you can set the `-jars` parameter in the `node settings` `Option Parameters`, e.g. `--jars /lib/jars/mysql-connector-java-8.0.16.jar`. - Currently only `MySQL`, `PostgreSQL` and `HIVE` data sources have been tested, other data sources have not been tested yet. diff --git a/docs/docs/en/guide/resource/configuration.md b/docs/docs/en/guide/resource/configuration.md index 42e1925e89..67c68a22c2 100644 --- a/docs/docs/en/guide/resource/configuration.md +++ b/docs/docs/en/guide/resource/configuration.md @@ -152,9 +152,10 @@ datasource.encryption.enable=false # datasource encryption salt datasource.encryption.salt=!@#$%^&* -# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory -# if you re-build it alone, or auto discovery mechanism fail -data-quality.jar.name= +# data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in +# data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server +# libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`). +data-quality.jar.dir= #data-quality.error.output.path=/tmp/data-quality-error-data diff --git a/docs/docs/en/guide/upgrade/incompatible.md b/docs/docs/en/guide/upgrade/incompatible.md index 4580a7d13d..f45af712c3 100644 --- a/docs/docs/en/guide/upgrade/incompatible.md +++ b/docs/docs/en/guide/upgrade/incompatible.md @@ -11,6 +11,7 @@ This document records the incompatible updates between each version. You need to * Change the default unix shell executor from sh to bash ([#12180](https://github.com/apache/dolphinscheduler/pull/12180)). * Remove `deleteSource` in `download()` of `StorageOperate` ([#14084](https://github.com/apache/dolphinscheduler/pull/14084)) * Remove default key for attribute `data-quality.jar.name` in `common.properties` ([#15551](https://github.com/apache/dolphinscheduler/pull/15551)) +* Rename attribute `data-quality.jar.name` to `data-quality.jar.dir` in `common.properties` and represent for directory ([#15563](https://github.com/apache/dolphinscheduler/pull/15563)) ## 3.2.0 diff --git a/docs/docs/zh/architecture/configuration.md b/docs/docs/zh/architecture/configuration.md index 65113b76a1..0b3ea9bc5b 100644 --- a/docs/docs/zh/architecture/configuration.md +++ b/docs/docs/zh/architecture/configuration.md @@ -226,7 +226,7 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId | yarn.job.history.status.address | http://ds1:19888/ws/v1/history/mapreduce/jobs/%s | yarn的作业历史状态URL | | datasource.encryption.enable | false | 是否启用datasource 加密 | | datasource.encryption.salt | !@#$%^&* | datasource加密使用的salt | -| data-quality.jar.name | dolphinscheduler-data-quality-dev-SNAPSHOT.jar | 配置数据质量使用的jar包 | +| data-quality.jar.dir | | 配置数据质量使用的jar包 | | support.hive.oneSession | false | 设置hive SQL是否在同一个session中执行 | | sudo.enable | true | 是否开启sudo | | alert.rpc.port | 50052 | Alert Server的RPC端口 | diff --git a/docs/docs/zh/guide/data-quality.md b/docs/docs/zh/guide/data-quality.md index 2a098a3216..17b2a55cb2 100644 --- a/docs/docs/zh/guide/data-quality.md +++ b/docs/docs/zh/guide/data-quality.md @@ -13,7 +13,7 @@ > ## 注意事项 -- 如果单独打包`data-quality`的话,记得修改包名和`data-quality.jar.name`一致,配置内容在 `common.properties` 中的 `data-quality.jar.name` +- 如果单独打包`data-quality`的话,记得修改包路径和`data-quality.jar.dir`一致,配置内容在 `common.properties` 中的 `data-quality.jar.dir` - 如果是老版本升级使用,运行之前需要先执行`SQL`更新脚本进行数据库初始化。 - 当前 `dolphinscheduler-data-quality-dev-SNAPSHOT.jar` 是瘦包,不包含任何 `JDBC` 驱动。 如果有 `JDBC` 驱动需要,可以在`节点设置` `选项参数`处设置 `--jars` 参数, diff --git a/docs/docs/zh/guide/resource/configuration.md b/docs/docs/zh/guide/resource/configuration.md index 57d0935e09..739d6fb30c 100644 --- a/docs/docs/zh/guide/resource/configuration.md +++ b/docs/docs/zh/guide/resource/configuration.md @@ -156,9 +156,10 @@ datasource.encryption.enable=false # datasource encryption salt datasource.encryption.salt=!@#$%^&* -# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory -# if you re-build it alone, or auto discovery mechanism fail -data-quality.jar.name= +# data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in +# data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server +# libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`). +data-quality.jar.dir= #data-quality.error.output.path=/tmp/data-quality-error-data diff --git a/dolphinscheduler-api-test/dolphinscheduler-api-test-case/src/test/resources/docker/file-manage/common.properties b/dolphinscheduler-api-test/dolphinscheduler-api-test-case/src/test/resources/docker/file-manage/common.properties index d43e55e822..96879cc272 100644 --- a/dolphinscheduler-api-test/dolphinscheduler-api-test-case/src/test/resources/docker/file-manage/common.properties +++ b/dolphinscheduler-api-test/dolphinscheduler-api-test-case/src/test/resources/docker/file-manage/common.properties @@ -84,9 +84,10 @@ datasource.encryption.enable=false # datasource encryption salt datasource.encryption.salt=!@#$%^&* -# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory -# if you re-build it alone, or auto discovery mechanism fail -data-quality.jar.name= +# data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in +# data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server +# libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`). +data-quality.jar.dir= #data-quality.error.output.path=/tmp/data-quality-error-data diff --git a/dolphinscheduler-common/src/main/resources/common.properties b/dolphinscheduler-common/src/main/resources/common.properties index 451a0f734c..669d3dfef3 100644 --- a/dolphinscheduler-common/src/main/resources/common.properties +++ b/dolphinscheduler-common/src/main/resources/common.properties @@ -120,9 +120,10 @@ datasource.encryption.enable=false # datasource encryption salt datasource.encryption.salt=!@#$%^&* -# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory -# if you re-build it alone, or auto discovery mechanism fail -data-quality.jar.name= +# data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in +# data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server +# libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`). +data-quality.jar.dir= #data-quality.error.output.path=/tmp/data-quality-error-data diff --git a/dolphinscheduler-common/src/test/resources/common.properties b/dolphinscheduler-common/src/test/resources/common.properties index 107977df7f..7f66a32a23 100644 --- a/dolphinscheduler-common/src/test/resources/common.properties +++ b/dolphinscheduler-common/src/test/resources/common.properties @@ -115,9 +115,10 @@ datasource.encryption.enable=false # datasource encryption salt datasource.encryption.salt=!@#$%^&* -# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory -# if you re-build it alone, or auto discovery mechanism fail -data-quality.jar.name= +# data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in +# data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server +# libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`). +data-quality.jar.dir= #data-quality.error.output.path=/tmp/data-quality-error-data diff --git a/dolphinscheduler-datasource-plugin/dolphinscheduler-datasource-api/src/main/java/org/apache/dolphinscheduler/plugin/datasource/api/utils/CommonUtils.java b/dolphinscheduler-datasource-plugin/dolphinscheduler-datasource-api/src/main/java/org/apache/dolphinscheduler/plugin/datasource/api/utils/CommonUtils.java index a4e64594c0..1c24785c2f 100644 --- a/dolphinscheduler-datasource-plugin/dolphinscheduler-datasource-api/src/main/java/org/apache/dolphinscheduler/plugin/datasource/api/utils/CommonUtils.java +++ b/dolphinscheduler-datasource-plugin/dolphinscheduler-datasource-api/src/main/java/org/apache/dolphinscheduler/plugin/datasource/api/utils/CommonUtils.java @@ -18,7 +18,7 @@ package org.apache.dolphinscheduler.plugin.datasource.api.utils; import static org.apache.dolphinscheduler.common.constants.Constants.RESOURCE_STORAGE_TYPE; -import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.DATA_QUALITY_JAR_NAME; +import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.DATA_QUALITY_JAR_DIR; import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.HADOOP_SECURITY_AUTHENTICATION; import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.HADOOP_SECURITY_AUTHENTICATION_STARTUP_STATE; import static org.apache.dolphinscheduler.plugin.task.api.TaskConstants.JAVA_SECURITY_KRB5_CONF; @@ -133,14 +133,28 @@ public class CommonUtils { } public static String getDataQualityJarPath() { - String dqsJarPath = PropertyUtils.getString(DATA_QUALITY_JAR_NAME); + log.info("Trying to get data quality jar in path"); + String dqJarDir = PropertyUtils.getString(DATA_QUALITY_JAR_DIR); + + if (StringUtils.isNotEmpty(dqJarDir)) { + log.info( + "Configuration data-quality.jar.dir is not empty, will try to get data quality jar from directory {}", + dqJarDir); + getDataQualityJarPathFromPath(dqJarDir).ifPresent(jarName -> DEFAULT_DATA_QUALITY_JAR_PATH = jarName); + } + + if (StringUtils.isEmpty(DEFAULT_DATA_QUALITY_JAR_PATH)) { + log.info("data quality jar path is empty, will try to auto discover it from build-in rules."); + getDefaultDataQualityJarPath(); + } - if (StringUtils.isEmpty(dqsJarPath)) { - log.info("data quality jar path is empty, will try to get it from data quality jar name"); - return getDefaultDataQualityJarPath(); + if (StringUtils.isEmpty(DEFAULT_DATA_QUALITY_JAR_PATH)) { + log.error( + "Can not find data quality jar in both configuration and auto discover, please check your configuration or report a bug."); + throw new RuntimeException("data quality jar path is empty"); } - return dqsJarPath; + return DEFAULT_DATA_QUALITY_JAR_PATH; } private static String getDefaultDataQualityJarPath() { @@ -173,7 +187,7 @@ public class CommonUtils { log.info("Try to get data quality jar from path {}", path); File[] jars = new File(path).listFiles(); if (jars == null) { - log.warn("No data quality related jar found from path {}", path); + log.warn("No any files find given path {}", path); return Optional.empty(); } for (File jar : jars) { @@ -181,6 +195,7 @@ public class CommonUtils { return Optional.of(jar.getAbsolutePath()); } } + log.warn("No data quality related jar found from path {}", path); return Optional.empty(); } diff --git a/dolphinscheduler-e2e/dolphinscheduler-e2e-case/src/test/resources/docker/file-manage/common.properties b/dolphinscheduler-e2e/dolphinscheduler-e2e-case/src/test/resources/docker/file-manage/common.properties index b5f61011b3..7583b3293a 100644 --- a/dolphinscheduler-e2e/dolphinscheduler-e2e-case/src/test/resources/docker/file-manage/common.properties +++ b/dolphinscheduler-e2e/dolphinscheduler-e2e-case/src/test/resources/docker/file-manage/common.properties @@ -95,9 +95,10 @@ datasource.encryption.enable=false # datasource encryption salt datasource.encryption.salt=!@#$%^&* -# data quality option, it would auto discovery from libs directory. You can also specific the jar name in libs directory -# if you re-build it alone, or auto discovery mechanism fail -data-quality.jar.name= +# data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in +# data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server +# libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`). +data-quality.jar.dir= #data-quality.error.output.path=/tmp/data-quality-error-data diff --git a/dolphinscheduler-standalone-server/src/main/docker/Dockerfile b/dolphinscheduler-standalone-server/src/main/docker/Dockerfile index 40158ff8e1..6eebf04162 100644 --- a/dolphinscheduler-standalone-server/src/main/docker/Dockerfile +++ b/dolphinscheduler-standalone-server/src/main/docker/Dockerfile @@ -20,6 +20,7 @@ FROM eclipse-temurin:8-jdk ENV DOCKER true ENV TZ Asia/Shanghai ENV DOLPHINSCHEDULER_HOME /opt/dolphinscheduler +ENV DATA_QUALITY_JAR_DIR /opt/dolphinscheduler/libs/worker-server RUN apt update ; \ apt install -y sudo ; \ diff --git a/dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/main/java/org/apache/dolphinscheduler/plugin/task/api/TaskConstants.java b/dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/main/java/org/apache/dolphinscheduler/plugin/task/api/TaskConstants.java index fe8d9a77bf..43734416e7 100644 --- a/dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/main/java/org/apache/dolphinscheduler/plugin/task/api/TaskConstants.java +++ b/dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/main/java/org/apache/dolphinscheduler/plugin/task/api/TaskConstants.java @@ -358,9 +358,9 @@ public class TaskConstants { public static final String RESOURCE_UPLOAD_PATH = "resource.storage.upload.base.path"; /** - * data.quality.jar.name + * data.quality.jar.dir */ - public static final String DATA_QUALITY_JAR_NAME = "data-quality.jar.name"; + public static final String DATA_QUALITY_JAR_DIR = "data-quality.jar.dir"; public static final String TASK_TYPE_CONDITIONS = "CONDITIONS"; diff --git a/dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/test/resources/common.properties b/dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/test/resources/common.properties index 9855d855e9..402112263f 100644 --- a/dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/test/resources/common.properties +++ b/dolphinscheduler-task-plugin/dolphinscheduler-task-api/src/test/resources/common.properties @@ -84,9 +84,10 @@ datasource.encryption.enable=false # datasource encryption salt datasource.encryption.salt=!@#$%^&* -# data quality absolute path, it would auto discovery from libs directory. You can also specific the jar name in libs directory -# if you re-build it alone, or auto discovery mechanism fail -data-quality.jar.name= +# data quality jar directory path, it would auto discovery data quality jar from this given dir. You should keep it empty if you do not change anything in +# data-quality, it will auto discovery by dolphinscheduler itself. Change it only if you want to use your own data-quality jar and it is not in worker-server +# libs directory(but may sure your jar name start with `dolphinscheduler-data-quality`). +data-quality.jar.dir= #data-quality.error.output.path=/tmp/data-quality-error-data