From 71f016851093c18376e3bb301bf6e41dee215706 Mon Sep 17 00:00:00 2001 From: Eric Gao Date: Wed, 6 Jul 2022 15:56:39 +0800 Subject: [PATCH] [Feature] Enable users to create python env from requirements.txt (#10658) --- docs/docs/en/guide/task/jupyter.md | 52 ++++++++++++++++- docs/docs/zh/guide/task/jupyter.md | 49 ++++++++++++++++ .../dolphinscheduler/spi/utils/DateUtils.java | 8 +++ .../plugin/task/jupyter/JupyterConstants.java | 27 +++++++++ .../plugin/task/jupyter/JupyterTask.java | 20 ++++++- .../plugin/task/jupyter/JupyterTaskTest.java | 56 +++++++++++++++++-- 6 files changed, 204 insertions(+), 8 deletions(-) diff --git a/docs/docs/en/guide/task/jupyter.md b/docs/docs/en/guide/task/jupyter.md index 648c42a651..9cf769634a 100644 --- a/docs/docs/en/guide/task/jupyter.md +++ b/docs/docs/en/guide/task/jupyter.md @@ -26,7 +26,8 @@ Click [here](https://docs.conda.io/en/latest/) for more information about `conda 1. Use [Conda-Pack](https://conda.github.io/conda-pack/) to pack your conda environment into `tarball`. 2. Upload packed conda environment to `resource center`. -3. Select your packed conda environment as `resource` in your `jupyter task`, e.g. `jupyter_env.tar.gz`. +3. Set `condaEnvName` as the name of your packed conda environment in your `jupyter task`, e.g. `jupyter_env.tar.gz`. +4. Select your packed conda environment as `resource` in your `jupyter task`, e.g. `jupyter_env.tar.gz`. > NOTE: Make sure you follow the [Conda-Pack](https://conda.github.io/conda-pack/) official instructions. > If you unpack your packed conda environment, the directory structure should be the same as below: @@ -46,6 +47,55 @@ Click [here](https://docs.conda.io/en/latest/) for more information about `conda > `Jupyter Task Plugin` uses `source` command to activate your packed conda environment. > If you are concerned about using `source`, choose other options to manage your python dependency. +### Construct From Requirements + +1. Upload or create a `.txt` file of requirements with your python dependencies in `Resource Center`. +2. Set `condaEnvName` as the name of your file of requirements in your `jupyter task`, e.g. `requirements.txt`. +3. Select your file of requirements as `resource` in your `jupyter task`, e.g. `requirements.txt`. + +Here is an example file of requirements, from which `jupyter task plugin` will automatically +construct your python dependencies, run your python code and finally tear down the environment: + +```text +fastjsonschema==2.15.3 +fonttools==4.33.3 +geojson==2.5.0 +identify==2.4.11 +idna==3.3 +importlib-metadata==4.11.3 +importlib-resources==5.7.1 +ipykernel==5.5.6 +ipython==8.2.0 +ipython-genutils==0.2.0 +jedi==0.18.1 +Jinja2==3.1.1 +json5==0.9.6 +jsonschema==4.4.0 +jupyter-client==7.3.0 +jupyter-core==4.10.0 +jupyter-server==1.17.0 +jupyterlab==3.3.4 +jupyterlab-pygments==0.2.2 +jupyterlab-server==2.13.0 +kiwisolver==1.4.2 +MarkupSafe==2.1.1 +matplotlib==3.5.2 +matplotlib-inline==0.1.3 +mistune==0.8.4 +nbclassic==0.3.7 +nbclient==0.6.0 +nbconvert==6.5.0 +nbformat==5.3.0 +nest-asyncio==1.5.5 +notebook==6.4.11 +notebook-shim==0.1.0 +numpy==1.22.3 +packaging==21.3 +pandas==1.4.2 +pandocfilters==1.5.0 +papermill==2.3.4 +``` + ## Create Task - Click `Project Management-Project Name-Workflow Definition`, and click the `Create Workflow` button to enter the DAG editing page. diff --git a/docs/docs/zh/guide/task/jupyter.md b/docs/docs/zh/guide/task/jupyter.md index 1698630aed..59ac4628a9 100644 --- a/docs/docs/zh/guide/task/jupyter.md +++ b/docs/docs/zh/guide/task/jupyter.md @@ -45,6 +45,55 @@ > `Jupyter任务插件`使用`source`命令激活您打包的conda环境。 > 若您对使用`source`命令有安全性上的担忧,请使用其他方法管理您的python依赖。 +### 由依赖需求文本文件临时构建 + +1. 在`资源中心`创建或上传`.txt`格式的python依赖需求文本文件。 +2. 将`jupyter任务`中的`condaEnvName`参数设置成您的python依赖需求文本文件,如`requirements.txt`。 +3. 在您`jupyter任务`的`资源`中选取您的python依赖需求文本文件,如`requirements.txt`。 + +如下是一个依赖需求文本文件的样例,通过该文件,`jupyter任务插件`会自动构建您的python依赖,并执行您的python代码, +执行完成后会自动释放临时构建的环境。 + +```text +fastjsonschema==2.15.3 +fonttools==4.33.3 +geojson==2.5.0 +identify==2.4.11 +idna==3.3 +importlib-metadata==4.11.3 +importlib-resources==5.7.1 +ipykernel==5.5.6 +ipython==8.2.0 +ipython-genutils==0.2.0 +jedi==0.18.1 +Jinja2==3.1.1 +json5==0.9.6 +jsonschema==4.4.0 +jupyter-client==7.3.0 +jupyter-core==4.10.0 +jupyter-server==1.17.0 +jupyterlab==3.3.4 +jupyterlab-pygments==0.2.2 +jupyterlab-server==2.13.0 +kiwisolver==1.4.2 +MarkupSafe==2.1.1 +matplotlib==3.5.2 +matplotlib-inline==0.1.3 +mistune==0.8.4 +nbclassic==0.3.7 +nbclient==0.6.0 +nbconvert==6.5.0 +nbformat==5.3.0 +nest-asyncio==1.5.5 +notebook==6.4.11 +notebook-shim==0.1.0 +numpy==1.22.3 +packaging==21.3 +pandas==1.4.2 +pandocfilters==1.5.0 +papermill==2.3.4 +``` + ## 创建任务 - 点击项目管理-项目名称-工作流定义,点击"创建工作流"按钮,进入DAG编辑页面。 diff --git a/dolphinscheduler-spi/src/main/java/org/apache/dolphinscheduler/spi/utils/DateUtils.java b/dolphinscheduler-spi/src/main/java/org/apache/dolphinscheduler/spi/utils/DateUtils.java index 4f4a8e7a17..695e70a7ba 100644 --- a/dolphinscheduler-spi/src/main/java/org/apache/dolphinscheduler/spi/utils/DateUtils.java +++ b/dolphinscheduler-spi/src/main/java/org/apache/dolphinscheduler/spi/utils/DateUtils.java @@ -429,4 +429,12 @@ public class DateUtils { } return TimeZone.getTimeZone(timezoneId); } + + /** + * get timestamp in String + * PowerMock 2.0.9 fails to mock System.currentTimeMillis(), this method helps in UT + */ + public static String getTimestampString() { + return String.valueOf(System.currentTimeMillis()); + } } diff --git a/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterConstants.java b/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterConstants.java index 8b4069c048..7585311eb3 100644 --- a/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterConstants.java +++ b/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterConstants.java @@ -23,6 +23,16 @@ public class JupyterConstants { throw new IllegalStateException("Utility class"); } + /** + * execution flag, ignore errors and keep executing till the end + */ + public static final String EXECUTION_FLAG = "set +e"; + + /** + * new line symbol + */ + public static final String NEW_LINE_SYMBOL = "\n"; + /** * conda init */ @@ -40,11 +50,28 @@ public class JupyterConstants { "tar -xzf %s -C jupyter_env && " + "source jupyter_env/bin/activate"; + /** + * create and activate tmp conda env from txt + */ + public static final String CREATE_ENV_FROM_TXT = "conda create -n jupyter-tmp-env-%s -y && " + + "conda activate jupyter-tmp-env-%s && " + + "pip install -r %s"; + + /** + * remove tmp conda env + */ + public static final String REMOVE_ENV = "conda deactivate && conda remove --name jupyter-tmp-env-%s --all -y"; + /** * file suffix tar.gz */ public static final String TAR_SUFFIX = ".tar.gz"; + /** + * file suffix .txt + */ + public static final String TXT_SUFFIX = ".txt"; + /** * jointer to combine two command */ diff --git a/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTask.java b/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTask.java index cec72c9601..0ce6052bdd 100644 --- a/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTask.java +++ b/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTask.java @@ -28,6 +28,7 @@ import org.apache.dolphinscheduler.plugin.task.api.parameters.AbstractParameters import org.apache.dolphinscheduler.plugin.task.api.parser.ParamUtils; import org.apache.dolphinscheduler.plugin.task.api.parser.ParameterUtils; import org.apache.dolphinscheduler.plugin.task.api.utils.MapUtils; +import org.apache.dolphinscheduler.spi.utils.DateUtils; import org.apache.dolphinscheduler.spi.utils.JSONUtils; import org.apache.dolphinscheduler.spi.utils.PropertyUtils; import org.apache.dolphinscheduler.spi.utils.StringUtils; @@ -104,12 +105,20 @@ public class JupyterTask extends AbstractTaskExecutor { */ List args = new ArrayList<>(); final String condaPath = PropertyUtils.getString(TaskConstants.CONDA_PATH); + final String timestamp = DateUtils.getTimestampString(); + String condaEnvName = jupyterParameters.getCondaEnvName(); + if (condaEnvName.endsWith(JupyterConstants.TXT_SUFFIX)) { + args.add(JupyterConstants.EXECUTION_FLAG); + args.add(JupyterConstants.NEW_LINE_SYMBOL); + } + args.add(JupyterConstants.CONDA_INIT); args.add(condaPath); args.add(JupyterConstants.JOINTER); - String condaEnvName = jupyterParameters.getCondaEnvName(); if (condaEnvName.endsWith(JupyterConstants.TAR_SUFFIX)) { args.add(String.format(JupyterConstants.CREATE_ENV_FROM_TAR, condaEnvName)); + } else if (condaEnvName.endsWith(JupyterConstants.TXT_SUFFIX)) { + args.add(String.format(JupyterConstants.CREATE_ENV_FROM_TXT, timestamp, timestamp, condaEnvName)); } else { args.add(JupyterConstants.CONDA_ACTIVATE); args.add(jupyterParameters.getCondaEnvName()); @@ -126,10 +135,17 @@ public class JupyterTask extends AbstractTaskExecutor { // populate jupyter options args.addAll(populateJupyterOptions()); + // remove tmp conda env, if created from requirements.txt + if (condaEnvName.endsWith(JupyterConstants.TXT_SUFFIX)) { + args.add(JupyterConstants.NEW_LINE_SYMBOL); + args.add(String.format(JupyterConstants.REMOVE_ENV, timestamp)); + } + // replace placeholder, and combining local and global parameters Map paramsMap = taskExecutionContext.getPrepareParamsMap(); - String command = ParameterUtils.convertParameterPlaceholders(String.join(" ", args), ParamUtils.convert(paramsMap)); + String command = ParameterUtils + .convertParameterPlaceholders(String.join(" ", args), ParamUtils.convert(paramsMap)); logger.info("jupyter task command: {}", command); diff --git a/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/test/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTaskTest.java b/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/test/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTaskTest.java index c55aa4935a..3007a4ee25 100644 --- a/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/test/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTaskTest.java +++ b/dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/test/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTaskTest.java @@ -19,8 +19,8 @@ package org.apache.dolphinscheduler.plugin.task.jupyter; import org.apache.dolphinscheduler.plugin.task.api.TaskExecutionContext; +import org.apache.dolphinscheduler.spi.utils.DateUtils; import org.apache.dolphinscheduler.spi.utils.JSONUtils; - import org.apache.dolphinscheduler.spi.utils.PropertyUtils; import org.junit.Assert; import org.junit.Test; @@ -30,16 +30,15 @@ import org.powermock.core.classloader.annotations.PowerMockIgnore; import org.powermock.core.classloader.annotations.PrepareForTest; import org.powermock.core.classloader.annotations.SuppressStaticInitializationFor; import org.powermock.modules.junit4.PowerMockRunner; -import org.apache.dolphinscheduler.plugin.task.api.TaskConstants; - import static org.mockito.ArgumentMatchers.any; import static org.powermock.api.mockito.PowerMockito.spy; import static org.powermock.api.mockito.PowerMockito.when; @RunWith(PowerMockRunner.class) @PrepareForTest({ - JSONUtils.class, - PropertyUtils.class, + JSONUtils.class, + PropertyUtils.class, + DateUtils.class }) @PowerMockIgnore({"javax.*"}) @SuppressStaticInitializationFor("org.apache.dolphinscheduler.spi.utils.PropertyUtils") @@ -99,6 +98,39 @@ public class JupyterTaskTest { "--progress-bar"); } + @Test + public void testBuildJupyterCommandWithRequirements() throws Exception { + String parameters = buildJupyterCommandWithRequirements(); + TaskExecutionContext taskExecutionContext = PowerMockito.mock(TaskExecutionContext.class); + when(taskExecutionContext.getTaskParams()).thenReturn(parameters); + PowerMockito.mockStatic(PropertyUtils.class); + when(PropertyUtils.getString(any())).thenReturn("/opt/anaconda3/etc/profile.d/conda.sh"); + PowerMockito.mockStatic(DateUtils.class); + when(DateUtils.getTimestampString()).thenReturn("123456789"); + JupyterTask jupyterTask = spy(new JupyterTask(taskExecutionContext)); + jupyterTask.init(); + Assert.assertEquals(jupyterTask.buildCommand(), + "set +e \n " + + "source /opt/anaconda3/etc/profile.d/conda.sh && " + + "conda create -n jupyter-tmp-env-123456789 -y && " + + "conda activate jupyter-tmp-env-123456789 && " + + "pip install -r requirements.txt && " + + "papermill " + + "/test/input_note.ipynb " + + "/test/output_note.ipynb " + + "--parameters city Shanghai " + + "--parameters factor 0.01 " + + "--kernel python3 " + + "--engine default_engine " + + "--execution-timeout 10 " + + "--start-timeout 3 " + + "--version " + + "--inject-paths " + + "--progress-bar \n " + + "conda deactivate && conda remove --name jupyter-tmp-env-123456789 --all -y" + ); + } + private String buildJupyterCommandWithLocalEnv() { JupyterParameters jupyterParameters = new JupyterParameters(); jupyterParameters.setCondaEnvName("jupyter-lab"); @@ -127,4 +159,18 @@ public class JupyterTaskTest { return JSONUtils.toJsonString(jupyterParameters); } + private String buildJupyterCommandWithRequirements() { + JupyterParameters jupyterParameters = new JupyterParameters(); + jupyterParameters.setCondaEnvName("requirements.txt"); + jupyterParameters.setInputNotePath("/test/input_note.ipynb"); + jupyterParameters.setOutputNotePath("/test/output_note.ipynb"); + jupyterParameters.setParameters("{\"city\": \"Shanghai\", \"factor\": \"0.01\"}"); + jupyterParameters.setKernel("python3"); + jupyterParameters.setEngine("default_engine"); + jupyterParameters.setExecutionTimeout("10"); + jupyterParameters.setStartTimeout("3"); + jupyterParameters.setOthers("--version"); + return JSONUtils.toJsonString(jupyterParameters); + } + }