Browse Source

[Feature] Enable users to create python env from requirements.txt (#10658)

3.1.0-release
Eric Gao 2 years ago committed by GitHub
parent
commit
71f0168510
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 52
      docs/docs/en/guide/task/jupyter.md
  2. 49
      docs/docs/zh/guide/task/jupyter.md
  3. 8
      dolphinscheduler-spi/src/main/java/org/apache/dolphinscheduler/spi/utils/DateUtils.java
  4. 27
      dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterConstants.java
  5. 20
      dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTask.java
  6. 56
      dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/test/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTaskTest.java

52
docs/docs/en/guide/task/jupyter.md

@ -26,7 +26,8 @@ Click [here](https://docs.conda.io/en/latest/) for more information about `conda
1. Use [Conda-Pack](https://conda.github.io/conda-pack/) to pack your conda environment into `tarball`.
2. Upload packed conda environment to `resource center`.
3. Select your packed conda environment as `resource` in your `jupyter task`, e.g. `jupyter_env.tar.gz`.
3. Set `condaEnvName` as the name of your packed conda environment in your `jupyter task`, e.g. `jupyter_env.tar.gz`.
4. Select your packed conda environment as `resource` in your `jupyter task`, e.g. `jupyter_env.tar.gz`.
> NOTE: Make sure you follow the [Conda-Pack](https://conda.github.io/conda-pack/) official instructions.
> If you unpack your packed conda environment, the directory structure should be the same as below:
@ -46,6 +47,55 @@ Click [here](https://docs.conda.io/en/latest/) for more information about `conda
> `Jupyter Task Plugin` uses `source` command to activate your packed conda environment.
> If you are concerned about using `source`, choose other options to manage your python dependency.
### Construct From Requirements
1. Upload or create a `.txt` file of requirements with your python dependencies in `Resource Center`.
2. Set `condaEnvName` as the name of your file of requirements in your `jupyter task`, e.g. `requirements.txt`.
3. Select your file of requirements as `resource` in your `jupyter task`, e.g. `requirements.txt`.
Here is an example file of requirements, from which `jupyter task plugin` will automatically
construct your python dependencies, run your python code and finally tear down the environment:
```text
fastjsonschema==2.15.3
fonttools==4.33.3
geojson==2.5.0
identify==2.4.11
idna==3.3
importlib-metadata==4.11.3
importlib-resources==5.7.1
ipykernel==5.5.6
ipython==8.2.0
ipython-genutils==0.2.0
jedi==0.18.1
Jinja2==3.1.1
json5==0.9.6
jsonschema==4.4.0
jupyter-client==7.3.0
jupyter-core==4.10.0
jupyter-server==1.17.0
jupyterlab==3.3.4
jupyterlab-pygments==0.2.2
jupyterlab-server==2.13.0
kiwisolver==1.4.2
MarkupSafe==2.1.1
matplotlib==3.5.2
matplotlib-inline==0.1.3
mistune==0.8.4
nbclassic==0.3.7
nbclient==0.6.0
nbconvert==6.5.0
nbformat==5.3.0
nest-asyncio==1.5.5
notebook==6.4.11
notebook-shim==0.1.0
numpy==1.22.3
packaging==21.3
pandas==1.4.2
pandocfilters==1.5.0
papermill==2.3.4
```
## Create Task
- Click `Project Management-Project Name-Workflow Definition`, and click the `Create Workflow` button to enter the DAG editing page.

49
docs/docs/zh/guide/task/jupyter.md

@ -45,6 +45,55 @@
> `Jupyter任务插件`使用`source`命令激活您打包的conda环境。
> 若您对使用`source`命令有安全性上的担忧,请使用其他方法管理您的python依赖。
### 由依赖需求文本文件临时构建
1. 在`资源中心`创建或上传`.txt`格式的python依赖需求文本文件。
2. 将`jupyter任务`中的`condaEnvName`参数设置成您的python依赖需求文本文件,如`requirements.txt`。
3. 在您`jupyter任务`的`资源`中选取您的python依赖需求文本文件,如`requirements.txt`。
如下是一个依赖需求文本文件的样例,通过该文件,`jupyter任务插件`会自动构建您的python依赖,并执行您的python代码,
执行完成后会自动释放临时构建的环境。
```text
fastjsonschema==2.15.3
fonttools==4.33.3
geojson==2.5.0
identify==2.4.11
idna==3.3
importlib-metadata==4.11.3
importlib-resources==5.7.1
ipykernel==5.5.6
ipython==8.2.0
ipython-genutils==0.2.0
jedi==0.18.1
Jinja2==3.1.1
json5==0.9.6
jsonschema==4.4.0
jupyter-client==7.3.0
jupyter-core==4.10.0
jupyter-server==1.17.0
jupyterlab==3.3.4
jupyterlab-pygments==0.2.2
jupyterlab-server==2.13.0
kiwisolver==1.4.2
MarkupSafe==2.1.1
matplotlib==3.5.2
matplotlib-inline==0.1.3
mistune==0.8.4
nbclassic==0.3.7
nbclient==0.6.0
nbconvert==6.5.0
nbformat==5.3.0
nest-asyncio==1.5.5
notebook==6.4.11
notebook-shim==0.1.0
numpy==1.22.3
packaging==21.3
pandas==1.4.2
pandocfilters==1.5.0
papermill==2.3.4
```
## 创建任务
- 点击项目管理-项目名称-工作流定义,点击"创建工作流"按钮,进入DAG编辑页面。

8
dolphinscheduler-spi/src/main/java/org/apache/dolphinscheduler/spi/utils/DateUtils.java

@ -429,4 +429,12 @@ public class DateUtils {
}
return TimeZone.getTimeZone(timezoneId);
}
/**
* get timestamp in String
* PowerMock 2.0.9 fails to mock System.currentTimeMillis(), this method helps in UT
*/
public static String getTimestampString() {
return String.valueOf(System.currentTimeMillis());
}
}

27
dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterConstants.java

@ -23,6 +23,16 @@ public class JupyterConstants {
throw new IllegalStateException("Utility class");
}
/**
* execution flag, ignore errors and keep executing till the end
*/
public static final String EXECUTION_FLAG = "set +e";
/**
* new line symbol
*/
public static final String NEW_LINE_SYMBOL = "\n";
/**
* conda init
*/
@ -40,11 +50,28 @@ public class JupyterConstants {
"tar -xzf %s -C jupyter_env && " +
"source jupyter_env/bin/activate";
/**
* create and activate tmp conda env from txt
*/
public static final String CREATE_ENV_FROM_TXT = "conda create -n jupyter-tmp-env-%s -y && " +
"conda activate jupyter-tmp-env-%s && " +
"pip install -r %s";
/**
* remove tmp conda env
*/
public static final String REMOVE_ENV = "conda deactivate && conda remove --name jupyter-tmp-env-%s --all -y";
/**
* file suffix tar.gz
*/
public static final String TAR_SUFFIX = ".tar.gz";
/**
* file suffix .txt
*/
public static final String TXT_SUFFIX = ".txt";
/**
* jointer to combine two command
*/

20
dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/main/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTask.java

@ -28,6 +28,7 @@ import org.apache.dolphinscheduler.plugin.task.api.parameters.AbstractParameters
import org.apache.dolphinscheduler.plugin.task.api.parser.ParamUtils;
import org.apache.dolphinscheduler.plugin.task.api.parser.ParameterUtils;
import org.apache.dolphinscheduler.plugin.task.api.utils.MapUtils;
import org.apache.dolphinscheduler.spi.utils.DateUtils;
import org.apache.dolphinscheduler.spi.utils.JSONUtils;
import org.apache.dolphinscheduler.spi.utils.PropertyUtils;
import org.apache.dolphinscheduler.spi.utils.StringUtils;
@ -104,12 +105,20 @@ public class JupyterTask extends AbstractTaskExecutor {
*/
List<String> args = new ArrayList<>();
final String condaPath = PropertyUtils.getString(TaskConstants.CONDA_PATH);
final String timestamp = DateUtils.getTimestampString();
String condaEnvName = jupyterParameters.getCondaEnvName();
if (condaEnvName.endsWith(JupyterConstants.TXT_SUFFIX)) {
args.add(JupyterConstants.EXECUTION_FLAG);
args.add(JupyterConstants.NEW_LINE_SYMBOL);
}
args.add(JupyterConstants.CONDA_INIT);
args.add(condaPath);
args.add(JupyterConstants.JOINTER);
String condaEnvName = jupyterParameters.getCondaEnvName();
if (condaEnvName.endsWith(JupyterConstants.TAR_SUFFIX)) {
args.add(String.format(JupyterConstants.CREATE_ENV_FROM_TAR, condaEnvName));
} else if (condaEnvName.endsWith(JupyterConstants.TXT_SUFFIX)) {
args.add(String.format(JupyterConstants.CREATE_ENV_FROM_TXT, timestamp, timestamp, condaEnvName));
} else {
args.add(JupyterConstants.CONDA_ACTIVATE);
args.add(jupyterParameters.getCondaEnvName());
@ -126,10 +135,17 @@ public class JupyterTask extends AbstractTaskExecutor {
// populate jupyter options
args.addAll(populateJupyterOptions());
// remove tmp conda env, if created from requirements.txt
if (condaEnvName.endsWith(JupyterConstants.TXT_SUFFIX)) {
args.add(JupyterConstants.NEW_LINE_SYMBOL);
args.add(String.format(JupyterConstants.REMOVE_ENV, timestamp));
}
// replace placeholder, and combining local and global parameters
Map<String, Property> paramsMap = taskExecutionContext.getPrepareParamsMap();
String command = ParameterUtils.convertParameterPlaceholders(String.join(" ", args), ParamUtils.convert(paramsMap));
String command = ParameterUtils
.convertParameterPlaceholders(String.join(" ", args), ParamUtils.convert(paramsMap));
logger.info("jupyter task command: {}", command);

56
dolphinscheduler-task-plugin/dolphinscheduler-task-jupyter/src/test/java/org/apache/dolphinscheduler/plugin/task/jupyter/JupyterTaskTest.java

@ -19,8 +19,8 @@ package org.apache.dolphinscheduler.plugin.task.jupyter;
import org.apache.dolphinscheduler.plugin.task.api.TaskExecutionContext;
import org.apache.dolphinscheduler.spi.utils.DateUtils;
import org.apache.dolphinscheduler.spi.utils.JSONUtils;
import org.apache.dolphinscheduler.spi.utils.PropertyUtils;
import org.junit.Assert;
import org.junit.Test;
@ -30,16 +30,15 @@ import org.powermock.core.classloader.annotations.PowerMockIgnore;
import org.powermock.core.classloader.annotations.PrepareForTest;
import org.powermock.core.classloader.annotations.SuppressStaticInitializationFor;
import org.powermock.modules.junit4.PowerMockRunner;
import org.apache.dolphinscheduler.plugin.task.api.TaskConstants;
import static org.mockito.ArgumentMatchers.any;
import static org.powermock.api.mockito.PowerMockito.spy;
import static org.powermock.api.mockito.PowerMockito.when;
@RunWith(PowerMockRunner.class)
@PrepareForTest({
JSONUtils.class,
PropertyUtils.class,
JSONUtils.class,
PropertyUtils.class,
DateUtils.class
})
@PowerMockIgnore({"javax.*"})
@SuppressStaticInitializationFor("org.apache.dolphinscheduler.spi.utils.PropertyUtils")
@ -99,6 +98,39 @@ public class JupyterTaskTest {
"--progress-bar");
}
@Test
public void testBuildJupyterCommandWithRequirements() throws Exception {
String parameters = buildJupyterCommandWithRequirements();
TaskExecutionContext taskExecutionContext = PowerMockito.mock(TaskExecutionContext.class);
when(taskExecutionContext.getTaskParams()).thenReturn(parameters);
PowerMockito.mockStatic(PropertyUtils.class);
when(PropertyUtils.getString(any())).thenReturn("/opt/anaconda3/etc/profile.d/conda.sh");
PowerMockito.mockStatic(DateUtils.class);
when(DateUtils.getTimestampString()).thenReturn("123456789");
JupyterTask jupyterTask = spy(new JupyterTask(taskExecutionContext));
jupyterTask.init();
Assert.assertEquals(jupyterTask.buildCommand(),
"set +e \n " +
"source /opt/anaconda3/etc/profile.d/conda.sh && " +
"conda create -n jupyter-tmp-env-123456789 -y && " +
"conda activate jupyter-tmp-env-123456789 && " +
"pip install -r requirements.txt && " +
"papermill " +
"/test/input_note.ipynb " +
"/test/output_note.ipynb " +
"--parameters city Shanghai " +
"--parameters factor 0.01 " +
"--kernel python3 " +
"--engine default_engine " +
"--execution-timeout 10 " +
"--start-timeout 3 " +
"--version " +
"--inject-paths " +
"--progress-bar \n " +
"conda deactivate && conda remove --name jupyter-tmp-env-123456789 --all -y"
);
}
private String buildJupyterCommandWithLocalEnv() {
JupyterParameters jupyterParameters = new JupyterParameters();
jupyterParameters.setCondaEnvName("jupyter-lab");
@ -127,4 +159,18 @@ public class JupyterTaskTest {
return JSONUtils.toJsonString(jupyterParameters);
}
private String buildJupyterCommandWithRequirements() {
JupyterParameters jupyterParameters = new JupyterParameters();
jupyterParameters.setCondaEnvName("requirements.txt");
jupyterParameters.setInputNotePath("/test/input_note.ipynb");
jupyterParameters.setOutputNotePath("/test/output_note.ipynb");
jupyterParameters.setParameters("{\"city\": \"Shanghai\", \"factor\": \"0.01\"}");
jupyterParameters.setKernel("python3");
jupyterParameters.setEngine("default_engine");
jupyterParameters.setExecutionTimeout("10");
jupyterParameters.setStartTimeout("3");
jupyterParameters.setOthers("--version");
return JSONUtils.toJsonString(jupyterParameters);
}
}

Loading…
Cancel
Save