From 59a026d897d5fa3599dd58e932d164370a63b5f4 Mon Sep 17 00:00:00 2001 From: Jiajie Zhong Date: Fri, 15 Apr 2022 15:46:44 +0800 Subject: [PATCH] [python] Support read config in env variable (#9517) Add a new method to get config from environment variables and for now, we have three ways to get config and the priority is `env-var > custom-config-file > built-in-config-file`. Environment config setting does not work in CLI, because it will confuse users when they get config value is `var-env` but value in the configuration file is `var-in-file`, they may not find the way how to change it * Add documentation * Add it to UPDATING.md close: #8344 --- .../pydolphinscheduler/UPDATING.md | 5 + .../pydolphinscheduler/docs/source/config.rst | 131 +++++++++++++++++- .../src/pydolphinscheduler/cli/commands.py | 2 +- .../pydolphinscheduler/core/configuration.py | 69 +++++++-- .../pydolphinscheduler/utils/yaml_parser.py | 14 -- .../tests/cli/test_config.py | 2 + .../tests/core/test_configuration.py | 88 ++++++++++++ .../tests/testing/constants.py | 2 +- .../tests/utils/test_yaml_parser.py | 26 ---- 9 files changed, 275 insertions(+), 64 deletions(-) diff --git a/dolphinscheduler-python/pydolphinscheduler/UPDATING.md b/dolphinscheduler-python/pydolphinscheduler/UPDATING.md index 430e4b626f..d772c6f8f6 100644 --- a/dolphinscheduler-python/pydolphinscheduler/UPDATING.md +++ b/dolphinscheduler-python/pydolphinscheduler/UPDATING.md @@ -24,6 +24,11 @@ It started after version 2.0.5 released ## dev +* Change variable about where to keep pydolphinscheduler configuration from ``PYDOLPHINSCHEDULER_HOME`` to + ``PYDS_HOME`` which is same as other environment variable name. + +## 3.0.0a0 + * Integrate Python gateway server into Dolphinscheduler API server, and you could start Python gateway service by command `./bin/dolphinscheduler-daemon.sh start api-server` instead of independent command `./bin/dolphinscheduler-daemon.sh start python-gateway-server`. diff --git a/dolphinscheduler-python/pydolphinscheduler/docs/source/config.rst b/dolphinscheduler-python/pydolphinscheduler/docs/source/config.rst index bfba2ecfde..66c7f08525 100644 --- a/dolphinscheduler-python/pydolphinscheduler/docs/source/config.rst +++ b/dolphinscheduler-python/pydolphinscheduler/docs/source/config.rst @@ -18,8 +18,114 @@ Configuration ============= +pydolphinscheduler has a built-in module setting necessary configuration to start and run your workflow code. +You could directly use them if you only want to run a quick start or for a simple job like POC. But if you +want to deep use pydolphinscheduler and even use it in production. You should probably need to modify and +change the built-in configuration. + +We have two ways to modify the configuration: + +- `Using Environment Variables`_: The more lightweight way to modify the configuration. it is useful in + containerization scenarios, like docker and k8s, or when you like to temporarily override configs in the + configuration file. +- `Using Configuration File`_: The more general way to modify the configuration. It is useful when you want + to persist and manage configuration files in one single file. + +Using Environment Variables +--------------------------- + +You could change the configuration by adding or modifying the operating system's environment variables. No +matter what way you used, as long as you can successfully modify the environment variables. We use two common +ways, `Bash `_ and `Python OS Module `_, as examples: + +By Bash +^^^^^^^ + +Setting environment variables via `Bash` is the most straightforward and easiest way. We give some examples about +how to change them by Bash. + +.. code-block:: bash + + # Modify Java Gateway Address + $ export PYDS_JAVA_GATEWAY_ADDRESS="192.168.1.1" + + # Modify Workflow Default User + $ export PYDS_WORKFLOW_USER="custom-user" + +After executing the commands above, both ``PYDS_JAVA_GATEWAY_ADDRESS`` and ``PYDS_WORKFLOW_USER`` will be changed. +The next time you execute and submit your workflow, it will submit to host `192.168.1.1`, and with workflow's user +named `custom-user`. + +By Python OS Module +^^^^^^^^^^^^^^^^^^^ + +pydolphinscheduler is a Python API for Apache DolphinScheduler, and you could modify or add system environment +variables via Python ``os`` module. In this example, we change variables as the same value as we change in +`Bash `_. It will take effect the next time you run your workflow, and call workflow ``run`` or ``submit`` +method next to ``os.environ`` statement. + +.. code-block:: python + + import os + # Modify Java Gateway Address + os.environ["PYDS_JAVA_GATEWAY_ADDRESS"] = "192.168.1.1" + + # Modify Workflow Default User + os.environ["PYDS_WORKFLOW_USER"] = "custom-user" + +All Configurations in Environment Variables +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +All environment variables as below, and you could modify their value via `Bash `_ or `Python OS Module `_ + ++------------------+------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| Variable Section | Variable Name | description | ++==================+====================================+==================================================================================================================+ +| | ``PYDS_JAVA_GATEWAY_ADDRESS`` | Default Java gateway address, will use its value when it is set. | ++ +------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| Java Gateway | ``PYDS_JAVA_GATEWAY_PORT`` | Default Java gateway port, will use its value when it is set. | ++ +------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| | ``PYDS_JAVA_GATEWAY_AUTO_CONVERT`` | Default boolean Java gateway auto convert, will use its value when it is set. | ++------------------+------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| | ``PYDS_USER_NAME`` | Default user name, will use when user's ``name`` when does not specify. | ++ +------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| | ``PYDS_USER_PASSWORD`` | Default user password, will use when user's ``password`` when does not specify. | ++ +------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| Default User | ``PYDS_USER_EMAIL`` | Default user email, will use when user's ``email`` when does not specify. | ++ +------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| | ``PYDS_USER_PHONE`` | Default user phone, will use when user's ``phone`` when does not specify. | ++ +------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| | ``PYDS_USER_STATE`` | Default user state, will use when user's ``state`` when does not specify. | ++------------------+------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| | ``PYDS_WORKFLOW_PROJECT`` | Default workflow project name, will use its value when workflow does not specify the attribute ``project``. | ++ +------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| | ``PYDS_WORKFLOW_TENANT`` | Default workflow tenant, will use its value when workflow does not specify the attribute ``tenant``. | ++ +------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| Default Workflow | ``PYDS_WORKFLOW_USER`` | Default workflow user, will use its value when workflow does not specify the attribute ``user``. | ++ +------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| | ``PYDS_WORKFLOW_QUEUE`` | Default workflow queue, will use its value when workflow does not specify the attribute ``queue``. | ++ +------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| | ``PYDS_WORKFLOW_WORKER_GROUP`` | Default workflow worker group, will use its value when workflow does not specify the attribute ``worker_group``. | ++ +------------------------------------+------------------------------------------------------------------------------------------------------------------+ +| | ``PYDS_WORKFLOW_TIME_ZONE`` | Default workflow worker group, will use its value when workflow does not specify the attribute ``timezone``. | ++------------------+------------------------------------+------------------------------------------------------------------------------------------------------------------+ + +.. note:: + + The scope of setting configuration via environment variable is in the workflow, and it will not change the + value of the configuration file. The :doc:`CLI ` command ``config --get`` and ``config --set`` operate + the value of the configuration file, so the command ``config --get`` may return a different value from what + you set in the environment variable, and command ``config --get`` will never change your environment variable. + +Using Configuration File +------------------------ + +If you want to persist and manage configuration in a file instead of environment variables, or maybe you want +want to save your configuration file to a version control system, like Git or SVN, and the way to change +configuration by file is the best choice. + Export Configuration File -------------------------- +^^^^^^^^^^^^^^^^^^^^^^^^^ pydolphinscheduler allows you to change the built-in configurations via CLI or editor you like. pydolphinscheduler integrated built-in configurations in its package, but you could also export it locally by CLI @@ -29,24 +135,24 @@ integrated built-in configurations in its package, but you could also export it $ pydolphinscheduler config --init And it will create a new YAML file in the path `~/pydolphinscheduler/config.yaml` by default. If you want to export -it to another path, you should set `PYDOLPHINSCHEDULER_HOME` before you run command :code:`pydolphinscheduler config --init`. +it to another path, you should set `PYDS_HOME` before you run command :code:`pydolphinscheduler config --init`. .. code-block:: bash - $ export PYDOLPHINSCHEDULER_HOME= + $ export PYDS_HOME= $ pydolphinscheduler config --init After that, your configuration file will export into `/config.yaml` instead of the default path. Change Configuration --------------------- +^^^^^^^^^^^^^^^^^^^^ In section `export configuration file`_ you export the configuration file locally, and as a local file, you could edit it with any editor you like. After you save your change in your editor, the latest configuration will work when you run your workflow code. You could also query or change the configuration via CLI :code:`config --get ` or :code:`config --get `. -Both `--get` and `--set` could be call one or more times in single command, and you could only set the leaf +Both `--get` and `--set` could be called one or more times in single command, and you could only set the leaf node of the configuration but could get the parent configuration, there are simple examples below: .. code-block:: bash @@ -84,8 +190,8 @@ node of the configuration but could get the parent configuration, there are simp For more information about our CLI, you could see document :doc:`cli`. -All Configurations ------------------- +All Configurations in File +^^^^^^^^^^^^^^^^^^^^^^^^^^ Here are all our configurations for pydolphinscheduler. @@ -93,4 +199,15 @@ Here are all our configurations for pydolphinscheduler. :language: yaml :lines: 18- +Priority +-------- + +We have two ways to modify the configuration and there is a built-in config in pydolphinscheduler too. It is +very important to understand the priority of the configuration when you use them. The overview of configuration +priority is. + +``Environment Variables > Configurations File > Built-in Configurations`` +This means that your setting in environment variables or configurations file will overwrite the built-in one. +And you could temporarily modify configurations by setting environment variables without modifying the global +config in the configuration file. diff --git a/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/cli/commands.py b/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/cli/commands.py index 5628799bcf..e2ca86b573 100644 --- a/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/cli/commands.py +++ b/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/cli/commands.py @@ -58,7 +58,7 @@ def version(part: str) -> None: "--init", "-i", is_flag=True, - help="Initialize and create configuration file to `PYDOLPHINSCHEDULER_HOME`.", + help="Initialize and create configuration file to `PYDS_HOME`.", ) @click.option( "--set", diff --git a/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/core/configuration.py b/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/core/configuration.py index 8b5665d304..14f2fe9fce 100644 --- a/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/core/configuration.py +++ b/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/core/configuration.py @@ -29,7 +29,7 @@ BUILD_IN_CONFIG_PATH = Path(__file__).resolve().parent.joinpath("default_config. def config_path() -> Path: """Get the path of pydolphinscheduler configuration file.""" - pyds_home = os.environ.get("PYDOLPHINSCHEDULER_HOME", "~/pydolphinscheduler") + pyds_home = os.environ.get("PYDS_HOME", "~/pydolphinscheduler") config_file_path = Path(pyds_home).joinpath("config.yaml").expanduser() return config_file_path @@ -118,6 +118,21 @@ def set_single_config(key: str, value: Any) -> None: file.write(content=str(config), to_path=str(config_path()), overwrite=True) +def get_int(val: Any) -> int: + """Covert value to int.""" + return int(val) + + +def get_bool(val: Any) -> bool: + """Covert value to boolean.""" + if isinstance(val, str): + return val.lower() in {"true", "t"} + elif isinstance(val, int): + return val == 1 + else: + return bool(val) + + # Start Common Configuration Settings # Add configs as module variables to avoid read configuration multiple times when @@ -126,23 +141,47 @@ def set_single_config(key: str, value: Any) -> None: configs: YamlParser = get_configs() # Java Gateway Settings -JAVA_GATEWAY_ADDRESS = configs.get("java_gateway.address") -JAVA_GATEWAY_PORT = configs.get_int("java_gateway.port") -JAVA_GATEWAY_AUTO_CONVERT = configs.get_bool("java_gateway.auto_convert") +JAVA_GATEWAY_ADDRESS = os.environ.get( + "PYDS_JAVA_GATEWAY_ADDRESS", configs.get("java_gateway.address") +) +JAVA_GATEWAY_PORT = get_int( + os.environ.get("PYDS_JAVA_GATEWAY_PORT", configs.get("java_gateway.port")) +) +JAVA_GATEWAY_AUTO_CONVERT = get_bool( + os.environ.get( + "PYDS_JAVA_GATEWAY_AUTO_CONVERT", configs.get("java_gateway.auto_convert") + ) +) # User Settings -USER_NAME = configs.get("default.user.name") -USER_PASSWORD = configs.get("default.user.password") -USER_EMAIL = configs.get("default.user.email") -USER_PHONE = str(configs.get("default.user.phone")) -USER_STATE = configs.get_int("default.user.state") +USER_NAME = os.environ.get("PYDS_USER_NAME", configs.get("default.user.name")) +USER_PASSWORD = os.environ.get( + "PYDS_USER_PASSWORD", configs.get("default.user.password") +) +USER_EMAIL = os.environ.get("PYDS_USER_EMAIL", configs.get("default.user.email")) +USER_PHONE = str(os.environ.get("PYDS_USER_PHONE", configs.get("default.user.phone"))) +USER_STATE = get_int( + os.environ.get("PYDS_USER_STATE", configs.get("default.user.state")) +) # Workflow Settings -WORKFLOW_PROJECT = configs.get("default.workflow.project") -WORKFLOW_TENANT = configs.get("default.workflow.tenant") -WORKFLOW_USER = configs.get("default.workflow.user") -WORKFLOW_QUEUE = configs.get("default.workflow.queue") -WORKFLOW_WORKER_GROUP = configs.get("default.workflow.worker_group") -WORKFLOW_TIME_ZONE = configs.get("default.workflow.time_zone") +WORKFLOW_PROJECT = os.environ.get( + "PYDS_WORKFLOW_PROJECT", configs.get("default.workflow.project") +) +WORKFLOW_TENANT = os.environ.get( + "PYDS_WORKFLOW_TENANT", configs.get("default.workflow.tenant") +) +WORKFLOW_USER = os.environ.get( + "PYDS_WORKFLOW_USER", configs.get("default.workflow.user") +) +WORKFLOW_QUEUE = os.environ.get( + "PYDS_WORKFLOW_QUEUE", configs.get("default.workflow.queue") +) +WORKFLOW_WORKER_GROUP = os.environ.get( + "PYDS_WORKFLOW_WORKER_GROUP", configs.get("default.workflow.worker_group") +) +WORKFLOW_TIME_ZONE = os.environ.get( + "PYDS_WORKFLOW_TIME_ZONE", configs.get("default.workflow.time_zone") +) # End Common Configuration Setting diff --git a/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/utils/yaml_parser.py b/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/utils/yaml_parser.py index 5cea0190d8..46ee08cec8 100644 --- a/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/utils/yaml_parser.py +++ b/dolphinscheduler-python/pydolphinscheduler/src/pydolphinscheduler/utils/yaml_parser.py @@ -146,20 +146,6 @@ class YamlParser: """Get value by key, is call ``__getitem__``.""" return self[key] - def get_int(self, key: str) -> int: - """Get value and covert it to int.""" - return int(self.get(key)) - - def get_bool(self, key: str) -> bool: - """Get value and covert it to boolean.""" - val = self.get(key) - if isinstance(val, str): - return val.lower() in {"true", "t"} - elif isinstance(val, int): - return val != 0 - else: - return val - def __str__(self) -> str: """Transfer :class:`YamlParser` to string object. diff --git a/dolphinscheduler-python/pydolphinscheduler/tests/cli/test_config.py b/dolphinscheduler-python/pydolphinscheduler/tests/cli/test_config.py index f7c489a968..d913277b2e 100644 --- a/dolphinscheduler-python/pydolphinscheduler/tests/cli/test_config.py +++ b/dolphinscheduler-python/pydolphinscheduler/tests/cli/test_config.py @@ -38,7 +38,9 @@ def teardown_file_env(): config_file_path = config_path() if config_file_path.exists(): config_file_path.unlink() + # pop environment variable to keep test cases dependent os.environ.pop(ENV_PYDS_HOME, None) + assert ENV_PYDS_HOME not in os.environ @pytest.mark.parametrize( diff --git a/dolphinscheduler-python/pydolphinscheduler/tests/core/test_configuration.py b/dolphinscheduler-python/pydolphinscheduler/tests/core/test_configuration.py index 45d4477bcd..394fd33d92 100644 --- a/dolphinscheduler-python/pydolphinscheduler/tests/core/test_configuration.py +++ b/dolphinscheduler-python/pydolphinscheduler/tests/core/test_configuration.py @@ -17,6 +17,7 @@ """Test class :mod:`pydolphinscheduler.core.configuration`' method.""" +import importlib import os from pathlib import Path from typing import Any @@ -46,6 +47,56 @@ def teardown_file_env(): os.environ.pop(ENV_PYDS_HOME, None) +@pytest.mark.parametrize( + "val, expect", + [ + ("1", 1), + ("123", 123), + ("4567", 4567), + (b"1234", 1234), + ], +) +def test_get_int(val: Any, expect: int): + """Test function :func:`configuration.get_int`.""" + assert configuration.get_int(val) == expect + + +@pytest.mark.parametrize( + "val", + [ + "a", + "1a", + "1d2", + "1723-", + ], +) +def test_get_int_error(val: Any): + """Test function :func:`configuration.get_int`.""" + with pytest.raises(ValueError): + configuration.get_int(val) + + +@pytest.mark.parametrize( + "val, expect", + [ + ("t", True), + ("true", True), + (1, True), + (True, True), + ("f", False), + ("false", False), + (0, False), + (123, False), + ("abc", False), + ("abc1", False), + (False, False), + ], +) +def test_get_bool(val: Any, expect: bool): + """Test function :func:`configuration.get_bool`.""" + assert configuration.get_bool(val) == expect + + @pytest.mark.parametrize( "home, expect", [ @@ -176,3 +227,40 @@ def test_single_config_get_set_not_exists_key(): def test_get_configuration(config_name: str, expect: Any): """Test get exists attribute in :mod:`configuration`.""" assert expect == getattr(configuration, config_name) + + +@pytest.mark.parametrize( + "config_name, src, dest", + [ + ("JAVA_GATEWAY_ADDRESS", "127.0.0.1", "192.168.1.1"), + ("JAVA_GATEWAY_PORT", 25333, 25334), + ("JAVA_GATEWAY_AUTO_CONVERT", True, False), + ("USER_NAME", "userPythonGateway", "envUserPythonGateway"), + ("USER_PASSWORD", "userPythonGateway", "envUserPythonGateway"), + ( + "USER_EMAIL", + "userPythonGateway@dolphinscheduler.com", + "userPythonGateway@dolphinscheduler.com", + ), + ("USER_PHONE", "11111111111", "22222222222"), + ("USER_STATE", 1, 0), + ("WORKFLOW_PROJECT", "project-pydolphin", "env-project-pydolphin"), + ("WORKFLOW_TENANT", "tenant_pydolphin", "env-tenant_pydolphin"), + ("WORKFLOW_USER", "userPythonGateway", "envUserPythonGateway"), + ("WORKFLOW_QUEUE", "queuePythonGateway", "envQueuePythonGateway"), + ("WORKFLOW_WORKER_GROUP", "default", "custom"), + ("WORKFLOW_TIME_ZONE", "Asia/Shanghai", "America/Los_Angeles"), + ], +) +def test_get_configuration_env(config_name: str, src: Any, dest: Any): + """Test get exists attribute from environment variable in :mod:`configuration`.""" + assert getattr(configuration, config_name) == src + + env_name = f"PYDS_{config_name}" + os.environ[env_name] = str(dest) + # reload module configuration to re-get config from environment. + importlib.reload(configuration) + assert getattr(configuration, config_name) == dest + + os.environ.pop(env_name, None) + assert env_name not in os.environ diff --git a/dolphinscheduler-python/pydolphinscheduler/tests/testing/constants.py b/dolphinscheduler-python/pydolphinscheduler/tests/testing/constants.py index 63f0fdd822..ed2ee37de7 100644 --- a/dolphinscheduler-python/pydolphinscheduler/tests/testing/constants.py +++ b/dolphinscheduler-python/pydolphinscheduler/tests/testing/constants.py @@ -39,7 +39,7 @@ ignore_exec_examples = { } # pydolphinscheduler environment home -ENV_PYDS_HOME = "PYDOLPHINSCHEDULER_HOME" +ENV_PYDS_HOME = "PYDS_HOME" # whether in dev mode, if true we will add or remove some tests. Or make be and more detail infos when # test failed. diff --git a/dolphinscheduler-python/pydolphinscheduler/tests/utils/test_yaml_parser.py b/dolphinscheduler-python/pydolphinscheduler/tests/utils/test_yaml_parser.py index ae49f2b768..aa6d9eee6c 100644 --- a/dolphinscheduler-python/pydolphinscheduler/tests/utils/test_yaml_parser.py +++ b/dolphinscheduler-python/pydolphinscheduler/tests/utils/test_yaml_parser.py @@ -251,29 +251,3 @@ def test_yaml_parser_str_repr(src: str, setter: Dict, expect: str): # Equal after changed assert expect == str(yaml_parser) assert f"YamlParser({expect})" == repr(yaml_parser) - - -@pytest.mark.parametrize( - "src, key, expect", - [ - (param[1], "java_gateway.port", 25333), - (param[1], "default.user.phone", 11111111111), - (param[1], "default.user.state", 1), - ], -) -def test_yaml_parser_get_int(src: str, key: str, expect: int): - """Test function :func:`YamlParser.get_int`.""" - yaml_parser = YamlParser(src) - assert expect == yaml_parser.get_int(key) - - -@pytest.mark.parametrize( - "src, key, expect", - [ - (param[1], "java_gateway.auto_convert", True), - ], -) -def test_yaml_parser_get_bool(src: str, key: str, expect: bool): - """Test function :func:`YamlParser.get_bool`.""" - yaml_parser = YamlParser(src) - assert expect == yaml_parser.get_bool(key)