From 2f7281c2d2d3ceb60bae166932baa04d7703bf26 Mon Sep 17 00:00:00 2001 From: Eric Gao Date: Tue, 12 Jul 2022 11:44:34 +0800 Subject: [PATCH] [Feature][Metrics] Add resource download related metrics for workers (#10749) * [Feature][Metrics] Add resource download related metrics for workers (#9324) * [Feature][Metrics] Fix bugs and add grafana demos for worker resource download metrics (#9324) * [Feature][Metrics] Add docs to resource related metrics (#9324) * [Feature][Metrics] Use tags to indicate status in metrics (#9324) * [Feature][Metrics] Fix demos, docs and remove redundant code (#9324) * [Feature][Metrics] Remove .pnpm-debug.log (#9324) * [Feature][Metrics] Fix style check (#9324) * [Feature][Metrics] Replace KB with bytes for the unit of resource file size in metrics (#9324) * [Feature][Metrics] Make code neat (#9324) --- docs/docs/en/guide/metrics/metrics.md | 5 +- docs/docs/zh/guide/metrics/metrics.md | 3 + .../grafana/DolphinSchedulerWorker.json | 408 +++++++++++++++++- .../worker/metrics/WorkerServerMetrics.java | 49 ++- .../worker/runner/TaskExecuteThread.java | 9 + 5 files changed, 460 insertions(+), 14 deletions(-) diff --git a/docs/docs/en/guide/metrics/metrics.md b/docs/docs/en/guide/metrics/metrics.md index 2ba4f639d6..9ecdc3619f 100644 --- a/docs/docs/en/guide/metrics/metrics.md +++ b/docs/docs/en/guide/metrics/metrics.md @@ -74,7 +74,7 @@ For example, you can get the master metrics by `curl http://localhost:5679/actua - ds.task.execution.count.by.type: (counter) the number of task executions grouped by tag `task_type` - ds.task.running: (gauge) the number of running tasks - ds.task.prepared: (gauge) the number of tasks prepared for task queue -- ds.task.execution.count: (histogram) the number of executed tasks +- ds.task.execution.count: (counter) the number of executed tasks - ds.task.execution.duration: (histogram) duration of task executions @@ -103,6 +103,9 @@ For example, you can get the master metrics by `curl http://localhost:5679/actua - ds.worker.overload.count: (counter) the number of times the worker overloaded - ds.worker.full.submit.queue.count: (counter) the number of times the worker's submit queue being full +- ds.worker.resource.download.count: (counter) the number of downloaded resource files on workers, sliced by tag `status` +- ds.worker.resource.download.duration: (histogram) the time cost of resource download on workers +- ds.worker.resource.download.size: (histogram) the sizes of downloaded resource files on workers (bytes) ### Api Server Metrics diff --git a/docs/docs/zh/guide/metrics/metrics.md b/docs/docs/zh/guide/metrics/metrics.md index 9b3805c613..3116f5445b 100644 --- a/docs/docs/zh/guide/metrics/metrics.md +++ b/docs/docs/zh/guide/metrics/metrics.md @@ -104,6 +104,9 @@ metrics exporter端口`server.port`是在application.yaml里定义的: master: ` - ds.worker.overload.count: (counter) worker过载次数 - ds.worker.full.submit.queue.count: (counter) worker提交队列全满次数 +- ds.worker.resource.download.count: (counter) worker下载资源文件的次数,可由`status`标签切分 +- ds.worker.resource.download.duration: (histogram) worker下载资源文件时花费的时间分布 +- ds.worker.resource.download.size: (histogram) worker下载资源文件大小的分布(bytes) ### Api Server指标 diff --git a/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json b/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json index 01cd09baad..c13e030259 100644 --- a/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json +++ b/dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json @@ -764,13 +764,398 @@ "align": false } }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 47, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "sum(increase(ds_worker_resource_download_count_total{}[5m]))", + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "increase(ds_worker_resource_download_count_total{status=\"success\"}[5m])", + "hide": false, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "increase(ds_worker_resource_download_count_total{status=\"fail\"}[5m])", + "hide": false, + "refId": "C" + } + ], + "title": "Worker Resource Download Count/5m", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 41 + }, + "id": 44, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "increase(ds_worker_resource_download_duration_seconds{}[5m])", + "refId": "A" + } + ], + "title": "Worker Resource Download Time/5m", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 41 + }, + "id": 45, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom" + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "increase(ds_worker_resource_download_size_bytes[5m])", + "refId": "A" + } + ], + "title": "Worker Resource Download Size/5m", + "type": "timeseries" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 49 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 46, + "legend": { + "show": false + }, + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "histogram_quantile(0.75, sum(increase(ds_worker_resource_download_size_bytes_bucket[5m])) by (le))", + "refId": "A" + } + ], + "title": "Worker Resource Download Size Distribution/5m", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "format": "short", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, + { + "cards": {}, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateOranges", + "exponent": 0.5, + "mode": "spectrum" + }, + "dataFormat": "timeseries", + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 49 + }, + "heatmap": {}, + "hideZeroBuckets": false, + "highlightCards": true, + "id": 48, + "legend": { + "show": false + }, + "pluginVersion": "8.5.3", + "reverseYBuckets": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "PBFA97CFB590B2093" + }, + "expr": "histogram_quantile(0.95, sum(rate(ds_worker_resource_download_duration_seconds_bucket[5m])) by (le))", + "refId": "A" + } + ], + "title": "Worker Resource Download Time Distribution/5m", + "tooltip": { + "show": true, + "showHistogram": false + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "yAxis": { + "format": "short", + "logBase": 1, + "show": true + }, + "yBucketBound": "auto" + }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 57 }, "id": 26, "panels": [], @@ -836,7 +1221,7 @@ "h": 8, "w": 8, "x": 0, - "y": 34 + "y": 58 }, "id": 14, "options": { @@ -925,7 +1310,7 @@ "h": 8, "w": 9, "x": 8, - "y": 34 + "y": 58 }, "id": 20, "options": { @@ -1014,7 +1399,7 @@ "h": 8, "w": 7, "x": 17, - "y": 34 + "y": 58 }, "id": 22, "options": { @@ -1103,7 +1488,7 @@ "h": 8, "w": 8, "x": 0, - "y": 42 + "y": 66 }, "id": 32, "options": { @@ -1192,7 +1577,7 @@ "h": 8, "w": 9, "x": 8, - "y": 42 + "y": 66 }, "id": 34, "options": { @@ -1281,7 +1666,7 @@ "h": 8, "w": 7, "x": 17, - "y": 42 + "y": 66 }, "id": 36, "options": { @@ -1370,7 +1755,7 @@ "h": 8, "w": 8, "x": 0, - "y": 50 + "y": 74 }, "id": 38, "options": { @@ -1459,7 +1844,7 @@ "h": 8, "w": 9, "x": 8, - "y": 50 + "y": 74 }, "id": 40, "options": { @@ -1548,7 +1933,7 @@ "h": 8, "w": 7, "x": 17, - "y": 50 + "y": 74 }, "id": 42, "options": { @@ -1579,6 +1964,7 @@ "type": "timeseries" } ], + "refresh": "30s", "schemaVersion": 36, "style": "dark", "tags": [], @@ -1593,6 +1979,6 @@ "timezone": "", "title": "Worker", "uid": "6wXtd3r7k", - "version": 2, + "version": 11, "weekStart": "" } \ No newline at end of file diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java index 6c68cc00ec..c0ae07269b 100644 --- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java +++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java @@ -17,11 +17,14 @@ package org.apache.dolphinscheduler.server.worker.metrics; +import java.util.concurrent.TimeUnit; import java.util.function.Supplier; import io.micrometer.core.instrument.Counter; +import io.micrometer.core.instrument.DistributionSummary; import io.micrometer.core.instrument.Gauge; import io.micrometer.core.instrument.Metrics; +import io.micrometer.core.instrument.Timer; import lombok.experimental.UtilityClass; @UtilityClass @@ -37,6 +40,33 @@ public class WorkerServerMetrics { .description("full worker submit queues count") .register(Metrics.globalRegistry); + private static final Counter WORKER_RESOURCE_DOWNLOAD_SUCCESS_COUNTER = + Counter.builder("ds.worker.resource.download.count") + .tag("status", "success") + .description("worker resource download success count") + .register(Metrics.globalRegistry); + + private static final Counter WORKER_RESOURCE_DOWNLOAD_FAILURE_COUNTER = + Counter.builder("ds.worker.resource.download.count") + .tag("status", "fail") + .description("worker resource download failure count") + .register(Metrics.globalRegistry); + + private static final Timer WORKER_RESOURCE_DOWNLOAD_DURATION_TIMER = + Timer.builder("ds.worker.resource.download.duration") + .publishPercentiles(0.5, 0.75, 0.95, 0.99) + .publishPercentileHistogram() + .description("time cost of resource download on workers") + .register(Metrics.globalRegistry); + + private static final DistributionSummary WORKER_RESOURCE_DOWNLOAD_SIZE_DISTRIBUTION = + DistributionSummary.builder("ds.worker.resource.download.size") + .baseUnit("bytes") + .publishPercentiles(0.5, 0.75, 0.95, 0.99) + .publishPercentileHistogram() + .description("size of downloaded resource files on worker") + .register(Metrics.globalRegistry); + public static void incWorkerOverloadCount() { WORKER_OVERLOAD_COUNTER.increment(); } @@ -45,11 +75,26 @@ public class WorkerServerMetrics { WORKER_SUBMIT_QUEUE_IS_FULL_COUNTER.increment(); } - public static void registerWorkerRunningTaskGauge(Supplier supplier) { + public static void incWorkerResourceDownloadSuccessCount() { + WORKER_RESOURCE_DOWNLOAD_SUCCESS_COUNTER.increment(); + } + + public static void incWorkerResourceDownloadFailureCount() { + WORKER_RESOURCE_DOWNLOAD_FAILURE_COUNTER.increment(); + } + + public static void recordWorkerResourceDownloadTime(final long milliseconds) { + WORKER_RESOURCE_DOWNLOAD_DURATION_TIMER.record(milliseconds, TimeUnit.MILLISECONDS); + } + + public static void recordWorkerResourceDownloadSize(final long size) { + WORKER_RESOURCE_DOWNLOAD_SIZE_DISTRIBUTION.record(size); + } + + public static void registerWorkerRunningTaskGauge(final Supplier supplier) { Gauge.builder("ds.task.running", supplier) .description("number of running tasks on workers") .register(Metrics.globalRegistry); - } } diff --git a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java index 2aa34a21fd..7e03f83844 100644 --- a/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java +++ b/dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java @@ -36,6 +36,7 @@ import org.apache.dolphinscheduler.plugin.task.api.enums.ExecutionStatus; import org.apache.dolphinscheduler.plugin.task.api.model.Property; import org.apache.dolphinscheduler.plugin.task.api.model.TaskAlertInfo; import org.apache.dolphinscheduler.server.utils.ProcessUtils; +import org.apache.dolphinscheduler.server.worker.metrics.WorkerServerMetrics; import org.apache.dolphinscheduler.server.worker.processor.TaskCallbackService; import org.apache.dolphinscheduler.service.alert.AlertClientService; import org.apache.dolphinscheduler.service.exceptions.ServiceException; @@ -46,7 +47,9 @@ import org.apache.commons.lang3.tuple.Pair; import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.NoSuchFileException; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.Collections; import java.util.Date; @@ -278,8 +281,14 @@ public class TaskExecuteThread implements Runnable, Delayed { String tenantCode = fileDownload.getRight(); String resHdfsPath = storageOperate.getResourceFileName(tenantCode, fullName); logger.info("get resource file from hdfs :{}", resHdfsPath); + long resourceDownloadStartTime = System.currentTimeMillis(); storageOperate.download(tenantCode, resHdfsPath, execLocalPath + File.separator + fullName, false, true); + WorkerServerMetrics.recordWorkerResourceDownloadTime(System.currentTimeMillis() - resourceDownloadStartTime); + WorkerServerMetrics.recordWorkerResourceDownloadSize( + Files.size(Paths.get(execLocalPath, fullName))); + WorkerServerMetrics.incWorkerResourceDownloadSuccessCount(); } catch (Exception e) { + WorkerServerMetrics.incWorkerResourceDownloadFailureCount(); logger.error(e.getMessage(), e); throw new ServiceException(e.getMessage()); }