Browse Source

[Feature][Metrics] Add resource download related metrics for workers (#10749)

* [Feature][Metrics] Add resource download related metrics for workers (#9324)

* [Feature][Metrics] Fix bugs and add grafana demos for worker resource download metrics (#9324)

* [Feature][Metrics] Add docs to resource related metrics (#9324)

* [Feature][Metrics] Use tags to indicate status in metrics (#9324)

* [Feature][Metrics] Fix demos, docs and remove redundant code (#9324)

* [Feature][Metrics] Remove .pnpm-debug.log (#9324)

* [Feature][Metrics] Fix style check (#9324)

* [Feature][Metrics] Replace KB with bytes for the unit of resource file size in metrics (#9324)

* [Feature][Metrics] Make code neat (#9324)
3.1.0-release
Eric Gao 2 years ago committed by GitHub
parent
commit
2f7281c2d2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
  1. 5
      docs/docs/en/guide/metrics/metrics.md
  2. 3
      docs/docs/zh/guide/metrics/metrics.md
  3. 408
      dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json
  4. 49
      dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java
  5. 9
      dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java

5
docs/docs/en/guide/metrics/metrics.md

@ -74,7 +74,7 @@ For example, you can get the master metrics by `curl http://localhost:5679/actua
- ds.task.execution.count.by.type: (counter) the number of task executions grouped by tag `task_type`
- ds.task.running: (gauge) the number of running tasks
- ds.task.prepared: (gauge) the number of tasks prepared for task queue
- ds.task.execution.count: (histogram) the number of executed tasks
- ds.task.execution.count: (counter) the number of executed tasks
- ds.task.execution.duration: (histogram) duration of task executions
@ -103,6 +103,9 @@ For example, you can get the master metrics by `curl http://localhost:5679/actua
- ds.worker.overload.count: (counter) the number of times the worker overloaded
- ds.worker.full.submit.queue.count: (counter) the number of times the worker's submit queue being full
- ds.worker.resource.download.count: (counter) the number of downloaded resource files on workers, sliced by tag `status`
- ds.worker.resource.download.duration: (histogram) the time cost of resource download on workers
- ds.worker.resource.download.size: (histogram) the sizes of downloaded resource files on workers (bytes)
### Api Server Metrics

3
docs/docs/zh/guide/metrics/metrics.md

@ -104,6 +104,9 @@ metrics exporter端口`server.port`是在application.yaml里定义的: master: `
- ds.worker.overload.count: (counter) worker过载次数
- ds.worker.full.submit.queue.count: (counter) worker提交队列全满次数
- ds.worker.resource.download.count: (counter) worker下载资源文件的次数,可由`status`标签切分
- ds.worker.resource.download.duration: (histogram) worker下载资源文件时花费的时间分布
- ds.worker.resource.download.size: (histogram) worker下载资源文件大小的分布(bytes)
### Api Server指标

408
dolphinscheduler-meter/src/main/resources/grafana/DolphinSchedulerWorker.json

@ -764,13 +764,398 @@
"align": false
}
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 33
},
"id": 47,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "sum(increase(ds_worker_resource_download_count_total{}[5m]))",
"refId": "A"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "increase(ds_worker_resource_download_count_total{status=\"success\"}[5m])",
"hide": false,
"refId": "B"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "increase(ds_worker_resource_download_count_total{status=\"fail\"}[5m])",
"hide": false,
"refId": "C"
}
],
"title": "Worker Resource Download Count/5m",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 41
},
"id": 44,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "increase(ds_worker_resource_download_duration_seconds{}[5m])",
"refId": "A"
}
],
"title": "Worker Resource Download Time/5m",
"type": "timeseries"
},
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"fieldConfig": {
"defaults": {
"color": {
"mode": "palette-classic"
},
"custom": {
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 0,
"gradientMode": "none",
"hideFrom": {
"legend": false,
"tooltip": false,
"viz": false
},
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": {
"type": "linear"
},
"showPoints": "auto",
"spanNulls": false,
"stacking": {
"group": "A",
"mode": "none"
},
"thresholdsStyle": {
"mode": "off"
}
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 41
},
"id": 45,
"options": {
"legend": {
"calcs": [],
"displayMode": "list",
"placement": "bottom"
},
"tooltip": {
"mode": "single",
"sort": "none"
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "increase(ds_worker_resource_download_size_bytes[5m])",
"refId": "A"
}
],
"title": "Worker Resource Download Size/5m",
"type": "timeseries"
},
{
"cards": {},
"color": {
"cardColor": "#b4ff00",
"colorScale": "sqrt",
"colorScheme": "interpolateOranges",
"exponent": 0.5,
"mode": "spectrum"
},
"dataFormat": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 49
},
"heatmap": {},
"hideZeroBuckets": false,
"highlightCards": true,
"id": 46,
"legend": {
"show": false
},
"reverseYBuckets": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "histogram_quantile(0.75, sum(increase(ds_worker_resource_download_size_bytes_bucket[5m])) by (le))",
"refId": "A"
}
],
"title": "Worker Resource Download Size Distribution/5m",
"tooltip": {
"show": true,
"showHistogram": false
},
"type": "heatmap",
"xAxis": {
"show": true
},
"yAxis": {
"format": "short",
"logBase": 1,
"show": true
},
"yBucketBound": "auto"
},
{
"cards": {},
"color": {
"cardColor": "#b4ff00",
"colorScale": "sqrt",
"colorScheme": "interpolateOranges",
"exponent": 0.5,
"mode": "spectrum"
},
"dataFormat": "timeseries",
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"gridPos": {
"h": 8,
"w": 12,
"x": 12,
"y": 49
},
"heatmap": {},
"hideZeroBuckets": false,
"highlightCards": true,
"id": 48,
"legend": {
"show": false
},
"pluginVersion": "8.5.3",
"reverseYBuckets": false,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "PBFA97CFB590B2093"
},
"expr": "histogram_quantile(0.95, sum(rate(ds_worker_resource_download_duration_seconds_bucket[5m])) by (le))",
"refId": "A"
}
],
"title": "Worker Resource Download Time Distribution/5m",
"tooltip": {
"show": true,
"showHistogram": false
},
"type": "heatmap",
"xAxis": {
"show": true
},
"yAxis": {
"format": "short",
"logBase": 1,
"show": true
},
"yBucketBound": "auto"
},
{
"collapsed": false,
"gridPos": {
"h": 1,
"w": 24,
"x": 0,
"y": 33
"y": 57
},
"id": 26,
"panels": [],
@ -836,7 +1221,7 @@
"h": 8,
"w": 8,
"x": 0,
"y": 34
"y": 58
},
"id": 14,
"options": {
@ -925,7 +1310,7 @@
"h": 8,
"w": 9,
"x": 8,
"y": 34
"y": 58
},
"id": 20,
"options": {
@ -1014,7 +1399,7 @@
"h": 8,
"w": 7,
"x": 17,
"y": 34
"y": 58
},
"id": 22,
"options": {
@ -1103,7 +1488,7 @@
"h": 8,
"w": 8,
"x": 0,
"y": 42
"y": 66
},
"id": 32,
"options": {
@ -1192,7 +1577,7 @@
"h": 8,
"w": 9,
"x": 8,
"y": 42
"y": 66
},
"id": 34,
"options": {
@ -1281,7 +1666,7 @@
"h": 8,
"w": 7,
"x": 17,
"y": 42
"y": 66
},
"id": 36,
"options": {
@ -1370,7 +1755,7 @@
"h": 8,
"w": 8,
"x": 0,
"y": 50
"y": 74
},
"id": 38,
"options": {
@ -1459,7 +1844,7 @@
"h": 8,
"w": 9,
"x": 8,
"y": 50
"y": 74
},
"id": 40,
"options": {
@ -1548,7 +1933,7 @@
"h": 8,
"w": 7,
"x": 17,
"y": 50
"y": 74
},
"id": 42,
"options": {
@ -1579,6 +1964,7 @@
"type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 36,
"style": "dark",
"tags": [],
@ -1593,6 +1979,6 @@
"timezone": "",
"title": "Worker",
"uid": "6wXtd3r7k",
"version": 2,
"version": 11,
"weekStart": ""
}

49
dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/metrics/WorkerServerMetrics.java

@ -17,11 +17,14 @@
package org.apache.dolphinscheduler.server.worker.metrics;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;
import io.micrometer.core.instrument.Counter;
import io.micrometer.core.instrument.DistributionSummary;
import io.micrometer.core.instrument.Gauge;
import io.micrometer.core.instrument.Metrics;
import io.micrometer.core.instrument.Timer;
import lombok.experimental.UtilityClass;
@UtilityClass
@ -37,6 +40,33 @@ public class WorkerServerMetrics {
.description("full worker submit queues count")
.register(Metrics.globalRegistry);
private static final Counter WORKER_RESOURCE_DOWNLOAD_SUCCESS_COUNTER =
Counter.builder("ds.worker.resource.download.count")
.tag("status", "success")
.description("worker resource download success count")
.register(Metrics.globalRegistry);
private static final Counter WORKER_RESOURCE_DOWNLOAD_FAILURE_COUNTER =
Counter.builder("ds.worker.resource.download.count")
.tag("status", "fail")
.description("worker resource download failure count")
.register(Metrics.globalRegistry);
private static final Timer WORKER_RESOURCE_DOWNLOAD_DURATION_TIMER =
Timer.builder("ds.worker.resource.download.duration")
.publishPercentiles(0.5, 0.75, 0.95, 0.99)
.publishPercentileHistogram()
.description("time cost of resource download on workers")
.register(Metrics.globalRegistry);
private static final DistributionSummary WORKER_RESOURCE_DOWNLOAD_SIZE_DISTRIBUTION =
DistributionSummary.builder("ds.worker.resource.download.size")
.baseUnit("bytes")
.publishPercentiles(0.5, 0.75, 0.95, 0.99)
.publishPercentileHistogram()
.description("size of downloaded resource files on worker")
.register(Metrics.globalRegistry);
public static void incWorkerOverloadCount() {
WORKER_OVERLOAD_COUNTER.increment();
}
@ -45,11 +75,26 @@ public class WorkerServerMetrics {
WORKER_SUBMIT_QUEUE_IS_FULL_COUNTER.increment();
}
public static void registerWorkerRunningTaskGauge(Supplier<Number> supplier) {
public static void incWorkerResourceDownloadSuccessCount() {
WORKER_RESOURCE_DOWNLOAD_SUCCESS_COUNTER.increment();
}
public static void incWorkerResourceDownloadFailureCount() {
WORKER_RESOURCE_DOWNLOAD_FAILURE_COUNTER.increment();
}
public static void recordWorkerResourceDownloadTime(final long milliseconds) {
WORKER_RESOURCE_DOWNLOAD_DURATION_TIMER.record(milliseconds, TimeUnit.MILLISECONDS);
}
public static void recordWorkerResourceDownloadSize(final long size) {
WORKER_RESOURCE_DOWNLOAD_SIZE_DISTRIBUTION.record(size);
}
public static void registerWorkerRunningTaskGauge(final Supplier<Number> supplier) {
Gauge.builder("ds.task.running", supplier)
.description("number of running tasks on workers")
.register(Metrics.globalRegistry);
}
}

9
dolphinscheduler-worker/src/main/java/org/apache/dolphinscheduler/server/worker/runner/TaskExecuteThread.java

@ -36,6 +36,7 @@ import org.apache.dolphinscheduler.plugin.task.api.enums.ExecutionStatus;
import org.apache.dolphinscheduler.plugin.task.api.model.Property;
import org.apache.dolphinscheduler.plugin.task.api.model.TaskAlertInfo;
import org.apache.dolphinscheduler.server.utils.ProcessUtils;
import org.apache.dolphinscheduler.server.worker.metrics.WorkerServerMetrics;
import org.apache.dolphinscheduler.server.worker.processor.TaskCallbackService;
import org.apache.dolphinscheduler.service.alert.AlertClientService;
import org.apache.dolphinscheduler.service.exceptions.ServiceException;
@ -46,7 +47,9 @@ import org.apache.commons.lang3.tuple.Pair;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.NoSuchFileException;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Date;
@ -278,8 +281,14 @@ public class TaskExecuteThread implements Runnable, Delayed {
String tenantCode = fileDownload.getRight();
String resHdfsPath = storageOperate.getResourceFileName(tenantCode, fullName);
logger.info("get resource file from hdfs :{}", resHdfsPath);
long resourceDownloadStartTime = System.currentTimeMillis();
storageOperate.download(tenantCode, resHdfsPath, execLocalPath + File.separator + fullName, false, true);
WorkerServerMetrics.recordWorkerResourceDownloadTime(System.currentTimeMillis() - resourceDownloadStartTime);
WorkerServerMetrics.recordWorkerResourceDownloadSize(
Files.size(Paths.get(execLocalPath, fullName)));
WorkerServerMetrics.incWorkerResourceDownloadSuccessCount();
} catch (Exception e) {
WorkerServerMetrics.incWorkerResourceDownloadFailureCount();
logger.error(e.getMessage(), e);
throw new ServiceException(e.getMessage());
}

Loading…
Cancel
Save