This is an automated email from the ASF dual-hosted git repository. manishswaminathan pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new e05ef7b068 Adds metrics for task states (#14785) e05ef7b068 is described below commit e05ef7b0683241490cb0f15343e93579e8cfcf44 Author: NOOB <43700604+noob-se...@users.noreply.github.com> AuthorDate: Wed Jan 22 21:56:27 2025 +0530 Adds metrics for task states (#14785) * Adds metrics for task states * Adds tests --- .../configs/controller.yml | 2 +- .../pinot/common/metrics/ControllerGauge.java | 4 +++ .../ControllerPrometheusMetricsTest.java | 3 ++ .../core/minion/PinotHelixTaskResourceManager.java | 30 +++++++++++++++++- .../helix/core/minion/TaskMetricsEmitter.java | 18 +++++++++++ .../helix/core/minion/TaskMetricsEmitterTest.java | 37 ++++++++++++++++++++-- 6 files changed, 89 insertions(+), 5 deletions(-) diff --git a/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml b/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml index 2281a9ea41..a1b945ab4a 100644 --- a/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml +++ b/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml @@ -17,7 +17,7 @@ rules: tableType: "$6" partition: "$7" # Gauges that accept the controller taskType -- pattern: "\"org\\.apache\\.pinot\\.common\\.metrics\"<type=\"ControllerMetrics\", name=\"pinot\\.controller\\.(numMinionTasksInProgress|numMinionSubtasksRunning|numMinionSubtasksWaiting|numMinionSubtasksError|numMinionSubtasksUnknown|percentMinionSubtasksInQueue|percentMinionSubtasksInError)\\.(\\w+)\"><>(\\w+)" +- pattern: "\"org\\.apache\\.pinot\\.common\\.metrics\"<type=\"ControllerMetrics\", name=\"pinot\\.controller\\.(numMinionTasksInProgress|numMinionSubtasksRunning|numMinionSubtasksWaiting|numMinionSubtasksError|numMinionSubtasksUnknown|numMinionSubtasksDropped|numMinionSubtasksTimedOut|numMinionSubtasksAborted|percentMinionSubtasksInQueue|percentMinionSubtasksInError)\\.(\\w+)\"><>(\\w+)" name: "pinot_controller_$1_$3" cache: true labels: diff --git a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java index a978219343..99bd892066 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java @@ -64,11 +64,15 @@ public enum ControllerGauge implements AbstractMetrics.Gauge { TIME_MS_SINCE_LAST_MINION_TASK_METADATA_UPDATE("TimeMsSinceLastMinionTaskMetadataUpdate", false), TIME_MS_SINCE_LAST_SUCCESSFUL_MINION_TASK_GENERATION("TimeMsSinceLastSuccessfulMinionTaskGeneration", false), LAST_MINION_TASK_GENERATION_ENCOUNTERS_ERROR("LastMinionTaskGenerationEncountersError", false), + // TODO: Unify below subtask metrics into a single metric with status label NUM_MINION_TASKS_IN_PROGRESS("NumMinionTasksInProgress", true), NUM_MINION_SUBTASKS_WAITING("NumMinionSubtasksWaiting", true), NUM_MINION_SUBTASKS_RUNNING("NumMinionSubtasksRunning", true), NUM_MINION_SUBTASKS_ERROR("NumMinionSubtasksError", true), NUM_MINION_SUBTASKS_UNKNOWN("NumMinionSubtasksUnknown", true), + NUM_MINION_SUBTASKS_DROPPED("NumMinionSubtasksDropped", true), + NUM_MINION_SUBTASKS_TIMED_OUT("NumMinionSubtasksTimedOut", true), + NUM_MINION_SUBTASKS_ABORTED("NumMinionSubtasksAborted", true), PERCENT_MINION_SUBTASKS_IN_QUEUE("PercentMinionSubtasksInQueue", true), PERCENT_MINION_SUBTASKS_IN_ERROR("PercentMinionSubtasksInError", true), TIER_BACKEND_TABLE_COUNT("TierBackendTableCount", true), diff --git a/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java b/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java index 1f458a4448..1061a0af54 100644 --- a/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java +++ b/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java @@ -41,6 +41,9 @@ public abstract class ControllerPrometheusMetricsTest extends PinotPrometheusMet List.of(ControllerGauge.NUM_MINION_TASKS_IN_PROGRESS, ControllerGauge.NUM_MINION_SUBTASKS_RUNNING, ControllerGauge.NUM_MINION_SUBTASKS_WAITING, ControllerGauge.NUM_MINION_SUBTASKS_ERROR, ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN, + ControllerGauge.NUM_MINION_SUBTASKS_DROPPED, + ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT, + ControllerGauge.NUM_MINION_SUBTASKS_ABORTED, ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE, ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR); //local gauges that accept partition diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java index f8b258265a..87580c30b0 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java @@ -1126,7 +1126,7 @@ public class PinotHelixTaskResourceManager { } } - @JsonPropertyOrder({"total", "completed", "running", "waiting", "error", "unknown"}) + @JsonPropertyOrder({"total", "completed", "running", "waiting", "error", "unknown", "dropped", "timedOut", "aborted"}) public static class TaskCount { private int _waiting; // Number of tasks waiting to be scheduled on minions private int _error; // Number of tasks in error @@ -1134,6 +1134,10 @@ public class PinotHelixTaskResourceManager { private int _completed; // Number of tasks completed normally private int _unknown; // Number of tasks with all other states private int _total; // Total number of tasks in the batch + private int _dropped; // Total number of tasks dropped + // (Task can be dropped due to no available assigned instance, etc.) + private int _timedOut; // Total number of tasks timed out + private int _aborted; // Total number of tasks aborted public TaskCount() { } @@ -1157,6 +1161,15 @@ public class PinotHelixTaskResourceManager { case COMPLETED: _completed++; break; + case DROPPED: + _dropped++; + break; + case TIMED_OUT: + _timedOut++; + break; + case TASK_ABORTED: + _aborted++; + break; default: _unknown++; break; @@ -1188,6 +1201,18 @@ public class PinotHelixTaskResourceManager { return _unknown; } + public int getDropped() { + return _dropped; + } + + public int getTimedOut() { + return _timedOut; + } + + public int getAborted() { + return _aborted; + } + public void accumulate(TaskCount other) { _waiting += other.getWaiting(); _running += other.getRunning(); @@ -1195,6 +1220,9 @@ public class PinotHelixTaskResourceManager { _completed += other.getCompleted(); _unknown += other.getUnknown(); _total += other.getTotal(); + _dropped += other.getDropped(); + _timedOut += other.getTimedOut(); + _aborted += other.getAborted(); } } } diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java index ace3694485..9eb6921053 100644 --- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java +++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java @@ -116,6 +116,12 @@ public class TaskMetricsEmitter extends BasePeriodicTask { taskTypeAccumulatedCount.getError()); _controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN, taskType, taskTypeAccumulatedCount.getUnknown()); + _controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_DROPPED, taskType, + taskTypeAccumulatedCount.getDropped()); + _controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT, taskType, + taskTypeAccumulatedCount.getTimedOut()); + _controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_ABORTED, taskType, + taskTypeAccumulatedCount.getAborted()); int total = taskTypeAccumulatedCount.getTotal(); int percent = total != 0 ? (taskTypeAccumulatedCount.getWaiting() + taskTypeAccumulatedCount.getRunning()) * 100 / total : 0; @@ -133,6 +139,12 @@ public class TaskMetricsEmitter extends BasePeriodicTask { ControllerGauge.NUM_MINION_SUBTASKS_ERROR, taskCount.getError()); _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType, ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN, taskCount.getUnknown()); + _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType, + ControllerGauge.NUM_MINION_SUBTASKS_DROPPED, taskCount.getDropped()); + _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType, + ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT, taskCount.getTimedOut()); + _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType, + ControllerGauge.NUM_MINION_SUBTASKS_ABORTED, taskCount.getAborted()); int tableTotal = taskCount.getTotal(); int tablePercent = tableTotal != 0 ? (taskCount.getWaiting() + taskCount.getRunning()) * 100 / tableTotal : 0; _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType, @@ -168,6 +180,9 @@ public class TaskMetricsEmitter extends BasePeriodicTask { _controllerMetrics.removeGlobalGauge(taskType, ControllerGauge.NUM_MINION_SUBTASKS_WAITING); _controllerMetrics.removeGlobalGauge(taskType, ControllerGauge.NUM_MINION_SUBTASKS_ERROR); _controllerMetrics.removeGlobalGauge(taskType, ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN); + _controllerMetrics.removeGlobalGauge(taskType, ControllerGauge.NUM_MINION_SUBTASKS_DROPPED); + _controllerMetrics.removeGlobalGauge(taskType, ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT); + _controllerMetrics.removeGlobalGauge(taskType, ControllerGauge.NUM_MINION_SUBTASKS_ABORTED); _controllerMetrics.removeGlobalGauge(taskType, ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE); _controllerMetrics.removeGlobalGauge(taskType, ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR); // remove table task type level gauges @@ -198,6 +213,9 @@ public class TaskMetricsEmitter extends BasePeriodicTask { _controllerMetrics.removeTableGauge(tableNameWithType, taskType, ControllerGauge.NUM_MINION_SUBTASKS_WAITING); _controllerMetrics.removeTableGauge(tableNameWithType, taskType, ControllerGauge.NUM_MINION_SUBTASKS_ERROR); _controllerMetrics.removeTableGauge(tableNameWithType, taskType, ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN); + _controllerMetrics.removeTableGauge(tableNameWithType, taskType, ControllerGauge.NUM_MINION_SUBTASKS_DROPPED); + _controllerMetrics.removeTableGauge(tableNameWithType, taskType, ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT); + _controllerMetrics.removeTableGauge(tableNameWithType, taskType, ControllerGauge.NUM_MINION_SUBTASKS_ABORTED); _controllerMetrics.removeTableGauge(tableNameWithType, taskType, ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE); _controllerMetrics.removeTableGauge(tableNameWithType, taskType, diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java index bd88f2731c..6ed301a069 100644 --- a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java +++ b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java @@ -84,7 +84,7 @@ public class TaskMetricsEmitterTest { Mockito.when(_pinotHelixTaskResourceManager.getTasksInProgress(taskType)).thenReturn(ImmutableSet.of()); _taskMetricsEmitter.runTask(null); - Assert.assertEquals(metricsRegistry.allMetrics().size(), 8); + Assert.assertEquals(metricsRegistry.allMetrics().size(), 11); Assert.assertTrue(metricsRegistry.allMetrics().containsKey( new YammerMetricName(ControllerMetrics.class, "pinot.controller.onlineMinionInstances"))); Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( @@ -99,6 +99,18 @@ public class TaskMetricsEmitterTest { Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( new YammerMetricName(ControllerMetrics.class, "pinot.controller.numMinionSubtasksError.taskType1")) .getMetric()).value(), 0L); + Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( + new YammerMetricName(ControllerMetrics.class, "pinot.controller.numMinionSubtasksUnknown.taskType1")) + .getMetric()).value(), 0L); + Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( + new YammerMetricName(ControllerMetrics.class, "pinot.controller.numMinionSubtasksDropped.taskType1")) + .getMetric()).value(), 0L); + Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( + new YammerMetricName(ControllerMetrics.class, "pinot.controller.numMinionSubtasksTimedOut.taskType1")) + .getMetric()).value(), 0L); + Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( + new YammerMetricName(ControllerMetrics.class, "pinot.controller.numMinionSubtasksAborted.taskType1")) + .getMetric()).value(), 0L); Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( new YammerMetricName(ControllerMetrics.class, "pinot.controller.percentMinionSubtasksInQueue.taskType1")) .getMetric()).value(), 0L); @@ -144,7 +156,7 @@ public class TaskMetricsEmitterTest { private void runAndAssertForTaskType1WithTwoTables() { PinotMetricsRegistry metricsRegistry = _controllerMetrics.getMetricsRegistry(); _taskMetricsEmitter.runTask(null); - Assert.assertEquals(metricsRegistry.allMetrics().size(), 20); + Assert.assertEquals(metricsRegistry.allMetrics().size(), 29); Assert.assertTrue(metricsRegistry.allMetrics().containsKey( new YammerMetricName(ControllerMetrics.class, "pinot.controller.onlineMinionInstances"))); @@ -160,6 +172,9 @@ public class TaskMetricsEmitterTest { Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( new YammerMetricName(ControllerMetrics.class, "pinot.controller.numMinionSubtasksError.taskType1")) .getMetric()).value(), 1L); + Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( + new YammerMetricName(ControllerMetrics.class, "pinot.controller.numMinionSubtasksDropped.taskType1")) + .getMetric()).value(), 0L); Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( new YammerMetricName(ControllerMetrics.class, "pinot.controller.percentMinionSubtasksInQueue.taskType1")) .getMetric()).value(), 50L); @@ -179,6 +194,10 @@ public class TaskMetricsEmitterTest { new YammerMetricName(ControllerMetrics.class, "pinot.controller.numMinionSubtasksError.table1_OFFLINE.taskType1")) .getMetric()).value(), 0L); + Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( + new YammerMetricName(ControllerMetrics.class, + "pinot.controller.numMinionSubtasksDropped.table1_OFFLINE.taskType1")) + .getMetric()).value(), 0L); Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( new YammerMetricName(ControllerMetrics.class, "pinot.controller.percentMinionSubtasksInQueue.table1_OFFLINE.taskType1")) @@ -200,6 +219,10 @@ public class TaskMetricsEmitterTest { new YammerMetricName(ControllerMetrics.class, "pinot.controller.numMinionSubtasksError.table2_OFFLINE.taskType1")) .getMetric()).value(), 1L); + Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( + new YammerMetricName(ControllerMetrics.class, + "pinot.controller.numMinionSubtasksDropped.table2_OFFLINE.taskType1")) + .getMetric()).value(), 0L); Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( new YammerMetricName(ControllerMetrics.class, "pinot.controller.percentMinionSubtasksInQueue.table2_OFFLINE.taskType1")) @@ -231,7 +254,7 @@ public class TaskMetricsEmitterTest { PinotMetricsRegistry metricsRegistry = _controllerMetrics.getMetricsRegistry(); _taskMetricsEmitter.runTask(null); - Assert.assertEquals(metricsRegistry.allMetrics().size(), 14); + Assert.assertEquals(metricsRegistry.allMetrics().size(), 20); Assert.assertTrue(metricsRegistry.allMetrics().containsKey( new YammerMetricName(ControllerMetrics.class, "pinot.controller.onlineMinionInstances"))); @@ -251,6 +274,10 @@ public class TaskMetricsEmitterTest { new YammerMetricName(ControllerMetrics.class, String.format("pinot.controller.numMinionSubtasksError.%s", taskType))) .getMetric()).value(), 0L); + Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( + new YammerMetricName(ControllerMetrics.class, + String.format("pinot.controller.numMinionSubtasksDropped.%s", taskType))) + .getMetric()).value(), 0L); Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( new YammerMetricName(ControllerMetrics.class, String.format("pinot.controller.percentMinionSubtasksInQueue.%s", taskType))) @@ -272,6 +299,10 @@ public class TaskMetricsEmitterTest { new YammerMetricName(ControllerMetrics.class, String.format("pinot.controller.numMinionSubtasksError.%s.%s", tableName, taskType))) .getMetric()).value(), 0L); + Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( + new YammerMetricName(ControllerMetrics.class, + String.format("pinot.controller.numMinionSubtasksDropped.%s.%s", tableName, taskType))) + .getMetric()).value(), 0L); Assert.assertEquals(((YammerSettableGauge<?>) metricsRegistry.allMetrics().get( new YammerMetricName(ControllerMetrics.class, String.format("pinot.controller.percentMinionSubtasksInQueue.%s.%s", tableName, taskType))) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org