This is an automated email from the ASF dual-hosted git repository.

manishswaminathan pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new e05ef7b068 Adds metrics for task states (#14785)
e05ef7b068 is described below

commit e05ef7b0683241490cb0f15343e93579e8cfcf44
Author: NOOB <43700604+noob-se...@users.noreply.github.com>
AuthorDate: Wed Jan 22 21:56:27 2025 +0530

    Adds metrics for task states (#14785)
    
    * Adds metrics for task states
    
    * Adds tests
---
 .../configs/controller.yml                         |  2 +-
 .../pinot/common/metrics/ControllerGauge.java      |  4 +++
 .../ControllerPrometheusMetricsTest.java           |  3 ++
 .../core/minion/PinotHelixTaskResourceManager.java | 30 +++++++++++++++++-
 .../helix/core/minion/TaskMetricsEmitter.java      | 18 +++++++++++
 .../helix/core/minion/TaskMetricsEmitterTest.java  | 37 ++++++++++++++++++++--
 6 files changed, 89 insertions(+), 5 deletions(-)

diff --git 
a/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml 
b/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml
index 2281a9ea41..a1b945ab4a 100644
--- a/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml
+++ b/docker/images/pinot/etc/jmx_prometheus_javaagent/configs/controller.yml
@@ -17,7 +17,7 @@ rules:
     tableType: "$6"
     partition: "$7"
 # Gauges that accept the controller taskType
-- pattern: 
"\"org\\.apache\\.pinot\\.common\\.metrics\"<type=\"ControllerMetrics\", 
name=\"pinot\\.controller\\.(numMinionTasksInProgress|numMinionSubtasksRunning|numMinionSubtasksWaiting|numMinionSubtasksError|numMinionSubtasksUnknown|percentMinionSubtasksInQueue|percentMinionSubtasksInError)\\.(\\w+)\"><>(\\w+)"
+- pattern: 
"\"org\\.apache\\.pinot\\.common\\.metrics\"<type=\"ControllerMetrics\", 
name=\"pinot\\.controller\\.(numMinionTasksInProgress|numMinionSubtasksRunning|numMinionSubtasksWaiting|numMinionSubtasksError|numMinionSubtasksUnknown|numMinionSubtasksDropped|numMinionSubtasksTimedOut|numMinionSubtasksAborted|percentMinionSubtasksInQueue|percentMinionSubtasksInError)\\.(\\w+)\"><>(\\w+)"
   name: "pinot_controller_$1_$3"
   cache: true
   labels:
diff --git 
a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
 
b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
index a978219343..99bd892066 100644
--- 
a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
+++ 
b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
@@ -64,11 +64,15 @@ public enum ControllerGauge implements 
AbstractMetrics.Gauge {
   
TIME_MS_SINCE_LAST_MINION_TASK_METADATA_UPDATE("TimeMsSinceLastMinionTaskMetadataUpdate",
 false),
   
TIME_MS_SINCE_LAST_SUCCESSFUL_MINION_TASK_GENERATION("TimeMsSinceLastSuccessfulMinionTaskGeneration",
 false),
   
LAST_MINION_TASK_GENERATION_ENCOUNTERS_ERROR("LastMinionTaskGenerationEncountersError",
 false),
+  // TODO: Unify below subtask metrics into a single metric with status label
   NUM_MINION_TASKS_IN_PROGRESS("NumMinionTasksInProgress", true),
   NUM_MINION_SUBTASKS_WAITING("NumMinionSubtasksWaiting", true),
   NUM_MINION_SUBTASKS_RUNNING("NumMinionSubtasksRunning", true),
   NUM_MINION_SUBTASKS_ERROR("NumMinionSubtasksError", true),
   NUM_MINION_SUBTASKS_UNKNOWN("NumMinionSubtasksUnknown", true),
+  NUM_MINION_SUBTASKS_DROPPED("NumMinionSubtasksDropped", true),
+  NUM_MINION_SUBTASKS_TIMED_OUT("NumMinionSubtasksTimedOut", true),
+  NUM_MINION_SUBTASKS_ABORTED("NumMinionSubtasksAborted", true),
   PERCENT_MINION_SUBTASKS_IN_QUEUE("PercentMinionSubtasksInQueue", true),
   PERCENT_MINION_SUBTASKS_IN_ERROR("PercentMinionSubtasksInError", true),
   TIER_BACKEND_TABLE_COUNT("TierBackendTableCount", true),
diff --git 
a/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java
 
b/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java
index 1f458a4448..1061a0af54 100644
--- 
a/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java
+++ 
b/pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/ControllerPrometheusMetricsTest.java
@@ -41,6 +41,9 @@ public abstract class ControllerPrometheusMetricsTest extends 
PinotPrometheusMet
       List.of(ControllerGauge.NUM_MINION_TASKS_IN_PROGRESS, 
ControllerGauge.NUM_MINION_SUBTASKS_RUNNING,
           ControllerGauge.NUM_MINION_SUBTASKS_WAITING, 
ControllerGauge.NUM_MINION_SUBTASKS_ERROR,
           ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN,
+          ControllerGauge.NUM_MINION_SUBTASKS_DROPPED,
+          ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT,
+          ControllerGauge.NUM_MINION_SUBTASKS_ABORTED,
           ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE, 
ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR);
 
   //local gauges that accept partition
diff --git 
a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java
 
b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java
index f8b258265a..87580c30b0 100644
--- 
a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java
+++ 
b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java
@@ -1126,7 +1126,7 @@ public class PinotHelixTaskResourceManager {
     }
   }
 
-  @JsonPropertyOrder({"total", "completed", "running", "waiting", "error", 
"unknown"})
+  @JsonPropertyOrder({"total", "completed", "running", "waiting", "error", 
"unknown", "dropped", "timedOut", "aborted"})
   public static class TaskCount {
     private int _waiting;   // Number of tasks waiting to be scheduled on 
minions
     private int _error;     // Number of tasks in error
@@ -1134,6 +1134,10 @@ public class PinotHelixTaskResourceManager {
     private int _completed; // Number of tasks completed normally
     private int _unknown;   // Number of tasks with all other states
     private int _total;     // Total number of tasks in the batch
+    private int _dropped;   // Total number of tasks dropped
+    // (Task can be dropped due to no available assigned instance, etc.)
+    private int _timedOut;  // Total number of tasks timed out
+    private int _aborted;   // Total number of tasks aborted
 
     public TaskCount() {
     }
@@ -1157,6 +1161,15 @@ public class PinotHelixTaskResourceManager {
           case COMPLETED:
             _completed++;
             break;
+          case DROPPED:
+            _dropped++;
+            break;
+          case TIMED_OUT:
+            _timedOut++;
+            break;
+          case TASK_ABORTED:
+            _aborted++;
+            break;
           default:
             _unknown++;
             break;
@@ -1188,6 +1201,18 @@ public class PinotHelixTaskResourceManager {
       return _unknown;
     }
 
+    public int getDropped() {
+      return _dropped;
+    }
+
+    public int getTimedOut() {
+      return _timedOut;
+    }
+
+    public int getAborted() {
+      return _aborted;
+    }
+
     public void accumulate(TaskCount other) {
       _waiting += other.getWaiting();
       _running += other.getRunning();
@@ -1195,6 +1220,9 @@ public class PinotHelixTaskResourceManager {
       _completed += other.getCompleted();
       _unknown += other.getUnknown();
       _total += other.getTotal();
+      _dropped += other.getDropped();
+      _timedOut += other.getTimedOut();
+      _aborted += other.getAborted();
     }
   }
 }
diff --git 
a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java
 
b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java
index ace3694485..9eb6921053 100644
--- 
a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java
+++ 
b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java
@@ -116,6 +116,12 @@ public class TaskMetricsEmitter extends BasePeriodicTask {
             taskTypeAccumulatedCount.getError());
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN,
 taskType,
             taskTypeAccumulatedCount.getUnknown());
+        
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_DROPPED,
 taskType,
+            taskTypeAccumulatedCount.getDropped());
+        
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT,
 taskType,
+            taskTypeAccumulatedCount.getTimedOut());
+        
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_ABORTED,
 taskType,
+            taskTypeAccumulatedCount.getAborted());
         int total = taskTypeAccumulatedCount.getTotal();
         int percent = total != 0
             ? (taskTypeAccumulatedCount.getWaiting() + 
taskTypeAccumulatedCount.getRunning()) * 100 / total : 0;
@@ -133,6 +139,12 @@ public class TaskMetricsEmitter extends BasePeriodicTask {
               ControllerGauge.NUM_MINION_SUBTASKS_ERROR, taskCount.getError());
           _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
               ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN, 
taskCount.getUnknown());
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.NUM_MINION_SUBTASKS_DROPPED, 
taskCount.getDropped());
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT, 
taskCount.getTimedOut());
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.NUM_MINION_SUBTASKS_ABORTED, 
taskCount.getAborted());
           int tableTotal = taskCount.getTotal();
           int tablePercent = tableTotal != 0 ? (taskCount.getWaiting() + 
taskCount.getRunning()) * 100 / tableTotal : 0;
           _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
@@ -168,6 +180,9 @@ public class TaskMetricsEmitter extends BasePeriodicTask {
       _controllerMetrics.removeGlobalGauge(taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_WAITING);
       _controllerMetrics.removeGlobalGauge(taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_ERROR);
       _controllerMetrics.removeGlobalGauge(taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN);
+      _controllerMetrics.removeGlobalGauge(taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_DROPPED);
+      _controllerMetrics.removeGlobalGauge(taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT);
+      _controllerMetrics.removeGlobalGauge(taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_ABORTED);
       _controllerMetrics.removeGlobalGauge(taskType, 
ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE);
       _controllerMetrics.removeGlobalGauge(taskType, 
ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR);
       // remove table task type level gauges
@@ -198,6 +213,9 @@ public class TaskMetricsEmitter extends BasePeriodicTask {
       _controllerMetrics.removeTableGauge(tableNameWithType, taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_WAITING);
       _controllerMetrics.removeTableGauge(tableNameWithType, taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_ERROR);
       _controllerMetrics.removeTableGauge(tableNameWithType, taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_UNKNOWN);
+      _controllerMetrics.removeTableGauge(tableNameWithType, taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_DROPPED);
+      _controllerMetrics.removeTableGauge(tableNameWithType, taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_TIMED_OUT);
+      _controllerMetrics.removeTableGauge(tableNameWithType, taskType, 
ControllerGauge.NUM_MINION_SUBTASKS_ABORTED);
       _controllerMetrics.removeTableGauge(tableNameWithType, taskType,
           ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE);
       _controllerMetrics.removeTableGauge(tableNameWithType, taskType,
diff --git 
a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java
 
b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java
index bd88f2731c..6ed301a069 100644
--- 
a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java
+++ 
b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitterTest.java
@@ -84,7 +84,7 @@ public class TaskMetricsEmitterTest {
     
Mockito.when(_pinotHelixTaskResourceManager.getTasksInProgress(taskType)).thenReturn(ImmutableSet.of());
     _taskMetricsEmitter.runTask(null);
 
-    Assert.assertEquals(metricsRegistry.allMetrics().size(), 8);
+    Assert.assertEquals(metricsRegistry.allMetrics().size(), 11);
     Assert.assertTrue(metricsRegistry.allMetrics().containsKey(
         new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.onlineMinionInstances")));
     Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
@@ -99,6 +99,18 @@ public class TaskMetricsEmitterTest {
     Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
             new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.numMinionSubtasksError.taskType1"))
         .getMetric()).value(), 0L);
+    Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
+            new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.numMinionSubtasksUnknown.taskType1"))
+        .getMetric()).value(), 0L);
+    Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
+            new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.numMinionSubtasksDropped.taskType1"))
+        .getMetric()).value(), 0L);
+    Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
+            new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.numMinionSubtasksTimedOut.taskType1"))
+        .getMetric()).value(), 0L);
+    Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
+            new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.numMinionSubtasksAborted.taskType1"))
+        .getMetric()).value(), 0L);
     Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
             new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.percentMinionSubtasksInQueue.taskType1"))
         .getMetric()).value(), 0L);
@@ -144,7 +156,7 @@ public class TaskMetricsEmitterTest {
   private void runAndAssertForTaskType1WithTwoTables() {
     PinotMetricsRegistry metricsRegistry = 
_controllerMetrics.getMetricsRegistry();
     _taskMetricsEmitter.runTask(null);
-    Assert.assertEquals(metricsRegistry.allMetrics().size(), 20);
+    Assert.assertEquals(metricsRegistry.allMetrics().size(), 29);
 
     Assert.assertTrue(metricsRegistry.allMetrics().containsKey(
         new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.onlineMinionInstances")));
@@ -160,6 +172,9 @@ public class TaskMetricsEmitterTest {
     Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
             new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.numMinionSubtasksError.taskType1"))
         .getMetric()).value(), 1L);
+    Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
+            new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.numMinionSubtasksDropped.taskType1"))
+        .getMetric()).value(), 0L);
     Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
             new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.percentMinionSubtasksInQueue.taskType1"))
         .getMetric()).value(), 50L);
@@ -179,6 +194,10 @@ public class TaskMetricsEmitterTest {
             new YammerMetricName(ControllerMetrics.class,
                 
"pinot.controller.numMinionSubtasksError.table1_OFFLINE.taskType1"))
         .getMetric()).value(), 0L);
+    Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
+            new YammerMetricName(ControllerMetrics.class,
+                
"pinot.controller.numMinionSubtasksDropped.table1_OFFLINE.taskType1"))
+        .getMetric()).value(), 0L);
     Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
             new YammerMetricName(ControllerMetrics.class,
                 
"pinot.controller.percentMinionSubtasksInQueue.table1_OFFLINE.taskType1"))
@@ -200,6 +219,10 @@ public class TaskMetricsEmitterTest {
             new YammerMetricName(ControllerMetrics.class,
                 
"pinot.controller.numMinionSubtasksError.table2_OFFLINE.taskType1"))
         .getMetric()).value(), 1L);
+    Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
+            new YammerMetricName(ControllerMetrics.class,
+                
"pinot.controller.numMinionSubtasksDropped.table2_OFFLINE.taskType1"))
+        .getMetric()).value(), 0L);
     Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
             new YammerMetricName(ControllerMetrics.class,
                 
"pinot.controller.percentMinionSubtasksInQueue.table2_OFFLINE.taskType1"))
@@ -231,7 +254,7 @@ public class TaskMetricsEmitterTest {
 
     PinotMetricsRegistry metricsRegistry = 
_controllerMetrics.getMetricsRegistry();
     _taskMetricsEmitter.runTask(null);
-    Assert.assertEquals(metricsRegistry.allMetrics().size(), 14);
+    Assert.assertEquals(metricsRegistry.allMetrics().size(), 20);
 
     Assert.assertTrue(metricsRegistry.allMetrics().containsKey(
         new YammerMetricName(ControllerMetrics.class, 
"pinot.controller.onlineMinionInstances")));
@@ -251,6 +274,10 @@ public class TaskMetricsEmitterTest {
             new YammerMetricName(ControllerMetrics.class,
                 String.format("pinot.controller.numMinionSubtasksError.%s", 
taskType)))
         .getMetric()).value(), 0L);
+    Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
+            new YammerMetricName(ControllerMetrics.class,
+                String.format("pinot.controller.numMinionSubtasksDropped.%s", 
taskType)))
+        .getMetric()).value(), 0L);
     Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
             new YammerMetricName(ControllerMetrics.class,
                 
String.format("pinot.controller.percentMinionSubtasksInQueue.%s", taskType)))
@@ -272,6 +299,10 @@ public class TaskMetricsEmitterTest {
             new YammerMetricName(ControllerMetrics.class,
                 String.format("pinot.controller.numMinionSubtasksError.%s.%s", 
tableName, taskType)))
         .getMetric()).value(), 0L);
+    Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
+            new YammerMetricName(ControllerMetrics.class,
+                
String.format("pinot.controller.numMinionSubtasksDropped.%s.%s", tableName, 
taskType)))
+        .getMetric()).value(), 0L);
     Assert.assertEquals(((YammerSettableGauge<?>) 
metricsRegistry.allMetrics().get(
             new YammerMetricName(ControllerMetrics.class,
                 
String.format("pinot.controller.percentMinionSubtasksInQueue.%s.%s", tableName, 
taskType)))


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to