snleee commented on code in PR #10083:
URL: https://github.com/apache/pinot/pull/10083#discussion_r1065359235


##########
pinot-common/src/main/java/org/apache/pinot/common/metrics/AbstractMetrics.java:
##########
@@ -333,9 +333,7 @@ public PinotMeter getMeteredTableValue(final String 
tableName, final M meter) {
    * @param unitCount The number of units to add to the gauge
    */
   public void addValueToTableGauge(final String tableName, final G gauge, 
final long unitCount) {
-    final String fullGaugeName;

Review Comment:
   +1 on this change 



##########
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java:
##########
@@ -71,42 +78,105 @@ protected final void runTask(Properties 
periodicTaskProperties) {
         _helixTaskResourceManager.getTaskMetadataLastUpdateTimeMs();
     taskMetadataLastUpdateTime.forEach((tableNameWithType, 
taskTypeLastUpdateTime) ->
         taskTypeLastUpdateTime.forEach((taskType, lastUpdateTimeMs) ->
-            _controllerMetrics.addOrUpdateGauge(
-                
ControllerGauge.TIME_MS_SINCE_LAST_MINION_TASK_METADATA_UPDATE.getGaugeName() + 
"."
-                    + tableNameWithType + "." + taskType, () -> 
System.currentTimeMillis() - lastUpdateTimeMs)));
+            _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, 
taskType,
+                ControllerGauge.TIME_MS_SINCE_LAST_MINION_TASK_METADATA_UPDATE,
+                () -> System.currentTimeMillis() - lastUpdateTimeMs)));
 
     // The call to get task types can take time if there are a lot of tasks.
     // Potential optimization is to call it every (say) 30m if we detect a 
barrage of
     // zk requests.
     Set<String> taskTypes = _helixTaskResourceManager.getTaskTypes();
     for (String taskType : taskTypes) {
-      TaskCount accumulated = new TaskCount();
+      TaskCount taskTypeAccumulatedCount = new TaskCount();
+      Map<String, TaskCount> tableAccumulatedCount = new HashMap<>();
       try {
         Set<String> tasksInProgress = 
_helixTaskResourceManager.getTasksInProgress(taskType);
         final int numRunningTasks = tasksInProgress.size();
         for (String task : tasksInProgress) {
-          TaskCount taskCount = _helixTaskResourceManager.getTaskCount(task);
-          accumulated.accumulate(taskCount);
+          Map<String, TaskCount> tableTaskCount = 
_helixTaskResourceManager.getTableTaskCount(task);
+          tableTaskCount.forEach((tableNameWithType, taskCount) -> {
+            taskTypeAccumulatedCount.accumulate(taskCount);
+            tableAccumulatedCount.compute(tableNameWithType, (name, count) -> {
+              if (count == null) {
+                count = new TaskCount();
+              }
+              count.accumulate(taskCount);
+              return count;
+            });
+          });
         }
         // Emit metrics for taskType.
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_TASKS_IN_PROGRESS,
 taskType,
             numRunningTasks);
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_RUNNING,
 taskType,
-            accumulated.getRunning());
+            taskTypeAccumulatedCount.getRunning());
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_WAITING,
 taskType,
-            accumulated.getWaiting());
+            taskTypeAccumulatedCount.getWaiting());
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_ERROR,
 taskType,
-            accumulated.getError());
-        int total = accumulated.getTotal();
-        int percent = total != 0 ? (accumulated.getWaiting() + 
accumulated.getRunning()) * 100 / total : 0;
+            taskTypeAccumulatedCount.getError());
+        int total = taskTypeAccumulatedCount.getTotal();
+        int percent = total != 0
+            ? (taskTypeAccumulatedCount.getWaiting() + 
taskTypeAccumulatedCount.getRunning()) * 100 / total : 0;
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE,
 taskType, percent);
-        percent = total != 0 ? accumulated.getError() * 100 / total : 0;
+        percent = total != 0 ? taskTypeAccumulatedCount.getError() * 100 / 
total : 0;
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR,
 taskType, percent);
+
+        // Emit metrics for table taskType
+        tableAccumulatedCount.forEach((tableNameWithType, taskCount) -> {
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.NUM_MINION_SUBTASKS_RUNNING, () -> (long) 
taskCount.getRunning());
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.NUM_MINION_SUBTASKS_WAITING, 
taskCount.getWaiting());
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.NUM_MINION_SUBTASKS_ERROR, taskCount.getError());
+          int tableTotal = taskCount.getTotal();
+          int tablePercent = tableTotal != 0 ? (taskCount.getWaiting() + 
taskCount.getRunning()) * 100 / tableTotal : 0;
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE, tablePercent);
+          tablePercent = tableTotal != 0 ? taskCount.getError() * 100 / 
tableTotal : 0;
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR, tablePercent);
+        });
+
+
+        if (_preReportedTables.containsKey(taskType)) {
+          Set<String> tableNameWithTypeSet = _preReportedTables.get(taskType);
+          tableNameWithTypeSet.removeAll(tableAccumulatedCount.keySet());
+          removeTableTaskTypeMetrics(tableNameWithTypeSet, taskType);
+        }
+        if (!tableAccumulatedCount.isEmpty()) {
+          // need to make a copy of the set because we may want to chagne the 
set later
+          Set<String> tableNameWithTypeSet = new 
HashSet<>(tableAccumulatedCount.keySet());
+          _preReportedTables.put(taskType, tableNameWithTypeSet);
+        } else {
+          _preReportedTables.remove(taskType);
+        }
       } catch (Exception e) {
         LOGGER.error("Caught exception while getting metrics for task type 
{}", taskType, e);
       }
     }
 
+    // clean up metrics for task types that have already been removed
+    _preReportedTaskTypes.removeAll(taskTypes);
+    for (String taskType : _preReportedTaskTypes) {

Review Comment:
   @zhtaoxiang We didn't clean up the metrics when the table is deleted?



##########
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/TaskMetricsEmitter.java:
##########
@@ -71,42 +78,105 @@ protected final void runTask(Properties 
periodicTaskProperties) {
         _helixTaskResourceManager.getTaskMetadataLastUpdateTimeMs();
     taskMetadataLastUpdateTime.forEach((tableNameWithType, 
taskTypeLastUpdateTime) ->
         taskTypeLastUpdateTime.forEach((taskType, lastUpdateTimeMs) ->
-            _controllerMetrics.addOrUpdateGauge(
-                
ControllerGauge.TIME_MS_SINCE_LAST_MINION_TASK_METADATA_UPDATE.getGaugeName() + 
"."
-                    + tableNameWithType + "." + taskType, () -> 
System.currentTimeMillis() - lastUpdateTimeMs)));
+            _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, 
taskType,
+                ControllerGauge.TIME_MS_SINCE_LAST_MINION_TASK_METADATA_UPDATE,
+                () -> System.currentTimeMillis() - lastUpdateTimeMs)));
 
     // The call to get task types can take time if there are a lot of tasks.
     // Potential optimization is to call it every (say) 30m if we detect a 
barrage of
     // zk requests.
     Set<String> taskTypes = _helixTaskResourceManager.getTaskTypes();
     for (String taskType : taskTypes) {
-      TaskCount accumulated = new TaskCount();
+      TaskCount taskTypeAccumulatedCount = new TaskCount();
+      Map<String, TaskCount> tableAccumulatedCount = new HashMap<>();
       try {
         Set<String> tasksInProgress = 
_helixTaskResourceManager.getTasksInProgress(taskType);
         final int numRunningTasks = tasksInProgress.size();
         for (String task : tasksInProgress) {
-          TaskCount taskCount = _helixTaskResourceManager.getTaskCount(task);
-          accumulated.accumulate(taskCount);
+          Map<String, TaskCount> tableTaskCount = 
_helixTaskResourceManager.getTableTaskCount(task);
+          tableTaskCount.forEach((tableNameWithType, taskCount) -> {
+            taskTypeAccumulatedCount.accumulate(taskCount);
+            tableAccumulatedCount.compute(tableNameWithType, (name, count) -> {
+              if (count == null) {
+                count = new TaskCount();
+              }
+              count.accumulate(taskCount);
+              return count;
+            });
+          });
         }
         // Emit metrics for taskType.
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_TASKS_IN_PROGRESS,
 taskType,
             numRunningTasks);
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_RUNNING,
 taskType,
-            accumulated.getRunning());
+            taskTypeAccumulatedCount.getRunning());
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_WAITING,
 taskType,
-            accumulated.getWaiting());
+            taskTypeAccumulatedCount.getWaiting());
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.NUM_MINION_SUBTASKS_ERROR,
 taskType,
-            accumulated.getError());
-        int total = accumulated.getTotal();
-        int percent = total != 0 ? (accumulated.getWaiting() + 
accumulated.getRunning()) * 100 / total : 0;
+            taskTypeAccumulatedCount.getError());
+        int total = taskTypeAccumulatedCount.getTotal();
+        int percent = total != 0
+            ? (taskTypeAccumulatedCount.getWaiting() + 
taskTypeAccumulatedCount.getRunning()) * 100 / total : 0;
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE,
 taskType, percent);
-        percent = total != 0 ? accumulated.getError() * 100 / total : 0;
+        percent = total != 0 ? taskTypeAccumulatedCount.getError() * 100 / 
total : 0;
         
_controllerMetrics.setValueOfGlobalGauge(ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR,
 taskType, percent);
+
+        // Emit metrics for table taskType
+        tableAccumulatedCount.forEach((tableNameWithType, taskCount) -> {
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.NUM_MINION_SUBTASKS_RUNNING, () -> (long) 
taskCount.getRunning());
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.NUM_MINION_SUBTASKS_WAITING, 
taskCount.getWaiting());
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.NUM_MINION_SUBTASKS_ERROR, taskCount.getError());
+          int tableTotal = taskCount.getTotal();
+          int tablePercent = tableTotal != 0 ? (taskCount.getWaiting() + 
taskCount.getRunning()) * 100 / tableTotal : 0;
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.PERCENT_MINION_SUBTASKS_IN_QUEUE, tablePercent);
+          tablePercent = tableTotal != 0 ? taskCount.getError() * 100 / 
tableTotal : 0;
+          _controllerMetrics.setOrUpdateTableGauge(tableNameWithType, taskType,
+              ControllerGauge.PERCENT_MINION_SUBTASKS_IN_ERROR, tablePercent);
+        });
+

Review Comment:
   remove line



##########
pinot-common/src/main/java/org/apache/pinot/common/metrics/AbstractMetrics.java:
##########
@@ -529,11 +517,9 @@ public void initializeGlobalMeters() {
     }
   }
 
+  @Deprecated

Review Comment:
   Let's add the comment to guide on which function to use instead of 
deprecated functions. (applies to all `@Deprecated` annotations)



##########
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java:
##########
@@ -364,7 +366,46 @@ public synchronized TaskCount getTaskCount(String 
parentTaskName) {
   }
 
   /**
-   * Returns a set of Task names (in the form "Task_TestTask_1624403781879") 
that are in progress or not started yet.
+   * This method returns a map of table name to count of sub-tasks in various 
states, given the top-level task name.
+   * @param parentTaskName in the form "Task_<taskType>_<uuid>_<timestamp>"
+   * @return a map of table name to {@link TaskCount}
+   */
+  public synchronized Map<String, TaskCount> getTableTaskCount(String 
parentTaskName) {
+    Map<String, TaskPartitionState> subtaskStates = 
getSubtaskStates(parentTaskName);
+    if (subtaskStates.isEmpty()) {
+      return Collections.emptyMap();
+    }
+
+    JobConfig jobConfig = 
_taskDriver.getJobConfig(getHelixJobName(parentTaskName));
+    // in theory, this should not happen because we have already checked 
JobContext
+    if (jobConfig == null) {
+      return Collections.emptyMap();
+    }
+
+    Map<String, TaskCount> tableTaskCountMap = new HashMap<>();
+    subtaskStates.forEach((taskId, taskState) -> {
+      TaskConfig taskConfig = jobConfig.getTaskConfig(taskId);
+      String tableNameWithType;
+      // in theory, this should not happen because jobContext has this taskId
+      if (taskConfig == null) {
+        tableNameWithType = UNKNOWN_TABLE_NAME;

Review Comment:
   Should we at least add the log for this if this is something that should not 
be happening?



##########
pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/minion/PinotHelixTaskResourceManager.java:
##########
@@ -364,7 +366,46 @@ public synchronized TaskCount getTaskCount(String 
parentTaskName) {
   }
 
   /**
-   * Returns a set of Task names (in the form "Task_TestTask_1624403781879") 
that are in progress or not started yet.
+   * This method returns a map of table name to count of sub-tasks in various 
states, given the top-level task name.
+   * @param parentTaskName in the form "Task_<taskType>_<uuid>_<timestamp>"

Review Comment:
   I guess that we are already adding `uuid` and this is just an improvement in 
terms of documentation?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to