somandal commented on code in PR #15650: URL: https://github.com/apache/pinot/pull/15650#discussion_r2070711724
########## pinot-common/src/test/java/org/apache/pinot/common/metrics/prometheus/PinotPrometheusMetricsTest.java: ########## @@ -348,6 +348,7 @@ public static class ExportedLabelKeys { public static final String PERIODIC_TASK = "periodicTask"; public static final String STATUS = "status"; public static final String DATABASE = "database"; + public static final String JOBID = "jobId"; Review Comment: lets remove this since it is not used ########## pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/ZkBasedTableRebalanceObserver.java: ########## @@ -234,6 +238,37 @@ public int getNumUpdatesToZk() { return _numUpdatesToZk; } + /** + * Emits the rebalance progress in percent to the metrics. Uses the percentage of remaining segments to be added as + * the indicator of the overall progress. + * Notice that for some jobs, the metrics may not be exactly accurate and would not be 100% when the job is done. + * (e.g. when `lowDiskMode=false`, the job finishes without waiting for `totalRemainingSegmentsToBeDeleted` become + * 0, or when `bestEffort=true` the job finishes without waiting for both `totalRemainingSegmentsToBeAdded`, + * `totalRemainingSegmentsToBeDeleted`, and `totalRemainingSegmentsToConverge` become 0) + * Therefore `emitProgressMetricDone()` should be called to emit the final progress as the time job exits. + * @param overallProgress the latest overall progress + */ + private void emitProgressMetric(TableRebalanceProgressStats.RebalanceProgressStats overallProgress) { + // Round this up so the metric is 100 only when no segment remains + long progressPercent = 100 - (long) Math.ceil(TableRebalanceProgressStats.calculatePercentageChange( + overallProgress._totalSegmentsToBeAdded + overallProgress._totalSegmentsToBeDeleted, + overallProgress._totalRemainingSegmentsToBeAdded + overallProgress._totalRemainingSegmentsToBeDeleted + + overallProgress._totalRemainingSegmentsToConverge + overallProgress._totalCarryOverSegmentsToBeAdded + + overallProgress._totalCarryOverSegmentsToBeDeleted)); + // Using the original job ID to group rebalance retries together with the same label Review Comment: nit: remove comment? no longer using jobId ########## pinot-controller/src/main/java/org/apache/pinot/controller/helix/core/rebalance/ZkBasedTableRebalanceObserver.java: ########## @@ -234,6 +238,37 @@ public int getNumUpdatesToZk() { return _numUpdatesToZk; } + /** + * Emits the rebalance progress in percent to the metrics. Uses the percentage of remaining segments to be added as Review Comment: nit: mention that we also need to account for carryover ########## pinot-controller/src/test/java/org/apache/pinot/controller/helix/core/rebalance/TestZkBasedTableRebalanceObserver.java: ########## @@ -67,23 +68,42 @@ void testZkObserverTracking() { segmentSet, segmentSet); observer.onTrigger(TableRebalanceObserver.Trigger.START_TRIGGER, source, target, rebalanceContext); assertEquals(observer.getNumUpdatesToZk(), 1); + checkProgressPercentMetrics(controllerMetrics, observer); observer.onTrigger(TableRebalanceObserver.Trigger.IDEAL_STATE_CHANGE_TRIGGER, source, source, rebalanceContext); + checkProgressPercentMetrics(controllerMetrics, observer); observer.onTrigger(TableRebalanceObserver.Trigger.EXTERNAL_VIEW_TO_IDEAL_STATE_CONVERGENCE_TRIGGER, source, source, rebalanceContext); + checkProgressPercentMetrics(controllerMetrics, observer); // START_TRIGGER will set up the ZK progress stats to have the diff between source and target. When calling the // triggers for IS and EV-IS, since source and source are compared, the diff will change for the IS trigger // but not for the EV-IS trigger, so ZK must be updated 1 extra time assertEquals(observer.getNumUpdatesToZk(), 2); observer.onTrigger(TableRebalanceObserver.Trigger.IDEAL_STATE_CHANGE_TRIGGER, source, target, rebalanceContext); + checkProgressPercentMetrics(controllerMetrics, observer); observer.onTrigger(TableRebalanceObserver.Trigger.EXTERNAL_VIEW_TO_IDEAL_STATE_CONVERGENCE_TRIGGER, source, target, rebalanceContext); + checkProgressPercentMetrics(controllerMetrics, observer); // Both of the changes above will update ZK for progress stats assertEquals(observer.getNumUpdatesToZk(), 4); // Try a rollback and this should trigger a ZK update as well observer.onRollback(); assertEquals(observer.getNumUpdatesToZk(), 5); } + private void checkProgressPercentMetrics(ControllerMetrics controllerMetrics, + ZkBasedTableRebalanceObserver observer) { + Long progressGaugeValue = + controllerMetrics.getGaugeValue(ControllerGauge.TABLE_REBALANCE_JOB_PROGRESS_PERCENT.getGaugeName() + ".dummy"); + assertNotNull(progressGaugeValue); + TableRebalanceProgressStats.RebalanceProgressStats overallProgress = + observer.getTableRebalanceProgressStats().getRebalanceProgressStatsOverall(); + long progressRemained = (long) Math.ceil(TableRebalanceProgressStats.calculatePercentageChange( Review Comment: shouldn't this include carry over? Is it possible to add a test where the value with carry over is different than without? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org