This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch fix-replicator-total-jobs-stats
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit db460c0e16f8cb49c1e89e0037ee08cfc975b499
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Wed Nov 26 16:48:08 2025 -0500

    Fix replicator scheduler total jobs metric
    
    Previously, we didn't always remember to update the total job stats gauge, 
so
    it was possible for it to becomes stale. Periodic scheduler refresh updated 
all
    the other guages but didn't update the total.
    
    To fix it make sure to update the stat in more places (on jobs removes and
    adds) and most importantly, add it to periodic stat refresh function, so 
even
    if we still missed it should eventually catch up after reschuduling cycle.
---
 .../src/couch_replicator_scheduler.erl                | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/couch_replicator/src/couch_replicator_scheduler.erl 
b/src/couch_replicator/src/couch_replicator_scheduler.erl
index aabd7febd..14cca4a21 100644
--- a/src/couch_replicator/src/couch_replicator_scheduler.erl
+++ b/src/couch_replicator/src/couch_replicator_scheduler.erl
@@ -258,8 +258,7 @@ handle_call({add_job, Job}, _From, State) ->
     true = add_job_int(Job),
     ok = maybe_start_newly_added_job(Job, State),
     couch_stats:increment_counter([couch_replicator, jobs, adds]),
-    TotalJobs = ets:info(?MODULE, size),
-    couch_stats:update_gauge([couch_replicator, jobs, total], TotalJobs),
+    update_total_jobs_stats(),
     {reply, ok, State};
 handle_call({remove_job, Id}, _From, State) ->
     ok = maybe_remove_job_int(Id, State),
@@ -464,6 +463,7 @@ handle_crashed_job(Job, Reason, State) ->
             update_running_jobs_stats(State#state.stats_pid),
             ok;
         false ->
+            update_total_jobs_stats(),
             ok
     end.
 
@@ -480,6 +480,7 @@ maybe_start_newly_added_job(Job, State) ->
             update_running_jobs_stats(State#state.stats_pid),
             ok;
         false ->
+            update_total_jobs_stats(),
             ok
     end.
 
@@ -655,16 +656,13 @@ maybe_remove_job_int(JobId, State) ->
             ok = stop_job_int(Job, State),
             true = remove_job_int(Job),
             couch_stats:increment_counter([couch_replicator, jobs, removes]),
-            TotalJobs = ets:info(?MODULE, size),
-            couch_stats:update_gauge(
-                [couch_replicator, jobs, total],
-                TotalJobs
-            ),
             update_running_jobs_stats(State#state.stats_pid),
             ok;
         {error, not_found} ->
             ok
-    end.
+    end,
+    update_total_jobs_stats(),
+    ok.
 
 start_job_int(#job{pid = Pid}, _State) when Pid /= undefined ->
     ok;
@@ -964,6 +962,7 @@ stats_updater_refresh() ->
     couch_stats:update_gauge([couch_replicator, jobs, pending], PendingN),
     couch_stats:update_gauge([couch_replicator, jobs, running], RunningN),
     couch_stats:update_gauge([couch_replicator, jobs, crashed], CrashedN),
+    update_total_jobs_stats(),
     ok.
 
 -spec stats_fold(#job{}, #stats_acc{}) -> #stats_acc{}.
@@ -976,6 +975,10 @@ stats_fold(#job{pid = undefined, history = [{{crashed, _}, 
_} | _]}, Acc) ->
 stats_fold(#job{pid = P, history = [{started, _} | _]}, Acc) when is_pid(P) ->
     Acc#stats_acc{running_n = Acc#stats_acc.running_n + 1}.
 
+update_total_jobs_stats() ->
+    TotalJobs = ets:info(?MODULE, size),
+    couch_stats:update_gauge([couch_replicator, jobs, total], TotalJobs).
+
 -spec existing_replication(#rep{}) -> boolean().
 existing_replication(#rep{} = NewRep) ->
     case job_by_id(NewRep#rep.id) of

Reply via email to