This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/couchdb.git


The following commit(s) were added to refs/heads/main by this push:
     new 9dee10d7b Optimize purge. ~30% for large batches.
9dee10d7b is described below

commit 9dee10d7b4e982394d393191857e54cfc24e5a49
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Sat Oct 25 00:39:07 2025 -0400

    Optimize purge. ~30% for large batches.
    
    Use a map to store and lookup FDI records in `couch_db_updater`.
    
    The map helps in two ways:
       * In `apply_purge_requests` we avoid removing then re-adding the active 
FDI record
         to the list. Looking up the record is a faster operation, especially 
with
         large batch and a simple update vs remove + replace helps with 
generating
         less garbage.
    
       * In `purge_docs` avoid nested `lists:keyfind/3` lookups when building 
the
       FDI pairs list. For instance with 1000 docs, we replace 1000 calls to
       keyfind, an O(n) operation, with 1000 map lookups, O(log n) operations.
    
    Benchmark:
    
    ```
    ./conflicts.py -a adm:pass -q 1 -n 100000 -x 1.0 -z -c 0
    
    docs: 100k,
    purge batch size: 1000
    q:1
    all deleted
    no conflicts
    ```
    
    Results (7 calls with main and 10 calls with the PR)
    
    Unoptimized (main)
    
    *** purging 100000 docs 15 sec, rate = 6466/sec
    *** purging 100000 docs 15 sec, rate = 6880/sec
    *** purging 100000 docs 14 sec, rate = 7019/sec
    *** purging 100000 docs 17 sec, rate = 6056/sec
    *** purging 100000 docs 14 sec, rate = 7239/sec
    *** purging 100000 docs 14 sec, rate = 7124/sec
    *** purging 100000 docs 14 sec, rate = 7121/sec
    
    Averge: 6844
    
    Optimized (pr)
    
    *** purging 100000 docs 11 sec, rate = 9003/sec
    *** purging 100000 docs 12 sec, rate = 8591/sec
    *** purging 100000 docs 10 sec, rate = 9692/sec
    *** purging 100000 docs 11 sec, rate = 9442/sec
    *** purging 100000 docs 12 sec, rate = 8364/sec
    *** purging 100000 docs 11 sec, rate = 8784/sec
    *** purging 100000 docs 11 sec, rate = 9103/sec
    
    Average: 8926
    
    Speedup: 30%
---
 src/couch/src/couch_db_updater.erl | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/src/couch/src/couch_db_updater.erl 
b/src/couch/src/couch_db_updater.erl
index 1d0e177d5..8394cd5a3 100644
--- a/src/couch/src/couch_db_updater.erl
+++ b/src/couch/src/couch_db_updater.erl
@@ -811,22 +811,22 @@ purge_docs(Db, PurgeReqs) ->
     FDIs = couch_db_engine:open_docs(Db, Ids),
     USeq = couch_db_engine:get_update_seq(Db),
 
-    IdFDIs = lists:zip(Ids, FDIs),
+    IdFDIs = maps:from_list(lists:zip(Ids, FDIs)),
     {NewIdFDIs, Replies} = apply_purge_reqs(PurgeReqs, IdFDIs, USeq, []),
 
-    Pairs = lists:flatmap(
-        fun({DocId, OldFDI}) ->
-            {DocId, NewFDI} = lists:keyfind(DocId, 1, NewIdFDIs),
-            case {OldFDI, NewFDI} of
-                {not_found, not_found} ->
-                    [];
-                {#full_doc_info{} = A, #full_doc_info{} = A} ->
-                    [];
-                {#full_doc_info{}, _} ->
-                    [{OldFDI, NewFDI}]
-            end
-        end,
-        IdFDIs
+    Pairs = lists:sort(
+        maps:fold(
+            fun(DocId, OldFDI, Acc) ->
+                #{DocId := NewFDI} = NewIdFDIs,
+                case {OldFDI, NewFDI} of
+                    {not_found, not_found} -> Acc;
+                    {#full_doc_info{} = A, #full_doc_info{} = A} -> Acc;
+                    {#full_doc_info{}, _} -> [{OldFDI, NewFDI} | Acc]
+                end
+            end,
+            [],
+            IdFDIs
+        )
     ),
 
     PSeq = couch_db_engine:get_purge_seq(Db),
@@ -850,7 +850,7 @@ apply_purge_reqs([], IdFDIs, _USeq, Replies) ->
     {IdFDIs, lists:reverse(Replies)};
 apply_purge_reqs([Req | RestReqs], IdFDIs, USeq, Replies) ->
     {_UUID, DocId, Revs} = Req,
-    {value, {_, FDI0}, RestIdFDIs} = lists:keytake(DocId, 1, IdFDIs),
+    #{DocId := FDI0} = IdFDIs,
     {NewFDI, RemovedRevs, NewUSeq} =
         case FDI0 of
             #full_doc_info{rev_tree = Tree} ->
@@ -888,9 +888,8 @@ apply_purge_reqs([Req | RestReqs], IdFDIs, USeq, Replies) ->
                 % Not found means nothing to change
                 {not_found, [], USeq}
         end,
-    NewIdFDIs = [{DocId, NewFDI} | RestIdFDIs],
     NewReplies = [{ok, RemovedRevs} | Replies],
-    apply_purge_reqs(RestReqs, NewIdFDIs, NewUSeq, NewReplies).
+    apply_purge_reqs(RestReqs, IdFDIs#{DocId := NewFDI}, NewUSeq, NewReplies).
 
 update_time_seq(#db{time_seq = TSeq} = Db, Seq) when is_integer(Seq) ->
     Timestamp = couch_time_seq:timestamp(),

Reply via email to