This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch optimize-purge-again in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 0f2776027dd67651227f4507dde3e13c0faa5115 Author: Nick Vatamaniuc <[email protected]> AuthorDate: Sat Oct 25 00:39:07 2025 -0400 Optimize purge. ~30% for large batches. Use a map to store and lookup FDI records instead of a plain list in couch_db_updater. The map helps in two places: * In `apply_purge_requests` we avoid removing then re-adding the active FDI record to the list. Looking up the record is a faster operation, especially with large batch and a simple update vs remove + replace helps with generating less garbage. * In `purge_docs` avoid nested `lists:keyfind/3` lookups when building the FDI pairs list. For instance with 1000 docs, we replace 1000 calls to keyfind, an O(n) operation, with 1000 map lookups, O(log n) operations. Benchmark: ``` ./conflicts.py -a adm:pass -q 1 -n 100000 -x 1.0 -z -c 0 docs: 100k, purge batch size: 1000 q:1 all deleted no conflicts ``` Results (10 calls with main and 10 calls with the PR) Unoptimized (main) *** purging 100000 docs 15 sec, rate = 6466/sec *** purging 100000 docs 15 sec, rate = 6880/sec *** purging 100000 docs 14 sec, rate = 7019/sec *** purging 100000 docs 17 sec, rate = 6056/sec *** purging 100000 docs 17 sec, rate = 5813/sec *** purging 100000 docs 17 sec, rate = 5997/sec *** purging 100000 docs 18 sec, rate = 5553/sec *** purging 100000 docs 14 sec, rate = 7239/sec *** purging 100000 docs 14 sec, rate = 7124/sec *** purging 100000 docs 14 sec, rate = 7121/sec Averge: 6526 Optimized (pr) *** purging 100000 docs 11 sec, rate = 9003/sec *** purging 100000 docs 12 sec, rate = 8591/sec *** purging 100000 docs 10 sec, rate = 9692/sec *** purging 100000 docs 11 sec, rate = 9442/sec *** purging 100000 docs 12 sec, rate = 8599/sec *** purging 100000 docs 11 sec, rate = 8897/sec *** purging 100000 docs 11 sec, rate = 8793/sec *** purging 100000 docs 12 sec, rate = 8364/sec *** purging 100000 docs 11 sec, rate = 8784/sec *** purging 100000 docs 11 sec, rate = 9103/sec Average: 8926 Speedup: 37% --- src/couch/src/couch_db_updater.erl | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/src/couch/src/couch_db_updater.erl b/src/couch/src/couch_db_updater.erl index 1d0e177d5..8394cd5a3 100644 --- a/src/couch/src/couch_db_updater.erl +++ b/src/couch/src/couch_db_updater.erl @@ -811,22 +811,22 @@ purge_docs(Db, PurgeReqs) -> FDIs = couch_db_engine:open_docs(Db, Ids), USeq = couch_db_engine:get_update_seq(Db), - IdFDIs = lists:zip(Ids, FDIs), + IdFDIs = maps:from_list(lists:zip(Ids, FDIs)), {NewIdFDIs, Replies} = apply_purge_reqs(PurgeReqs, IdFDIs, USeq, []), - Pairs = lists:flatmap( - fun({DocId, OldFDI}) -> - {DocId, NewFDI} = lists:keyfind(DocId, 1, NewIdFDIs), - case {OldFDI, NewFDI} of - {not_found, not_found} -> - []; - {#full_doc_info{} = A, #full_doc_info{} = A} -> - []; - {#full_doc_info{}, _} -> - [{OldFDI, NewFDI}] - end - end, - IdFDIs + Pairs = lists:sort( + maps:fold( + fun(DocId, OldFDI, Acc) -> + #{DocId := NewFDI} = NewIdFDIs, + case {OldFDI, NewFDI} of + {not_found, not_found} -> Acc; + {#full_doc_info{} = A, #full_doc_info{} = A} -> Acc; + {#full_doc_info{}, _} -> [{OldFDI, NewFDI} | Acc] + end + end, + [], + IdFDIs + ) ), PSeq = couch_db_engine:get_purge_seq(Db), @@ -850,7 +850,7 @@ apply_purge_reqs([], IdFDIs, _USeq, Replies) -> {IdFDIs, lists:reverse(Replies)}; apply_purge_reqs([Req | RestReqs], IdFDIs, USeq, Replies) -> {_UUID, DocId, Revs} = Req, - {value, {_, FDI0}, RestIdFDIs} = lists:keytake(DocId, 1, IdFDIs), + #{DocId := FDI0} = IdFDIs, {NewFDI, RemovedRevs, NewUSeq} = case FDI0 of #full_doc_info{rev_tree = Tree} -> @@ -888,9 +888,8 @@ apply_purge_reqs([Req | RestReqs], IdFDIs, USeq, Replies) -> % Not found means nothing to change {not_found, [], USeq} end, - NewIdFDIs = [{DocId, NewFDI} | RestIdFDIs], NewReplies = [{ok, RemovedRevs} | Replies], - apply_purge_reqs(RestReqs, NewIdFDIs, NewUSeq, NewReplies). + apply_purge_reqs(RestReqs, IdFDIs#{DocId := NewFDI}, NewUSeq, NewReplies). update_time_seq(#db{time_seq = TSeq} = Db, Seq) when is_integer(Seq) -> Timestamp = couch_time_seq:timestamp(),
