This is an automated email from the ASF dual-hosted git repository.
vatamane pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/couchdb.git
The following commit(s) were added to refs/heads/main by this push:
new 9dee10d7b Optimize purge. ~30% for large batches.
9dee10d7b is described below
commit 9dee10d7b4e982394d393191857e54cfc24e5a49
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Sat Oct 25 00:39:07 2025 -0400
Optimize purge. ~30% for large batches.
Use a map to store and lookup FDI records in `couch_db_updater`.
The map helps in two ways:
* In `apply_purge_requests` we avoid removing then re-adding the active
FDI record
to the list. Looking up the record is a faster operation, especially
with
large batch and a simple update vs remove + replace helps with
generating
less garbage.
* In `purge_docs` avoid nested `lists:keyfind/3` lookups when building
the
FDI pairs list. For instance with 1000 docs, we replace 1000 calls to
keyfind, an O(n) operation, with 1000 map lookups, O(log n) operations.
Benchmark:
```
./conflicts.py -a adm:pass -q 1 -n 100000 -x 1.0 -z -c 0
docs: 100k,
purge batch size: 1000
q:1
all deleted
no conflicts
```
Results (7 calls with main and 10 calls with the PR)
Unoptimized (main)
*** purging 100000 docs 15 sec, rate = 6466/sec
*** purging 100000 docs 15 sec, rate = 6880/sec
*** purging 100000 docs 14 sec, rate = 7019/sec
*** purging 100000 docs 17 sec, rate = 6056/sec
*** purging 100000 docs 14 sec, rate = 7239/sec
*** purging 100000 docs 14 sec, rate = 7124/sec
*** purging 100000 docs 14 sec, rate = 7121/sec
Averge: 6844
Optimized (pr)
*** purging 100000 docs 11 sec, rate = 9003/sec
*** purging 100000 docs 12 sec, rate = 8591/sec
*** purging 100000 docs 10 sec, rate = 9692/sec
*** purging 100000 docs 11 sec, rate = 9442/sec
*** purging 100000 docs 12 sec, rate = 8364/sec
*** purging 100000 docs 11 sec, rate = 8784/sec
*** purging 100000 docs 11 sec, rate = 9103/sec
Average: 8926
Speedup: 30%
---
src/couch/src/couch_db_updater.erl | 33 ++++++++++++++++-----------------
1 file changed, 16 insertions(+), 17 deletions(-)
diff --git a/src/couch/src/couch_db_updater.erl
b/src/couch/src/couch_db_updater.erl
index 1d0e177d5..8394cd5a3 100644
--- a/src/couch/src/couch_db_updater.erl
+++ b/src/couch/src/couch_db_updater.erl
@@ -811,22 +811,22 @@ purge_docs(Db, PurgeReqs) ->
FDIs = couch_db_engine:open_docs(Db, Ids),
USeq = couch_db_engine:get_update_seq(Db),
- IdFDIs = lists:zip(Ids, FDIs),
+ IdFDIs = maps:from_list(lists:zip(Ids, FDIs)),
{NewIdFDIs, Replies} = apply_purge_reqs(PurgeReqs, IdFDIs, USeq, []),
- Pairs = lists:flatmap(
- fun({DocId, OldFDI}) ->
- {DocId, NewFDI} = lists:keyfind(DocId, 1, NewIdFDIs),
- case {OldFDI, NewFDI} of
- {not_found, not_found} ->
- [];
- {#full_doc_info{} = A, #full_doc_info{} = A} ->
- [];
- {#full_doc_info{}, _} ->
- [{OldFDI, NewFDI}]
- end
- end,
- IdFDIs
+ Pairs = lists:sort(
+ maps:fold(
+ fun(DocId, OldFDI, Acc) ->
+ #{DocId := NewFDI} = NewIdFDIs,
+ case {OldFDI, NewFDI} of
+ {not_found, not_found} -> Acc;
+ {#full_doc_info{} = A, #full_doc_info{} = A} -> Acc;
+ {#full_doc_info{}, _} -> [{OldFDI, NewFDI} | Acc]
+ end
+ end,
+ [],
+ IdFDIs
+ )
),
PSeq = couch_db_engine:get_purge_seq(Db),
@@ -850,7 +850,7 @@ apply_purge_reqs([], IdFDIs, _USeq, Replies) ->
{IdFDIs, lists:reverse(Replies)};
apply_purge_reqs([Req | RestReqs], IdFDIs, USeq, Replies) ->
{_UUID, DocId, Revs} = Req,
- {value, {_, FDI0}, RestIdFDIs} = lists:keytake(DocId, 1, IdFDIs),
+ #{DocId := FDI0} = IdFDIs,
{NewFDI, RemovedRevs, NewUSeq} =
case FDI0 of
#full_doc_info{rev_tree = Tree} ->
@@ -888,9 +888,8 @@ apply_purge_reqs([Req | RestReqs], IdFDIs, USeq, Replies) ->
% Not found means nothing to change
{not_found, [], USeq}
end,
- NewIdFDIs = [{DocId, NewFDI} | RestIdFDIs],
NewReplies = [{ok, RemovedRevs} | Replies],
- apply_purge_reqs(RestReqs, NewIdFDIs, NewUSeq, NewReplies).
+ apply_purge_reqs(RestReqs, IdFDIs#{DocId := NewFDI}, NewUSeq, NewReplies).
update_time_seq(#db{time_seq = TSeq} = Db, Seq) when is_integer(Seq) ->
Timestamp = couch_time_seq:timestamp(),