This is an automated email from the ASF dual-hosted git repository. vatamane pushed a commit to branch scanner-scan-by-sequence-including-deleted in repository https://gitbox.apache.org/repos/asf/couchdb.git
commit 804eadf2e37541f2654b3b408b33c7d9ebd20d41 Author: Nick Vatamaniuc <[email protected]> AuthorDate: Sat Aug 2 02:24:36 2025 -0400 In the scanner, traverse docs by sequence instead of by ID. Previously, the scanner traversed docs by id, skipping deleted documents. However, some plugins like the conflict checker, or the finder, may want to inspect deleted docs. To fix this, switch to using by-seq order during scanning. This will also let us add more precise checkpoints in the future; we could, for example, checkpoint during the db traversal not just per-db. To let plugins customize the traversal, modify the db_opened/2 callback so it can return start changes sequence and changes folding options. Use this new feature in the finder since there it makes sense to scan backwards to find the most recently added data first (i.e. someone just added something they shouldn't have to the db and we'd like to find it). Since we now consider deleted documents, adjust the QuickJS scanner to discard deleted FDIs before even opening the doc bodies. For the conflict checker, add a test to ensure we do catch deleted conflicts when all of them are deleted. As minor tweak, improved the scanner tests by using `#doc{}` records instead of plain tuples. --- .../src/couch_quickjs_scanner_plugin.erl | 9 ++++++++- src/couch_scanner/src/couch_scanner_plugin.erl | 22 +++++++++++++++------- .../src/couch_scanner_plugin_find.erl | 4 +++- .../test/eunit/couch_scanner_test.erl | 17 +++++++++++++---- 4 files changed, 39 insertions(+), 13 deletions(-) diff --git a/src/couch_quickjs/src/couch_quickjs_scanner_plugin.erl b/src/couch_quickjs/src/couch_quickjs_scanner_plugin.erl index 0d7b233de..91385010c 100644 --- a/src/couch_quickjs/src/couch_quickjs_scanner_plugin.erl +++ b/src/couch_quickjs/src/couch_quickjs_scanner_plugin.erl @@ -23,6 +23,7 @@ shards/2, db_opened/2, doc_id/3, + doc_fdi/3, doc/3, db_closing/2 ]). @@ -149,7 +150,7 @@ db_opened(#st{} = St, Db) -> #st{max_docs = MaxDocs, max_step = MaxStep} = St, {ok, DocTotal} = couch_db:get_doc_count(Db), Step = min(MaxStep, max(1, DocTotal div MaxDocs)), - {ok, St#st{doc_cnt = 0, doc_step = Step, docs = []}}. + {0, [], St#st{doc_cnt = 0, doc_step = Step, docs = []}}. doc_id(#st{} = St, <<?DESIGN_DOC_PREFIX, _/binary>>, _Db) -> {skip, St}; @@ -162,6 +163,12 @@ doc_id(#st{doc_cnt = C, doc_step = S} = St, _DocId, _Db) when C rem S /= 0 -> doc_id(#st{doc_cnt = C} = St, _DocId, _Db) -> {ok, St#st{doc_cnt = C + 1}}. +doc_fdi(#st{} = St, #full_doc_info{deleted = true}, _Db) -> + % Skip deleted; don't even open the doc body + {stop, St}; +doc_fdi(#st{} = St, #full_doc_info{}, _Db) -> + {ok, St}. + doc(#st{} = St, Db, #doc{id = DocId} = Doc) -> #st{sid = SId} = St, JsonDoc = couch_query_servers:json_doc(Doc), diff --git a/src/couch_scanner/src/couch_scanner_plugin.erl b/src/couch_scanner/src/couch_scanner_plugin.erl index 9ff3a2505..2d63cb9ce 100644 --- a/src/couch_scanner/src/couch_scanner_plugin.erl +++ b/src/couch_scanner/src/couch_scanner_plugin.erl @@ -115,9 +115,14 @@ -callback shards(St :: term(), [#shard{}]) -> {[#shard{}], St1 :: term()}. -% Optional +% Optional. Called right after a shard file is opened so it gets a Db handle. +% Should return the change feed start sequence and a list of options along with any changes +% in a private context. The change feed start sequence should normally be 0 and the list +% of option can be []. The list of options will be passed directly to couch_db:fold_changes, +% so any {dir, Dir}, {end_key, EndSeq} could work there. +% -callback db_opened(St :: term(), Db :: term()) -> - {ok, St :: term()}. + {ChangesSeq :: non_neg_integer(), ChangesOpts :: [term()], St1 :: term()}. % Optional. If doc and doc_fdi are not defined, then doc_id default % action is {skip, St}. If it is defined, the default action is {ok, St}. @@ -178,6 +183,8 @@ cursor, shards_db, db, + changes_seq = 0, + changes_opts = [], checkpoint_sec = 0, start_sec = 0, skip_dbs, @@ -370,7 +377,8 @@ scan_docs(#st{} = St, #shard{name = ShardDbName}) -> try St2 = St1#st{db = Db}, St3 = db_opened_callback(St2), - {ok, St4} = couch_db:fold_docs(Db, fun scan_docs_fold/2, St3, []), + #st{changes_seq = Seq, changes_opts = Opts} = St3, + {ok, St4} = couch_db:fold_changes(Db, Seq, fun scan_docs_fold/2, St3, Opts), St5 = db_closing_callback(St4), erlang:garbage_collect(), St5#st{db = undefined} @@ -521,13 +529,13 @@ resume_callback(#{} = Cbks, SId, #{} = EJsonPSt) when is_binary(SId) -> db_opened_callback(#st{pst = PSt, callbacks = Cbks, db = Db} = St) -> #{db_opened := DbOpenedCbk} = Cbks, - {ok, PSt1} = DbOpenedCbk(PSt, Db), - St#st{pst = PSt1}. + {Seq, Opts, PSt1} = DbOpenedCbk(PSt, Db), + St#st{pst = PSt1, changes_seq = Seq, changes_opts = Opts}. db_closing_callback(#st{pst = PSt, callbacks = Cbks, db = Db} = St) -> #{db_closing := DbClosingCbk} = Cbks, {ok, PSt1} = DbClosingCbk(PSt, Db), - St#st{pst = PSt1}. + St#st{pst = PSt1, changes_seq = 0, changes_opts = []}. shards_callback(#st{pst = PSt, callbacks = Cbks} = St, Shards) -> #{shards := ShardsCbk} = Cbks, @@ -601,7 +609,7 @@ default_shards(Mod, _F, _A) when is_atom(Mod) -> end. default_db_opened(Mod, _F, _A) when is_atom(Mod) -> - fun(St, _Db) -> {ok, St} end. + fun(St, _Db) -> {0, [], St} end. default_doc_id(Mod, _F, _A) when is_atom(Mod) -> case is_exported(Mod, doc, 3) orelse is_exported(Mod, doc_fdi, 3) of diff --git a/src/couch_scanner/src/couch_scanner_plugin_find.erl b/src/couch_scanner/src/couch_scanner_plugin_find.erl index 9b3a162d9..12b1e22b5 100644 --- a/src/couch_scanner/src/couch_scanner_plugin_find.erl +++ b/src/couch_scanner/src/couch_scanner_plugin_find.erl @@ -88,7 +88,9 @@ db_opened(#st{sid = SId} = St, Db) -> true -> ?DEBUG("", [], #{sid => SId, db => Db}); false -> ok end, - {ok, St}. + % Search backwards with the idea that we may be looking for some recent + % changes we just made to the database. + {couch_db:get_update_seq(Db), [{dir, rev}], St}. doc_id(#st{} = St, DocId, Db) -> #st{sid = SId, compiled_regexes = Pats} = St, diff --git a/src/couch_scanner/test/eunit/couch_scanner_test.erl b/src/couch_scanner/test/eunit/couch_scanner_test.erl index b609bd69c..a7edb6b67 100644 --- a/src/couch_scanner/test/eunit/couch_scanner_test.erl +++ b/src/couch_scanner/test/eunit/couch_scanner_test.erl @@ -85,8 +85,8 @@ setup() -> ok = add_doc(DbName2, ?DOC3, #{foo3 => bax}), ok = add_doc(DbName2, ?DOC4, #{foo4 => baw, <<>> => this_is_ok_apparently}), add_docs(DbName3, [ - {doc, ?DOC5, {2, [<<"x">>, <<"z">>]}, {[]}, [], false, []}, - {doc, ?DOC5, {2, [<<"y">>, <<"z">>]}, {[]}, [], false, []} + #doc{id = ?DOC5, revs = {2, [<<"x">>, <<"z">>]}, deleted = false}, + #doc{id = ?DOC5, revs = {2, [<<"y">>, <<"z">>]}, deleted = false} ]), couch_scanner:reset_checkpoints(), {Ctx, {DbName1, DbName2, DbName3}}. @@ -204,11 +204,20 @@ t_conflict_finder_works({_, {_, _, DbName3}}) -> % Add a deleted conflicting doc to the third database. % 3 reports are expected: 2 doc reports and 1 db report. add_docs(DbName3, [ - {doc, ?DOC6, {2, [<<"x">>, <<"z">>]}, {[]}, [], false, []}, - {doc, ?DOC6, {2, [<<"d">>, <<"z">>]}, {[]}, [], true, []} + #doc{id = ?DOC6, revs = {2, [<<"x">>, <<"z">>]}, deleted = false}, + #doc{id = ?DOC6, revs = {2, [<<"d">>, <<"z">>]}, deleted = true} ]), resume_couch_scanner(Plugin), ?assertEqual(3, meck:num_calls(couch_scanner_util, log, LogArgs)), + % Should work even if all revs are deleted (the whole FDI is deleted) + add_docs(DbName3, [ + #doc{id = ?DOC6, revs = {3, [<<"a">>, <<"x">>, <<"z">>]}, deleted = true} + ]), + % Confirm it's deleted (we did the revs paths manipulations correctly) + ?assertEqual({not_found, deleted}, fabric:open_doc(DbName3, ?DOC6, [])), + % But we can still find the conflicts + resume_couch_scanner(Plugin), + ?assertEqual(3, meck:num_calls(couch_scanner_util, log, LogArgs)), % Set doc_report to false to only have 1 db report. config:set(Plugin, "doc_report", "false", false), resume_couch_scanner(Plugin),
