This is an automated email from the ASF dual-hosted git repository.

vatamane pushed a commit to branch scanner-scan-by-sequence-including-deleted
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 804eadf2e37541f2654b3b408b33c7d9ebd20d41
Author: Nick Vatamaniuc <[email protected]>
AuthorDate: Sat Aug 2 02:24:36 2025 -0400

    In the scanner, traverse docs by sequence instead of by ID.
    
    Previously, the scanner traversed docs by id, skipping deleted documents.
    However, some plugins like the conflict checker, or the finder, may want to
    inspect deleted docs. To fix this, switch to using by-seq order during
    scanning. This will also let us add more precise checkpoints in the future; 
we
    could, for example, checkpoint during the db traversal not just per-db.
    
    To let plugins customize the traversal, modify the db_opened/2 callback so 
it
    can return start changes sequence and changes folding options. Use this new
    feature in the finder since there it makes sense to scan backwards to
    find the most recently added data first (i.e. someone just added something 
they
    shouldn't have to the db and we'd like to find it).
    
    Since we now consider deleted documents, adjust the QuickJS scanner to 
discard
    deleted FDIs before even opening the doc bodies.
    
    For the conflict checker, add a test to ensure we do catch deleted conflicts
    when all of them are deleted.
    
    As minor tweak, improved the scanner tests by using `#doc{}` records 
instead of
    plain tuples.
---
 .../src/couch_quickjs_scanner_plugin.erl           |  9 ++++++++-
 src/couch_scanner/src/couch_scanner_plugin.erl     | 22 +++++++++++++++-------
 .../src/couch_scanner_plugin_find.erl              |  4 +++-
 .../test/eunit/couch_scanner_test.erl              | 17 +++++++++++++----
 4 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/src/couch_quickjs/src/couch_quickjs_scanner_plugin.erl 
b/src/couch_quickjs/src/couch_quickjs_scanner_plugin.erl
index 0d7b233de..91385010c 100644
--- a/src/couch_quickjs/src/couch_quickjs_scanner_plugin.erl
+++ b/src/couch_quickjs/src/couch_quickjs_scanner_plugin.erl
@@ -23,6 +23,7 @@
     shards/2,
     db_opened/2,
     doc_id/3,
+    doc_fdi/3,
     doc/3,
     db_closing/2
 ]).
@@ -149,7 +150,7 @@ db_opened(#st{} = St, Db) ->
     #st{max_docs = MaxDocs, max_step = MaxStep} = St,
     {ok, DocTotal} = couch_db:get_doc_count(Db),
     Step = min(MaxStep, max(1, DocTotal div MaxDocs)),
-    {ok, St#st{doc_cnt = 0, doc_step = Step, docs = []}}.
+    {0, [], St#st{doc_cnt = 0, doc_step = Step, docs = []}}.
 
 doc_id(#st{} = St, <<?DESIGN_DOC_PREFIX, _/binary>>, _Db) ->
     {skip, St};
@@ -162,6 +163,12 @@ doc_id(#st{doc_cnt = C, doc_step = S} = St, _DocId, _Db) 
when C rem S /= 0 ->
 doc_id(#st{doc_cnt = C} = St, _DocId, _Db) ->
     {ok, St#st{doc_cnt = C + 1}}.
 
+doc_fdi(#st{} = St, #full_doc_info{deleted = true}, _Db) ->
+    % Skip deleted; don't even open the doc body
+    {stop, St};
+doc_fdi(#st{} = St, #full_doc_info{}, _Db) ->
+    {ok, St}.
+
 doc(#st{} = St, Db, #doc{id = DocId} = Doc) ->
     #st{sid = SId} = St,
     JsonDoc = couch_query_servers:json_doc(Doc),
diff --git a/src/couch_scanner/src/couch_scanner_plugin.erl 
b/src/couch_scanner/src/couch_scanner_plugin.erl
index 9ff3a2505..2d63cb9ce 100644
--- a/src/couch_scanner/src/couch_scanner_plugin.erl
+++ b/src/couch_scanner/src/couch_scanner_plugin.erl
@@ -115,9 +115,14 @@
 -callback shards(St :: term(), [#shard{}]) ->
     {[#shard{}], St1 :: term()}.
 
-% Optional
+% Optional. Called right after a shard file is opened so it gets a Db handle.
+% Should return the change feed start sequence and a list of options along 
with any changes
+% in a private context. The change feed start sequence should normally be 0 
and the list
+% of option can be []. The list of options will be passed directly to 
couch_db:fold_changes,
+% so any {dir, Dir}, {end_key, EndSeq} could work there.
+%
 -callback db_opened(St :: term(), Db :: term()) ->
-    {ok, St :: term()}.
+    {ChangesSeq :: non_neg_integer(), ChangesOpts :: [term()], St1 :: term()}.
 
 % Optional. If doc and doc_fdi are not defined, then doc_id default
 % action is {skip, St}. If it is defined, the default action is {ok, St}.
@@ -178,6 +183,8 @@
     cursor,
     shards_db,
     db,
+    changes_seq = 0,
+    changes_opts = [],
     checkpoint_sec = 0,
     start_sec = 0,
     skip_dbs,
@@ -370,7 +377,8 @@ scan_docs(#st{} = St, #shard{name = ShardDbName}) ->
             try
                 St2 = St1#st{db = Db},
                 St3 = db_opened_callback(St2),
-                {ok, St4} = couch_db:fold_docs(Db, fun scan_docs_fold/2, St3, 
[]),
+                #st{changes_seq = Seq, changes_opts = Opts} = St3,
+                {ok, St4} = couch_db:fold_changes(Db, Seq, fun 
scan_docs_fold/2, St3, Opts),
                 St5 = db_closing_callback(St4),
                 erlang:garbage_collect(),
                 St5#st{db = undefined}
@@ -521,13 +529,13 @@ resume_callback(#{} = Cbks, SId, #{} = EJsonPSt) when 
is_binary(SId) ->
 
 db_opened_callback(#st{pst = PSt, callbacks = Cbks, db = Db} = St) ->
     #{db_opened := DbOpenedCbk} = Cbks,
-    {ok, PSt1} = DbOpenedCbk(PSt, Db),
-    St#st{pst = PSt1}.
+    {Seq, Opts, PSt1} = DbOpenedCbk(PSt, Db),
+    St#st{pst = PSt1, changes_seq = Seq, changes_opts = Opts}.
 
 db_closing_callback(#st{pst = PSt, callbacks = Cbks, db = Db} = St) ->
     #{db_closing := DbClosingCbk} = Cbks,
     {ok, PSt1} = DbClosingCbk(PSt, Db),
-    St#st{pst = PSt1}.
+    St#st{pst = PSt1, changes_seq = 0, changes_opts = []}.
 
 shards_callback(#st{pst = PSt, callbacks = Cbks} = St, Shards) ->
     #{shards := ShardsCbk} = Cbks,
@@ -601,7 +609,7 @@ default_shards(Mod, _F, _A) when is_atom(Mod) ->
     end.
 
 default_db_opened(Mod, _F, _A) when is_atom(Mod) ->
-    fun(St, _Db) -> {ok, St} end.
+    fun(St, _Db) -> {0, [], St} end.
 
 default_doc_id(Mod, _F, _A) when is_atom(Mod) ->
     case is_exported(Mod, doc, 3) orelse is_exported(Mod, doc_fdi, 3) of
diff --git a/src/couch_scanner/src/couch_scanner_plugin_find.erl 
b/src/couch_scanner/src/couch_scanner_plugin_find.erl
index 9b3a162d9..12b1e22b5 100644
--- a/src/couch_scanner/src/couch_scanner_plugin_find.erl
+++ b/src/couch_scanner/src/couch_scanner_plugin_find.erl
@@ -88,7 +88,9 @@ db_opened(#st{sid = SId} = St, Db) ->
         true -> ?DEBUG("", [], #{sid => SId, db => Db});
         false -> ok
     end,
-    {ok, St}.
+    % Search backwards with the idea that we may be looking for some recent
+    % changes we just made to the database.
+    {couch_db:get_update_seq(Db), [{dir, rev}], St}.
 
 doc_id(#st{} = St, DocId, Db) ->
     #st{sid = SId, compiled_regexes = Pats} = St,
diff --git a/src/couch_scanner/test/eunit/couch_scanner_test.erl 
b/src/couch_scanner/test/eunit/couch_scanner_test.erl
index b609bd69c..a7edb6b67 100644
--- a/src/couch_scanner/test/eunit/couch_scanner_test.erl
+++ b/src/couch_scanner/test/eunit/couch_scanner_test.erl
@@ -85,8 +85,8 @@ setup() ->
     ok = add_doc(DbName2, ?DOC3, #{foo3 => bax}),
     ok = add_doc(DbName2, ?DOC4, #{foo4 => baw, <<>> => 
this_is_ok_apparently}),
     add_docs(DbName3, [
-        {doc, ?DOC5, {2, [<<"x">>, <<"z">>]}, {[]}, [], false, []},
-        {doc, ?DOC5, {2, [<<"y">>, <<"z">>]}, {[]}, [], false, []}
+        #doc{id = ?DOC5, revs = {2, [<<"x">>, <<"z">>]}, deleted = false},
+        #doc{id = ?DOC5, revs = {2, [<<"y">>, <<"z">>]}, deleted = false}
     ]),
     couch_scanner:reset_checkpoints(),
     {Ctx, {DbName1, DbName2, DbName3}}.
@@ -204,11 +204,20 @@ t_conflict_finder_works({_, {_, _, DbName3}}) ->
     % Add a deleted conflicting doc to the third database.
     % 3 reports are expected: 2 doc reports and 1 db report.
     add_docs(DbName3, [
-        {doc, ?DOC6, {2, [<<"x">>, <<"z">>]}, {[]}, [], false, []},
-        {doc, ?DOC6, {2, [<<"d">>, <<"z">>]}, {[]}, [], true, []}
+        #doc{id = ?DOC6, revs = {2, [<<"x">>, <<"z">>]}, deleted = false},
+        #doc{id = ?DOC6, revs = {2, [<<"d">>, <<"z">>]}, deleted = true}
     ]),
     resume_couch_scanner(Plugin),
     ?assertEqual(3, meck:num_calls(couch_scanner_util, log, LogArgs)),
+    % Should work even if all revs are deleted (the whole FDI is deleted)
+    add_docs(DbName3, [
+        #doc{id = ?DOC6, revs = {3, [<<"a">>, <<"x">>, <<"z">>]}, deleted = 
true}
+    ]),
+    % Confirm it's deleted (we did the revs paths manipulations correctly)
+    ?assertEqual({not_found, deleted}, fabric:open_doc(DbName3, ?DOC6, [])),
+    % But we can still find the conflicts
+    resume_couch_scanner(Plugin),
+    ?assertEqual(3, meck:num_calls(couch_scanner_util, log, LogArgs)),
     % Set doc_report to false to only have 1 db report.
     config:set(Plugin, "doc_report", "false", false),
     resume_couch_scanner(Plugin),

Reply via email to