From db0f4800d3bae875b5b1b262249c12738a243bf7 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Thu, 12 Sep 2024 15:23:20 +0100
Subject: [PATCH v1 1/2] Fix stuck parallel btree scans

Before, a backend that called _bt_parallel_seize was not always
guaranteed to be able to move forward on a state where more work
was expected from parallel backends, and handled NEED_PRIMSCAN as
a semi-ADVANCING state. This caused issues when the leader process
was waiting for the state to advance and concurrent backends were
waiting for the leader to consume the buffered tuples they still
had after updating the state to NEED_PRIMSCAN.

This is fixed by treating _bt_parallel_seize()'s status output as
the status of a currently active primitive scan.  If _seize is
called from outside _bt_first, and the scan state is NEED_PRIMSCAN,
then we'll end our current primitive scan and set the scan up for
a new primitive scan, eventually hitting _bt_first's call to
_seize.
---
 src/backend/access/nbtree/nbtree.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 6d090f8739..2b553d1161 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -584,7 +584,8 @@ btparallelrescan(IndexScanDesc scan)
  *		or _bt_parallel_done().
  *
  * The return value is true if we successfully seized the scan and false
- * if we did not.  The latter case occurs if no pages remain.
+ * if we did not.  The latter case occurs if no pages remain in this primitive
+ * index scan.
  *
  * If the return value is true, *pageno returns the next or current page
  * of the scan (depending on the scan direction).  An invalid block number
@@ -653,8 +654,10 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 			Assert(so->numArrayKeys);
 
 			/*
-			 * If we can start another primitive scan right away, do so.
-			 * Otherwise just wait.
+			 * If we're called from _bt_first and thus are set up to start a
+			 * primitive scan, do so.  If not, we stop this current primitive
+			 * scan by returning false, which sets us up for the call to
+			 * _bt_first which can then try to seize this scan again.
 			 */
 			if (first)
 			{
@@ -672,6 +675,13 @@ _bt_parallel_seize(IndexScanDesc scan, BlockNumber *pageno, bool first)
 				*pageno = InvalidBlockNumber;
 				exit_loop = true;
 			}
+			else
+			{
+				so->needPrimScan = true;
+				so->scanBehind = false;
+				*pageno = InvalidBlockNumber;
+				status = false;
+			}
 		}
 		else if (btscan->btps_pageStatus != BTPARALLEL_ADVANCING)
 		{
-- 
2.46.0

