From 7330fd56024acdc4fe8b10a18fedcc633b41ddec Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Mon, 22 Dec 2025 17:36:14 +0100
Subject: [PATCH v13 7/8] nbtree: Reduce Index-Only Scan pin duration

Previously, we would keep a pin on every leaf page while we were returning
tuples to the scan.  With this patch, we utilize the newly introduced
table_index_vischeck_tuples API to pre-check visibility of all TIDs, and
thus unpin the page well ahead of when we'd usually be ready with returning
and processing all index tuple results.  This reduces the cases where VACUUM
may have to wait for a pin from a stalled index scan, and can increase
performance by reducing the amount of VM page accesses.
---
 src/backend/access/nbtree/nbtreadpage.c |   5 ++
 src/backend/access/nbtree/nbtree.c      |  27 +++++-
 src/backend/access/nbtree/nbtsearch.c   | 115 ++++++++++++++++++++++--
 src/include/access/nbtree.h             |   8 ++
 4 files changed, 145 insertions(+), 10 deletions(-)

diff --git a/src/backend/access/nbtree/nbtreadpage.c b/src/backend/access/nbtree/nbtreadpage.c
index b3b8b553411..b9c93ad29e6 100644
--- a/src/backend/access/nbtree/nbtreadpage.c
+++ b/src/backend/access/nbtree/nbtreadpage.c
@@ -1038,6 +1038,8 @@ _bt_saveitem(BTScanOpaque so, int itemIndex,
 
 	currItem->heapTid = itup->t_tid;
 	currItem->indexOffset = offnum;
+	currItem->visrecheck = TMVC_Unchecked;
+
 	if (so->currTuples)
 	{
 		Size		itupsz = IndexTupleSize(itup);
@@ -1068,6 +1070,8 @@ _bt_setuppostingitems(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
+	currItem->visrecheck = TMVC_Unchecked;
+
 	if (so->currTuples)
 	{
 		/* Save base IndexTuple (truncate posting list) */
@@ -1104,6 +1108,7 @@ _bt_savepostingitem(BTScanOpaque so, int itemIndex, OffsetNumber offnum,
 
 	currItem->heapTid = *heapTid;
 	currItem->indexOffset = offnum;
+	currItem->visrecheck = TMVC_Unchecked;
 
 	/*
 	 * Have index-only scans return the same base IndexTuple for every TID
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index b4425231935..04056647805 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -364,6 +364,10 @@ btbeginscan(Relation rel, int nkeys, int norderbys)
 	so->killedItems = NULL;		/* until needed */
 	so->numKilled = 0;
 
+	so->vmbuf = InvalidBuffer;
+	so->vischeckcap = 0;
+	so->vischecksbuf = NULL;
+
 	/*
 	 * We don't know yet whether the scan will be index-only, so we do not
 	 * allocate the tuple workspace arrays until btrescan.  However, we set up
@@ -420,10 +424,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	 *
 	 * Note: so->dropPin should never change across rescans.
 	 */
-	so->dropPin = (!scan->xs_want_itup &&
-				   IsMVCCSnapshot(scan->xs_snapshot) &&
+	so->dropPin = (IsMVCCSnapshot(scan->xs_snapshot) &&
 				   RelationNeedsWAL(scan->indexRelation) &&
 				   scan->heapRelation != NULL);
+	so->vischeck = scan->xs_want_itup;
+
+	Assert(!scan->xs_want_itup || so->vischeck || !so->dropPin);
 
 	so->markItemIndex = -1;
 	so->needPrimScan = false;
@@ -432,6 +438,12 @@ btrescan(IndexScanDesc scan, ScanKey scankey, int nscankeys,
 	BTScanPosUnpinIfPinned(so->markPos);
 	BTScanPosInvalidate(so->markPos);
 
+	if (BufferIsValid(so->vmbuf))
+	{
+		ReleaseBuffer(so->vmbuf);
+		so->vmbuf = InvalidBuffer;
+	}
+
 	/*
 	 * Allocate tuple workspace arrays, if needed for an index-only scan and
 	 * not already done in a previous rescan call.  To save on palloc
@@ -473,6 +485,17 @@ btendscan(IndexScanDesc scan)
 	so->markItemIndex = -1;
 	BTScanPosUnpinIfPinned(so->markPos);
 
+	if (so->vischecksbuf)
+		pfree(so->vischecksbuf);
+	so->vischecksbuf = NULL;
+	so->vischeckcap = 0;
+
+	if (BufferIsValid(so->vmbuf))
+	{
+		ReleaseBuffer(so->vmbuf);
+		so->vmbuf = InvalidBuffer;
+	}
+
 	/* No need to invalidate positions, the RAM is about to be freed. */
 
 	/* Release storage */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index 7a416d60cea..bdcc2974c92 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -25,7 +25,8 @@
 #include "utils/rel.h"
 
 
-static inline void _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so);
+static inline void _bt_drop_lock_and_maybe_pin(Relation rel, Relation heaprel,
+											   BTScanOpaque so);
 static Buffer _bt_moveright(Relation rel, Relation heaprel, BTScanInsert key,
 							Buffer buf, bool forupdate, BTStack stack,
 							int access);
@@ -51,12 +52,95 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
  * Dropping the pin prevents VACUUM from blocking on acquiring a cleanup lock.
  */
 static inline void
-_bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
+_bt_drop_lock_and_maybe_pin(Relation rel, Relation heaprel, BTScanOpaque so)
 {
+	if (so->dropPin)
+		so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
+
+	_bt_unlockbuf(rel, so->currPos.buf);
+
+	/*
+	 * Do some visibility checks if this is an index-only scan; allowing us to
+	 * drop the pin on this page before we have returned all tuples from this
+	 * IOS to the executor.
+	 */
+	if (so->vischeck)
+	{
+		TM_IndexVisibilityCheckOp visCheck;
+		BTScanPos sp = &so->currPos;
+		BTScanPosItem *item;
+		int		initOffset = sp->firstItem;
+
+		visCheck.checkntids = 1 + sp->lastItem - initOffset;
+
+		Assert(visCheck.checkntids > 0);
+
+		/* populate the visibility checking buffer */
+		if (so->vischeckcap == 0)
+		{
+			Assert(so->vischecksbuf == NULL);
+			so->vischecksbuf = palloc_array(TM_VisCheck,
+											visCheck.checkntids);
+			so->vischeckcap = visCheck.checkntids;
+		}
+		else if (so->vischeckcap < visCheck.checkntids)
+		{
+			so->vischecksbuf = repalloc_array(so->vischecksbuf,
+											  TM_VisCheck,
+											  visCheck.checkntids);
+			so->vischeckcap = visCheck.checkntids;
+		}
+
+		/* populate the visibility check data */
+		visCheck.checktids = so->vischecksbuf;
+		visCheck.vmbuf = &so->vmbuf;
+
+		item = &so->currPos.items[initOffset];
+
+		for (int i = 0; i < visCheck.checkntids; i++)
+		{
+			TM_VisCheck *check = &visCheck.checktids[i];
+			Assert(item->visrecheck == TMVC_Unchecked);
+			Assert(ItemPointerIsValid(&item->heapTid));
+
+			PopulateTMVischeck(check, &item->heapTid, initOffset);
+
+			item++;
+			initOffset++;
+		}
+
+		/* do the visibility check */
+		table_index_vischeck_tuples(heaprel, &visCheck);
+
+		/* ... and put the results into the BTScanPosItems */
+		for (int i = 0; i < visCheck.checkntids; i++)
+		{
+			TM_VisCheck *check = &visCheck.checktids[i];
+			TMVC_Result result = check->vischeckresult;
+			/* We must have a valid visibility check result */
+			Assert(result != TMVC_Unchecked && result <= TMVC_MAX);
+
+			/* The idxoffnum should be in the expected range */
+			Assert(check->idxoffnum >= so->currPos.firstItem &&
+				   check->idxoffnum <= so->currPos.lastItem);
+
+			item = &so->currPos.items[check->idxoffnum];
+
+			/* Ensure we don't visit the same item twice */
+			Assert(item->visrecheck == TMVC_Unchecked);
+
+			/* The offset number should still indicate the right item */
+			Assert(check->tidblkno == ItemPointerGetBlockNumberNoCheck(&item->heapTid));
+			Assert(check->tidoffset == ItemPointerGetOffsetNumberNoCheck(&item->heapTid));
+
+			/* Store the visibility check result */
+			item->visrecheck = result;
+		}
+	}
+
 	if (!so->dropPin)
 	{
-		/* Just drop the lock (not the pin) */
-		_bt_unlockbuf(rel, so->currPos.buf);
+		/* Only drop the lock (not the pin) */
 		return;
 	}
 
@@ -67,8 +151,7 @@ _bt_drop_lock_and_maybe_pin(Relation rel, BTScanOpaque so)
 	 * when concurrent heap TID recycling by VACUUM might have taken place.
 	 */
 	Assert(RelationNeedsWAL(rel));
-	so->currPos.lsn = BufferGetLSNAtomic(so->currPos.buf);
-	_bt_relbuf(rel, so->currPos.buf);
+	ReleaseBuffer(so->currPos.buf);
 	so->currPos.buf = InvalidBuffer;
 }
 
@@ -1626,9 +1709,25 @@ _bt_returnitem(IndexScanDesc scan, BTScanOpaque so)
 	Assert(BTScanPosIsValid(so->currPos));
 	Assert(so->currPos.itemIndex >= so->currPos.firstItem);
 	Assert(so->currPos.itemIndex <= so->currPos.lastItem);
+	Assert(!scan->xs_want_itup || so->vischeck || !so->dropPin);
 
 	/* Return next item, per amgettuple contract */
 	scan->xs_heaptid = currItem->heapTid;
+
+	if (scan->xs_want_itup)
+	{
+		/*
+		 * If we've already dropped the buffer, we better have already
+		 * checked the visibility state of the tuple:  Without the
+		 * buffer pinned, vacuum may have already cleaned up the tuple
+		 * and marked the page as ALL_VISIBLE.
+		 */
+		Assert(BufferIsValid(so->currPos.buf) ||
+			   currItem->visrecheck != TMVC_Unchecked);
+
+		scan->xs_visrecheck = currItem->visrecheck;
+	}
+
 	if (so->currTuples)
 		scan->xs_itup = (IndexTuple) (so->currTuples + currItem->tupleOffset);
 }
@@ -1785,7 +1884,7 @@ _bt_readfirstpage(IndexScanDesc scan, OffsetNumber offnum, ScanDirection dir)
 		 * so->currPos.buf in preparation for btgettuple returning tuples.
 		 */
 		Assert(BTScanPosIsPinned(so->currPos));
-		_bt_drop_lock_and_maybe_pin(rel, so);
+		_bt_drop_lock_and_maybe_pin(rel, scan->heapRelation, so);
 		return true;
 	}
 
@@ -1945,7 +2044,7 @@ _bt_readnextpage(IndexScanDesc scan, BlockNumber blkno,
 	 */
 	Assert(so->currPos.currPage == blkno);
 	Assert(BTScanPosIsPinned(so->currPos));
-	_bt_drop_lock_and_maybe_pin(rel, so);
+	_bt_drop_lock_and_maybe_pin(rel, scan->heapRelation, so);
 
 	return true;
 }
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 7a3efd209bc..b6ff85c9e61 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -17,6 +17,7 @@
 #include "access/amapi.h"
 #include "access/itup.h"
 #include "access/sdir.h"
+#include "access/tableam.h"
 #include "catalog/pg_am_d.h"
 #include "catalog/pg_class.h"
 #include "catalog/pg_index.h"
@@ -957,6 +958,7 @@ typedef struct BTScanPosItem	/* what we remember about each match */
 	ItemPointerData heapTid;	/* TID of referenced heap item */
 	OffsetNumber indexOffset;	/* index item's location within page */
 	LocationIndex tupleOffset;	/* IndexTuple's offset in workspace, if any */
+	uint8		visrecheck;		/* visibility recheck status, if any */
 } BTScanPosItem;
 
 typedef struct BTScanPosData
@@ -1072,6 +1074,12 @@ typedef struct BTScanOpaqueData
 	int			numKilled;		/* number of currently stored items */
 	bool		dropPin;		/* drop leaf pin before btgettuple returns? */
 
+	/* used for index-only scan visibility prechecks */
+	bool		vischeck;		/* check visibility of scanned tuples */
+	Buffer		vmbuf;			/* vm buffer */
+	int			vischeckcap;	/* capacity of vischeckbuf */
+	TM_VisCheck *vischecksbuf;	/* single allocation to save on alloc overhead */
+
 	/*
 	 * If we are doing an index-only scan, these are the tuple storage
 	 * workspaces for the currPos and markPos respectively.  Each is of size
-- 
2.50.1 (Apple Git-155)

