From 300a59a92e30305101103bd816d4f5c2d50cdf1d Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Thu, 25 Feb 2021 15:17:22 -0800
Subject: [PATCH v9] Recycle pages deleted during same VACUUM.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Masahiko Sawada <sawada.mshk@gmail.com>
Discussion: https://postgr.es/m/CAH2-Wzk76_P=67iUscb1UN44-gyZL-KgpsXbSxq_bdcMa7Q+wQ@mail.gmail.com
---
 src/include/access/nbtree.h         | 22 ++++++-
 src/backend/access/nbtree/README    | 31 +++++++++
 src/backend/access/nbtree/nbtpage.c | 40 ++++++++++++
 src/backend/access/nbtree/nbtree.c  | 97 +++++++++++++++++++++++++++++
 src/backend/access/nbtree/nbtxlog.c | 22 +++++++
 5 files changed, 211 insertions(+), 1 deletion(-)

diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 5c66d1f366..8517b6026c 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -279,7 +279,8 @@ BTPageGetDeleteXid(Page page)
  * Is an existing page recyclable?
  *
  * This exists to centralize the policy on which deleted pages are now safe to
- * re-use.
+ * re-use.  The _bt_newly_deleted_pages_recycle() optimization behaves more
+ * aggressively, though that has certain known limitations.
  *
  * Note: PageIsNew() pages are always safe to recycle, but we can't deal with
  * them here (caller is responsible for that case themselves).  Caller might
@@ -316,14 +317,33 @@ BTPageIsRecyclable(Page page)
  * BTVacState is private nbtree.c state used during VACUUM.  It is exported
  * for use by page deletion related code in nbtpage.c.
  */
+typedef struct BTPendingRecycle
+{
+	BlockNumber blkno;
+	FullTransactionId safexid;
+} BTPendingRecycle;
+
 typedef struct BTVacState
 {
+	/*
+	 * VACUUM operation state
+	 */
 	IndexVacuumInfo *info;
 	IndexBulkDeleteResult *stats;
 	IndexBulkDeleteCallback callback;
 	void	   *callback_state;
 	BTCycleId	cycleid;
+
+	/*
+	 * Page deletion state for VACUUM
+	 */
 	MemoryContext pagedelcontext;
+	BTPendingRecycle *deleted;
+	bool		grow;
+	bool		full;
+	uint32		ndeletedspace;
+	uint64		maxndeletedspace;
+	uint32		ndeleted;
 } BTVacState;
 
 /*
diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index 46d49bf025..265814ea46 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -430,6 +430,37 @@ whenever it is subsequently taken from the FSM for reuse.  The deleted
 page's contents will be overwritten by the split operation (it will become
 the new right sibling page).
 
+Prior to PostgreSQL 14, VACUUM was only able to recycle pages that were
+deleted by a previous VACUUM operation (VACUUM typically placed all pages
+deleted by the last VACUUM into the FSM, though there were and are no
+certainties here).  This had the obvious disadvantage of creating
+uncertainty about when and how pages get recycled, especially with bursty
+workloads.  It was naive, even within the constraints of the design, since
+there is no reason to think that it will take long for a deleted page to
+become recyclable.  It's convenient to use XIDs to implement the drain
+technique, but that is totally unrelated to any of the other things that
+VACUUM needs to do with XIDs.
+
+VACUUM operations now consider if it's possible to recycle any pages that
+the same operation deleted after the physical scan of the index, the last
+point it's convenient to do one last check.  This changes nothing about
+the basic design, and so it might still not be possible to recycle any
+pages at that time (e.g., there might not even be one single new
+transactions after an index page deletion, but before VACUUM ends).  But
+we have little to lose and plenty to gain by trying.  We only need to keep
+around a little information about recently deleted pages in local memory.
+We don't even have to access the deleted pages a second time.
+
+Currently VACUUM delays considering the possibility of recycling its own
+recently deleted page until the end of its btbulkdelete scan (or until the
+end of btvacuumcleanup in cases where there were no tuples to delete in
+the index).  It would be slightly more effective if btbulkdelete page
+deletions were deferred until btvacuumcleanup, simply because more time
+will have passed.  Our current approach works well enough in practice,
+especially in cases where it really matters: cases where we're vacuuming a
+large index, where recycling pages sooner rather than later is
+particularly likely to matter.
+
 Fastpath For Index Insertion
 ----------------------------
 
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 4a0578dff4..a6c43b940e 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -2677,6 +2677,46 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	if (target <= scanblkno)
 		stats->pages_deleted++;
 
+	/*
+	 * Maintain array of pages that were deleted during current btvacuumscan()
+	 * call.  We may well be able to recycle them in a separate pass at the
+	 * end of the current btvacuumscan().
+	 *
+	 * Need to respect work_mem/maxndeletedspace limitation on size of deleted
+	 * array.  Our strategy when the array can no longer grow within the
+	 * bounds of work_mem is simple: keep earlier entries (which are likelier
+	 * to be recyclable in the end), but stop saving new entries.
+	 */
+	if (vstate->full)
+		return true;
+
+	if (vstate->ndeleted >= vstate->ndeletedspace)
+	{
+		uint64 newndeletedspace;
+
+		if (!vstate->grow)
+		{
+			vstate->full = true;
+			return true;
+		}
+
+		newndeletedspace = vstate->ndeletedspace * 2;
+		if (newndeletedspace > vstate->maxndeletedspace)
+		{
+			newndeletedspace = vstate->maxndeletedspace;
+			vstate->grow = false;
+		}
+		vstate->ndeletedspace = newndeletedspace;
+
+		vstate->deleted =
+			repalloc(vstate->deleted,
+					 sizeof(BTPendingRecycle) * vstate->ndeletedspace);
+	}
+
+	vstate->deleted[vstate->ndeleted].blkno = target;
+	vstate->deleted[vstate->ndeleted].safexid = safexid;
+	vstate->ndeleted++;
+
 	return true;
 }
 
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index c70647d6f3..bf50c7c265 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -21,7 +21,9 @@
 #include "access/nbtree.h"
 #include "access/nbtxlog.h"
 #include "access/relscan.h"
+#include "access/table.h"
 #include "access/xlog.h"
+#include "catalog/index.h"
 #include "commands/progress.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
@@ -32,6 +34,7 @@
 #include "storage/indexfsm.h"
 #include "storage/ipc.h"
 #include "storage/lmgr.h"
+#include "storage/procarray.h"
 #include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/index_selfuncs.h"
@@ -833,6 +836,71 @@ _bt_vacuum_needs_cleanup(IndexVacuumInfo *info)
 	return false;
 }
 
+/*
+ * _bt_newly_deleted_pages_recycle() -- Are _bt_pagedel pages recyclable now?
+ *
+ * Note that we assume that the array is ordered by safexid.  No further
+ * entries can be safe to recycle once we encounter the first non-recyclable
+ * entry in the deleted array.
+ */
+static inline void
+_bt_newly_deleted_pages_recycle(Relation rel, BTVacState *vstate)
+{
+	IndexBulkDeleteResult *stats = vstate->stats;
+	Relation	heapRel;
+
+	Assert(vstate->ndeleted > 0);
+	Assert(stats->pages_newly_deleted >= vstate->ndeleted);
+
+	/*
+	 * Recompute VACUUM XID boundaries.
+	 *
+	 * We don't actually care about the oldest non-removable XID.  Computing
+	 * the oldest such XID has a useful side-effect: It updates the procarray
+	 * state that tracks XID horizon.  This is not just an optimization; it's
+	 * essential.  It allows the GlobalVisCheckRemovableFullXid() calls we
+	 * make here to notice if and when safexid values from pages this same
+	 * VACUUM operation deleted are sufficiently old to allow recycling to
+	 * take place safely.
+	 */
+	GetOldestNonRemovableTransactionId(NULL);
+
+	/*
+	 * Use the heap relation for GlobalVisCheckRemovableFullXid() calls (don't
+	 * pass NULL rel argument).
+	 *
+	 * This is an optimization; it allows us to be much more aggressive in
+	 * cases involving logical decoding (unless this happens to be a system
+	 * catalog).  We don't simply use BTPageIsRecyclable().
+	 *
+	 * XXX: The BTPageIsRecyclable() criteria creates problems for this
+	 * optimization.  Its safexid test is applied in a redundant manner within
+	 * _bt_getbuf() (via its BTPageIsRecyclable() call).  Consequently,
+	 * _bt_getbuf() may believe that it is still unsafe to recycle a page that
+	 * we know to be recycle safe -- in which case it is unnecessarily
+	 * discarded.
+	 *
+	 * We should get around to fixing this _bt_getbuf() issue some day.  For
+	 * now we can still proceed in the hopes that BTPageIsRecyclable() will
+	 * catch up with us before _bt_getbuf() ever reaches the page.
+	 */
+	heapRel = table_open(IndexGetRelation(RelationGetRelid(rel), false),
+						 AccessShareLock);
+	for (int i = 0; i < vstate->ndeleted; i++)
+	{
+		BlockNumber blkno = vstate->deleted[i].blkno;
+		FullTransactionId safexid = vstate->deleted[i].safexid;
+
+		if (!GlobalVisCheckRemovableFullXid(heapRel, safexid))
+			break;
+
+		RecordFreeIndexPage(rel, blkno);
+		stats->pages_free++;
+	}
+
+	table_close(heapRel, AccessShareLock);
+}
+
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples.
  * The set of target tuples is specified via a callback routine that tells
@@ -927,6 +995,14 @@ btvacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats)
 	 * FSM during the next VACUUM operation.  _bt_vacuum_needs_cleanup() will
 	 * force the next VACUUM to consider this before allowing btvacuumscan()
 	 * to be skipped entirely.
+	 *
+	 * Note: Prior to PostgreSQL 14, we were completely reliant on the next
+	 * VACUUM operation taking care of recycling whatever pages the current
+	 * VACUUM operation found to be empty and then deleted.  It is now usually
+	 * possible for _bt_newly_deleted_pages_recycle() to recycle all of the
+	 * pages that any given VACUUM operation deletes, as part of the same
+	 * VACUUM operation.  As a result, it is rare for num_delpages to actually
+	 * exceed 0, including with indexes where page deletions are frequent.
 	 */
 	Assert(stats->pages_deleted >= stats->pages_free);
 	num_delpages = stats->pages_deleted - stats->pages_free;
@@ -1002,6 +1078,16 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 												  "_bt_pagedel",
 												  ALLOCSET_DEFAULT_SIZES);
 
+	/* Allocate _bt_newly_deleted_pages_recycle related information */
+	vstate.ndeletedspace = 512;
+	vstate.grow = true;
+	vstate.full = false;
+	vstate.maxndeletedspace = ((work_mem * 1024L) / sizeof(BTPendingRecycle));
+	vstate.maxndeletedspace = Min(vstate.maxndeletedspace, MaxBlockNumber);
+	vstate.maxndeletedspace = Max(vstate.maxndeletedspace, vstate.ndeletedspace);
+	vstate.ndeleted = 0;
+	vstate.deleted = palloc(sizeof(BTPendingRecycle) * vstate.ndeletedspace);
+
 	/*
 	 * The outer loop iterates over all index pages except the metapage, in
 	 * physical order (we hope the kernel will cooperate in providing
@@ -1070,7 +1156,18 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	 *
 	 * Note that if no recyclable pages exist, we don't bother vacuuming the
 	 * FSM at all.
+	 *
+	 * Before vacuuming the FSM, try to make the most of the pages we
+	 * ourselves deleted: see if they can be recycled already (try to avoid
+	 * waiting until the next VACUUM operation to recycle).  Our approach is
+	 * to check the local array of pages that were newly deleted during this
+	 * VACUUM.
 	 */
+	if (vstate.ndeleted > 0)
+		_bt_newly_deleted_pages_recycle(rel, &vstate);
+
+	pfree(vstate.deleted);
+
 	if (stats->pages_free > 0)
 		IndexFreeSpaceMapVacuum(rel);
 }
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 1779b6ba47..192c6e03ce 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -999,6 +999,28 @@ btree_xlog_newroot(XLogReaderState *record)
  * the PGPROC->xmin > limitXmin test inside GetConflictingVirtualXIDs().
  * Consequently, one XID value achieves the same exclusion effect on primary
  * and standby.
+ *
+ * XXX It would make a great deal more sense if each nbtree index's FSM (or
+ * some equivalent structure) was completely crash-safe.  Importantly, this
+ * would enable page recycling's REDO side to work in a way that naturally
+ * matches original execution.
+ *
+ * Page deletion has to be crash safe already, plus xl_btree_reuse_page
+ * records are logged any time a backend has to recycle -- full crash safety
+ * is unlikely to add much overhead, and has clear efficiency benefits.  It
+ * would also simplify things by more explicitly decoupling page deletion and
+ * page recycling.  The benefits for REDO all follow from that.
+ *
+ * Under this scheme, the whole question of recycle safety could be moved from
+ * VACUUM to the consumer side.  That is, VACUUM would no longer have to defer
+ * placing a page that it deletes in the FSM until BTPageIsRecyclable() starts
+ * to return true -- _bt_getbut() would handle all details of safely deferring
+ * recycling instead.  _bt_getbut() would use the improved/crash-safe FSM to
+ * explicitly find a free page whose safexid is sufficiently old for recycling
+ * to be safe from the point of view of backends that run during original
+ * execution.  That just leaves the REDO side.  Instead of xl_btree_reuse_page
+ * records, we'd have FSM "consume/recycle page from the FSM" records that are
+ * associated with FSM page buffers/blocks.
  */
 static void
 btree_xlog_reuse_page(XLogReaderState *record)

base-commit: 5f8727f5a679452f7bbdd6966a1586934dcaa84f
-- 
2.27.0

