From 836b57525c3c221ef4f2826abed9b32d7f48ad8e Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Fri, 19 May 2023 16:20:55 -0700
Subject: [PATCH v2 1/2] nbtree VACUUM: cope with right sibling link
 corruption.

Avoid "right sibling's left-link doesn't match" errors when vacuuming a
corrupt nbtree index: just LOG the issue and press on with vacuuming.
This ERROR is seen in the field from time to time (it's not just a
theoretical possibility), even though the relevant check takes place
after various similar checks ran without detecting any problems.

It's worth avoiding such errors, even in the event of corruption.
Anything that makes VACUUM unable to make forward progress against one
table/index ultimately risks making the system enter xidStopLimit mode.
There is no good reason to take any chances here.

This is similar to the work from commit 5b861baa (later backpatched as
commit 43e409ce), which taught nbtree to press on with vacuuming an
index when page deletion fails to "re-find" a downlink in the target
page's parent (or in some page to the right of the parent).  Both cases
involve throwing errors in nbtree page deletion that correspond to
similar errors at a corresponding point during nbtree page splits.  And
in both cases the solution is to demote the ERROR to a LOG message, and
press on (without changing anything about the page split errors).

Note that _bt_unlink_halfdead_page() has long been able to bail on page
deletion when the target page's left sibling was in an inconsistent
state (that goes all the way back to the initial introduction of page
deletion by commit 88dc31e3f2).  _bt_unlink_halfdead_page() can now bail
on page deletion in much the same way if the sibling link corruption
happens to be in the right sibling page's link.  While there was already
significant overlap between each of the two sibling checks (elevel
notwithstanding), there is no reason to take any chances here.

Author: Peter Geoghegan <pg@bowt.ie>
Reviewed-By: Heikki Linnakangas <hlinnaka@iki.fi>
Discussion: https://postgr.es/m/CAH2-Wzko2q2kP1+UvgJyP9g0mF4hopK0NtQZcxwvMv9_ytGhkQ@mail.gmail.com
Backpatch: 11- (all supported versions).
---
 src/backend/access/nbtree/nbtpage.c | 62 ++++++++++++++++++++---------
 src/backend/access/nbtree/nbtree.c  |  3 +-
 2 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 6be891522..486d1563d 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -2404,24 +2404,22 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 
 			if (!leftsibvalid)
 			{
-				if (target != leafblkno)
-				{
-					/* we have only a pin on target, but pin+lock on leafbuf */
-					ReleaseBuffer(buf);
-					_bt_relbuf(rel, leafbuf);
-				}
-				else
-				{
-					/* we have only a pin on leafbuf */
-					ReleaseBuffer(leafbuf);
-				}
-
+				/*
+				 * This is known to fail in the field; sibling link corruption
+				 * is relatively common.  Press on with vacuuming rather than
+				 * just throwing an ERROR.
+				 */
 				ereport(LOG,
 						(errcode(ERRCODE_INDEX_CORRUPTED),
 						 errmsg_internal("valid left sibling for deletion target could not be located: "
-										 "left sibling %u of target %u with leafblkno %u and scanblkno %u in index \"%s\"",
+										 "left sibling %u of target %u with leafblkno %u and scanblkno %u on level %u of index \"%s\"",
 										 leftsib, target, leafblkno, scanblkno,
-										 RelationGetRelationName(rel))));
+										 targetlevel, RelationGetRelationName(rel))));
+
+				/* Must release all pins and locks on failure exit */
+				ReleaseBuffer(buf);
+				if (target != leafblkno)
+					_bt_relbuf(rel, leafbuf);
 
 				return false;
 			}
@@ -2496,13 +2494,40 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	rbuf = _bt_getbuf(rel, vstate->info->heaprel, rightsib, BT_WRITE);
 	page = BufferGetPage(rbuf);
 	opaque = BTPageGetOpaque(page);
+
+	/*
+	 * Validate target's right sibling page.  Its left link must point back to
+	 * the target page.
+	 */
 	if (opaque->btpo_prev != target)
-		ereport(ERROR,
+	{
+		/*
+		 * This is known to fail in the field; sibling link corruption is
+		 * relatively common.  Press on with vacuuming rather than just
+		 * throwing an ERROR (same approach used for left-sibling's-right-link
+		 * validation check a moment ago).
+		 */
+		ereport(LOG,
 				(errcode(ERRCODE_INDEX_CORRUPTED),
 				 errmsg_internal("right sibling's left-link doesn't match: "
-								 "block %u links to %u instead of expected %u in index \"%s\"",
-								 rightsib, opaque->btpo_prev, target,
-								 RelationGetRelationName(rel))));
+								 "right sibling %u of target %u with leafblkno %u "
+								 "and scanblkno %u spuriously links to non-target %u "
+								 "on level %u of index \"%s\"",
+								 rightsib, target, leafblkno,
+								 scanblkno, opaque->btpo_prev,
+								 targetlevel, RelationGetRelationName(rel))));
+
+		/* Must release all pins and locks on failure exit */
+		if (BufferIsValid(lbuf))
+			_bt_relbuf(rel, lbuf);
+		_bt_relbuf(rel, rbuf);
+		_bt_relbuf(rel, buf);
+		if (target != leafblkno)
+			_bt_relbuf(rel, leafbuf);
+
+		return false;
+	}
+
 	rightsib_is_rightmost = P_RIGHTMOST(opaque);
 	*rightsib_empty = (P_FIRSTDATAKEY(opaque) > PageGetMaxOffsetNumber(page));
 
@@ -2727,6 +2752,7 @@ _bt_unlink_halfdead_page(Relation rel, Buffer leafbuf, BlockNumber scanblkno,
 	 */
 	_bt_pendingfsm_add(vstate, target, safexid);
 
+	/* Success - hold on to lock on leafbuf (might also have been target) */
 	return true;
 }
 
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 2df884985..1ce5b1519 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -1093,8 +1093,7 @@ backtrack:
 		 * can't be half-dead because only an interrupted VACUUM process can
 		 * leave pages in that state, so we'd definitely have dealt with it
 		 * back when the page was the scanblkno page (half-dead pages are
-		 * always marked fully deleted by _bt_pagedel()).  This assumes that
-		 * there can be only one vacuum process running at a time.
+		 * always marked fully deleted by _bt_pagedel(), barring corruption).
 		 */
 		if (!opaque || !P_ISLEAF(opaque) || P_ISHALFDEAD(opaque))
 		{
-- 
2.40.1

