From b3e2d15a0add4a55907195a05f38f1ce22857ade Mon Sep 17 00:00:00 2001
From: Alexander Korotkov <akorotkov@postgresql.org>
Date: Mon, 6 Nov 2023 11:07:35 +0200
Subject: [PATCH] Fix false reports in pg_visibility

---
 contrib/pg_visibility/pg_visibility.c | 23 +++++++---
 src/backend/storage/ipc/procarray.c   | 66 +++++++++++++++++++++++++++
 src/include/storage/procarray.h       |  1 +
 3 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c
index 2a4acfd1eee..78dac323e51 100644
--- a/contrib/pg_visibility/pg_visibility.c
+++ b/contrib/pg_visibility/pg_visibility.c
@@ -19,6 +19,7 @@
 #include "funcapi.h"
 #include "miscadmin.h"
 #include "storage/bufmgr.h"
+#include "storage/proc.h"
 #include "storage/procarray.h"
 #include "storage/smgr.h"
 #include "utils/rel.h"
@@ -562,8 +563,14 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 	/* Only some relkinds have a visibility map */
 	check_relation_relkind(rel);
 
+	/* Mark ourselves as PROC_IN_VACUUM to exclude our influence on xmin's */
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	MyProc->statusFlags |= PROC_IN_VACUUM;
+	ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags;
+	LWLockRelease(ProcArrayLock);
+
 	if (all_visible)
-		OldestXmin = GetOldestNonRemovableTransactionId(rel);
+		OldestXmin = GetStrictOldestNonRemovableTransactionId();
 
 	nblocks = RelationGetNumberOfBlocks(rel);
 
@@ -670,12 +677,11 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 				 * From a concurrency point of view, it sort of sucks to
 				 * retake ProcArrayLock here while we're holding the buffer
 				 * exclusively locked, but it should be safe against
-				 * deadlocks, because surely
-				 * GetOldestNonRemovableTransactionId() should never take a
-				 * buffer lock. And this shouldn't happen often, so it's worth
-				 * being careful so as to avoid false positives.
+				 * deadlocks, because surely get_strict_xid_horizon() should
+				 * never take a buffer lock. And this shouldn't happen often,
+				 * so it's worth being careful so as to avoid false positives.
 				 */
-				RecomputedOldestXmin = GetOldestNonRemovableTransactionId(rel);
+				RecomputedOldestXmin = GetStrictOldestNonRemovableTransactionId();
 
 				if (!TransactionIdPrecedes(OldestXmin, RecomputedOldestXmin))
 					record_corrupt_item(items, &tuple.t_self);
@@ -715,6 +721,11 @@ collect_corrupt_items(Oid relid, bool all_visible, bool all_frozen)
 	items->count = items->next;
 	items->next = 0;
 
+	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	MyProc->statusFlags &= ~PROC_IN_VACUUM;
+	ProcGlobal->statusFlags[MyProc->pgxactoff] = MyProc->statusFlags;
+	LWLockRelease(ProcArrayLock);
+
 	return items;
 }
 
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 80ab026bf56..e36a463cc66 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -237,6 +237,30 @@ typedef struct ComputeXidHorizonsResult
 	 */
 	TransactionId data_oldest_nonremovable;
 
+	/*
+	 * The "strict" version of GetOldestNonRemovableTransactionId().  The
+	* pg_visiblity check can tolerale false positives (don't report some of the
+	* errors), but can't toletate false negatives (report false errors).
+	* This makes us fundamentally unhappy with ComputeXidHorizons().  Normally,
+	* horozons moves forwards, but there are cases when it could move backwards
+	* (see comment for ComputeXidHorizons()).
+	*
+	* This is why we have to implement our own function for xid horizon, which
+	* would be guaranteed to be newer or equal to any xid horizon computed before.
+	* We have to do the following to achieve this.
+	*
+	* 1. Ignore processes xmin's, because they take into account connection to
+	*    other databases which were ignored before.
+	* 2. Ignore KnownAssignedXids, because they are not database-aware.  While
+	*    primary could compute its horizons database-aware.
+	* 3. Ignore walsender xmin, because it could go backward if some replication
+	*    connections don't use replication slots.
+	*
+	* Surely these would significantly sacrifice accuracy.  But we have to do so
+	* in order to avoid reporting false errors.
+	*/
+	TransactionId data_strict_oldest_nonremovable;
+
 	/*
 	 * Oldest xid for which deleted tuples need to be retained in this
 	 * session's temporary tables.
@@ -252,6 +276,7 @@ typedef enum GlobalVisHorizonKind
 	VISHORIZON_SHARED,
 	VISHORIZON_CATALOG,
 	VISHORIZON_DATA,
+	VISHORIZON_DATA_STRICT,
 	VISHORIZON_TEMP,
 } GlobalVisHorizonKind;
 
@@ -1743,6 +1768,7 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		h->oldest_considered_running = initial;
 		h->shared_oldest_nonremovable = initial;
 		h->data_oldest_nonremovable = initial;
+		h->data_strict_oldest_nonremovable = initial;
 
 		/*
 		 * Only modifications made by this backend affect the horizon for
@@ -1842,6 +1868,10 @@ ComputeXidHorizons(ComputeXidHorizonsResult *h)
 		{
 			h->data_oldest_nonremovable =
 				TransactionIdOlder(h->data_oldest_nonremovable, xmin);
+
+			if (TransactionIdIsValid(xid))
+				h->data_strict_oldest_nonremovable =
+					TransactionIdOlder(h->data_strict_oldest_nonremovable, xid);
 		}
 	}
 
@@ -1997,6 +2027,8 @@ GetOldestNonRemovableTransactionId(Relation rel)
 			return horizons.catalog_oldest_nonremovable;
 		case VISHORIZON_DATA:
 			return horizons.data_oldest_nonremovable;
+		case VISHORIZON_DATA_STRICT:
+			return horizons.data_strict_oldest_nonremovable;
 		case VISHORIZON_TEMP:
 			return horizons.temp_oldest_nonremovable;
 	}
@@ -2005,6 +2037,38 @@ GetOldestNonRemovableTransactionId(Relation rel)
 	return InvalidTransactionId;
 }
 
+/*
+ * The "strict" version of GetOldestNonRemovableTransactionId().  The
+ * pg_visiblity check can tolerale false positives (don't report some of the
+ * errors), but can't toletate false negatives (report false errors).
+ * This makes us fundamentally unhappy with ComputeXidHorizons().  Normally,
+ * horozons moves forwards, but there are cases when it could move backwards
+ * (see comment for ComputeXidHorizons()).
+ *
+ * This is why we have to implement our own function for xid horizon, which
+ * would be guaranteed to be newer or equal to any xid horizon computed before.
+ * We have to do the following to achieve this.
+ *
+ * 1. Ignore processes xmin's, because they take into account connection to
+ *    other databases which were ignored before.
+ * 2. Ignore KnownAssignedXids, because they are not database-aware.  While
+ *    primary could compute its horizons database-aware.
+ * 3. Ignore walsender xmin, because it could go backward if some replication
+ *    connections don't use replication slots.
+ *
+ * Surely these would significantly sacrifice accuracy.  But we have to do so
+ * in order to avoid reporting false errors.
+ */
+TransactionId
+GetStrictOldestNonRemovableTransactionId(void)
+{
+	ComputeXidHorizonsResult horizons;
+
+	ComputeXidHorizons(&horizons);
+
+	return horizons.data_strict_oldest_nonremovable;
+}
+
 /*
  * Return the oldest transaction id any currently running backend might still
  * consider running. This should not be used for visibility / pruning
@@ -4028,6 +4092,8 @@ GlobalVisTestFor(Relation rel)
 		case VISHORIZON_TEMP:
 			state = &GlobalVisTempRels;
 			break;
+		default:
+			break;
 	}
 
 	Assert(FullTransactionIdIsValid(state->definitely_needed) &&
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index d8cae3ce1c5..d44c2a98b2d 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -55,6 +55,7 @@ extern RunningTransactions GetRunningTransactionData(void);
 extern bool TransactionIdIsInProgress(TransactionId xid);
 extern bool TransactionIdIsActive(TransactionId xid);
 extern TransactionId GetOldestNonRemovableTransactionId(Relation rel);
+extern TransactionId GetStrictOldestNonRemovableTransactionId(void);
 extern TransactionId GetOldestTransactionIdConsideredRunning(void);
 extern TransactionId GetOldestActiveTransactionId(void);
 extern TransactionId GetOldestSafeDecodingTransactionId(bool catalogOnly);
-- 
2.39.3 (Apple Git-145)

