From e06e44ce9930793f5a0383580db8ebb3e9b6a6b4 Mon Sep 17 00:00:00 2001
From: Masahiko Sawada <sawada.mshk@gmail.com>
Date: Tue, 31 Jan 2023 10:25:57 +0900
Subject: [PATCH v3] Fix a race condition of updating
 procArray->replication_slot_xmin.

Previously, if already_locked is false,
ReplicationSlotsComputeRequiredXmin() computed the oldest xmin across
all slots while not holding the ProcArrayLock and acquires the
ProcArrayLock just before updating the
replication_slot_xmin. Therefore, it was possible that by the time a
process computes the oldest xmin and before updating the
replication_slot_xmin, another process computes and updates it. As a
result, the replication_slot_xmin could be overwritten with an old
value and retreated.

In the reported failure, after a walsender who was creating a
replication slot updated the replication_slot_xmin via
CreateInitDecodingContext(), another walsender overwrote it with
InvalidTransactionId. Then the walsender creating the replication slot
ended up computing the oldest safe decoding transaction id without
considering the replication_slot_xmin. That led to an error "cannot
build an initial slot snapshot as oldest safe xid %u follows
snapshot's xmin %u", which was an assertion failure prior to
240e0dbacd3.

This commit changes ReplicationSlotsComputeRequiredXmin() so that it
computes the oldest xmin while holding the ProcArrayLock in exclusive
mode. We keep already_locked parameter in
ProcArraySetReplicationSlotXmin() on backbranches to not break ABI
compatibility.

Discussion: https://postgr.es/m/CAA4eK1L8wYcyTPxNzPGkhuO52WBGoOZbT0A73Le=ZUWYAYmdfw@mail.gmail.com
Backpatch-through: 11
---
 src/backend/replication/slot.c      | 15 ++++++++++++++-
 src/backend/storage/ipc/procarray.c | 13 +++----------
 src/include/storage/procarray.h     |  2 +-
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/src/backend/replication/slot.c b/src/backend/replication/slot.c
index f286918f69..063f6aa95c 100644
--- a/src/backend/replication/slot.c
+++ b/src/backend/replication/slot.c
@@ -840,6 +840,16 @@ ReplicationSlotsComputeRequiredXmin(bool already_locked)
 
 	Assert(ReplicationSlotCtl != NULL);
 
+	/*
+	 * It is possible that by the time we compute the agg_xmin here and before
+	 * updating replication_slot_xmin, the CreateInitDecodingContext() will
+	 * compute and update replication_slot_xmin. So, we need to acquire
+	 * ProcArrayLock here to avoid retreating the value of
+	 * replication_slot_xmin.
+	 */
+	if (!already_locked)
+		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+
 	LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
 
 	for (i = 0; i < max_replication_slots; i++)
@@ -878,7 +888,10 @@ ReplicationSlotsComputeRequiredXmin(bool already_locked)
 
 	LWLockRelease(ReplicationSlotControlLock);
 
-	ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin, already_locked);
+	ProcArraySetReplicationSlotXmin(agg_xmin, agg_catalog_xmin);
+
+	if (!already_locked)
+		LWLockRelease(ProcArrayLock);
 }
 
 /*
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 4340bf9641..a9e4f59440 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -3896,23 +3896,16 @@ TerminateOtherDBBackends(Oid databaseId)
  *
  * Install limits to future computations of the xmin horizon to prevent vacuum
  * and HOT pruning from removing affected rows still needed by clients with
- * replication slots.
+ * replication slots. The caller must hold ProcArrayLock in exclusive mode.
  */
 void
-ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin,
-								bool already_locked)
+ProcArraySetReplicationSlotXmin(TransactionId xmin, TransactionId catalog_xmin)
 {
-	Assert(!already_locked || LWLockHeldByMe(ProcArrayLock));
-
-	if (!already_locked)
-		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+	Assert(LWLockHeldByMeInMode(ProcArrayLock, LW_EXCLUSIVE));
 
 	procArray->replication_slot_xmin = xmin;
 	procArray->replication_slot_catalog_xmin = catalog_xmin;
 
-	if (!already_locked)
-		LWLockRelease(ProcArrayLock);
-
 	elog(DEBUG1, "xmin required by slots: data %u, catalog %u",
 		 xmin, catalog_xmin);
 }
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index d8cae3ce1c..b7554f1b53 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -91,7 +91,7 @@ extern void XidCacheRemoveRunningXids(TransactionId xid,
 									  TransactionId latestXid);
 
 extern void ProcArraySetReplicationSlotXmin(TransactionId xmin,
-											TransactionId catalog_xmin, bool already_locked);
+											TransactionId catalog_xmin);
 
 extern void ProcArrayGetReplicationSlotXmin(TransactionId *xmin,
 											TransactionId *catalog_xmin);
-- 
2.31.1

