From 9a4b9da2a6d2b52470f2f53267a1577af6c0b5ad Mon Sep 17 00:00:00 2001
From: Zhijie Hou <houzj.fnst@fujitsu.com>
Date: Mon, 9 Feb 2026 13:49:41 +0800
Subject: [PATCH v6 1/2] Improve the retry logic in pg_sync_replication_slots()

The SQL function pg_sync_replication_slots() currently retries cyclically when a
slot cannot be persisted when its initial synchronization is skipped. However,
It is also beneficial to retry if a slot is persisted but unable to sync to the
latest value, for instance, due to physical replication lag causing delays in
flushing WALs up to the confirmed_flush_lsn of the failover slot on the standby.

This commit improves the function to retry for both slots that fail to persist
and those persistent slots that have skipped subsequent synchronizations.
---
 doc/src/sgml/func/func-admin.sgml          |  4 +-
 src/backend/replication/logical/slotsync.c | 81 ++++++++++++----------
 2 files changed, 47 insertions(+), 38 deletions(-)

diff --git a/doc/src/sgml/func/func-admin.sgml b/doc/src/sgml/func/func-admin.sgml
index 3ac81905d1f..7f20faf99e8 100644
--- a/doc/src/sgml/func/func-admin.sgml
+++ b/doc/src/sgml/func/func-admin.sgml
@@ -1495,7 +1495,9 @@ postgres=# SELECT '0/0'::pg_lsn + pd.segment_number * ps.setting::int + :offset
         Synchronize the logical failover replication slots from the primary
         server to the standby server. This function can only be executed on the
         standby server. Temporary synced slots, if any, cannot be used for
-        logical decoding and must be dropped after promotion. See
+        logical decoding and must be dropped after promotion. This function
+        retries cyclically until all the failover slots that existed on
+        primary at the start of the function call are synchronized. See
         <xref linkend="logicaldecoding-replication-slots-synchronization"/> for details.
         Note that this function cannot be executed if
         <link linkend="guc-sync-replication-slots"><varname>
diff --git a/src/backend/replication/logical/slotsync.c b/src/backend/replication/logical/slotsync.c
index 062a08ccb88..a6658dec5c1 100644
--- a/src/backend/replication/logical/slotsync.c
+++ b/src/backend/replication/logical/slotsync.c
@@ -40,10 +40,11 @@
  * more details.
  *
  * If the SQL function pg_sync_replication_slots() is used to sync the slots,
- * and if the slots are not ready to be synced and are marked as RS_TEMPORARY
- * because of any of the reasons mentioned above, then the SQL function also
- * waits and retries until the slots are marked as RS_PERSISTENT (which means
- * sync-ready). Refer to the comments in SyncReplicationSlots() for more
+ * and if the slots are not ready to be synced because of any of the reasons
+ * mentioned above, or if the local slot is invalidated while the remote slot
+ * remains valid, then the SQL function also waits and retries until all the
+ * failover slots that existed on primary at the start of the function call are
+ * synchronized. Refer to the comments in SyncReplicationSlots() for more
  * details.
  *
  * Any standby synchronized slots will be dropped if they no longer need
@@ -218,7 +219,7 @@ update_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid)
 		 * Can get here only if GUC 'synchronized_standby_slots' on the
 		 * primary server was not configured correctly.
 		 */
-		ereport(AmLogicalSlotSyncWorkerProcess() ? LOG : ERROR,
+		ereport(LOG,
 				errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 				errmsg("skipping slot synchronization because the received slot sync"
 					   " LSN %X/%08X for slot \"%s\" is ahead of the standby position %X/%08X",
@@ -623,15 +624,11 @@ reserve_wal_for_local_slot(XLogRecPtr restart_lsn)
  * local ones, then update the LSNs and persist the local synced slot for
  * future synchronization; otherwise, do nothing.
  *
- * *slot_persistence_pending is set to true if any of the slots fail to
- * persist.
- *
  * Return true if the slot is marked as RS_PERSISTENT (sync-ready), otherwise
  * false.
  */
 static bool
-update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
-									 bool *slot_persistence_pending)
+update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid)
 {
 	ReplicationSlot *slot = MyReplicationSlot;
 
@@ -655,13 +652,7 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
 		 * the next cycle. It may take more time to create such a slot or
 		 * reach the consistent point. Therefore, we keep this slot and
 		 * attempt the synchronization in the next cycle.
-		 *
-		 * We also update the slot_persistence_pending parameter, so the SQL
-		 * function can retry.
 		 */
-		if (slot_persistence_pending)
-			*slot_persistence_pending = true;
-
 		return false;
 	}
 
@@ -685,14 +676,14 @@ update_and_persist_local_synced_slot(RemoteSlot *remote_slot, Oid remote_dbid,
  * updated. The slot is then persisted and is considered as sync-ready for
  * periodic syncs.
  *
- * *slot_persistence_pending is set to true if any of the slots fail to
- * persist.
+ * *slotsync_pending is set to true if the slot's synchronization is skipped and
+ * requires re-sync.
  *
  * Returns TRUE if the local slot is updated.
  */
 static bool
 synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid,
-					 bool *slot_persistence_pending)
+					 bool *slotsync_pending)
 {
 	ReplicationSlot *slot;
 	bool		slot_updated = false;
@@ -757,6 +748,14 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid,
 		{
 			update_slotsync_skip_stats(SS_SKIP_INVALID);
 
+			/*
+			 * If only the local slot is invalidated, attempt to re-sync. The
+			 * slot will be dropped and recreated in the next cycle. See
+			 * drop_local_obsolete_slots().
+			 */
+			if (slotsync_pending && remote_slot->invalidated == RS_INVAL_NONE)
+				*slotsync_pending = true;
+
 			ReplicationSlotRelease();
 			return slot_updated;
 		}
@@ -765,8 +764,7 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid,
 		if (slot->data.persistency == RS_TEMPORARY)
 		{
 			slot_updated = update_and_persist_local_synced_slot(remote_slot,
-																remote_dbid,
-																slot_persistence_pending);
+																remote_dbid);
 		}
 
 		/* Slot ready for sync, so sync it. */
@@ -836,12 +834,17 @@ synchronize_one_slot(RemoteSlot *remote_slot, Oid remote_dbid,
 		LWLockRelease(ProcArrayLock);
 		LWLockRelease(ReplicationSlotControlLock);
 
-		update_and_persist_local_synced_slot(remote_slot, remote_dbid,
-											 slot_persistence_pending);
+		update_and_persist_local_synced_slot(remote_slot, remote_dbid);
 
 		slot_updated = true;
 	}
 
+	Assert(slot->slotsync_skip_reason != SS_SKIP_INVALID);
+
+	/* See the comments atop the file for cases requiring re-sync */
+	if (slotsync_pending && slot->slotsync_skip_reason != SS_SKIP_NONE)
+		*slotsync_pending = true;
+
 	ReplicationSlotRelease();
 
 	return slot_updated;
@@ -992,15 +995,15 @@ fetch_remote_slots(WalReceiverConn *wrconn, List *slot_names)
  * This function takes a list of remote slots and synchronizes them locally. It
  * creates the slots if not present on the standby and updates existing ones.
  *
- * If slot_persistence_pending is not NULL, it will be set to true if one or
- * more slots could not be persisted. This allows callers such as
- * SyncReplicationSlots() to retry those slots.
+ * If slotsync_pending is not NULL, it will be set to true if one or more slots
+ * skip to be synced. This allows callers such as SyncReplicationSlots() to
+ * retry those slots.
  *
  * Returns TRUE if any of the slots gets updated in this sync-cycle.
  */
 static bool
 synchronize_slots(WalReceiverConn *wrconn, List *remote_slot_list,
-				  bool *slot_persistence_pending)
+				  bool *slotsync_pending)
 {
 	bool		some_slot_updated = false;
 
@@ -1020,7 +1023,7 @@ synchronize_slots(WalReceiverConn *wrconn, List *remote_slot_list,
 		LockSharedObject(DatabaseRelationId, remote_dbid, 0, AccessShareLock);
 
 		some_slot_updated |= synchronize_one_slot(remote_slot, remote_dbid,
-												  slot_persistence_pending);
+												  slotsync_pending);
 
 		UnlockSharedObject(DatabaseRelationId, remote_dbid, 0, AccessShareLock);
 	}
@@ -1910,8 +1913,9 @@ extract_slot_names(List *remote_slots)
  * Synchronize the failover enabled replication slots using the specified
  * primary server connection.
  *
- * Repeatedly fetches and updates replication slot information from the
- * primary until all slots are at least "sync ready".
+ * Repeatedly fetches and updates replication slot information from the primary
+ * until all slots that existed on primary at the start of the function call are
+ * synchronized.
  *
  * Exits early if promotion is triggered or certain critical
  * configuration parameters have changed.
@@ -1934,7 +1938,7 @@ SyncReplicationSlots(WalReceiverConn *wrconn)
 		/* Retry until all the slots are sync-ready */
 		for (;;)
 		{
-			bool		slot_persistence_pending = false;
+			bool		slotsync_pending = false;
 			bool		some_slot_updated = false;
 
 			/* Check for interrupts and config changes */
@@ -1957,20 +1961,23 @@ SyncReplicationSlots(WalReceiverConn *wrconn)
 
 			/* Attempt to synchronize slots */
 			some_slot_updated = synchronize_slots(wrconn, remote_slots,
-												  &slot_persistence_pending);
+												  &slotsync_pending);
 
 			/*
-			 * If slot_persistence_pending is true, extract slot names for
-			 * future iterations (only needed if we haven't done it yet)
+			 * If slotsync_pending is true, extract slot names for future
+			 * iterations (only needed if we haven't done it yet)
 			 */
-			if (slot_names == NIL && slot_persistence_pending)
+			if (slot_names == NIL && slotsync_pending)
 				slot_names = extract_slot_names(remote_slots);
 
 			/* Free the current remote_slots list */
 			list_free_deep(remote_slots);
 
-			/* Done if all slots are persisted i.e are sync-ready */
-			if (!slot_persistence_pending)
+			/*
+			 * Done if all slots have been synchronized to the latest
+			 * information.
+			 */
+			if (!slotsync_pending)
 				break;
 
 			/* wait before retrying again */
-- 
2.51.1.windows.1

