From 144fc2a434e4c912df6adc1d4a8245497fd44261 Mon Sep 17 00:00:00 2001
From: Thomas Munro <thomas.munro@gmail.com>
Date: Fri, 7 Jun 2024 17:49:19 +1200
Subject: [PATCH v2] Fix RBM_ZERO_AND_LOCK.

Commit 210622c6 accidentally zeroed out pages even if they were found in
the buffer pool.  It should always lock the page, but it should only
zero pages that were not already found as an optimization to avoid I/O.
Otherwise, concurrent readers that hold only a pin might see corrupted
page contents changing under their feet.  This is now done with the
standard BM_IO_IN_PROGRESS infrastructure to test for BM_VALID and wake
concurrent waiters without races (as it was in earlier releases).

Reported-by: Noah Misch <noah@leadboat.com>
Reported-by: Alexander Lakhin <exclusion@gmail.com>
Reviewed-by: Alvaro Herrera <alvherre@alvh.no-ip.org>
Reviewed-by: Robert Haas <robertmhaas@gmail.com>
Discussion: https://postgr.es/m/20240512171658.7e.nmisch@google.com
Discussion: https://postgr.es/m/7ed10231-ce47-03d5-d3f9-4aea0dc7d5a4%40gmail.com
---
 src/backend/storage/buffer/bufmgr.c | 54 +++++++++++++++++++----------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 49637284f9..9b1ef3fde5 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1010,43 +1010,59 @@ ExtendBufferedRelTo(BufferManagerRelation bmr,
 }
 
 /*
- * Zero a buffer and lock it, as part of the implementation of
+ * Lock and optionally zero a buffer, as part of the implementation of
  * RBM_ZERO_AND_LOCK or RBM_ZERO_AND_CLEANUP_LOCK.  The buffer must be already
- * pinned.  It does not have to be valid, but it is valid and locked on
- * return.
+ * pinned.  If the buffer is not already valid, it is zeroed and made valid.
  */
 static void
-ZeroBuffer(Buffer buffer, ReadBufferMode mode)
+ZeroAndLockBuffer(Buffer buffer, ReadBufferMode mode, bool already_valid)
 {
 	BufferDesc *bufHdr;
-	uint32		buf_state;
+	bool		need_to_zero;
 
 	Assert(mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK);
 
-	if (BufferIsLocal(buffer))
+	if (already_valid)
+	{
+		/*
+		 * If the caller already knew the buffer was valid, we can skip some
+		 * header interaction.  The caller just wants to lock the buffer.
+		 */
+		need_to_zero = false;
+	}
+	else if (BufferIsLocal(buffer))
+	{
+		/* Simple case for non-shared buffers. */
 		bufHdr = GetLocalBufferDescriptor(-buffer - 1);
+		need_to_zero = (pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0;
+	}
 	else
 	{
+		/* Take BM_IO_IN_PROGRESS, or discover that BM_VALID has been set concurrently. */
 		bufHdr = GetBufferDescriptor(buffer - 1);
+		need_to_zero = StartBufferIO(bufHdr, true, false);
+	}
+
+	if (need_to_zero)
+		memset(BufferGetPage(buffer), 0, BLCKSZ);
+
+	/* Acquire the requested lock before setting BM_VALID. */
+	if (!BufferIsLocal(buffer))
+	{
 		if (mode == RBM_ZERO_AND_LOCK)
 			LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
 		else
 			LockBufferForCleanup(buffer);
 	}
 
-	memset(BufferGetPage(buffer), 0, BLCKSZ);
-
-	if (BufferIsLocal(buffer))
+	/* Set BM_VALID, and wake anyone waiting for BM_IO_IN_PROGRESS to be cleared. */
+	if (need_to_zero)
 	{
-		buf_state = pg_atomic_read_u32(&bufHdr->state);
-		buf_state |= BM_VALID;
-		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
-	}
-	else
-	{
-		buf_state = LockBufHdr(bufHdr);
-		buf_state |= BM_VALID;
-		UnlockBufHdr(bufHdr, buf_state);
+		if (BufferIsLocal(buffer))
+			pg_atomic_write_u32(&bufHdr->state,
+								pg_atomic_read_u32(&bufHdr->state) | BM_VALID);
+		else
+			TerminateBufferIO(bufHdr, false, BM_VALID, true);
 	}
 }
 
@@ -1185,7 +1201,7 @@ ReadBuffer_common(Relation rel, SMgrRelation smgr, char smgr_persistence,
 
 		buffer = PinBufferForBlock(rel, smgr, smgr_persistence,
 								   forkNum, blockNum, strategy, &found);
-		ZeroBuffer(buffer, mode);
+		ZeroAndLockBuffer(buffer, mode, found);
 		return buffer;
 	}
 
-- 
2.39.3 (Apple Git-146)

