From c744aa6046d4eef3660cab3c8d362686987b4ea6 Mon Sep 17 00:00:00 2001
From: Teja Mupparti <temuppar@microsoft.com>
Date: Sat, 21 Mar 2020 16:18:59 -0700
Subject: [PATCH] Wal replay corruption
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A bug in Postgres which can cause redo recovery to fail(forever). There is a
very tiny timing window where an autovacuum completion, a checkpoint completion,
and an abnormal shutdown occur such that the checkpoint fails to write a dirty
page. Log replay requires the most up to date version of the page to be able to
replay the log record.
 
The abnormal shutdown can be an unrelated crash like that due to Out Of Memory
or a forced restart without a full shutdown checkpoint.
 
Repro details

	1) Page on disk has empty LP 1, Insert into page LP 1
	2) checkpoint START (Recovery REDO eventually starts here)
	3) Delete all rows on the page (page is empty now)
	4) Autovacuum kicks in and truncates the pages
		DropRelFileNodeBuffers - Dirty page NOT written, LP 1 on disk
		still empty
	5) Checkpoint completes
	6) Crash
	7) smgrtruncate - Not reached (this is where we do the physical truncate)

Now the crash-recovery starts

	Delete-log-replay (above step-3) reads page with empty LP 1 and the
	delete fails with PANIC

Fix
===
1) Mark all the buffers as about-to-be-dropped
2) CacheInvalidateSmgr()
3) Truncate on filesystem level
	if that fails, remove the about-to-be-dropped flags, in a PG_CATCH block
	if that succeeds, fully remove the buffers to be dropped

Hackers discussion
===================
https://www.postgresql.org/message-id/822113470.250068.1573246011818%40connect.xfinity.com
https://www.postgresql.org/message-id/20191206230640.2dvdjpcgn46q3ks2%40alap3.anarazel.de
https://www.postgresql.org/message-id/20191207001232.klidxnm756wqxvwx%40alap3.anarazel.de
https://www.postgresql.org/message-id/1880.1281020817@sss.pgh.pa.us
---
 contrib/pg_visibility/pg_visibility.c |   1 +
 src/backend/catalog/storage.c         | 105 +++++++++++-----
 src/backend/storage/buffer/bufmgr.c   | 165 ++++++++++++++++++++++++++
 src/backend/storage/smgr/smgr.c       |  13 +-
 src/include/storage/bufmgr.h          |   4 +
 5 files changed, 252 insertions(+), 36 deletions(-)

diff --git a/contrib/pg_visibility/pg_visibility.c b/contrib/pg_visibility/pg_visibility.c
index 0cd1160ceb..229ca8ca3f 100644
--- a/contrib/pg_visibility/pg_visibility.c
+++ b/contrib/pg_visibility/pg_visibility.c
@@ -398,6 +398,7 @@ pg_truncate_visibility_map(PG_FUNCTION_ARGS)
 	if (BlockNumberIsValid(block))
 	{
 		fork = VISIBILITYMAP_FORKNUM;
+		MarkTruncateBuffers(rel->rd_smgr->smgr_rnode, &fork, 1, &block);
 		smgrtruncate(rel->rd_smgr, &fork, 1, &block);
 	}
 
diff --git a/src/backend/catalog/storage.c b/src/backend/catalog/storage.c
index fddfbf1d8c..b071cb5666 100644
--- a/src/backend/catalog/storage.c
+++ b/src/backend/catalog/storage.c
@@ -276,45 +276,87 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 	}
 
 	/*
-	 * We WAL-log the truncation before actually truncating, which means
-	 * trouble if the truncation fails. If we then crash, the WAL replay
-	 * likely isn't going to succeed in the truncation either, and cause a
-	 * PANIC. It's tempting to put a critical section here, but that cure
-	 * would be worse than the disease. It would turn a usually harmless
-	 * failure to truncate, that might spell trouble at WAL replay, into a
-	 * certain PANIC.
+	 * 1) WAL-log the truncation of the file.
+	 * 2) Discard buffers of the pages to be removed.
+	 * 3) Truncate the pages from the file.
+	 *
+	 * We are seeing corruptions, if there are crashes in the midst of these
+	 * operations, treat all the operations as critical and raise PANIC to
+	 * avoid discrepancy between the WAL log, buffer copy and the on-disk image.
 	 */
-	if (RelationNeedsWAL(rel))
+
+	/*
+	 * It seems logical and apt to put a critical section, but there are two
+	 * issues. First the minor one, palloc() operations down the lane are
+	 * restricted, the major one is (as the below comment suggests) for
+	 * a repeatable truncate error, WAL replay will never end and the server
+	 * is hosed. The former issue can be resolved by either reworking palloc()s
+	 * or using TRY/CATCH block instead. The counter argument to the latter
+	 * issue is, the lack of critical section is what pushing us into the
+	 * scenario we want to avoid by omitting it. Missing critical section is
+	 * causing the WAL replay hitting PANIC persistently with invalid-offset
+	 * on the disk page ERRORs.
+	 */
+	PG_TRY();
 	{
+
 		/*
-		 * Make an XLOG entry reporting the file truncation.
+		 * We are going to truncate (shrink) the file, the contents of the
+		 * corresponding buffers are useless and need to be discarded, but
+		 * a background task might flush them to the disk right after we
+		 * truncate and before we discard. Let's prevent it by marking
+		 * buffers as IO_IN_PROGRESS (though we don't do any real IO).
 		 */
-		XLogRecPtr	lsn;
-		xl_smgr_truncate xlrec;
-
-		xlrec.blkno = nblocks;
-		xlrec.rnode = rel->rd_node;
-		xlrec.flags = SMGR_TRUNCATE_ALL;
-
-		XLogBeginInsert();
-		XLogRegisterData((char *) &xlrec, sizeof(xlrec));
-
-		lsn = XLogInsert(RM_SMGR_ID,
-						 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+		MarkTruncateBuffers(rel->rd_smgr->smgr_rnode, forks, nforks, blocks);
 
 		/*
-		 * Flush, because otherwise the truncation of the main relation might
-		 * hit the disk before the WAL record, and the truncation of the FSM
-		 * or visibility map. If we crashed during that window, we'd be left
-		 * with a truncated heap, but the FSM or visibility map would still
-		 * contain entries for the non-existent heap pages.
+		 * We WAL-log the truncation before actually truncating, which means
+		 * trouble if the truncation fails. If we then crash, the WAL replay
+		 * likely isn't going to succeed in the truncation either, and cause a
+		 * PANIC. It's tempting to put a critical section here, but that cure
+		 * would be worse than the disease. It would turn a usually harmless
+		 * failure to truncate, that might spell trouble at WAL replay, into a
+		 * certain PANIC.
 		 */
-		if (fsm || vm)
+		if (RelationNeedsWAL(rel))
+		{
+			/*
+			 * Make an XLOG entry reporting the file truncation.
+			 */
+			XLogRecPtr	lsn;
+			xl_smgr_truncate xlrec;
+
+			xlrec.blkno = nblocks;
+			xlrec.rnode = rel->rd_node;
+			xlrec.flags = SMGR_TRUNCATE_ALL;
+
+			XLogBeginInsert();
+			XLogRegisterData((char *) &xlrec, sizeof(xlrec));
+
+			lsn = XLogInsert(RM_SMGR_ID,
+							 XLOG_SMGR_TRUNCATE | XLR_SPECIAL_REL_UPDATE);
+
+			/*
+			 * Flush, because otherwise the truncation of the main relation might
+			 * hit the disk before the WAL record, and the truncation of the FSM
+			 * or visibility map. If we crashed during that window, we'd be left
+			 * with a truncated heap, but the FSM or visibility map would still
+			 * contain entries for the non-existent heap pages.
+			 */
 			XLogFlush(lsn);
-	}
+		}
 
-	/* Do the real work to truncate relation forks */
-	smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
+		MemoryContextAllowInCriticalSection(CurrentMemoryContext, true);
+		/* Do the real work to truncate relation forks */
+		smgrtruncate(rel->rd_smgr, forks, nforks, blocks);
+		MemoryContextAllowInCriticalSection(CurrentMemoryContext, false);
+	}
+	PG_CATCH();
+	{
+		ereport(PANIC, (errcode(ERRCODE_INTERNAL_ERROR),
+			errmsg("failed to truncate the relation")));
+	}
+	PG_END_TRY();
 
 	/*
 	 * Update upper-level FSM pages to account for the truncation.
@@ -689,7 +731,10 @@ smgr_redo(XLogReaderState *record)
 
 		/* Do the real work to truncate relation forks */
 		if (nforks > 0)
+		{
+			MarkTruncateBuffers(reln->smgr_rnode, forks, nforks, blocks);
 			smgrtruncate(reln, forks, nforks, blocks);
+		}
 
 		/*
 		 * Update upper-level FSM pages to account for the truncation.
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index e05e2b3456..ad81f873c1 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -141,6 +141,9 @@ static bool IsForInput;
 /* local state for LockBufferForCleanup */
 static BufferDesc *PinCountWaitBuf = NULL;
 
+static BufferDesc **truncatedBufs = NULL;
+static int numTruncatedBufs = 0;
+
 /*
  * Backend-Private refcount management:
  *
@@ -3986,6 +3989,9 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 void
 AbortBufferIO(void)
 {
+	int	i;
+	uint32	buf_state;
+
 	BufferDesc *buf = InProgressBuf;
 
 	if (buf)
@@ -4031,6 +4037,22 @@ AbortBufferIO(void)
 		}
 		TerminateBufferIO(buf, false, BM_IO_ERROR);
 	}
+
+	if (numTruncatedBufs)
+	{
+		Assert(truncatedBufs);
+		for (i = 0; i < numTruncatedBufs; i++)
+		{
+			buf_state = LockBufHdr(truncatedBufs[i]);
+			Assert(buf_state & BM_IO_IN_PROGRESS);
+			buf_state &= ~BM_IO_IN_PROGRESS;
+			UnlockBufHdr(truncatedBufs[i], buf_state);
+		}
+
+		pfree(truncatedBufs);
+		numTruncatedBufs = 0;
+		truncatedBufs = NULL;
+	}
 }
 
 /*
@@ -4370,3 +4392,146 @@ TestForOldSnapshot_impl(Snapshot snapshot, Relation relation)
 				(errcode(ERRCODE_SNAPSHOT_TOO_OLD),
 				 errmsg("snapshot too old")));
 }
+
+/*
+ *		DropTruncatedBuffers
+ *
+ *		This function removes from the buffer pool all the pages from the
+ *		saved list of buffers that were marked as BM_IO_IN_PROGRESS just
+ * 		before the truncation. Dirty pages are simply dropped, without
+ *		bothering to write them	out first.
+ *
+ *		Currently, this is called only from smgr.c smgrtuncate() where the
+ *		underlying file was just truncated. It is the responsibility of
+ *		MarkTruncatedBuffers() ensure that the list of buffers truncated is
+ *		sane and point to the right set of buffers. It is also the responsibility
+ *		of higher-level code to ensure that no other process could be trying
+ *		to load more pages of the relation into buffers.
+ *
+ */
+void
+DropTruncatedBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+					   int nforks, BlockNumber *firstDelBlock)
+{
+	int			i;
+	int			j;
+
+	/* If it's a local relation, it's localbuf.c's problem. */
+	if (RelFileNodeBackendIsTemp(rnode))
+	{
+		if (rnode.backend == MyBackendId)
+		{
+			for (j = 0; j < nforks; j++)
+				DropRelFileNodeLocalBuffers(rnode.node, forkNum[j],
+											firstDelBlock[j]);
+		}
+		return;
+	}
+
+	if (!numTruncatedBufs)
+		return; /* Nothing to clean up */
+
+	for (i = 0; i < numTruncatedBufs; i++)
+	{
+		uint32		buf_state;
+
+		buf_state = LockBufHdr(truncatedBufs[i]);
+		Assert(buf_state & BM_IO_IN_PROGRESS);
+		InvalidateBuffer(truncatedBufs[i]);	/* releases spinlock */
+	}
+
+	pfree(truncatedBufs);
+	numTruncatedBufs = 0;
+	truncatedBufs = NULL;
+}
+
+/*
+ * Finds the buffers (pages) of a about to be truncated-file
+ * and prevent them from being written to disk by marking them
+ * as BM_IO_IN_PROGRESS. Though the buffers are marked for IO, they
+ * will never be written to disk but it prevents processes from
+ * doing IO.
+ *
+ * Note: Buffers might be marked for BM_IO_IN_PROGRES than usual
+ * time because of the extra code path to be exercised before we
+ * drop them, but there shouldn't be any regular backends waiting
+ * on these pages (as they are empty and getting the process of
+ * being discarded).
+ */
+void
+MarkTruncateBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+					   int nforks, BlockNumber *firstDelBlock)
+{
+	int	i;
+	int	j;
+
+	/* Do we care about local buffers i.e. Temp relation ? */
+
+
+	for (i = 0; i < NBuffers; i++)
+	{
+		BufferDesc *bufHdr = GetBufferDescriptor(i);
+		uint32		buf_state;
+
+		/*
+		 * Since we take AccessExclusiveLock on the relation during truncate,
+		 * it's safe to check without lock and will save lot of lock
+		 * acquisitions in typical cases.
+		 */
+		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+			continue;
+
+		buf_state = LockBufHdr(bufHdr);
+		for (j = 0; j < nforks; j++)
+		{
+			if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
+				bufHdr->tag.forkNum == forkNum[j] &&
+				bufHdr->tag.blockNum >= firstDelBlock[j])
+			{
+retry:
+				if (buf_state & BM_IO_IN_PROGRESS)
+				{
+					UnlockBufHdr(bufHdr, buf_state);
+
+					WaitIO(bufHdr);
+
+					/* OK, now the flag is cleared, recheck */
+					buf_state = LockBufHdr(bufHdr);
+					goto retry;
+				}
+				else
+				{
+					/*
+					 * Easy path is to allocate NBuffers, but that
+					 * might to lead to wastage, start with 100 and
+					 * increase in increments of 100.
+					 */
+					if (!truncatedBufs)
+					{
+						truncatedBufs = (BufferDesc **)
+							palloc0(100 * sizeof(BufferDesc *));
+					}
+					else if ((numTruncatedBufs % 100) == 0)
+					{
+						truncatedBufs = (BufferDesc **)
+							repalloc(truncatedBufs,
+							(numTruncatedBufs + 100) * sizeof(BufferDesc *));
+					}
+
+					/*
+					 * Add to the list, this will be used either to
+					 * clean up (in an exception) or invalidate the
+					 * buffers after the truncate.
+					 */
+					truncatedBufs[numTruncatedBufs] =  bufHdr;
+					numTruncatedBufs++;
+					buf_state |= BM_IO_IN_PROGRESS;
+					UnlockBufHdr(bufHdr, buf_state);
+				}
+			}
+		}
+
+		if (j >= nforks)
+			UnlockBufHdr(bufHdr, buf_state);
+	}
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 360b5bf5bf..d1f066a220 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -572,12 +572,6 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 {
 	int		i;
 
-	/*
-	 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
-	 * just drop them without bothering to write the contents.
-	 */
-	DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nforks, nblocks);
-
 	/*
 	 * Send a shared-inval message to force other backends to close any smgr
 	 * references they may have for this rel.  This is useful because they
@@ -608,6 +602,13 @@ smgrtruncate(SMgrRelation reln, ForkNumber *forknum, int nforks, BlockNumber *nb
 		if (forknum[i] == VISIBILITYMAP_FORKNUM)
 			reln->smgr_vm_nblocks = nblocks[i];
 	}
+
+	/*
+	 * Now that we have successfully truncated the table(shrunk the file),
+	 * get rid of any buffers for the just deleted blocks. Bufmgr will
+	 * just drop them without bothering to write the contents.
+	 */
+	DropTruncatedBuffers(reln->smgr_rnode, forknum, nforks, nblocks);
 }
 
 /*
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index d2a5b52f6e..9889193bd9 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -189,6 +189,10 @@ extern void FlushRelationBuffers(Relation rel);
 extern void FlushDatabaseBuffers(Oid dbid);
 extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
 								   int nforks, BlockNumber *firstDelBlock);
+extern void DropTruncatedBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+								   int nforks, BlockNumber *firstDelBlock);
+extern void MarkTruncateBuffers(RelFileNodeBackend rnode, ForkNumber *forkNum,
+								   int nforks, BlockNumber *firstDelBlock);
 extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
 extern void DropDatabaseBuffers(Oid dbid);
 
-- 
2.17.1