commit aa7a7db17aa794fa583dc4bee6e0eacf009fa076
Author: Alexander Korotkov <akorotkov@postgresql.org>
Date:   Fri Apr 19 12:18:14 2019 +0300

    Improve relation truncation algorithm
    
    Make relation truncation use ExclusiveLock instead of AccessExclusiveLock lock.
    In order to implement that make dropping of relation buffers two-phase.  First
    phase happens before actual file truncation and prevents any dirty buffers past
    truncation point from being written.  After file truncation second phase wipes
    out past truncation point buffers.
    
    TODO:
     * Tolerate past-truncation point reads, which might happen concurrently to
       truncation.

diff --git a/src/backend/access/heap/vacuumlazy.c b/src/backend/access/heap/vacuumlazy.c
index 8dc76fa8583..cb1907da578 100644
--- a/src/backend/access/heap/vacuumlazy.c
+++ b/src/backend/access/heap/vacuumlazy.c
@@ -1920,7 +1920,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
 		lock_retry = 0;
 		while (true)
 		{
-			if (ConditionalLockRelation(onerel, AccessExclusiveLock))
+			if (ConditionalLockRelation(onerel, ExclusiveLock))
 				break;
 
 			/*
@@ -1961,7 +1961,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
 			 * numbers alone amounts to assuming that the new pages have the
 			 * same tuple density as existing ones, which is less unlikely.
 			 */
-			UnlockRelation(onerel, AccessExclusiveLock);
+			UnlockRelation(onerel, ExclusiveLock);
 			return;
 		}
 
@@ -1976,7 +1976,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
 		if (new_rel_pages >= old_rel_pages)
 		{
 			/* can't do anything after all */
-			UnlockRelation(onerel, AccessExclusiveLock);
+			UnlockRelation(onerel, ExclusiveLock);
 			return;
 		}
 
@@ -1992,7 +1992,7 @@ lazy_truncate_heap(Relation onerel, LVRelStats *vacrelstats)
 		 * that should happen as part of standard invalidation processing once
 		 * they acquire lock on the relation.
 		 */
-		UnlockRelation(onerel, AccessExclusiveLock);
+		UnlockRelation(onerel, ExclusiveLock);
 
 		/*
 		 * Update statistics.  Here, it *is* correct to adjust rel_pages
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 887023fc8a5..7b19a84d562 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1108,7 +1108,8 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 			 * happens to be trying to split the page the first one got from
 			 * StrategyGetBuffer.)
 			 */
-			if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
+			if (!(oldFlags & BM_DIRTY_BARRIER) &&
+				LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
 										 LW_SHARED))
 			{
 				/*
@@ -1305,6 +1306,7 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
 	 * just like permanent relations.
 	 */
 	buf->tag = newTag;
+	Assert((buf_state & BM_DIRTY_BARRIER) == 0);
 	buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
 				   BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
 				   BUF_USAGECOUNT_MASK);
@@ -1483,6 +1485,7 @@ MarkBufferDirty(Buffer buffer)
 		buf_state = old_buf_state;
 
 		Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
+		Assert((buf_state & BM_DIRTY_BARRIER) == 0);
 		buf_state |= BM_DIRTY | BM_JUST_DIRTIED;
 
 		if (pg_atomic_compare_exchange_u32(&bufHdr->state, &old_buf_state,
@@ -2371,6 +2374,29 @@ SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context)
 	 */
 	buf_state = LockBufHdr(bufHdr);
 
+	if (buf_state & BM_DIRTY_BARRIER)
+	{
+		if (skip_recently_used)
+		{
+			UnlockBufHdr(bufHdr, buf_state);
+			return result;
+		}
+		else
+		{
+			/*
+			 * Can't sync buffer if it's BM_DIRTY_BARRIER.  So, wait till this
+			 * flag is cleared.
+			 */
+			while (buf_state & BM_DIRTY_BARRIER)
+			{
+				UnlockBufHdr(bufHdr, buf_state);
+				pg_usleep(10000L);
+				buf_state = LockBufHdr(bufHdr);
+			}
+		}
+	}
+
+
 	if (BUF_STATE_GET_REFCOUNT(buf_state) == 0 &&
 		BUF_STATE_GET_USAGECOUNT(buf_state) == 0)
 	{
@@ -2859,6 +2885,204 @@ BufferGetLSNAtomic(Buffer buffer)
 	return lsn;
 }
 
+/*
+ * Set BM_DIRTY_BARRIER flag to the buffer.
+ */
+static bool
+SetBufferDirtyBarrier(BufferDesc *buf, uint32 buf_state)
+{
+	bool	result = false;
+
+	while (buf_state & BM_IO_IN_PROGRESS)
+	{
+		UnlockBufHdr(buf, buf_state);
+		WaitIO(buf);
+		buf_state = LockBufHdr(buf);
+	}
+
+	if (buf_state & BM_DIRTY)
+	{
+		buf_state |= BM_DIRTY_BARRIER;
+		result = true;
+	}
+
+	UnlockBufHdr(buf, buf_state);
+
+	return result;
+}
+
+/*
+ * Clear BM_DIRTY_BARRIER flag and also optionally BM_DIRTY and
+ * BM_JUST_DIRTIED flags.
+ */
+static void
+UnsetBufferDirtyBarrier(BufferDesc *buf, bool clean_dirty)
+{
+	uint32		buf_state;
+
+	buf_state = LockBufHdr(buf);
+	buf_state &= ~BM_DIRTY_BARRIER;
+	if (clean_dirty)
+		buf_state &= ~(BM_DIRTY | BM_JUST_DIRTIED);
+	UnlockBufHdr(buf, buf_state);
+}
+
+typedef struct
+{
+	int				   *buf_ids;
+	int					count;
+	int					nallocated;
+	RelFileNodeBackend	rnode;
+	ForkNumber			forkNum;
+	BlockNumber			firstDelBlock;
+} BuffersTrucateState;
+
+
+volatile BuffersTrucateState truncate_state = {NULL, 0, 0};
+
+static void
+init_truncate_state(void)
+{
+	truncate_state.count = 0;
+	truncate_state.nallocated = 16;
+	truncate_state.buf_ids = (int *) palloc(sizeof(int) *
+											truncate_state.nallocated);
+}
+
+static void
+extend_truncate_state_if_needed(void)
+{
+	int *tmp;
+
+	if (truncate_state.count + 1 <= truncate_state.nallocated)
+		return;
+
+	truncate_state.nallocated *= 2;
+	tmp = (int *) palloc(sizeof(int) *
+						 truncate_state.nallocated);
+	memcpy(tmp, truncate_state.buf_ids, truncate_state.count * sizeof(int));
+	truncate_state.buf_ids = tmp;
+}
+
+static void
+free_truncate_state(void)
+{
+	if (truncate_state.buf_ids)
+		pfree(truncate_state.buf_ids);
+	truncate_state.buf_ids = NULL;
+	truncate_state.count = 0;
+	truncate_state.nallocated = 0;
+}
+
+/*
+ * Prepare for truncation of relfilenode buffers.  Sets BM_DIRTY_BARRIER to
+ * every dirty buffer to be truncated.  Requires ExclusiveLock on relation,
+ * so no more buffers should be dirtied.  Therefore, after execution of this
+ * function no more past truncation point buffers will be written out.
+ */
+void
+RelFileNodeBuffersTruncatePrepare(RelFileNodeBackend rnode,
+								  ForkNumber forkNum,
+								  BlockNumber firstDelBlock)
+{
+	int					i;
+	WritebackContext	wb_context;
+	bool				barriers_overflow = false;
+
+	/* If it's a local relation, it's localbuf.c's problem. */
+	if (RelFileNodeBackendIsTemp(rnode))
+	{
+		if (rnode.backend == MyBackendId)
+			DropRelFileNodeLocalBuffers(rnode.node, forkNum, firstDelBlock);
+		return;
+	}
+
+	init_truncate_state();
+	truncate_state.rnode = rnode;
+	truncate_state.forkNum = forkNum;
+	truncate_state.firstDelBlock = firstDelBlock;
+	WritebackContextInit(&wb_context, &checkpoint_flush_after);
+
+	/* Look for past truncation point buffers */
+	for (i = 0; i < NBuffers; i++)
+	{
+		BufferDesc *bufHdr = GetBufferDescriptor(i);
+		uint32		buf_state;
+
+		if (!RelFileNodeEquals(bufHdr->tag.rnode, rnode.node))
+			continue;
+
+		buf_state = LockBufHdr(bufHdr);
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode.node) &&
+			bufHdr->tag.forkNum == forkNum &&
+			bufHdr->tag.blockNum >= firstDelBlock)
+		{
+			extend_truncate_state_if_needed();
+
+			/*
+			 * Mark dirty buffers as BM_DIRTY_BARRIER.  IncDirtyBarriers()
+			 * ensures that there is not more than NBuffers/2 of barriered
+			 * buffers.  If we exceed the limite, then just write out dirty
+			 * buffers (unlikely to happen).
+			 */
+			if (!barriers_overflow)
+			{
+				START_CRIT_SECTION();
+				if (SetBufferDirtyBarrier(bufHdr, buf_state))
+				{
+					if (IncDirtyBarriers())
+					{
+						truncate_state.buf_ids[truncate_state.count++] = i;
+					}
+					else
+					{
+						UnsetBufferDirtyBarrier(bufHdr, false);
+						barriers_overflow = true;
+					}
+				}
+				END_CRIT_SECTION();
+			}
+
+			if (barriers_overflow)
+			{
+				UnlockBufHdr(bufHdr, buf_state);
+				SyncOneBuffer(i, false, &wb_context);
+			}
+		}
+		else
+		{
+			UnlockBufHdr(bufHdr, buf_state);
+		}
+	}
+
+	IssuePendingWritebacks(&wb_context);
+}
+
+/*
+ * Finish truncation of node buffers.  When commit == true, all the buffers
+ * past truncation point are removed.  When commit == false, just removes
+ * BM_DIRTY_BARRIER flag.
+ */
+void
+RelFileNodeBuffersTruncateFinish(bool commit)
+{
+	int		i;
+
+	START_CRIT_SECTION();
+	for (i = 0; i < truncate_state.count; i++)
+	{
+		BufferDesc *buf = GetBufferDescriptor(truncate_state.buf_ids[i]);
+		UnsetBufferDirtyBarrier(buf, commit);
+	}
+	SubDirtyBarriers(truncate_state.count);
+	if (commit)
+		DropRelFileNodeBuffers(truncate_state.rnode,
+							   truncate_state.forkNum,
+							   truncate_state.firstDelBlock);
+	free_truncate_state();
+	END_CRIT_SECTION();
+}
+
 /* ---------------------------------------------------------------------
  *		DropRelFileNodeBuffers
  *
@@ -3173,6 +3397,8 @@ FlushRelationBuffers(Relation rel)
 				ErrorContextCallback errcallback;
 				Page		localpage;
 
+				Assert((buf_state & BM_DIRTY_BARRIER) == 0);
+
 				localpage = (char *) LocalBufHdrGetBlock(bufHdr);
 
 				/* Setup error traceback support for ereport() */
@@ -3273,6 +3499,7 @@ FlushDatabaseBuffers(Oid dbid)
 		ReservePrivateRefCountEntry();
 
 		buf_state = LockBufHdr(bufHdr);
+		Assert((buf_state & BM_DIRTY_BARRIER) == 0);
 		if (bufHdr->tag.rnode.dbNode == dbid &&
 			(buf_state & (BM_VALID | BM_DIRTY)) == (BM_VALID | BM_DIRTY))
 		{
@@ -3894,6 +4121,8 @@ StartBufferIO(BufferDesc *buf, bool forInput)
 
 		buf_state = LockBufHdr(buf);
 
+		Assert((buf_state & BM_DIRTY_BARRIER) == 0);
+
 		if (!(buf_state & BM_IO_IN_PROGRESS))
 			break;
 
@@ -3953,6 +4182,7 @@ TerminateBufferIO(BufferDesc *buf, bool clear_dirty, uint32 set_flag_bits)
 
 	buf_state = LockBufHdr(buf);
 
+	Assert((buf_state & BM_DIRTY_BARRIER) == 0);
 	Assert(buf_state & BM_IO_IN_PROGRESS);
 
 	buf_state &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
@@ -3994,6 +4224,7 @@ AbortBufferIO(void)
 		LWLockAcquire(BufferDescriptorGetIOLock(buf), LW_EXCLUSIVE);
 
 		buf_state = LockBufHdr(buf);
+		Assert((buf_state & BM_DIRTY_BARRIER) == 0);
 		Assert(buf_state & BM_IO_IN_PROGRESS);
 		if (IsForInput)
 		{
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index 03caceaf7b0..bece4878506 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -58,6 +58,8 @@ typedef struct
 	 * StrategyNotifyBgWriter.
 	 */
 	int			bgwprocno;
+
+	pg_atomic_uint32 numDirtyBarriers;
 } BufferStrategyControl;
 
 /* Pointers to shared state */
@@ -300,7 +302,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
 			 */
 			local_buf_state = LockBufHdr(buf);
 			if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
-				&& BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0)
+				&& BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0
+				&& (local_buf_state & BM_DIRTY_BARRIER) == 0)
 			{
 				if (strategy != NULL)
 					AddBufferToRing(strategy, buf);
@@ -324,7 +327,8 @@ StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state)
 		 */
 		local_buf_state = LockBufHdr(buf);
 
-		if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0)
+		if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 &&
+			(local_buf_state & BM_DIRTY_BARRIER) == 0)
 		{
 			if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0)
 			{
@@ -519,6 +523,8 @@ StrategyInitialize(bool init)
 		StrategyControl->completePasses = 0;
 		pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0);
 
+		pg_atomic_init_u32(&StrategyControl->numDirtyBarriers, 0);
+
 		/* No pending notification */
 		StrategyControl->bgwprocno = -1;
 	}
@@ -643,7 +649,8 @@ GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state)
 	buf = GetBufferDescriptor(bufnum - 1);
 	local_buf_state = LockBufHdr(buf);
 	if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0
-		&& BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1)
+		&& BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1
+		&& (local_buf_state & BM_DIRTY_BARRIER) == 0)
 	{
 		strategy->current_was_in_ring = true;
 		*buf_state = local_buf_state;
@@ -702,3 +709,30 @@ StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf)
 
 	return true;
 }
+
+bool
+IncDirtyBarriers(void)
+{
+	uint32 num;
+
+	num = pg_atomic_read_u32(&StrategyControl->numDirtyBarriers);
+
+	do
+	{
+		if (num >= NBuffers / 2)
+			return false;
+
+		if (pg_atomic_compare_exchange_u32(&StrategyControl->numDirtyBarriers,
+										   &num,
+										   num + 1))
+			return true;
+
+	}
+	while (true);
+}
+
+void
+SubDirtyBarriers(uint32 sub)
+{
+	(void) pg_atomic_fetch_sub_u32(&StrategyControl->numDirtyBarriers, sub);
+}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 8191118b619..ea9739ca13c 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -645,28 +645,38 @@ smgrnblocks(SMgrRelation reln, ForkNumber forknum)
 void
 smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
-	/*
-	 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
-	 * just drop them without bothering to write the contents.
-	 */
-	DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nblocks);
-
-	/*
-	 * Send a shared-inval message to force other backends to close any smgr
-	 * references they may have for this rel.  This is useful because they
-	 * might have open file pointers to segments that got removed, and/or
-	 * smgr_targblock variables pointing past the new rel end.  (The inval
-	 * message will come back to our backend, too, causing a
-	 * probably-unnecessary local smgr flush.  But we don't expect that this
-	 * is a performance-critical path.)  As in the unlink code, we want to be
-	 * sure the message is sent before we start changing things on-disk.
-	 */
-	CacheInvalidateSmgr(reln->smgr_rnode);
-
-	/*
-	 * Do the truncation.
-	 */
-	smgrsw[reln->smgr_which].smgr_truncate(reln, forknum, nblocks);
+	PG_TRY();
+	{
+		/*
+		 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
+		 * just drop them without bothering to write the contents.
+		 */
+		RelFileNodeBuffersTruncatePrepare(reln->smgr_rnode, forknum, nblocks);
+
+		/*
+		 * Send a shared-inval message to force other backends to close any smgr
+		 * references they may have for this rel.  This is useful because they
+		 * might have open file pointers to segments that got removed, and/or
+		 * smgr_targblock variables pointing past the new rel end.  (The inval
+		 * message will come back to our backend, too, causing a
+		 * probably-unnecessary local smgr flush.  But we don't expect that this
+		 * is a performance-critical path.)  As in the unlink code, we want to be
+		 * sure the message is sent before we start changing things on-disk.
+		 */
+		CacheInvalidateSmgr(reln->smgr_rnode);
+
+		/*
+		 * Do the truncation.
+		 */
+		smgrsw[reln->smgr_which].smgr_truncate(reln, forknum, nblocks);
+	}
+	PG_CATCH();
+	{
+		RelFileNodeBuffersTruncateFinish(false);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+	RelFileNodeBuffersTruncateFinish(true);
 }
 
 /*
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index ba1b5463fc3..fb86b3fbb82 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -29,9 +29,9 @@
 /*
  * Buffer state is a single 32-bit variable where following data is combined.
  *
- * - 18 bits refcount
+ * - 17 bits refcount
  * - 4 bits usage count
- * - 10 bits of flags
+ * - 11 bits of flags
  *
  * Combining these values allows to perform some operations without locking
  * the buffer header, by modifying them together with a CAS loop.
@@ -39,11 +39,11 @@
  * The definition of buffer state components is below.
  */
 #define BUF_REFCOUNT_ONE 1
-#define BUF_REFCOUNT_MASK ((1U << 18) - 1)
-#define BUF_USAGECOUNT_MASK 0x003C0000U
-#define BUF_USAGECOUNT_ONE (1U << 18)
-#define BUF_USAGECOUNT_SHIFT 18
-#define BUF_FLAG_MASK 0xFFC00000U
+#define BUF_REFCOUNT_MASK ((1U << 17) - 1)
+#define BUF_USAGECOUNT_MASK 0x001E0000U
+#define BUF_USAGECOUNT_ONE (1U << 17)
+#define BUF_USAGECOUNT_SHIFT 17
+#define BUF_FLAG_MASK 0xFFE00000U
 
 /* Get refcount and usagecount from buffer state */
 #define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
@@ -55,8 +55,10 @@
  * Note: TAG_VALID essentially means that there is a buffer hashtable
  * entry associated with the buffer's tag.
  */
-#define BM_LOCKED				(1U << 22)	/* buffer header is locked */
-#define BM_DIRTY				(1U << 23)	/* data needs writing */
+#define BM_LOCKED				(1U << 21)	/* buffer header is locked */
+#define BM_DIRTY				(1U << 22)	/* data needs writing */
+#define BM_DIRTY_BARRIER		(1U << 23)	/* data writing is temporary
+											 * forbidden */
 #define BM_VALID				(1U << 24)	/* data is valid */
 #define BM_TAG_VALID			(1U << 25)	/* tag is assigned */
 #define BM_IO_IN_PROGRESS		(1U << 26)	/* read or write in progress */
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index c5826f691de..08f018804e6 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -190,8 +190,13 @@ extern BlockNumber RelationGetNumberOfBlocksInFork(Relation relation,
 extern void FlushOneBuffer(Buffer buffer);
 extern void FlushRelationBuffers(Relation rel);
 extern void FlushDatabaseBuffers(Oid dbid);
-extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode,
-					   ForkNumber forkNum, BlockNumber firstDelBlock);
+
+extern void RelFileNodeBuffersTruncatePrepare(RelFileNodeBackend rnode,
+											  ForkNumber forkNum,
+											  BlockNumber firstDelBlock);
+extern void RelFileNodeBuffersTruncateFinish(bool commit);
+extern void DropRelFileNodeBuffers(RelFileNodeBackend rnode, ForkNumber forkNum,
+					   BlockNumber firstDelBlock);
 extern void DropRelFileNodesAllBuffers(RelFileNodeBackend *rnodes, int nnodes);
 extern void DropDatabaseBuffers(Oid dbid);
 
@@ -230,6 +235,8 @@ extern void TestForOldSnapshot_impl(Snapshot snapshot, Relation relation);
 /* in freelist.c */
 extern BufferAccessStrategy GetAccessStrategy(BufferAccessStrategyType btype);
 extern void FreeAccessStrategy(BufferAccessStrategy strategy);
+extern bool IncDirtyBarriers(void);
+extern void SubDirtyBarriers(uint32 sub);
 
 
 /* inline functions */
