From 4a2ca3ec30b1743a669b21da80fbf589f6a23be8 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Thu, 19 Jul 2018 14:28:25 +0400
Subject: [PATCH 2/2] Delete pages during GiST VACUUM v11

---
 src/backend/access/gist/README         |  35 ++++++++
 src/backend/access/gist/gist.c         |  18 ++++
 src/backend/access/gist/gistbuild.c    |   5 --
 src/backend/access/gist/gistutil.c     |   3 +-
 src/backend/access/gist/gistvacuum.c   | 154 +++++++++++++++++++++++++++++++++
 src/backend/access/gist/gistxlog.c     |  60 +++++++++++++
 src/backend/access/rmgrdesc/gistdesc.c |   3 +
 src/include/access/gist.h              |   3 +
 src/include/access/gist_private.h      |  24 +++--
 src/include/access/gistxlog.h          |  17 +++-
 src/test/regress/expected/gist.out     |   4 +-
 src/test/regress/sql/gist.sql          |   4 +-
 12 files changed, 310 insertions(+), 20 deletions(-)

diff --git a/src/backend/access/gist/README b/src/backend/access/gist/README
index 02228662b8..9548872be8 100644
--- a/src/backend/access/gist/README
+++ b/src/backend/access/gist/README
@@ -413,6 +413,41 @@ emptied yet; tuples never move upwards in the tree. The final emptying loops
 through buffers at a given level until all buffers at that level have been
 emptied, and then moves down to the next level.
 
+Bulk delete algorithm (VACUUM)
+------------------------------
+
+Function gistbulkdelete() is responsible for marking empty leaf pages as free
+so that they can be used it allocate newly split pages. To find this pages
+function can choose between two strategies: logical scan or physical scan.
+
+Physical scan reads the entire index from the first page to last. This scan
+maintains graph structure in palloc'ed array to collect block numbers of
+internal pages that need cleansing from references to empty leafs. Also, the
+array contains offsets on the internal page to potentially free leaf page. This
+scan method is chosen when maintenance work memory is sufficient to hold
+necessary graph structure.
+
+The logical scan is chosen when there is not enough maintenance memory to
+execute the physical scan. Logical scan traverses GiST index in DFS, looking up
+into incomplete split branches. The logical scan can be slower on hard disk
+drives.
+
+The result of both scans are the same: the stack of block numbers of internal
+pages with the list of offsets potentially referencing empty leaf pages. After
+the scan, for each internal pages under exclusive lock, each potentially free
+leaf page is examined. gistbulkdelete() never delete last one reference from
+internal page to preserve balanced tree properties.
+
+The physical scan can return empty leaf pages offsets unordered. Thus, before
+executing PageIndexMultiDelete offsets (already locked and checked) are sorted.
+This step is not necessary for the logical scan.
+
+Both scans hold only one lock at a time. Physical scan grabs exclusive lock
+instantly, while logical scan takes shared lock and then swaps it to exclusive.
+This is done because amount of work on internal page done by physical scan is
+lower and amount of internal pages is relatively low compared to the amount of
+leaf pages.
+
 
 Authors:
 	Teodor Sigaev	<teodor@sigaev.ru>
diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 8a42effdf7..3a6b5c7ed3 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -700,6 +700,11 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
 			GISTInsertStack *item;
 			OffsetNumber downlinkoffnum;
 
+			/*
+			 * Currently internal pages are not deleted during vacuum,
+			 * so we do not need to check if page is deleted
+			 */
+
 			downlinkoffnum = gistchoose(state.r, stack->page, itup, giststate);
 			iid = PageGetItemId(stack->page, downlinkoffnum);
 			idxtuple = (IndexTuple) PageGetItem(stack->page, iid);
@@ -834,6 +839,19 @@ gistdoinsert(Relation r, IndexTuple itup, Size freespace, GISTSTATE *giststate)
 				}
 			}
 
+			/*
+			 * Leaf pages can be left deleted but still referenced
+			 * until it's space is reused. Downlink to this page may be already
+			 * removed from the internal page, but this scan can posess it.
+			 */
+			if(GistPageIsDeleted(stack->page))
+			{
+				UnlockReleaseBuffer(stack->buffer);
+				xlocked = false;
+				state.stack = stack = stack->parent;
+				continue;
+			}
+
 			/* now state.stack->(page, buffer and blkno) points to leaf page */
 
 			gistinserttuple(&state, stack, giststate, itup,
diff --git a/src/backend/access/gist/gistbuild.c b/src/backend/access/gist/gistbuild.c
index 434f15f014..f26f139a9e 100644
--- a/src/backend/access/gist/gistbuild.c
+++ b/src/backend/access/gist/gistbuild.c
@@ -1126,11 +1126,6 @@ gistGetMaxLevel(Relation index)
  * but will be added there the first time we visit them.
  */
 
-typedef struct
-{
-	BlockNumber childblkno;		/* hash key */
-	BlockNumber parentblkno;
-} ParentMapEntry;
 
 static void
 gistInitParentMap(GISTBuildState *buildstate)
diff --git a/src/backend/access/gist/gistutil.c b/src/backend/access/gist/gistutil.c
index 12804c321c..41978bb5e5 100644
--- a/src/backend/access/gist/gistutil.c
+++ b/src/backend/access/gist/gistutil.c
@@ -23,6 +23,7 @@
 #include "storage/lmgr.h"
 #include "utils/builtins.h"
 #include "utils/syscache.h"
+#include "utils/snapmgr.h"
 
 
 /*
@@ -806,7 +807,7 @@ gistNewBuffer(Relation r)
 
 			gistcheckpage(r, buffer);
 
-			if (GistPageIsDeleted(page))
+			if (GistPageIsDeleted(page) && TransactionIdPrecedes(GistPageGetDeleteXid(page), RecentGlobalDataXmin))
 				return buffer;	/* OK to use */
 
 			LockBuffer(buffer, GIST_UNLOCK);
diff --git a/src/backend/access/gist/gistvacuum.c b/src/backend/access/gist/gistvacuum.c
index ff86f4491f..26ac4dd30e 100644
--- a/src/backend/access/gist/gistvacuum.c
+++ b/src/backend/access/gist/gistvacuum.c
@@ -16,6 +16,7 @@
 
 #include "access/genam.h"
 #include "access/gist_private.h"
+#include "access/transam.h"
 #include "commands/vacuum.h"
 #include "miscadmin.h"
 #include "storage/indexfsm.h"
@@ -104,6 +105,27 @@ typedef struct GistBDItem
 	struct GistBDItem *next;
 } GistBDItem;
 
+static void gistmapset(char *map, BlockNumber blkno)
+{
+	map[blkno / 8] |= 1 << (blkno % 8);
+}
+static bool gistmapget(char *map, BlockNumber blkno)
+{
+	return (map[blkno / 8] & 1 << (blkno % 8)) != 0;
+}
+
+/*
+ * This function is used to sort offsets
+ * When employing physical scan rescan offsets are not ordered.
+ */
+static int
+compare_offsetnumber(const void *x, const void *y)
+{
+	OffsetNumber a = *((OffsetNumber *)x);
+	OffsetNumber b = *((OffsetNumber *)y);
+	return a - b;
+}
+
 /*
  * Bulk deletion of all index entries pointing to a set of heap tuples and
  * check invalid tuples left after upgrade.
@@ -121,6 +143,8 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	bool			 needLock;
 	BlockNumber      blkno;
 	GistNSN			 startNSN = GetInsertRecPtr();
+	void			*internals;
+	void			*emptyLeafs;
 
 	/* first time through? */
 	if (stats == NULL)
@@ -140,6 +164,9 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	npages = RelationGetNumberOfBlocks(rel);
 	if (needLock)
 		UnlockRelationForExtension(rel, ExclusiveLock);
+	
+	internals = palloc0(npages / 8 + 1);
+	emptyLeafs = palloc0(npages / 8 + 1);
 
 	for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++)
 	{
@@ -248,7 +275,14 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 
 					END_CRIT_SECTION();
 				}
+				/* The page is completely empty */
+				if (ntodelete == maxoff)
+				{
+					gistmapset(emptyLeafs, nextBlock);
+				}
 			}
+			else
+				gistmapset(internals, nextBlock);
 
 			/* We should not unlock buffer if we are going to jump left */
 			if (needScan)
@@ -269,5 +303,125 @@ gistbulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 		}
 	}
 
+	for (blkno = GIST_ROOT_BLKNO; blkno < npages; blkno++)
+	{
+		Buffer		 buffer;
+		Page		 page;
+		OffsetNumber i,
+					 maxoff;
+		IndexTuple   idxtuple;
+		ItemId	     iid;
+		OffsetNumber 	 todelete[MaxOffsetNumber];
+		Buffer			 buftodelete[MaxOffsetNumber];
+		int				 ntodelete = 0;
+
+		if (!gistmapget(internals, blkno))
+			continue; /* second scan is for internal pages */
+
+
+		buffer = ReadBufferExtended(rel, MAIN_FORKNUM, blkno, RBM_NORMAL,
+									info->strategy);
+
+		LockBuffer(buffer, GIST_EXCLUSIVE);
+		page = (Page) BufferGetPage(buffer);
+		/* Currently block of internal page cannot become leaf */
+		Assert(!GistPageIsLeaf(page));
+
+		if (PageIsNew(page) || GistPageIsDeleted(page))
+		{
+			UnlockReleaseBuffer(buffer);
+			/* TODO: Should not we record free page here? */
+			continue;
+		}
+
+		maxoff = PageGetMaxOffsetNumber(page);
+		/* Check that leafs are still empty and decide what to delete */
+		for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
+		{
+			Buffer		leafBuffer;
+			Page		leafPage;
+			/* if this page was not empty in previous scan - we do not consider it */
+			if(!gistmapget(emptyLeafs, i))
+			{
+				continue;
+			}
+
+			iid = PageGetItemId(page, i);
+			idxtuple = (IndexTuple) PageGetItem(page, iid);
+
+			leafBuffer = ReadBufferExtended(rel, MAIN_FORKNUM, ItemPointerGetBlockNumber(&(idxtuple->t_tid)),
+								RBM_NORMAL, info->strategy);
+			LockBuffer(leafBuffer, GIST_EXCLUSIVE);
+			gistcheckpage(rel, leafBuffer);
+			leafPage = (Page) BufferGetPage(leafBuffer);
+			Assert(GistPageIsLeaf(leafPage));
+
+			if (PageGetMaxOffsetNumber(leafPage) == InvalidOffsetNumber /* Nothing left to split */
+				&& !(GistFollowRight(leafPage) || GistPageGetNSN(page) < GistPageGetNSN(leafPage)) /* No follow-right */
+				&& ntodelete < maxoff-1) /* We must keep at least one leaf page per each */
+			{
+				buftodelete[ntodelete] = leafBuffer;
+				todelete[ntodelete++] = i;
+			}
+			else
+				UnlockReleaseBuffer(leafBuffer);
+		}
+
+
+		if (ntodelete)
+		{
+			/* Prepare possibly onurdered offsets */
+			qsort(todelete, ntodelete, sizeof(OffsetNumber), compare_offsetnumber);
+
+			/* 
+			 * Like in _bt_unlink_halfdead_page we need a upper bound on xid
+			 * that could hold downlinks to this page. We use
+			 * ReadNewTransactionId() to instead of GetCurrentTransactionId
+			 * since we are in a VACUUM.
+			 */
+			TransactionId txid = ReadNewTransactionId();
+
+			START_CRIT_SECTION();
+
+			/* Mark pages as deleted dropping references from internal pages */
+			for (i = 0; i < ntodelete; i++)
+			{
+				Page		leafPage = (Page)BufferGetPage(buftodelete[i]);
+
+				GistPageSetDeleteXid(leafPage,txid);
+
+				GistPageSetDeleted(leafPage);
+				MarkBufferDirty(buftodelete[i]);
+				stats->pages_deleted++;
+
+				MarkBufferDirty(buffer);
+				/* Offsets are changed as long as we delete tuples from internal page */
+				PageIndexTupleDelete(page, todelete[i] - i);
+
+				if (RelationNeedsWAL(rel))
+				{
+					XLogRecPtr recptr 	=
+						gistXLogSetDeleted(rel->rd_node, buftodelete[i],
+											txid, buffer, todelete[i] - i);
+					PageSetLSN(page, recptr);
+					PageSetLSN(leafPage, recptr);
+				}
+				else
+				{
+					PageSetLSN(page, gistGetFakeLSN(rel));
+					PageSetLSN(leafPage, gistGetFakeLSN(rel));
+				}
+
+				UnlockReleaseBuffer(buftodelete[i]);
+			}
+			END_CRIT_SECTION();
+		}
+
+		UnlockReleaseBuffer(buffer);
+	}
+
+	pfree(internals);
+	pfree(emptyLeafs);
+
 	return stats;
 }
diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c
index 1e09126978..80108f6bfb 100644
--- a/src/backend/access/gist/gistxlog.c
+++ b/src/backend/access/gist/gistxlog.c
@@ -60,6 +60,39 @@ gistRedoClearFollowRight(XLogReaderState *record, uint8 block_id)
 		UnlockReleaseBuffer(buffer);
 }
 
+static void
+gistRedoPageSetDeleted(XLogReaderState *record)
+{
+	XLogRecPtr	lsn = record->EndRecPtr;
+	gistxlogPageDelete *xldata = (gistxlogPageDelete *) XLogRecGetData(record);
+	Buffer		buffer;
+	Page		page;
+
+	if (XLogReadBufferForRedo(record, 0, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+
+		GistPageSetDeleteXid(page, xldata->deleteXid);
+		GistPageSetDeleted(page);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+
+	if (XLogReadBufferForRedo(record, 1, &buffer) == BLK_NEEDS_REDO)
+	{
+		page = (Page) BufferGetPage(buffer);
+
+		PageIndexTupleDelete(page, xldata->downlinkOffset);
+
+		PageSetLSN(page, lsn);
+		MarkBufferDirty(buffer);
+	}
+	if (BufferIsValid(buffer))
+		UnlockReleaseBuffer(buffer);
+}
 /*
  * redo any page update (except page split)
  */
@@ -112,6 +145,7 @@ gistRedoPageUpdateRecord(XLogReaderState *record)
 			data += sizeof(OffsetNumber) * xldata->ntodelete;
 
 			PageIndexMultiDelete(page, todelete, xldata->ntodelete);
+
 			if (GistPageIsLeaf(page))
 				GistMarkTuplesDeleted(page);
 		}
@@ -324,6 +358,9 @@ gist_redo(XLogReaderState *record)
 		case XLOG_GIST_CREATE_INDEX:
 			gistRedoCreateIndex(record);
 			break;
+		case XLOG_GIST_PAGE_DELETE:
+			gistRedoPageSetDeleted(record);
+			break;
 		default:
 			elog(PANIC, "gist_redo: unknown op code %u", info);
 	}
@@ -442,6 +479,29 @@ gistXLogSplit(bool page_is_leaf,
 	return recptr;
 }
 
+/*
+ * Write XLOG record describing a page delete. This also includes removal of
+ * downlink from internal page.
+ */
+XLogRecPtr
+gistXLogSetDeleted(RelFileNode node, Buffer buffer, TransactionId xid,
+					Buffer internalPageBuffer, OffsetNumber internalPageOffset) {
+	gistxlogPageDelete xlrec;
+	XLogRecPtr	recptr;
+
+	xlrec.deleteXid = xid;
+	xlrec.downlinkOffset = internalPageOffset;
+
+	XLogBeginInsert();
+	XLogRegisterData((char *) &xlrec, sizeof(gistxlogPageDelete));
+
+	XLogRegisterBuffer(0, buffer, REGBUF_STANDARD);
+	XLogRegisterBuffer(1, internalPageBuffer, REGBUF_STANDARD);
+	/* new tuples */
+	recptr = XLogInsert(RM_GIST_ID, XLOG_GIST_PAGE_DELETE);
+	return recptr;
+}
+
 /*
  * Write XLOG record describing a page update. The update can include any
  * number of deletions and/or insertions of tuples on a single index page.
diff --git a/src/backend/access/rmgrdesc/gistdesc.c b/src/backend/access/rmgrdesc/gistdesc.c
index e5e925e0c5..f494db63f6 100644
--- a/src/backend/access/rmgrdesc/gistdesc.c
+++ b/src/backend/access/rmgrdesc/gistdesc.c
@@ -65,6 +65,9 @@ gist_identify(uint8 info)
 		case XLOG_GIST_CREATE_INDEX:
 			id = "CREATE_INDEX";
 			break;
+		case XLOG_GIST_PAGE_DELETE:
+			id = "PAGE_DELETE";
+			break;
 	}
 
 	return id;
diff --git a/src/include/access/gist.h b/src/include/access/gist.h
index 827566dc6e..0dd2bf47c8 100644
--- a/src/include/access/gist.h
+++ b/src/include/access/gist.h
@@ -151,6 +151,9 @@ typedef struct GISTENTRY
 #define GistPageGetNSN(page) ( PageXLogRecPtrGet(GistPageGetOpaque(page)->nsn))
 #define GistPageSetNSN(page, val) ( PageXLogRecPtrSet(GistPageGetOpaque(page)->nsn, val))
 
+#define GistPageGetDeleteXid(page) ( ((PageHeader) (page))->pd_prune_xid )
+#define GistPageSetDeleteXid(page, val) ( ((PageHeader) (page))->pd_prune_xid = val)
+
 /*
  * Vector of GISTENTRY structs; user-defined methods union and picksplit
  * take it as one of their arguments
diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h
index 36ed7244ba..1f82695b1d 100644
--- a/src/include/access/gist_private.h
+++ b/src/include/access/gist_private.h
@@ -16,6 +16,7 @@
 
 #include "access/amapi.h"
 #include "access/gist.h"
+#include "access/gistxlog.h"
 #include "access/itup.h"
 #include "fmgr.h"
 #include "lib/pairingheap.h"
@@ -51,6 +52,11 @@ typedef struct
 	char		tupledata[FLEXIBLE_ARRAY_MEMBER];
 } GISTNodeBufferPage;
 
+typedef struct
+{
+	BlockNumber childblkno;		/* hash key */
+	BlockNumber parentblkno;
+} ParentMapEntry;
 #define BUFFER_PAGE_DATA_OFFSET MAXALIGN(offsetof(GISTNodeBufferPage, tupledata))
 /* Returns free space in node buffer page */
 #define PAGE_FREE_SPACE(nbp) (nbp->freespace)
@@ -176,13 +182,6 @@ typedef struct GISTScanOpaqueData
 
 typedef GISTScanOpaqueData *GISTScanOpaque;
 
-/* despite the name, gistxlogPage is not part of any xlog record */
-typedef struct gistxlogPage
-{
-	BlockNumber blkno;
-	int			num;			/* number of index tuples following */
-} gistxlogPage;
-
 /* SplitedPageLayout - gistSplit function result */
 typedef struct SplitedPageLayout
 {
@@ -409,6 +408,17 @@ extern bool gistplacetopage(Relation rel, Size freespace, GISTSTATE *giststate,
 extern SplitedPageLayout *gistSplit(Relation r, Page page, IndexTuple *itup,
 		  int len, GISTSTATE *giststate);
 
+/* gistxlog.c */
+extern void gist_redo(XLogReaderState *record);
+extern void gist_desc(StringInfo buf, XLogReaderState *record);
+extern const char *gist_identify(uint8 info);
+extern void gist_xlog_startup(void);
+extern void gist_xlog_cleanup(void);
+
+extern XLogRecPtr gistXLogSetDeleted(RelFileNode node, Buffer buffer,
+					TransactionId xid, Buffer internalPageBuffer,
+					OffsetNumber internalPageOffset);
+
 extern XLogRecPtr gistXLogUpdate(Buffer buffer,
 			   OffsetNumber *todelete, int ntodelete,
 			   IndexTuple *itup, int ntup,
diff --git a/src/include/access/gistxlog.h b/src/include/access/gistxlog.h
index 1a2b9496d0..ad0b742dbb 100644
--- a/src/include/access/gistxlog.h
+++ b/src/include/access/gistxlog.h
@@ -17,12 +17,14 @@
 #include "access/xlogreader.h"
 #include "lib/stringinfo.h"
 
+/* XLog stuff */
+
 #define XLOG_GIST_PAGE_UPDATE		0x00
  /* #define XLOG_GIST_NEW_ROOT			 0x20 */	/* not used anymore */
 #define XLOG_GIST_PAGE_SPLIT		0x30
  /* #define XLOG_GIST_INSERT_COMPLETE	 0x40 */	/* not used anymore */
 #define XLOG_GIST_CREATE_INDEX		0x50
- /* #define XLOG_GIST_PAGE_DELETE		 0x60 */	/* not used anymore */
+#define XLOG_GIST_PAGE_DELETE		 0x60
 
 /*
  * Backup Blk 0: updated page.
@@ -59,6 +61,19 @@ typedef struct gistxlogPageSplit
 	 */
 } gistxlogPageSplit;
 
+typedef struct gistxlogPageDelete
+{
+   TransactionId deleteXid; /* last Xid which could see page in scan */
+   OffsetNumber downlinkOffset; /* Offset of the downlink referencing this page */
+} gistxlogPageDelete;
+
+/* despite the name, gistxlogPage is not part of any xlog record */
+typedef struct gistxlogPage
+{
+   BlockNumber blkno;
+   int			num;			/* number of index tuples following */
+} gistxlogPage;
+
 extern void gist_redo(XLogReaderState *record);
 extern void gist_desc(StringInfo buf, XLogReaderState *record);
 extern const char *gist_identify(uint8 info);
diff --git a/src/test/regress/expected/gist.out b/src/test/regress/expected/gist.out
index f5a2993aaf..5b92f08c74 100644
--- a/src/test/regress/expected/gist.out
+++ b/src/test/regress/expected/gist.out
@@ -27,9 +27,7 @@ insert into gist_point_tbl (id, p)
 select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 vacuum analyze gist_point_tbl;
 -- rebuild the index with a different fillfactor
diff --git a/src/test/regress/sql/gist.sql b/src/test/regress/sql/gist.sql
index bae722fe13..e66396e851 100644
--- a/src/test/regress/sql/gist.sql
+++ b/src/test/regress/sql/gist.sql
@@ -28,9 +28,7 @@ select g+100000, point(g*10+1, g*10+1) from generate_series(1, 10000) g;
 -- To test vacuum, delete some entries from all over the index.
 delete from gist_point_tbl where id % 2 = 1;
 
--- And also delete some concentration of values. (GiST doesn't currently
--- attempt to delete pages even when they become empty, but if it did, this
--- would exercise it)
+-- And also delete some concentration of values.
 delete from gist_point_tbl where id < 10000;
 
 vacuum analyze gist_point_tbl;
-- 
2.15.2 (Apple Git-101.1)

