From 42607e0e6827b0146e423a74e3c115adaf5bf63b Mon Sep 17 00:00:00 2001
From: Peter Geoghegan <pg@bowt.ie>
Date: Tue, 2 Jul 2019 10:17:29 -0700
Subject: [PATCH 2/2] Fix pathological page split issue.

---
 src/backend/access/nbtree/nbtsplitloc.c | 109 ++++++++++++++++++------
 1 file changed, 81 insertions(+), 28 deletions(-)

diff --git a/src/backend/access/nbtree/nbtsplitloc.c b/src/backend/access/nbtree/nbtsplitloc.c
index f43ec6774b..16a3c79721 100644
--- a/src/backend/access/nbtree/nbtsplitloc.c
+++ b/src/backend/access/nbtree/nbtsplitloc.c
@@ -74,7 +74,7 @@ static bool _bt_afternewitemoff(FindSplitData *state, OffsetNumber maxoff,
 								int leaffillfactor, bool *usemult);
 static bool _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid);
 static OffsetNumber _bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
-									 bool *newitemonleft);
+									 bool *newitemonleft, FindSplitStrat strategy);
 static int	_bt_strategy(FindSplitData *state, SplitPoint *leftpage,
 						 SplitPoint *rightpage, FindSplitStrat *strategy);
 static void _bt_interval_edges(FindSplitData *state,
@@ -361,10 +361,10 @@ _bt_findsplitloc(Relation rel,
 	 * fillfactormult, in order to deal with pages with a large number of
 	 * duplicates gracefully.
 	 *
-	 * Pass low and high splits for the entire page (including even newitem).
-	 * These are used when the initial split interval encloses split points
-	 * that are full of duplicates, and we need to consider if it's even
-	 * possible to avoid appending a heap TID.
+	 * Pass low and high splits for the entire page (or a version of the page
+	 * that includes newitem).  These are used when the initial split interval
+	 * encloses split points that are full of duplicates, and we need to
+	 * consider if it's even possible to avoid appending a heap TID.
 	 */
 	perfectpenalty = _bt_strategy(&state, &leftpage, &rightpage, &strategy);
 
@@ -379,14 +379,18 @@ _bt_findsplitloc(Relation rel,
 	/*
 	 * Many duplicates strategy is used when a heap TID would otherwise be
 	 * appended, but the page isn't completely full of logical duplicates.
+	 * This increases locality of access for index scans.
 	 *
 	 * The split interval is widened to include all legal candidate split
-	 * points.  There may be a few as two distinct values in the whole-page
-	 * split interval.  Many duplicates strategy has no hard requirements for
-	 * space utilization, though it still keeps the use of space balanced as a
-	 * non-binding secondary goal (perfect penalty is set so that the
-	 * first/lowest delta split points that avoids appending a heap TID is
-	 * used).
+	 * points on the page.  There might be a few as two distinct values (or
+	 * two distinct sets of values) in the whole-page split interval, though
+	 * it's also possible that most of the values on the page are unique.  The
+	 * final split point will either be to the immediate left or to the
+	 * immediate right of the group of duplicate tuples that surround the
+	 * first/delta-optimal split point (perfect penalty was set so that the
+	 * lowest delta split point that avoids appending a heap TID will be
+	 * chosen).  Maximizing the number of attributes that can be truncated
+	 * away is only a goal for the default strategy.
 	 *
 	 * Single value strategy is used when it is impossible to avoid appending
 	 * a heap TID.  It arranges to leave the left page very full.  This
@@ -399,6 +403,9 @@ _bt_findsplitloc(Relation rel,
 	else if (strategy == SPLIT_MANY_DUPLICATES)
 	{
 		Assert(state.is_leaf);
+		/* Shouldn't try to truncate away extra user attributes */
+		Assert(perfectpenalty ==
+			   IndexRelationGetNumberOfKeyAttributes(state.rel));
 		/* No need to resort splits -- no change in fillfactormult/deltas */
 		state.interval = state.nsplits;
 	}
@@ -419,7 +426,8 @@ _bt_findsplitloc(Relation rel,
 	 * the entry that has the lowest penalty, and is therefore expected to
 	 * maximize fan-out.  Sets *newitemonleft for us.
 	 */
-	foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft);
+	foundfirstright = _bt_bestsplitloc(&state, perfectpenalty, newitemonleft,
+									   strategy);
 	pfree(state.splits);
 
 	return foundfirstright;
@@ -753,11 +761,13 @@ _bt_adjacenthtid(ItemPointer lowhtid, ItemPointer highhtid)
  * page, plus a boolean indicating if new item is on left of split point.
  */
 static OffsetNumber
-_bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft)
+_bt_bestsplitloc(FindSplitData *state, int perfectpenalty,
+				 bool *newitemonleft, FindSplitStrat strategy)
 {
 	int			bestpenalty,
 				lowsplit;
 	int			highsplit = Min(state->interval, state->nsplits);
+	SplitPoint *final;
 
 	bestpenalty = INT_MAX;
 	lowsplit = 0;
@@ -781,8 +791,52 @@ _bt_bestsplitloc(FindSplitData *state, int perfectpenalty, bool *newitemonleft)
 		}
 	}
 
-	*newitemonleft = state->splits[lowsplit].newitemonleft;
-	return state->splits[lowsplit].firstoldonright;
+	final = &state->splits[lowsplit];
+
+	/*
+	 * If "many duplicates" strategy ends up splitting far from the center of
+	 * the page then the page as a whole will typically have few distinct
+	 * values.  There may occasionally be quite a few distinct tuples on
+	 * either or both sides of the central group of duplicates, but that won't
+	 * matter provided that space is consistently made available for future
+	 * insertions of non-duplicates.  Space will almost certainly be available
+	 * for future insertions of tuples of the same duplicate value, but tuples
+	 * with unique values that have not been inserted yet but are covered by
+	 * the same range in the key space can be tricky.
+	 *
+	 * Random or monotonically increasing unique tuple insertions should not
+	 * lead to repeated unbalanced page splits.  If they're on the left hand
+	 * side of the large group of duplicates then the large group of
+	 * duplicates will be isolated, since the new high key for the page will
+	 * have the same duplicate value (with truncated/-inf heap TID value).  If
+	 * random/increasing tuples appear on the right hand side of the dups then
+	 * the new high key will be a low value among the unique values, which
+	 * will also have acceptable space utilization.
+	 *
+	 * A clear problem case is monotonically _decreasing_ insertions where
+	 * there happens to be a large group of duplicates to the left.  There is
+	 * a danger that the "many duplicates" strategy will choose a split point
+	 * that derives the new high key from the lowest current unique value,
+	 * which is still higher than all future unique key insertions.  The same
+	 * page will be split again and again, resulting in a succession of right
+	 * halves that each contain very few tuples.
+	 *
+	 * Avoid problems with monotonically decreasing insertions by reverting to
+	 * a delta-optimal split when the "many duplicates" split point is just to
+	 * the left of new item's position.  This suggests that values at and just
+	 * after the would-be firstright tuple are not one large grouping.
+	 */
+	if (strategy == SPLIT_MANY_DUPLICATES && !state->is_rightmost &&
+		!final->newitemonleft &&
+		final->firstoldonright >= state->newitemoff &&
+		final->firstoldonright < state->newitemoff + 10)
+	{
+		/* Do a 50:50 page split instead */
+		final = &state->splits[0];
+	}
+
+	*newitemonleft = final->newitemonleft;
+	return final->firstoldonright;
 }
 
 /*
@@ -859,17 +913,16 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
 		*strategy = SPLIT_MANY_DUPLICATES;
 
 		/*
-		 * Caller should choose the lowest delta split that avoids appending a
-		 * heap TID.  Maximizing the number of attributes that can be
-		 * truncated away (returning perfectpenalty when it happens to be less
-		 * than the number of key attributes in index) can result in continual
-		 * unbalanced page splits.
+		 * Many duplicates strategy should split at either side the group of
+		 * duplicates that surround the delta-optimal split point.  Return
+		 * indnkeyatts rather than the true perfect penalty to make that
+		 * happen.  (If perfectpenalty was returned here then low cardinality
+		 * composite indexes could have continual unbalanced splits.)
 		 *
-		 * Just avoiding appending a heap TID can still make splits very
-		 * unbalanced, but this is self-limiting.  When final split has a very
-		 * high delta, one side of the split will likely consist of a single
-		 * value.  If that page is split once again, then that split will
-		 * likely use the single value strategy.
+		 * Note that caller won't go through with a many duplicates split in
+		 * rare cases where it looks like there are ever-decreasing insertions
+		 * to the immediate right of the split point.  This must happen just
+		 * before a final decision is made, within _bt_bestsplitloc().
 		 */
 		return indnkeyatts;
 	}
@@ -879,9 +932,9 @@ _bt_strategy(FindSplitData *state, SplitPoint *leftpage,
 	 * TIDs; otherwise, original default strategy split should proceed to
 	 * avoid pathological performance.  Use page high key to infer if this is
 	 * the rightmost page among pages that store the same duplicate value.
-	 * This should not prevent insertions of heap TIDs that are slightly out
-	 * of order from using single value strategy, since that's expected with
-	 * concurrent inserters of the same duplicate value.
+	 * This won't prevent insertions of heap TIDs that are slightly out of
+	 * order from using single value strategy, which is expected when there
+	 * are concurrent inserters of the same duplicate value.
 	 */
 	else if (state->is_rightmost)
 		*strategy = SPLIT_SINGLE_VALUE;
-- 
2.17.1