From 8bc1e434068ff7bdbd5ac9ff49c17912e535b6da Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Thu, 6 Jul 2023 21:57:53 +0200
Subject: [PATCH v2] Add heap reloption local_update_limit

Without this, there is no way to efficiently make a table more compact when a
hole or area with few tuples was created. With local_update_limit configured,
local updates are only available for the first MBs of the table, all other
updates will go through the visibility map to find a destination page for
the new tuple version (and try to update locally still if no earlier page fits
the new tuple version).

This is intended as a debug/maintenance option so that tables will tend to
less fragmentation if a large part of the table was updated at once and left
many pages nearly empty; such as when someone updated a table with
fillfactor=100 using an unqualified UPDATE statement.
---
 doc/src/sgml/ref/create_table.sgml        | 32 +++++++++++++++++++
 src/backend/access/common/reloptions.c    | 13 +++++++-
 src/backend/access/heap/heapam.c          | 38 +++++++++++++++++++++--
 src/bin/psql/tab-complete.c               |  1 +
 src/include/utils/rel.h                   | 19 ++++++++++++
 src/test/regress/expected/alter_table.out | 18 +++++++++++
 src/test/regress/expected/update.out      | 38 +++++++++++++++++++++++
 src/test/regress/sql/alter_table.sql      |  8 +++++
 src/test/regress/sql/update.sql           | 23 ++++++++++++++
 9 files changed, 187 insertions(+), 3 deletions(-)

diff --git a/doc/src/sgml/ref/create_table.sgml b/doc/src/sgml/ref/create_table.sgml
index 10ef699fab..32af7e7182 100644
--- a/doc/src/sgml/ref/create_table.sgml
+++ b/doc/src/sgml/ref/create_table.sgml
@@ -1459,6 +1459,38 @@ WITH ( MODULUS <replaceable class="parameter">numeric_literal</replaceable>, REM
     </listitem>
    </varlistentry>
 
+    <varlistentry id="reloption-local_update_limit" xreflabel="local_update_limit">
+    <term><varname>local_update_limit</varname> (<type>integer</type>)
+    <indexterm>
+     <primary><varname>local_update_limit</varname> storage parameter</primary>
+    </indexterm>
+    </term>
+    <listitem>
+     <para>
+      The local update limit for a table configures in which part of the table
+      tuples will prefer updates on the same page. -1 (unlimited) is the
+      default. When a the limit is configured, tuples that are located outside
+      the the first <literal>local_update_limit</literal> megabytes of the heap
+      relation will always use the new tuple insertion mechanism to find a page
+      to insert the new tuple version into, instead of first trying to use free
+      space on the local page, making this quite useful in helping move tuples
+      from the end of a table into free space at the front of the table without
+      the long locks and processing downtime associated with
+      <xref linkend="sql-cluster"/>. Do note, though, that
+      <link linkend="storage-hot">heap-only tuple updates</link> only happen
+      when both the old and the new version of each tuple is on the same page.
+      Because <literal>local_update_limit</literal> moves the tuple away from
+      the page, this is likely to reduce the effectiveness of HOT while the
+      tuples are being moved to the front of the table.
+      Another consideration is that even though this setting won't excessively
+      increase the size of your table, it will increase the effort we need to
+      expend while updating tuples in the area after <literal>local_update_limit</literal>.
+      That means those updates will always have a higher latency, and benefit
+      less from a tuned <link linkend="reloption-fillfactor"><literal>fillfactor</literal></link>.
+     </para>
+    </listitem>
+   </varlistentry>
+
    <varlistentry id="reloption-toast-tuple-target" xreflabel="toast_tuple_target">
     <term><literal>toast_tuple_target</literal> (<type>integer</type>)
     <indexterm>
diff --git a/src/backend/access/common/reloptions.c b/src/backend/access/common/reloptions.c
index 469de9bb49..225ea9f019 100644
--- a/src/backend/access/common/reloptions.c
+++ b/src/backend/access/common/reloptions.c
@@ -382,6 +382,15 @@ static relopt_int intRelOpts[] =
 		},
 		-1, 0, 1024
 	},
+	{
+		{
+			"local_update_limit",
+			"Updates in the table that are not located the first local_update_limit MB of the table will always try to insert the new tuple on a different page.",
+			RELOPT_KIND_HEAP,
+			ShareUpdateExclusiveLock
+		},
+		-1, -1, (MaxBlockNumber / (1024 * 1024 / BLCKSZ))
+	},
 
 	/* list terminator */
 	{{NULL}}
@@ -1882,7 +1891,9 @@ default_reloptions(Datum reloptions, bool validate, relopt_kind kind)
 		{"vacuum_index_cleanup", RELOPT_TYPE_ENUM,
 		offsetof(StdRdOptions, vacuum_index_cleanup)},
 		{"vacuum_truncate", RELOPT_TYPE_BOOL,
-		offsetof(StdRdOptions, vacuum_truncate)}
+		offsetof(StdRdOptions, vacuum_truncate)},
+		{"local_update_limit", RELOPT_TYPE_INT,
+		offsetof(StdRdOptions, local_update_limit)}
 	};
 
 	return (bytea *) build_reloptions(reloptions, validate, kind,
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 6a66214a58..2823773c9e 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -3496,7 +3496,8 @@ l2:
 
 	newtupsize = MAXALIGN(newtup->t_len);
 
-	if (need_toast || newtupsize > pagefree)
+	if (need_toast || newtupsize > pagefree ||
+		!RelationUpdateTupleOnPageLocally(relation, -1, block))
 	{
 		TransactionId xmax_lock_old_tuple;
 		uint16		infomask_lock_old_tuple,
@@ -3508,7 +3509,7 @@ l2:
 		 * temporarily mark it locked, while we release the page-level lock.
 		 *
 		 * To satisfy the rule that any xid potentially appearing in a buffer
-		 * written out to disk, we unfortunately have to WAL log this
+		 * must be written out to disk, we unfortunately have to WAL log this
 		 * temporary modification.  We can reuse xl_heap_lock for this
 		 * purpose.  If we crash/error before following through with the
 		 * actual update, xmax will be of an aborted transaction, allowing
@@ -3633,6 +3634,39 @@ l2:
 				/* We're all done. */
 				break;
 			}
+
+			if (!RelationUpdateTupleOnPageLocally(relation, -1, block))
+			{
+				/*
+				 * We are trying to store updates in an earlier part of the
+				 * table.
+				 */
+				newbuf = RelationGetBufferForTuple(relation, heaptup->t_len,
+												   buffer, 0, NULL,
+												   &vmbuffer_new, &vmbuffer,
+												   0);
+
+				/*
+				 * We'd prefer to move the tuple to an earlier page. However,
+				 * if the page for the new tuple is located after our old
+				 * tuple's page, and the old page still had space, we insert
+				 * the tuple into the old page.
+				 */
+				if (BufferGetBlockNumber(newbuf) > block &&
+					PageGetHeapFreeSpace(page) >= newtupsize)
+				{
+					UnlockReleaseBuffer(newbuf);
+					newbuf = buffer;
+					if (vmbuffer_new != vmbuffer && vmbuffer_new != InvalidBuffer)
+					{
+						UnlockReleaseBuffer(vmbuffer_new);
+						vmbuffer_new = InvalidBuffer;
+					}
+				}
+				/* We're all done. */
+				break;
+			}
+
 			/* Acquire VM page pin if needed and we don't have it. */
 			if (vmbuffer == InvalidBuffer && PageIsAllVisible(page))
 				visibilitymap_pin(relation, block, &vmbuffer);
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 779fdc90cb..48dcacd352 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -1304,6 +1304,7 @@ static const char *const table_storage_parameters[] = {
 	"autovacuum_vacuum_scale_factor",
 	"autovacuum_vacuum_threshold",
 	"fillfactor",
+	"local_update_limit",
 	"log_autovacuum_min_duration",
 	"parallel_workers",
 	"toast.autovacuum_enabled",
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index 1426a353cd..22411fd7c8 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -342,6 +342,7 @@ typedef struct StdRdOptions
 	int			parallel_workers;	/* max number of parallel workers */
 	StdRdOptIndexCleanup vacuum_index_cleanup;	/* controls index vacuuming */
 	bool		vacuum_truncate;	/* enables vacuum to truncate a relation */
+	int			local_update_limit;	/* after this block, always use tuple routing */
 } StdRdOptions;
 
 #define HEAP_MIN_FILLFACTOR			10
@@ -377,6 +378,24 @@ typedef struct StdRdOptions
 #define RelationGetTargetPageFreeSpace(relation, defaultff) \
 	(BLCKSZ * (100 - RelationGetFillFactor(relation, defaultff)) / 100)
 
+/*
+ * RelationGetLocalUpdateLimit
+ *		Returns the size of the relation's local update section (MB).
+ */
+#define RelationGetLocalUpdateLimit(relation, defaultlul) \
+	((relation)->rd_options ? \
+	 ((StdRdOptions *) (relation)->rd_options)->local_update_limit : (defaultlul))
+
+/*
+ * RelationUpdateTupleLocally
+ *		Is an update on blockno allowed to put the new tuple on the current
+ *		page, or should we instead try to find a different page?
+ */
+#define RelationUpdateTupleOnPageLocally(relation, defaultmlu, blockno) \
+	((RelationGetLocalUpdateLimit((relation), (defaultmlu)) == -1) || \
+	  ((blockno) < (BlockNumber) (RelationGetLocalUpdateLimit((relation), (defaultmlu)) * \
+								  (1024 * 1024) / BLCKSZ)))
+
 /*
  * RelationIsUsedAsCatalogTable
  *		Returns whether the relation should be treated as a catalog table
diff --git a/src/test/regress/expected/alter_table.out b/src/test/regress/expected/alter_table.out
index cd814ff321..d8076f2e61 100644
--- a/src/test/regress/expected/alter_table.out
+++ b/src/test/regress/expected/alter_table.out
@@ -2758,6 +2758,24 @@ select * from my_locks order by 1;
  pg_toast  | ShareUpdateExclusiveLock
 (2 rows)
 
+commit;
+begin; alter table alterlock set (local_update_limit = 8);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
+commit;
+begin; alter table alterlock reset (local_update_limit);
+select * from my_locks order by 1;
+  relname  |       max_lockmode       
+-----------+--------------------------
+ alterlock | ShareUpdateExclusiveLock
+ pg_toast  | ShareUpdateExclusiveLock
+(2 rows)
+
 commit;
 begin; alter table alterlock set (toast.autovacuum_enabled = off);
 select * from my_locks order by 1;
diff --git a/src/test/regress/expected/update.out b/src/test/regress/expected/update.out
index c809f88f54..4116929760 100644
--- a/src/test/regress/expected/update.out
+++ b/src/test/regress/expected/update.out
@@ -1026,3 +1026,41 @@ update hash_parted set b = b + 8 where b = 1;
 drop table hash_parted;
 drop operator class custom_opclass using hash;
 drop function dummy_hashint4(a int4, seed int8);
+create table block_local_updates(id int, b int) with (fillfactor = 10, autovacuum_enabled = false);
+insert into block_local_updates select generate_series(1, 88), 0;
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+ ctid | count 
+------+-------
+    0 |    22
+    1 |    22
+    2 |    22
+    3 |    22
+(4 rows)
+
+-- Test local update limit still works if there is no space available earlier in the table (after accounting for fillfactor)
+alter table block_local_updates set (local_update_limit=0);
+update block_local_updates set b = 1;
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+ ctid | count 
+------+-------
+    0 |    22
+    1 |    22
+    2 |    22
+    3 |    22
+(4 rows)
+
+-- FF 10=>100 -> all blocks now have ~ 90% space left
+alter table block_local_updates set (fillfactor = 100);
+-- vacuum to clear "FULL" hint bits on all pages, and clear dead LPs
+vacuum (disable_page_skipping true) block_local_updates;
+-- ~90% space left on each page, all updates would be page-local if not for local_update_limit
+update block_local_updates set b = 2;
+-- all tuples moved to first page, 88 total
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+ ctid | count 
+------+-------
+    0 |    88
+(1 row)
+
+-- cleanup
+drop table block_local_updates;
diff --git a/src/test/regress/sql/alter_table.sql b/src/test/regress/sql/alter_table.sql
index ff8c498419..eb8c8ded0c 100644
--- a/src/test/regress/sql/alter_table.sql
+++ b/src/test/regress/sql/alter_table.sql
@@ -1770,6 +1770,14 @@ begin; alter table alterlock reset (fillfactor);
 select * from my_locks order by 1;
 commit;
 
+begin; alter table alterlock set (local_update_limit = 8);
+select * from my_locks order by 1;
+commit;
+
+begin; alter table alterlock reset (local_update_limit);
+select * from my_locks order by 1;
+commit;
+
 begin; alter table alterlock set (toast.autovacuum_enabled = off);
 select * from my_locks order by 1;
 commit;
diff --git a/src/test/regress/sql/update.sql b/src/test/regress/sql/update.sql
index 7a7bee77b9..0ef4897c95 100644
--- a/src/test/regress/sql/update.sql
+++ b/src/test/regress/sql/update.sql
@@ -667,3 +667,26 @@ update hash_parted set b = b + 8 where b = 1;
 drop table hash_parted;
 drop operator class custom_opclass using hash;
 drop function dummy_hashint4(a int4, seed int8);
+
+create table block_local_updates(id int, b int) with (fillfactor = 10, autovacuum_enabled = false);
+insert into block_local_updates select generate_series(1, 88), 0;
+
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+
+-- Test local update limit still works if there is no space available earlier in the table (after accounting for fillfactor)
+alter table block_local_updates set (local_update_limit=0);
+update block_local_updates set b = 1;
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+
+-- FF 10=>100 -> all blocks now have ~ 90% space left
+alter table block_local_updates set (fillfactor = 100);
+-- vacuum to clear "FULL" hint bits on all pages, and clear dead LPs
+vacuum (disable_page_skipping true) block_local_updates;
+
+-- ~90% space left on each page, all updates would be page-local if not for local_update_limit
+update block_local_updates set b = 2;
+-- all tuples moved to first page, 88 total
+select (ctid::text::point)[0], count(*) from block_local_updates group by 1 order by 1;
+
+-- cleanup
+drop table block_local_updates;
-- 
2.40.1

