From 023db6aaeee8cfdbe1d89bfd6ae7c13dd3a60465 Mon Sep 17 00:00:00 2001
From: Andrey Borodin <amborodin@acm.org>
Date: Mon, 1 Jan 2018 20:55:14 +0500
Subject: [PATCH] SLRU checksums patch

---
 src/backend/access/transam/clog.c      |   2 +-
 src/backend/access/transam/commit_ts.c |   2 +-
 src/backend/access/transam/multixact.c |   4 +-
 src/backend/access/transam/slru.c      |  26 +++++
 src/backend/access/transam/subtrans.c  |   2 +-
 src/backend/commands/async.c           |   2 +-
 src/backend/storage/lmgr/predicate.c   |   2 +-
 src/bin/pg_upgrade/pg_upgrade.c        | 188 +++++++++++++++++++++++++++++++--
 src/bin/pg_upgrade/pg_upgrade.h        |   4 +
 src/include/access/slru.h              |   1 +
 src/include/catalog/catversion.h       |   2 +-
 src/include/storage/checksum.h         |  11 +-
 src/include/storage/checksum_impl.h    |  46 ++++++++
 13 files changed, 273 insertions(+), 19 deletions(-)

diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index bbf9ce1a3a..3d9dba0414 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -58,7 +58,7 @@
 /* We need two bits per xact, so four xacts fit in a byte */
 #define CLOG_BITS_PER_XACT	2
 #define CLOG_XACTS_PER_BYTE 4
-#define CLOG_XACTS_PER_PAGE (BLCKSZ * CLOG_XACTS_PER_BYTE)
+#define CLOG_XACTS_PER_PAGE ((BLCKSZ - CHKSUMSZ) * CLOG_XACTS_PER_BYTE)
 #define CLOG_XACT_BITMASK	((1 << CLOG_BITS_PER_XACT) - 1)
 
 #define TransactionIdToPage(xid)	((xid) / (TransactionId) CLOG_XACTS_PER_PAGE)
diff --git a/src/backend/access/transam/commit_ts.c b/src/backend/access/transam/commit_ts.c
index 7b7bf2b2bf..621f6cf482 100644
--- a/src/backend/access/transam/commit_ts.c
+++ b/src/backend/access/transam/commit_ts.c
@@ -64,7 +64,7 @@ typedef struct CommitTimestampEntry
 									sizeof(RepOriginId))
 
 #define COMMIT_TS_XACTS_PER_PAGE \
-	(BLCKSZ / SizeOfCommitTimestampEntry)
+	((BLCKSZ - CHKSUMSZ) / SizeOfCommitTimestampEntry)
 
 #define TransactionIdToCTsPage(xid) \
 	((xid) / (TransactionId) COMMIT_TS_XACTS_PER_PAGE)
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index ba01e94328..f04c23c649 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -106,7 +106,7 @@
  */
 
 /* We need four bytes per offset */
-#define MULTIXACT_OFFSETS_PER_PAGE (BLCKSZ / sizeof(MultiXactOffset))
+#define MULTIXACT_OFFSETS_PER_PAGE ((BLCKSZ - CHKSUMSZ) / sizeof(MultiXactOffset))
 
 #define MultiXactIdToOffsetPage(xid) \
 	((xid) / (MultiXactOffset) MULTIXACT_OFFSETS_PER_PAGE)
@@ -138,7 +138,7 @@
 /* size in bytes of a complete group */
 #define MULTIXACT_MEMBERGROUP_SIZE \
 	(sizeof(TransactionId) * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP)
-#define MULTIXACT_MEMBERGROUPS_PER_PAGE (BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+#define MULTIXACT_MEMBERGROUPS_PER_PAGE ((BLCKSZ - CHKSUMSZ) / MULTIXACT_MEMBERGROUP_SIZE)
 #define MULTIXACT_MEMBERS_PER_PAGE	\
 	(MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP)
 
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 94b6e6612a..651afd51e9 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -55,9 +55,15 @@
 #include "access/transam.h"
 #include "access/xlog.h"
 #include "pgstat.h"
+#include "storage/checksum.h"
 #include "storage/fd.h"
 #include "storage/shmem.h"
 #include "miscadmin.h"
+#include "utils/guc.h"
+#include "utils/memutils.h"
+
+/* GUC variable */
+extern bool ignore_checksum_failure;
 
 
 #define SlruFileName(ctl, path, seg) \
@@ -376,6 +382,7 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
 				  TransactionId xid)
 {
 	SlruShared	shared = ctl->shared;
+	int16		checksum;
 
 	/* Outer loop handles restart if we must wait for someone else's I/O */
 	for (;;)
@@ -426,6 +433,19 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, bool write_ok,
 		/* Do the read */
 		ok = SlruPhysicalReadPage(ctl, pageno, slotno);
 
+		if (DataChecksumsEnabled() && ok)
+		{			
+			checksum = pg_getchecksum_slru_page(shared->page_buffer[slotno]);
+			if (checksum != pg_checksum_slru_page(shared->page_buffer[slotno]))
+			{
+				elog(LOG, "CHECKSUM: Page Is not Verified.");
+				if (!ignore_checksum_failure)
+				{
+					elog(ERROR, "CHECKSUM: ERROR ignore_checksum_failure turned off.");
+				}
+			}
+		}
+
 		/* Set the LSNs for this newly read-in page to zero */
 		SimpleLruZeroLSNs(ctl, slotno);
 
@@ -539,6 +559,12 @@ SlruInternalWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
 	/* Release control lock while doing I/O */
 	LWLockRelease(shared->ControlLock);
 
+	/*
+	 * Update checksum on the page. We do not need to copy the page since page
+	 * contents cannot be modified under the lock.
+	 */
+	pg_setchecksum_slru_page(shared->page_buffer[slotno]);
+
 	/* Do the write */
 	ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
 
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index f640661130..80420cb7a4 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -49,7 +49,7 @@
  */
 
 /* We need four bytes per xact */
-#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+#define SUBTRANS_XACTS_PER_PAGE ((BLCKSZ - CHKSUMSZ) / sizeof(TransactionId))
 
 #define TransactionIdToPage(xid) ((xid) / (TransactionId) SUBTRANS_XACTS_PER_PAGE)
 #define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE)
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index f7de742a56..8f672635e3 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -150,7 +150,7 @@
  * than that, so changes in that data structure won't affect user-visible
  * restrictions.
  */
-#define NOTIFY_PAYLOAD_MAX_LENGTH	(BLCKSZ - NAMEDATALEN - 128)
+#define NOTIFY_PAYLOAD_MAX_LENGTH	(BLCKSZ - NAMEDATALEN - 128 - CHKSUMSZ)
 
 /*
  * Struct representing an entry in the global notify queue
diff --git a/src/backend/storage/lmgr/predicate.c b/src/backend/storage/lmgr/predicate.c
index 251a359bff..de8b74820e 100644
--- a/src/backend/storage/lmgr/predicate.c
+++ b/src/backend/storage/lmgr/predicate.c
@@ -315,7 +315,7 @@ static SlruCtlData OldSerXidSlruCtlData;
 
 #define OLDSERXID_PAGESIZE			BLCKSZ
 #define OLDSERXID_ENTRYSIZE			sizeof(SerCommitSeqNo)
-#define OLDSERXID_ENTRIESPERPAGE	(OLDSERXID_PAGESIZE / OLDSERXID_ENTRYSIZE)
+#define OLDSERXID_ENTRIESPERPAGE	((OLDSERXID_PAGESIZE - CHKSUMSZ) / OLDSERXID_ENTRYSIZE)
 
 /*
  * Set maximum pages based on the lesser of the number needed to track all
diff --git a/src/bin/pg_upgrade/pg_upgrade.c b/src/bin/pg_upgrade/pg_upgrade.c
index c10103f0bf..5751285b3f 100644
--- a/src/bin/pg_upgrade/pg_upgrade.c
+++ b/src/bin/pg_upgrade/pg_upgrade.c
@@ -404,17 +404,183 @@ copy_subdir_files(const char *old_subdir, const char *new_subdir)
 	check_ok();
 }
 
+#include "storage/checksum.h"
+
+#include <dirent.h>
+#include <math.h>
+#include <fcntl.h>
+
+#define SLRU_PAGES_PER_SEGMENT	32
+#define SLRU_SEGMENT_SIZE (BLCKSZ * SLRU_PAGES_PER_SEGMENT)
+
+#define CLOG_BYTES_PER_PAGE_NEW	(BLCKSZ - CHKSUMSZ)
+#define CLOG_BYTES_PER_SEGMENT_NEW	(BLCKSZ - CHKSUMSZ) * SLRU_PAGES_PER_SEGMENT
+
+static void write_xact_data_to_file(char *file_name, uint32 local_start, char *data, uint32 length)
+{
+	int		dest_fd;
+	int		local_end = local_start + length;
+	char   *buffer = pg_malloc(SLRU_SEGMENT_SIZE);
+
+	Assert(length <= CLOG_BYTES_PER_SEGMENT_NEW);
+
+	if ((dest_fd = open(file_name, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+						S_IRUSR | S_IWUSR)) < 0)
+		pg_fatal("could not create file \"%s\": %s\n", file_name, strerror(errno));
+
+	if (ftruncate(dest_fd, SLRU_SEGMENT_SIZE) < 0)
+		pg_fatal("could not set size of file \"%s\": %s\n", file_name, strerror(errno));
+
+	while (local_start < local_end)
+	{
+		int nbytes;
+		int page = local_start / CLOG_BYTES_PER_PAGE_NEW;
+		int page_start = local_start - (page * CLOG_BYTES_PER_PAGE_NEW);
+		int delta = Min((page + 1) * CLOG_BYTES_PER_PAGE_NEW, local_end) - local_start;		
+
+		if (lseek(dest_fd, page * BLCKSZ, SEEK_SET) < 0)
+			pg_fatal("could not set position in file \"%s\": %s\n", file_name, strerror(errno));
+
+		nbytes = read(dest_fd, buffer, BLCKSZ);
+
+		if (nbytes < 0)
+			pg_fatal("could not read file \"%s\": %s\n", file_name, strerror(errno));
+
+		memmove(buffer + page_start, data, delta);
+
+		pg_setchecksum_slru_page(buffer);
+
+		if (lseek(dest_fd, page * BLCKSZ, SEEK_SET) < 0)
+			pg_fatal("could not set position in file \"%s\": %s\n", file_name, strerror(errno));
+
+		if (write(dest_fd, buffer, BLCKSZ) != BLCKSZ)
+		{
+			pg_fatal("could not write file \"%s\": %s\n", file_name, strerror(errno));
+		}
+		
+		local_start += delta;
+		data += delta;
+	}
+
+	pg_free(buffer);
+	close(dest_fd);
+}
+
+static void
+distribute_xact_data(char *buffer, int nbytes, int oldsegno, const char *new_subdir)
+{
+	uint64 start = oldsegno * ((uint64) SLRU_SEGMENT_SIZE);
+	uint64 end = start + nbytes;
+
+	while (start < end)
+	{
+		int new_segno = start / (CLOG_BYTES_PER_SEGMENT_NEW);
+		uint64 local_start = start - new_segno * CLOG_BYTES_PER_SEGMENT_NEW;
+		uint64 local_end = Min(end, ((uint64)new_segno + 1) * CLOG_BYTES_PER_SEGMENT_NEW);
+		int64 length = local_end - start;
+		char		new_file[MAXPGPATH];
+
+		Assert(length > 0);
+		Assert(length == (uint32)length);
+		Assert(local_start == (uint32)local_start);
+
+		snprintf(new_file, sizeof(new_file), "%s/%s/%04X", new_cluster.pgdata, new_subdir, new_segno);
+
+		write_xact_data_to_file(new_file, (uint32)local_start, buffer, (uint32)length);
+
+		start +=length;
+		buffer +=length;
+	}
+}
+
+static void
+upgrade_one_xact_file(const char *old_file, int segno, const char *new_subdir)
+{
+	char 	   *buffer = pg_malloc(SLRU_SEGMENT_SIZE);
+	int			src_fd;
+	ssize_t		nbytes;
+
+	if ((src_fd = open(old_file, O_RDONLY | PG_BINARY, 0)) < 0)
+		pg_fatal("could not open file \"%s\": %s\n", old_file, strerror(errno));
+	
+	nbytes = read(src_fd, buffer, SLRU_SEGMENT_SIZE);
+
+	if (nbytes < 0)
+			pg_fatal("could not read file \"%s\": %s\n", old_file, strerror(errno));
+
+	distribute_xact_data(buffer, nbytes, segno, new_subdir);
+
+	pg_free(buffer);
+	close(src_fd);
+}
+
+static void
+upgrade_xact_files(const char *old_subdir, const char *new_subdir)
+{
+	char		old_path[MAXPGPATH];
+	char		old_file[MAXPGPATH];
+
+	DIR		   *cldir;
+	struct dirent *clde;
+	int			segno;
+
+	remove_new_subdir(new_subdir, false);
+
+	snprintf(old_path, sizeof(old_path), "%s/%s", old_cluster.pgdata, old_subdir);
+
+	prep_status("Upgrading old %s to new cluster", old_subdir);
+
+	
+	if ((cldir = opendir(old_path)) == NULL)
+	{
+		pg_fatal("could not open dir \"%s\": %s\n", old_path, strerror(errno));
+	}
+
+	while (errno = 0, (clde = readdir(cldir)) != NULL)
+	{
+		size_t		len;
+
+		len = strlen(clde->d_name);
+
+		if ((len == 4 || len == 5 || len == 6) &&
+			strspn(clde->d_name, "0123456789ABCDEF") == len)
+		{
+			segno = (int) strtol(clde->d_name, NULL, 16);
+			snprintf(old_file, sizeof(old_file), "%s/%s", old_path, clde->d_name);
+
+			upgrade_one_xact_file(old_file, segno, new_subdir);
+		}
+	}
+
+	if (errno)
+	{
+		pg_fatal("could not read dir \"%s\": %s\n", old_path, strerror(errno));
+	}
+	check_ok();
+}
+
 static void
 copy_xact_xlog_xid(void)
 {
-	/*
-	 * Copy old commit logs to new data dir. pg_clog has been renamed to
-	 * pg_xact in post-10 clusters.
-	 */
-	copy_subdir_files(GET_MAJOR_VERSION(old_cluster.major_version) < 1000 ?
-					  "pg_clog" : "pg_xact",
-					  GET_MAJOR_VERSION(new_cluster.major_version) < 1000 ?
-					  "pg_clog" : "pg_xact");
+	bool slru_changed = (new_cluster.controldata.cat_ver >= SLRU_FORMAT_CHANGE_CAT_VER &&
+		old_cluster.controldata.cat_ver < SLRU_FORMAT_CHANGE_CAT_VER);
+	char *xact_old_subdir = GET_MAJOR_VERSION(old_cluster.major_version) < 1000 ?
+					  "pg_clog" : "pg_xact";
+	char *xact_new_subdir = GET_MAJOR_VERSION(new_cluster.major_version) < 1000 ?
+					  "pg_clog" : "pg_xact";
+
+	if (slru_changed)
+	{
+		upgrade_xact_files(xact_old_subdir, xact_new_subdir);
+	}
+	else
+	{
+		/*
+		* Copy old commit logs to new data dir. pg_clog has been renamed to
+		* pg_xact in post-10 clusters.
+		*/
+		copy_subdir_files(xact_old_subdir, xact_new_subdir);
+	}
 
 	/* set the next transaction id and epoch of the new cluster */
 	prep_status("Setting next transaction ID and epoch for new cluster");
@@ -442,7 +608,8 @@ copy_xact_xlog_xid(void)
 	 * server doesn't attempt to read multis older than the cutoff value.
 	 */
 	if (old_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER &&
-		new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+		new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER && 
+		!slru_changed)
 	{
 		copy_subdir_files("pg_multixact/offsets", "pg_multixact/offsets");
 		copy_subdir_files("pg_multixact/members", "pg_multixact/members");
@@ -462,7 +629,8 @@ copy_xact_xlog_xid(void)
 				  new_cluster.pgdata);
 		check_ok();
 	}
-	else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER)
+	else if (new_cluster.controldata.cat_ver >= MULTIXACT_FORMATCHANGE_CAT_VER ||
+				slru_changed)
 	{
 		/*
 		 * Remove offsets/0000 file created by initdb that no longer matches
diff --git a/src/bin/pg_upgrade/pg_upgrade.h b/src/bin/pg_upgrade/pg_upgrade.h
index a21dd48c42..2c9350d0fb 100644
--- a/src/bin/pg_upgrade/pg_upgrade.h
+++ b/src/bin/pg_upgrade/pg_upgrade.h
@@ -131,6 +131,10 @@ extern char *output_files[];
  */
 #define JSONB_FORMAT_CHANGE_CAT_VER 201409291
 
+/*
+ * change in SLRU format to add checksums
+ */
+#define SLRU_FORMAT_CHANGE_CAT_VER 201803181
 
 /*
  * Each relation is represented by a relinfo structure.
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 20114c4d44..e7b9662764 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -14,6 +14,7 @@
 #define SLRU_H
 
 #include "access/xlogdefs.h"
+#include "storage/checksum.h"
 #include "storage/lwlock.h"
 
 
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 3934582efc..d6b15761d8 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	201712251
+#define CATALOG_VERSION_NO	201803181
 
 #endif
diff --git a/src/include/storage/checksum.h b/src/include/storage/checksum.h
index b85f714712..f4bd19b242 100644
--- a/src/include/storage/checksum.h
+++ b/src/include/storage/checksum.h
@@ -1,7 +1,7 @@
 /*-------------------------------------------------------------------------
  *
  * checksum.h
- *	  Checksum implementation for data pages.
+ *	  Checksum implementation for data pages and SLRU pages.
  *
  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
@@ -21,4 +21,13 @@
  */
 extern uint16 pg_checksum_page(char *page, BlockNumber blkno);
 
+extern uint16 pg_checksum_slru_page(char *page);
+
+extern uint16 pg_getchecksum_slru_page(char *page);
+
+extern void pg_setchecksum_slru_page(char *page);
+
+/* Size of checksum in bytes default 2 bytes (uint16) */
+#define CHKSUMSZ 2
+
 #endif							/* CHECKSUM_H */
diff --git a/src/include/storage/checksum_impl.h b/src/include/storage/checksum_impl.h
index bffd061de8..edc9d7e1dc 100644
--- a/src/include/storage/checksum_impl.h
+++ b/src/include/storage/checksum_impl.h
@@ -101,6 +101,7 @@
  */
 
 #include "storage/bufpage.h"
+#include "storage/checksum.h"
 
 /* number of checksums to calculate in parallel */
 #define N_SUMS 32
@@ -205,3 +206,48 @@ pg_checksum_page(char *page, BlockNumber blkno)
 	 */
 	return (checksum % 65535) + 1;
 }
+
+
+#define SLRU_CHECKSUM_UINT16_OFFSET (BLCKSZ / sizeof(uint16) - 1)
+/*
+ * Compute the checksum for a Postgres SLRU page.  The page must be aligned on a
+ * 4-byte boundary.
+ *
+ * The checksum save itself to the last 2 bytes (CHKSUMSZ = 2 bytes) of the page
+ */
+uint16
+pg_checksum_slru_page(char *page)
+{
+	uint16 *page_casted = (uint16*) page;
+	uint16		save_checksum;
+	uint32		checksum;
+
+	save_checksum = page_casted[SLRU_CHECKSUM_UINT16_OFFSET];
+	page_casted[SLRU_CHECKSUM_UINT16_OFFSET] = 0;
+
+	checksum = (pg_checksum_block(page, BLCKSZ) % 65535) + 1;
+
+	page_casted[SLRU_CHECKSUM_UINT16_OFFSET] = save_checksum;
+
+	return checksum;
+}
+
+/*
+ * Get the checksum for a Postgres SLRU page.
+ */
+uint16
+pg_getchecksum_slru_page(char *page)
+{
+	uint16 *page_casted = (uint16*) page;
+	return page_casted[SLRU_CHECKSUM_UINT16_OFFSET];
+}
+
+/*
+ * Compute and install the checksum for a Postgres SLRU page.
+ */
+void
+pg_setchecksum_slru_page(char *page)
+{
+	uint16 *page_casted = (uint16*) page;
+	page_casted[SLRU_CHECKSUM_UINT16_OFFSET] = pg_checksum_slru_page(page);
+}
-- 
2.14.3 (Apple Git-98)

