From 34b5786cec6e8081db005d1c3eb5190f5b6e207b Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Sun, 7 Aug 2022 01:08:15 +0000
Subject: [PATCH v1] Use pg_pwritev_with_retry() instead of write() in
 walmethods.c

Use pg_pwritev_with_retry() while prepadding a WAL segment
instead of write() in walmethods.c dir_open_for_write() to avoid
partial writes. As the pg_pwritev_with_retry() function uses
pwritev, we can avoid explicit lseek() to seek to the beginning
of the WAL segment.

These changes are inline with how core postgres initializes the
WAL segment in XLogFileInitInternal().
---
 src/backend/access/transam/xlog.c  |  35 +++-------
 src/backend/storage/file/fd.c      |  65 ------------------
 src/bin/pg_basebackup/walmethods.c |  22 ++----
 src/common/file_utils.c            | 104 +++++++++++++++++++++++++++++
 src/include/common/file_utils.h    |  11 +++
 src/include/storage/fd.h           |   6 --
 6 files changed, 129 insertions(+), 114 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 34f0150d1e..90473a7af4 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2916,7 +2916,6 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 					 bool *added, char *path)
 {
 	char		tmppath[MAXPGPATH];
-	PGAlignedXLogBlock zbuffer;
 	XLogSegNo	installed_segno;
 	XLogSegNo	max_segno;
 	int			fd;
@@ -2960,14 +2959,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 				(errcode_for_file_access(),
 				 errmsg("could not create file \"%s\": %m", tmppath)));
 
-	memset(zbuffer.data, 0, XLOG_BLCKSZ);
-
 	pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
 	save_errno = 0;
 	if (wal_init_zero)
 	{
-		struct iovec iov[PG_IOV_MAX];
-		int			blocks;
+		ssize_t rc;
 
 		/*
 		 * Zero-fill the file.  With this setting, we do this the hard way to
@@ -2978,32 +2974,17 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
 		 * indirect blocks are down on disk.  Therefore, fdatasync(2) or
 		 * O_DSYNC will be sufficient to sync future writes to the log file.
 		 */
+		rc = pg_pwritev_with_retry_and_init(fd, wal_segment_size, XLOG_BLCKSZ);
 
-		/* Prepare to write out a lot of copies of our zero buffer at once. */
-		for (int i = 0; i < lengthof(iov); ++i)
-		{
-			iov[i].iov_base = zbuffer.data;
-			iov[i].iov_len = XLOG_BLCKSZ;
-		}
-
-		/* Loop, writing as many blocks as we can for each system call. */
-		blocks = wal_segment_size / XLOG_BLCKSZ;
-		for (int i = 0; i < blocks;)
-		{
-			int			iovcnt = Min(blocks - i, lengthof(iov));
-			off_t		offset = i * XLOG_BLCKSZ;
-
-			if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
-			{
-				save_errno = errno;
-				break;
-			}
-
-			i += iovcnt;
-		}
+		if (rc < 0)
+			save_errno = errno;
 	}
 	else
 	{
+		PGAlignedXLogBlock zbuffer;
+
+		memset(zbuffer.data, 0, XLOG_BLCKSZ);
+
 		/*
 		 * Otherwise, seeking to the end and writing a solitary byte is
 		 * enough.
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index efb34d4dcb..ac611b836f 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -95,7 +95,6 @@
 #include "common/pg_prng.h"
 #include "miscadmin.h"
 #include "pgstat.h"
-#include "port/pg_iovec.h"
 #include "portability/mem.h"
 #include "postmaster/startup.h"
 #include "storage/fd.h"
@@ -3747,67 +3746,3 @@ data_sync_elevel(int elevel)
 {
 	return data_sync_retry ? elevel : PANIC;
 }
-
-/*
- * A convenience wrapper for pwritev() that retries on partial write.  If an
- * error is returned, it is unspecified how much has been written.
- */
-ssize_t
-pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
-{
-	struct iovec iov_copy[PG_IOV_MAX];
-	ssize_t		sum = 0;
-	ssize_t		part;
-
-	/* We'd better have space to make a copy, in case we need to retry. */
-	if (iovcnt > PG_IOV_MAX)
-	{
-		errno = EINVAL;
-		return -1;
-	}
-
-	for (;;)
-	{
-		/* Write as much as we can. */
-		part = pwritev(fd, iov, iovcnt, offset);
-		if (part < 0)
-			return -1;
-
-#ifdef SIMULATE_SHORT_WRITE
-		part = Min(part, 4096);
-#endif
-
-		/* Count our progress. */
-		sum += part;
-		offset += part;
-
-		/* Step over iovecs that are done. */
-		while (iovcnt > 0 && iov->iov_len <= part)
-		{
-			part -= iov->iov_len;
-			++iov;
-			--iovcnt;
-		}
-
-		/* Are they all done? */
-		if (iovcnt == 0)
-		{
-			/* We don't expect the kernel to write more than requested. */
-			Assert(part == 0);
-			break;
-		}
-
-		/*
-		 * Move whatever's left to the front of our mutable copy and adjust
-		 * the leading iovec.
-		 */
-		Assert(iovcnt > 0);
-		memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
-		Assert(iov->iov_len > part);
-		iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
-		iov_copy[0].iov_len -= part;
-		iov = iov_copy;
-	}
-
-	return sum;
-}
diff --git a/src/bin/pg_basebackup/walmethods.c b/src/bin/pg_basebackup/walmethods.c
index e90aa0ba37..8e70fc2d38 100644
--- a/src/bin/pg_basebackup/walmethods.c
+++ b/src/bin/pg_basebackup/walmethods.c
@@ -209,25 +209,15 @@ dir_open_for_write(const char *pathname, const char *temp_suffix, size_t pad_to_
 	/* Do pre-padding on non-compressed files */
 	if (pad_to_size && dir_data->compression_algorithm == PG_COMPRESSION_NONE)
 	{
-		PGAlignedXLogBlock zerobuf;
-		int			bytes;
+		ssize_t rc;
 
-		memset(zerobuf.data, 0, XLOG_BLCKSZ);
-		for (bytes = 0; bytes < pad_to_size; bytes += XLOG_BLCKSZ)
-		{
-			errno = 0;
-			if (write(fd, zerobuf.data, XLOG_BLCKSZ) != XLOG_BLCKSZ)
-			{
-				/* If write didn't set errno, assume problem is no disk space */
-				dir_data->lasterrno = errno ? errno : ENOSPC;
-				close(fd);
-				return NULL;
-			}
-		}
+		errno = 0;
+		rc = pg_pwritev_with_retry_and_init(fd, pad_to_size, XLOG_BLCKSZ);
 
-		if (lseek(fd, 0, SEEK_SET) != 0)
+		if (rc < 0)
 		{
-			dir_data->lasterrno = errno;
+			/* If errno isn't set, assume problem is no disk space */
+			dir_data->lasterrno = errno ? errno : ENOSPC;
 			close(fd);
 			return NULL;
 		}
diff --git a/src/common/file_utils.c b/src/common/file_utils.c
index df4d6d240c..bea8810e26 100644
--- a/src/common/file_utils.c
+++ b/src/common/file_utils.c
@@ -28,6 +28,7 @@
 #ifdef FRONTEND
 #include "common/logging.h"
 #endif
+#include "port/pg_iovec.h"
 
 #ifdef FRONTEND
 
@@ -460,3 +461,106 @@ get_dirent_type(const char *path,
 
 	return result;
 }
+
+/*
+ * A convenience wrapper for pwritev() that retries on partial write.  If an
+ * error is returned, it is unspecified how much has been written.
+ */
+ssize_t
+pg_pwritev_with_retry(int fd, const struct iovec *iov, int iovcnt, off_t offset)
+{
+	struct iovec iov_copy[PG_IOV_MAX];
+	ssize_t		sum = 0;
+	ssize_t		part;
+
+	/* We'd better have space to make a copy, in case we need to retry. */
+	if (iovcnt > PG_IOV_MAX)
+	{
+		errno = EINVAL;
+		return -1;
+	}
+
+	for (;;)
+	{
+		/* Write as much as we can. */
+		part = pwritev(fd, iov, iovcnt, offset);
+		if (part < 0)
+			return -1;
+
+#ifdef SIMULATE_SHORT_WRITE
+		part = Min(part, 4096);
+#endif
+
+		/* Count our progress. */
+		sum += part;
+		offset += part;
+
+		/* Step over iovecs that are done. */
+		while (iovcnt > 0 && iov->iov_len <= part)
+		{
+			part -= iov->iov_len;
+			++iov;
+			--iovcnt;
+		}
+
+		/* Are they all done? */
+		if (iovcnt == 0)
+		{
+			/* We don't expect the kernel to write more than requested. */
+			Assert(part == 0);
+			break;
+		}
+
+		/*
+		 * Move whatever's left to the front of our mutable copy and adjust
+		 * the leading iovec.
+		 */
+		Assert(iovcnt > 0);
+		memmove(iov_copy, iov, sizeof(*iov) * iovcnt);
+		Assert(iov->iov_len > part);
+		iov_copy[0].iov_base = (char *) iov_copy[0].iov_base + part;
+		iov_copy[0].iov_len -= part;
+		iov = iov_copy;
+	}
+
+	return sum;
+}
+
+/*
+ * A convenience wrapper for pg_pwritev_with_retry() that zero-fills the given
+ * file of size total_sz in batches of size block_sz.
+ */
+ssize_t
+pg_pwritev_with_retry_and_init(int fd, int total_sz, int block_sz)
+{
+	PGAlignedXLogBlock zbuffer;
+	struct iovec iov[PG_IOV_MAX];
+	int	blocks;
+	ssize_t rc;
+
+	memset(zbuffer.data, 0, block_sz);
+
+	/* Prepare to write out a lot of copies of our zero buffer at once. */
+	for (int i = 0; i < lengthof(iov); ++i)
+	{
+		iov[i].iov_base = zbuffer.data;
+		iov[i].iov_len = block_sz;
+	}
+
+	/* Loop, writing as many blocks as we can for each system call. */
+	blocks = total_sz / block_sz;
+	for (int i = 0; i < blocks;)
+	{
+		int			iovcnt = Min(blocks - i, lengthof(iov));
+		off_t		offset = i * block_sz;
+
+		rc = pg_pwritev_with_retry(fd, iov, iovcnt, offset);
+
+		if ( rc < 0)
+			break;
+
+		i += iovcnt;
+	}
+
+	return rc;
+}
diff --git a/src/include/common/file_utils.h b/src/include/common/file_utils.h
index 2811744c12..3c139afb95 100644
--- a/src/include/common/file_utils.h
+++ b/src/include/common/file_utils.h
@@ -24,6 +24,8 @@ typedef enum PGFileType
 	PGFILETYPE_LNK
 } PGFileType;
 
+struct iovec;					/* avoid including port/pg_iovec.h here */
+
 #ifdef FRONTEND
 extern int	fsync_fname(const char *fname, bool isdir);
 extern void fsync_pgdata(const char *pg_data, int serverVersion);
@@ -37,4 +39,13 @@ extern PGFileType get_dirent_type(const char *path,
 								  bool look_through_symlinks,
 								  int elevel);
 
+extern ssize_t pg_pwritev_with_retry(int fd,
+									 const struct iovec *iov,
+									 int iovcnt,
+									 off_t offset);
+
+extern ssize_t pg_pwritev_with_retry_and_init(int fd,
+											  int total_sz,
+											  int block_sz);
+
 #endif							/* FILE_UTILS_H */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 2b4a8e0ffe..224bdfb9a6 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -51,8 +51,6 @@ typedef enum RecoveryInitSyncMethod
 	RECOVERY_INIT_SYNC_METHOD_SYNCFS
 }			RecoveryInitSyncMethod;
 
-struct iovec;					/* avoid including port/pg_iovec.h here */
-
 typedef int File;
 
 
@@ -178,10 +176,6 @@ extern int	pg_fsync_no_writethrough(int fd);
 extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
 extern void pg_flush_data(int fd, off_t offset, off_t amount);
-extern ssize_t pg_pwritev_with_retry(int fd,
-									 const struct iovec *iov,
-									 int iovcnt,
-									 off_t offset);
 extern int	pg_truncate(const char *path, off_t length);
 extern void fsync_fname(const char *fname, bool isdir);
 extern int	fsync_fname_ext(const char *fname, bool isdir, bool ignore_perm, int elevel);
-- 
2.34.1

