From 4801ff2c3b1e7bc7076205b676d4e3bc4a4ed308 Mon Sep 17 00:00:00 2001
From: Bharath Rupireddy <bharath.rupireddyforpostgres@gmail.com>
Date: Thu, 10 Feb 2022 15:58:58 +0000
Subject: [PATCH v8] Replace ReadDir with ReadDirExtended

Replace ReadDir with ReadDirExtended (in CheckPointSnapBuild) and
get rid of lstat entirely. We still use ReadDir in CheckPointLogicalRewriteHeap
because unable to read directory would result a NULL from
ReadDirExtended and we may miss to fsync the remaining map files,
so here let's error out with ReadDir.

With this change, the checkpoint will only care about the snapshot
and mapping files and not fail if it finds other files in the
directories.

Removing lstat enables us to make things faster as we avoid extra
system calls.

Also, convert "could not parse filename" and "could not remove file"
errors to LOG messages in  CheckPointLogicalRewriteHeap. This will
enable checkpoint not to waste the amount of work that it had done.
---
 src/backend/access/heap/rewriteheap.c       | 24 +++++++++++++++------
 src/backend/replication/logical/snapbuild.c | 12 ++---------
 2 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/src/backend/access/heap/rewriteheap.c b/src/backend/access/heap/rewriteheap.c
index 2a53826736..035cd6db70 100644
--- a/src/backend/access/heap/rewriteheap.c
+++ b/src/backend/access/heap/rewriteheap.c
@@ -1213,7 +1213,6 @@ CheckPointLogicalRewriteHeap(void)
 	mappings_dir = AllocateDir("pg_logical/mappings");
 	while ((mapping_de = ReadDir(mappings_dir, "pg_logical/mappings")) != NULL)
 	{
-		struct stat statbuf;
 		Oid			dboid;
 		Oid			relid;
 		XLogRecPtr	lsn;
@@ -1227,26 +1226,39 @@ CheckPointLogicalRewriteHeap(void)
 			continue;
 
 		snprintf(path, sizeof(path), "pg_logical/mappings/%s", mapping_de->d_name);
-		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
-			continue;
 
 		/* Skip over files that cannot be ours. */
 		if (strncmp(mapping_de->d_name, "map-", 4) != 0)
 			continue;
 
+		/* We just log a message if a file doesn't fit the pattern. */
 		if (sscanf(mapping_de->d_name, LOGICAL_REWRITE_FORMAT,
 				   &dboid, &relid, &hi, &lo, &rewrite_xid, &create_xid) != 6)
-			elog(ERROR, "could not parse filename \"%s\"", mapping_de->d_name);
+		{
+			ereport(LOG,
+					(errmsg("could not parse file name \"%s\"", path)));
+			continue;
+		}
 
 		lsn = ((uint64) hi) << 32 | lo;
 
 		if (lsn < cutoff || cutoff == InvalidXLogRecPtr)
 		{
 			elog(DEBUG1, "removing logical rewrite file \"%s\"", path);
+
+			/*
+			 * It's not particularly harmful, though strange, if we can't
+			 * remove the file here. Don't prevent the checkpoint from
+			 * completing, that'd be a cure worse than the disease.
+			 */
 			if (unlink(path) < 0)
-				ereport(ERROR,
+			{
+				ereport(LOG,
 						(errcode_for_file_access(),
-						 errmsg("could not remove file \"%s\": %m", path)));
+						 errmsg("could not remove file \"%s\": %m",
+								path)));
+				continue;
+			}
 		}
 		else
 		{
diff --git a/src/backend/replication/logical/snapbuild.c b/src/backend/replication/logical/snapbuild.c
index 83fca8a77d..e2cdf17bee 100644
--- a/src/backend/replication/logical/snapbuild.c
+++ b/src/backend/replication/logical/snapbuild.c
@@ -1942,12 +1942,11 @@ CheckPointSnapBuild(void)
 		cutoff = redo;
 
 	snap_dir = AllocateDir("pg_logical/snapshots");
-	while ((snap_de = ReadDir(snap_dir, "pg_logical/snapshots")) != NULL)
+	while ((snap_de = ReadDirExtended(snap_dir, "pg_logical/snapshots", LOG)) != NULL)
 	{
 		uint32		hi;
 		uint32		lo;
 		XLogRecPtr	lsn;
-		struct stat statbuf;
 
 		if (strcmp(snap_de->d_name, ".") == 0 ||
 			strcmp(snap_de->d_name, "..") == 0)
@@ -1955,20 +1954,13 @@ CheckPointSnapBuild(void)
 
 		snprintf(path, sizeof(path), "pg_logical/snapshots/%s", snap_de->d_name);
 
-		if (lstat(path, &statbuf) == 0 && !S_ISREG(statbuf.st_mode))
-		{
-			elog(DEBUG1, "only regular files expected: %s", path);
-			continue;
-		}
-
 		/*
 		 * temporary filenames from SnapBuildSerialize() include the LSN and
 		 * everything but are postfixed by .$pid.tmp. We can just remove them
 		 * the same as other files because there can be none that are
 		 * currently being written that are older than cutoff.
 		 *
-		 * We just log a message if a file doesn't fit the pattern, it's
-		 * probably some editors lock/state file or similar...
+		 * We just log a message if a file doesn't fit the pattern.
 		 */
 		if (sscanf(snap_de->d_name, "%X-%X.snap", &hi, &lo) != 2)
 		{
-- 
2.25.1

