From a104f1eb24cbe94b1349f42f5237c7748eca8a4b Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Mon, 9 Mar 2020 15:23:24 -0400
Subject: [PATCH v10 3/4] pg_validatebackup: Validate a backup against the
 backup manifest.

Patch by me; review by Tushar Ahuja and Rajkumar Raghuwanshi, and also
off-list by Mark Dilger, Davinder Singh, and Jeevan Chalke.
---
 src/bin/Makefile                              |   1 +
 src/bin/pg_validatebackup/.gitignore          |   1 +
 src/bin/pg_validatebackup/Makefile            |  33 +
 src/bin/pg_validatebackup/parse_manifest.c    | 546 +++++++++++++
 src/bin/pg_validatebackup/parse_manifest.h    |  40 +
 src/bin/pg_validatebackup/pg_validatebackup.c | 719 ++++++++++++++++++
 6 files changed, 1340 insertions(+)
 create mode 100644 src/bin/pg_validatebackup/.gitignore
 create mode 100644 src/bin/pg_validatebackup/Makefile
 create mode 100644 src/bin/pg_validatebackup/parse_manifest.c
 create mode 100644 src/bin/pg_validatebackup/parse_manifest.h
 create mode 100644 src/bin/pg_validatebackup/pg_validatebackup.c

diff --git a/src/bin/Makefile b/src/bin/Makefile
index 7f4120a34f..77bceea4fe 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -27,6 +27,7 @@ SUBDIRS = \
 	pg_test_fsync \
 	pg_test_timing \
 	pg_upgrade \
+	pg_validatebackup \
 	pg_waldump \
 	pgbench \
 	psql \
diff --git a/src/bin/pg_validatebackup/.gitignore b/src/bin/pg_validatebackup/.gitignore
new file mode 100644
index 0000000000..3ae1c1f03a
--- /dev/null
+++ b/src/bin/pg_validatebackup/.gitignore
@@ -0,0 +1 @@
+/pg_validatebackup
diff --git a/src/bin/pg_validatebackup/Makefile b/src/bin/pg_validatebackup/Makefile
new file mode 100644
index 0000000000..dde7eb3c02
--- /dev/null
+++ b/src/bin/pg_validatebackup/Makefile
@@ -0,0 +1,33 @@
+# src/bin/pg_validatebackup/Makefile
+
+PGFILEDESC = "pg_validatebackup - validate a backup against a backup manifest"
+PGAPPICON = win32
+
+subdir = src/bin/pg_validatebackup
+top_builddir = ../../..
+include $(top_builddir)/src/Makefile.global
+
+# We need libpq only because fe_utils does.
+LDFLAGS_INTERNAL += -L$(top_builddir)/src/fe_utils -lpgfeutils $(libpq_pgport)
+
+OBJS = \
+	$(WIN32RES) \
+	parse_manifest.o \
+	pg_validatebackup.o
+
+all: pg_validatebackup
+
+pg_validatebackup: $(OBJS) | submake-libpq submake-libpgport submake-libpgfeutils
+	$(CC) $(CFLAGS) $^ $(LDFLAGS) $(LDFLAGS_EX) $(LIBS) -o $@$(X)
+
+install: all installdirs
+	$(INSTALL_PROGRAM) pg_validatebackup$(X) '$(DESTDIR)$(bindir)/pg_validatebackup$(X)'
+
+installdirs:
+	$(MKDIR_P) '$(DESTDIR)$(bindir)'
+
+uninstall:
+	rm -f '$(DESTDIR)$(bindir)/pg_validatebackup$(X)'
+
+clean distclean maintainer-clean:
+	rm -f pg_validatebackup$(X) $(OBJS)
diff --git a/src/bin/pg_validatebackup/parse_manifest.c b/src/bin/pg_validatebackup/parse_manifest.c
new file mode 100644
index 0000000000..d8680f96e1
--- /dev/null
+++ b/src/bin/pg_validatebackup/parse_manifest.c
@@ -0,0 +1,546 @@
+/*-------------------------------------------------------------------------
+ *
+ * parse_manifest.c
+ *	  Parse a backup manifest in JSON format.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/bin/pg_validatebackup/parse_manifest.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+
+#include "parse_manifest.h"
+#include "common/jsonapi.h"
+
+/*
+ * Semantic states for JSON manifest parsing.
+ */
+typedef enum
+{
+	JM_EXPECT_TOPLEVEL_START,
+	JM_EXPECT_TOPLEVEL_END,
+	JM_EXPECT_VERSION_FIELD,
+	JM_EXPECT_VERSION_VALUE,
+	JM_EXPECT_FILES_FIELD,
+	JM_EXPECT_FILES_ARRAY_START,
+	JM_EXPECT_FILES_ARRAY_NEXT,
+	JM_EXPECT_THIS_FILE_FIELD,
+	JM_EXPECT_THIS_FILE_VALUE,
+	JM_EXPECT_MANIFEST_CHECKSUM_FIELD,
+	JM_EXPECT_MANIFEST_CHECKSUM_VALUE,
+	JM_EXPECT_EOF
+} JsonManifestSemanticState;
+
+/*
+ * Possible fields for one file as described by the manifest.
+ */
+typedef enum
+{
+	JMFF_PATH,
+	JMFF_SIZE,
+	JMFF_LAST_MODIFIED,
+	JMFF_CHECKSUM_ALGORITHM,
+	JMFF_CHECKSUM
+} JsonManifestFileField;
+
+/*
+ * Internal state used while decoding the JSON-format backup manifest.
+ */
+typedef struct
+{
+	JsonManifestParseContext *context;
+	JsonManifestSemanticState state;
+	JsonManifestFileField field;
+	char	   *pathname;
+	char	   *size;
+	char	   *algorithm;
+	pg_checksum_type checksum_algorithm;
+	char	   *checksum;
+	char	   *manifest_checksum;
+} JsonManifestParseState;
+
+static void json_manifest_object_start(void *state);
+static void json_manifest_object_end(void *state);
+static void json_manifest_array_start(void *state);
+static void json_manifest_array_end(void *state);
+static void json_manifest_object_field_start(void *state, char *fname,
+											 bool isnull);
+static void json_manifest_scalar(void *state, char *token,
+								 JsonTokenType tokentype);
+static void json_manifest_finalize_file(JsonManifestParseState *parse);
+static void verify_manifest_checksum(JsonManifestParseState *parse,
+									 char *buffer, size_t size);
+static void json_manifest_parse_failure(JsonManifestParseContext *context,
+										char *msg);
+
+static int	hexdecode_char(char c);
+static bool hexdecode_string(uint8 *result, char *input, int nbytes);
+
+/*
+ * Main entrypoint to parse a JSON-format backup manifest.
+ *
+ * Caller should set up the parsing context and then invoke this function.
+ * For each file whose information is extracted from the manifest,
+ * context->perfile_cb is invoked.  In case of trouble, context->error_cb is
+ * invoked and is expected not to return.
+ */
+void
+json_parse_manifest(JsonManifestParseContext *context, char *buffer,
+					size_t size)
+{
+	JsonLexContext *lex;
+	JsonParseErrorType json_error;
+	JsonSemAction sem;
+	JsonManifestParseState parse;
+
+	/* Set up our private parsing context. */
+	parse.state = JM_EXPECT_TOPLEVEL_START;
+	parse.context = context;
+
+	/* Create a JSON lexing context. */
+	lex = makeJsonLexContextCstringLen(buffer, size, PG_UTF8, true);
+
+	/* Set up semantic actions. */
+	sem.semstate = &parse;
+	sem.object_start = json_manifest_object_start;
+	sem.object_end = json_manifest_object_end;
+	sem.array_start = json_manifest_array_start;
+	sem.array_end = json_manifest_array_end;
+	sem.object_field_start = json_manifest_object_field_start;
+	sem.object_field_end = NULL;
+	sem.array_element_start = NULL;
+	sem.array_element_end = NULL;
+	sem.scalar = json_manifest_scalar;
+
+	/* Run the actual JSON parser. */
+	json_error = pg_parse_json(lex, &sem);
+	if (json_error != JSON_SUCCESS)
+		json_manifest_parse_failure(context, json_errdetail(json_error, lex));
+	if (parse.state != JM_EXPECT_EOF)
+		json_manifest_parse_failure(context, "manifest ended unexpectedly");
+
+	/* Validate the checksum. */
+	verify_manifest_checksum(&parse, buffer, size);
+}
+
+/*
+ * Invoked at the start of each object in the JSON document.
+ *
+ * The document as a whole is expected to be an object with three keys
+ * (PostgreSQL-Backup-Manifest-Version, Files, Manifest-Checksum) and each
+ * file is expected to be an object with various keys (Path, Size, etc.).
+ * If we're not at the beginning of either the toplevel object or the object
+ * for a particular file, it's an error.
+ */
+static void
+json_manifest_object_start(void *state)
+{
+	JsonManifestParseState *parse = state;
+
+	switch (parse->state)
+	{
+		case JM_EXPECT_TOPLEVEL_START:
+			parse->state = JM_EXPECT_VERSION_FIELD;
+			break;
+		case JM_EXPECT_FILES_ARRAY_NEXT:
+			parse->state = JM_EXPECT_THIS_FILE_FIELD;
+			parse->pathname = NULL;
+			parse->algorithm = NULL;
+			parse->checksum = NULL;
+			break;
+		default:
+			json_manifest_parse_failure(parse->context,
+										"unexpected object start");
+			break;
+	}
+}
+
+/*
+ * Invoked at the end of each object in the JSON document.
+ *
+ * The possible cases here are the same as for json_manifest_object_start.
+ * There's nothing special to do at the end of the document, but when we
+ * reach the end of an object representing a particular file, we must call
+ * json_manifest_finalize_file() to save the associated details.
+ */
+static void
+json_manifest_object_end(void *state)
+{
+	JsonManifestParseState *parse = state;
+
+	switch (parse->state)
+	{
+		case JM_EXPECT_TOPLEVEL_END:
+			parse->state = JM_EXPECT_EOF;
+			break;
+		case JM_EXPECT_THIS_FILE_FIELD:
+			json_manifest_finalize_file(parse);
+			parse->state = JM_EXPECT_FILES_ARRAY_NEXT;
+			break;
+		default:
+			json_manifest_parse_failure(parse->context,
+										"unexpected object end");
+			break;
+	}
+}
+
+/*
+ * Invoked at the start of each array in the JSON document.
+ *
+ * Within the toplevel object, the value associated with the "Files" key
+ * should be an array. No other arrays are expected.
+ */
+static void
+json_manifest_array_start(void *state)
+{
+	JsonManifestParseState *parse = state;
+
+	switch (parse->state)
+	{
+		case JM_EXPECT_FILES_ARRAY_START:
+			parse->state = JM_EXPECT_FILES_ARRAY_NEXT;
+			break;
+		default:
+			json_manifest_parse_failure(parse->context,
+										"unexpected array start");
+			break;
+	}
+}
+
+/*
+ * Invoked at the end of each array in the JSON document.
+ *
+ * Just like json_manifest_array_start, there's only one expected case
+ * here.
+ */
+static void
+json_manifest_array_end(void *state)
+{
+	JsonManifestParseState *parse = state;
+
+	switch (parse->state)
+	{
+		case JM_EXPECT_FILES_ARRAY_NEXT:
+			parse->state = JM_EXPECT_MANIFEST_CHECKSUM_FIELD;
+			break;
+		default:
+			json_manifest_parse_failure(parse->context,
+										"unexpected array end");
+			break;
+	}
+}
+
+/*
+ * Invoked at the start of each object field in the JSON document.
+ */
+static void
+json_manifest_object_field_start(void *state, char *fname, bool isnull)
+{
+	JsonManifestParseState *parse = state;
+
+	switch (parse->state)
+	{
+		case JM_EXPECT_VERSION_FIELD:
+			/* Inside toplevel object, expecting version indicator. */
+			if (strcmp(fname, "PostgreSQL-Backup-Manifest-Version") != 0)
+				json_manifest_parse_failure(parse->context,
+											"expected version indicator");
+			parse->state = JM_EXPECT_VERSION_VALUE;
+			break;
+		case JM_EXPECT_FILES_FIELD:
+			/* Inside toplevel object, expecting "Files" next. */
+			if (strcmp(fname, "Files") != 0)
+				json_manifest_parse_failure(parse->context,
+											"expected file list");
+			parse->state = JM_EXPECT_FILES_ARRAY_START;
+			break;
+		case JM_EXPECT_THIS_FILE_FIELD:
+			/* Inside object for one file; which key have we got? */
+			if (strcmp(fname, "Path") == 0)
+				parse->field = JMFF_PATH;
+			else if (strcmp(fname, "Size") == 0)
+				parse->field = JMFF_SIZE;
+			else if (strcmp(fname, "Last-Modified") == 0)
+				parse->field = JMFF_LAST_MODIFIED;
+			else if (strcmp(fname, "Checksum-Algorithm") == 0)
+				parse->field = JMFF_CHECKSUM_ALGORITHM;
+			else if (strcmp(fname, "Checksum") == 0)
+				parse->field = JMFF_CHECKSUM;
+			else
+				json_manifest_parse_failure(parse->context,
+											"unexpected file field");
+			parse->state = JM_EXPECT_THIS_FILE_VALUE;
+			break;
+		case JM_EXPECT_MANIFEST_CHECKSUM_FIELD:
+			/* Inside toplevel object, expecting "Manifest-Checksum" next. */
+			if (strcmp(fname, "Manifest-Checksum") != 0)
+				json_manifest_parse_failure(parse->context,
+											"expected manifest checksum");
+			parse->state = JM_EXPECT_MANIFEST_CHECKSUM_VALUE;
+			break;
+		default:
+			json_manifest_parse_failure(parse->context,
+										"unexpected object field");
+			break;
+	}
+}
+
+/*
+ * Invoked at the start of each scalar in the JSON document.
+ *
+ * Object field names don't reach this code; those are handled by
+ * json_manifest_object_field_start. When we're inside of the object for
+ * a particular file, that function will have noticed the name of the field,
+ * and we'll get the corresponding value here. When we're in the toplevel
+ * object, the parse state itself tells us which field this is.
+ *
+ * In all cases except for PostgreSQL-Backup-Manifest-Version, which we
+ * can just check on the spot, the goal here is just to save the value in
+ * the parse state for later use. We don't actually do anything until we
+ * reach either the end of the object representing this file, or the end
+ * of the manifest, as the case may be.
+ */
+static void
+json_manifest_scalar(void *state, char *token, JsonTokenType tokentype)
+{
+	JsonManifestParseState *parse = state;
+
+	switch (parse->state)
+	{
+		case JM_EXPECT_VERSION_VALUE:
+			if (strcmp(token, "1") != 0)
+				json_manifest_parse_failure(parse->context,
+											"unexpected manifest version");
+			parse->state = JM_EXPECT_FILES_FIELD;
+			break;
+		case JM_EXPECT_THIS_FILE_VALUE:
+			switch (parse->field)
+			{
+				case JMFF_PATH:
+					parse->pathname = token;
+					break;
+				case JMFF_SIZE:
+					parse->size = token;
+					break;
+				case JMFF_LAST_MODIFIED:
+					pfree(token);	/* unused */
+					break;
+				case JMFF_CHECKSUM_ALGORITHM:
+					parse->algorithm = token;
+					break;
+				case JMFF_CHECKSUM:
+					parse->checksum = token;
+					break;
+			}
+			parse->state = JM_EXPECT_THIS_FILE_FIELD;
+			break;
+		case JM_EXPECT_MANIFEST_CHECKSUM_VALUE:
+			parse->state = JM_EXPECT_TOPLEVEL_END;
+			parse->manifest_checksum = token;
+			break;
+		default:
+			json_manifest_parse_failure(parse->context, "unexpected scalar");
+			break;
+	}
+}
+
+/*
+ * Do additional parsing and sanity-checking of the details gathered for one
+ * file, and invoke the per-file callback so that the caller gets those
+ * details. This happens for each file when the corresponding JSON object is
+ * completely parsed.
+ */
+static void
+json_manifest_finalize_file(JsonManifestParseState *parse)
+{
+	JsonManifestParseContext *context = parse->context;
+	size_t		size;
+	char	   *ep;
+	int			checksum_string_length;
+	pg_checksum_type checksum_type;
+	int			checksum_length;
+	uint8	   *checksum_payload;
+
+	/* Pathname and size are required. */
+	if (parse->pathname == NULL)
+		json_manifest_parse_failure(parse->context, "missing pathname");
+	if (parse->size == NULL)
+		json_manifest_parse_failure(parse->context, "missing size");
+	if (parse->algorithm == NULL && parse->checksum != NULL)
+		json_manifest_parse_failure(parse->context,
+									"checksum without algorithm");
+
+	/* Parse size. */
+	size = strtoul(parse->size, &ep, 10);
+	if (*ep)
+		json_manifest_parse_failure(parse->context,
+									"file size is not an integer");
+
+	/* Parse the checksum algorithm, if it's present. */
+	if (parse->algorithm == NULL)
+		checksum_type = CHECKSUM_TYPE_NONE;
+	else if (!pg_checksum_parse_type(parse->algorithm, &checksum_type))
+		context->error_cb(context, "unrecognized checksum algorithm: \"%s\"",
+						  parse->algorithm);
+
+	/* Parse the checksum payload, if it's present. */
+	checksum_string_length = parse->checksum == NULL ? 0
+		: strlen(parse->checksum);
+	if (checksum_string_length == 0)
+	{
+		checksum_length = 0;
+		checksum_payload = NULL;
+	}
+	else
+	{
+		checksum_length = checksum_string_length / 2;
+		checksum_payload = palloc(checksum_length);
+		if (checksum_string_length % 2 != 0 ||
+			!hexdecode_string(checksum_payload, parse->checksum,
+							  checksum_length))
+			context->error_cb(context,
+							  "invalid checksum for file \"%s\": \"%s\"",
+							  parse->pathname, checksum_payload);
+	}
+
+	/* Invoke the callback with the details we've gathered. */
+	context->perfile_cb(context, parse->pathname, size,
+						checksum_type, checksum_length, checksum_payload);
+
+	/* Free memory we no longer need. */
+	if (parse->size != NULL)
+	{
+		pfree(parse->size);
+		parse->size = NULL;
+	}
+	if (parse->algorithm != NULL)
+	{
+		pfree(parse->algorithm);
+		parse->algorithm = NULL;
+	}
+	if (parse->checksum != NULL)
+	{
+		pfree(parse->checksum);
+		parse->checksum = NULL;
+	}
+}
+
+/*
+ * Verify that the manifest checksum is correct.
+ *
+ * The last line of the manifest file is excluded from the manifest checksum,
+ * because the last line is expected to contain the checksum that covers
+ * the rest of the file.
+ */
+static void
+verify_manifest_checksum(JsonManifestParseState *parse, char *buffer,
+						 size_t size)
+{
+	JsonManifestParseContext *context = parse->context;
+	size_t		i;
+	size_t		number_of_newlines = 0;
+	size_t		ultimate_newline = 0;
+	size_t		penultimate_newline = 0;
+	pg_sha256_ctx manifest_ctx;
+	uint8		manifest_checksum_actual[PG_SHA256_DIGEST_LENGTH];
+	uint8		manifest_checksum_expected[PG_SHA256_DIGEST_LENGTH];
+
+	/* Find the last two newlines in the file. */
+	for (i = 0; i < size; ++i)
+	{
+		if (buffer[i] == '\n')
+		{
+			++number_of_newlines;
+			penultimate_newline = ultimate_newline;
+			ultimate_newline = i;
+		}
+	}
+
+	/*
+	 * Make sure that the last newline is right at the end, and that there are
+	 * at least two lines total. We need this to be true in order for the
+	 * following code, which computes the manifest checksum, to work properly.
+	 */
+	if (number_of_newlines < 2)
+		json_manifest_parse_failure(parse->context,
+									"expected at least 2 lines");
+	if (ultimate_newline != size - 1)
+		json_manifest_parse_failure(parse->context,
+									"last line not newline-terminated");
+
+	/* Checksum the rest. */
+	pg_sha256_init(&manifest_ctx);
+	pg_sha256_update(&manifest_ctx, (uint8 *) buffer, penultimate_newline + 1);
+	pg_sha256_final(&manifest_ctx, manifest_checksum_actual);
+
+	/* Now verify it. */
+	if (parse->manifest_checksum == NULL)
+		context->error_cb(parse->context, "manifest has no checksum");
+	if (strlen(parse->manifest_checksum) != PG_SHA256_DIGEST_LENGTH * 2 ||
+		!hexdecode_string(manifest_checksum_expected, parse->manifest_checksum,
+						  PG_SHA256_DIGEST_LENGTH))
+		context->error_cb(context, "invalid manifest checksum: \"%s\"",
+						  parse->manifest_checksum);
+	if (memcmp(manifest_checksum_actual, manifest_checksum_expected,
+			   PG_SHA256_DIGEST_LENGTH) != 0)
+		context->error_cb(context, "manifest checksum mismatch");
+}
+
+/*
+ * Report a parse error.
+ *
+ * This is intended to be used for fairly low-level failures that probably
+ * shouldn't occur unless somebody has deliberately constructed a bad manifest,
+ * or unless the server is generating bad manifests due to some bug. msg should
+ * be a short string giving some hint as to what the problem is.
+ */
+static void
+json_manifest_parse_failure(JsonManifestParseContext *context, char *msg)
+{
+	context->error_cb(context, "could not parse backup manifest: %s", msg);
+}
+
+/*
+ * Convert a character which represents a hexadecimal digit to an integer.
+ *
+ * Returns -1 if the character is not a hexadecimal digit.
+ */
+static int
+hexdecode_char(char c)
+{
+	if (c >= '0' && c <= '9')
+		return c - '0';
+	if (c >= 'a' && c <= 'f')
+		return c - 'a' + 10;
+	if (c >= 'A' && c <= 'F')
+		return c - 'A' + 10;
+
+	return -1;
+}
+
+/*
+ * Decode a hex string into a byte string, 2 hex chars per byte.
+ *
+ * Returns false if invalid characters are encountered; otherwise true.
+ */
+static bool
+hexdecode_string(uint8 *result, char *input, int nbytes)
+{
+	int			i;
+
+	for (i = 0; i < nbytes; ++i)
+	{
+		int			n1 = hexdecode_char(input[i * 2]);
+		int			n2 = hexdecode_char(input[i * 2 + 1]);
+
+		if (n1 < 0 || n2 < 0)
+			return false;
+		result[i] = n1 * 16 + n2;
+	}
+
+	return true;
+}
diff --git a/src/bin/pg_validatebackup/parse_manifest.h b/src/bin/pg_validatebackup/parse_manifest.h
new file mode 100644
index 0000000000..b0b18a57ca
--- /dev/null
+++ b/src/bin/pg_validatebackup/parse_manifest.h
@@ -0,0 +1,40 @@
+/*-------------------------------------------------------------------------
+ *
+ * parse_manifest.h
+ *	  Parse a backup manifest in JSON format.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/bin/pg_validatebackup/parse_manifest.h
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#ifndef PARSE_MANIFEST_H
+#define PARSE_MANIFEST_H
+
+#include "common/checksum_helper.h"
+#include "mb/pg_wchar.h"
+
+struct JsonManifestParseContext;
+typedef struct JsonManifestParseContext JsonManifestParseContext;
+
+typedef void (*json_manifest_perfile_callback)(JsonManifestParseContext *,
+								 char *pathname,
+								 size_t size, pg_checksum_type checksum_type,
+								 int checksum_length, uint8 *checksum_payload);
+typedef void (*json_manifest_error_callback)(JsonManifestParseContext *,
+								 char *fmt, ...);
+
+struct JsonManifestParseContext
+{
+	void	   *private_data;
+	json_manifest_perfile_callback perfile_cb;
+	json_manifest_error_callback error_cb;
+};
+
+extern void json_parse_manifest(JsonManifestParseContext *context,
+								char *buffer, size_t size);
+
+#endif
diff --git a/src/bin/pg_validatebackup/pg_validatebackup.c b/src/bin/pg_validatebackup/pg_validatebackup.c
new file mode 100644
index 0000000000..e75e74c8ad
--- /dev/null
+++ b/src/bin/pg_validatebackup/pg_validatebackup.c
@@ -0,0 +1,719 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_validatebackup.c
+ *	  Validate a backup against a backup manifest.
+ *
+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/bin/pg_validatebackup/pg_validatebackup.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres_fe.h"
+
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/stat.h>
+
+#include "common/hashfn.h"
+#include "common/logging.h"
+#include "fe_utils/simple_list.h"
+#include "getopt_long.h"
+#include "parse_manifest.h"
+
+/*
+ * For efficiency, we'd like our hash table containing information about the
+ * manifest to start out with approximately the correct number of entries.
+ * There's no way to know the exact number of entries without reading the whole
+ * file, but we can get an estimate by dividing the file size by the estimated
+ * number of bytes per line.
+ *
+ * This could be off by about a factor of two in either direction, because the
+ * checksum algorithm has a big impact on the line lengths; e.g. a SHA512
+ * checksum is 128 hex bytes, whereas a CRC-32C value is only 8, and there
+ * might be no checksum at all.
+ */
+#define ESTIMATED_BYTES_PER_MANIFEST_LINE	100
+
+/*
+ * How many bytes should we try to read from a file at once?
+ */
+#define READ_CHUNK_SIZE				4096
+
+/*
+ * Information about each file described by the manifest file is parsed to
+ * produce an object like this.
+ */
+typedef struct manifestfile
+{
+	uint32		status;			/* hash status */
+	char	   *pathname;
+	size_t		size;
+	pg_checksum_type checksum_type;
+	int			checksum_length;
+	uint8	   *checksum_payload;
+	bool		matched;
+	bool		bad;
+} manifestfile;
+
+/*
+ * Define a hash table which we can use to store information about the files
+ * mentioned in the backup manifest.
+ */
+static uint32 hash_string_pointer(char *s);
+#define SH_PREFIX		manifestfiles
+#define SH_ELEMENT_TYPE	manifestfile
+#define SH_KEY_TYPE		char *
+#define	SH_KEY			pathname
+#define SH_HASH_KEY(tb, key)	hash_string_pointer(key)
+#define SH_EQUAL(tb, a, b)		(strcmp(a, b) == 0)
+#define	SH_SCOPE		static inline
+#define SH_RAW_ALLOCATOR	pg_malloc0
+#define SH_DECLARE
+#define SH_DEFINE
+#include "lib/simplehash.h"
+
+/*
+ * All of the context information we need while checking a backup manifest.
+ */
+typedef struct validator_context
+{
+	manifestfiles_hash *ht;
+	char	   *backup_directory;
+	SimpleStringList ignore_list;
+	bool		exit_on_error;
+	bool		saw_any_error;
+} validator_context;
+
+static manifestfiles_hash *parse_manifest_file(char *manifest_path);
+
+static void record_manifest_details_for_file(JsonManifestParseContext *context,
+											 char *pathname, size_t size,
+											 pg_checksum_type checksum_type,
+											 int checksum_length,
+											 uint8 *checksum_payload);
+static void report_manifest_error(JsonManifestParseContext *context,
+								  char *fmt, ...);
+
+static void validate_backup_directory(validator_context *context,
+									  char *relpath, char *fullpath);
+static void validate_backup_file(validator_context *context,
+								 char *relpath, char *fullpath);
+static void report_extra_backup_files(validator_context *context);
+static void validate_backup_checksums(validator_context *context);
+static void validate_file_checksum(validator_context *context,
+								   manifestfile *tabent, char *pathname);
+
+static void pg_validator_error(validator_context *context,
+							   const char *pg_restrict fmt,...)
+			pg_attribute_printf(2, 3);
+static void pg_validator_fatal(const char *pg_restrict fmt,...)
+			pg_attribute_printf(1, 2) pg_attribute_noreturn();
+static bool should_ignore_relpath(validator_context *context, char *relpath);
+
+static void usage(void);
+
+static const char *progname;
+
+/*
+ * Main entry point.
+ */
+int
+main(int argc, char **argv)
+{
+	static struct option long_options[] = {
+		{"exit-on-error", no_argument, NULL, 'e'},
+		{"ignore", required_argument, NULL, 'i'},
+		{"manifest-path", required_argument, NULL, 'm'},
+		{"quiet", no_argument, NULL, 'q'},
+		{"skip-checksums", no_argument, NULL, 's'},
+		{NULL, 0, NULL, 0}
+	};
+
+	int			c;
+	validator_context context;
+	char	   *manifest_path = NULL;
+	bool		quiet = false;
+	bool		skip_checksums = false;
+
+	pg_logging_init(argv[0]);
+	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_validatebackup"));
+	progname = get_progname(argv[0]);
+
+	memset(&context, 0, sizeof(context));
+
+	if (argc > 1)
+	{
+		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
+		{
+			usage();
+			exit(0);
+		}
+		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
+		{
+			puts("pg_validatebackup (PostgreSQL) " PG_VERSION);
+			exit(0);
+		}
+	}
+
+	/*
+	 * Skip certain files in the toplevel directory.
+	 *
+	 * Ignore the backup_manifest file, because it's not included in the
+	 * backup manifest.
+	 *
+	 * Ignore the pg_wal directory, because those files are not included in
+	 * the backup manifest either, since they are fetched separately from the
+	 * backup itself.
+	 *
+	 * Ignore postgresql.auto.conf, recovery.signal, and standby.signal,
+	 * because we expect that those files may sometimes be created or changed
+	 * as part of the backup process. For example, pg_basebackup -R will
+	 * modify postgresql.auto.conf and create standby.signal.
+	 */
+	simple_string_list_append(&context.ignore_list, "backup_manifest");
+	simple_string_list_append(&context.ignore_list, "pg_wal");
+	simple_string_list_append(&context.ignore_list, "postgresql.auto.conf");
+	simple_string_list_append(&context.ignore_list, "recovery.signal");
+	simple_string_list_append(&context.ignore_list, "standby.signal");
+
+	while ((c = getopt_long(argc, argv, "ei:m:qs", long_options, NULL)) != -1)
+	{
+		switch (c)
+		{
+			case 'e':
+				context.exit_on_error = true;
+				break;
+			case 'i':
+				{
+					char	   *arg = pstrdup(optarg);
+
+					canonicalize_path(arg);
+					simple_string_list_append(&context.ignore_list, arg);
+					break;
+				}
+			case 'm':
+				manifest_path = pstrdup(optarg);
+				canonicalize_path(manifest_path);
+				break;
+			case 'q':
+				quiet = true;
+				break;
+			case 's':
+				skip_checksums = true;
+				break;
+			default:
+				fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
+						progname);
+				exit(1);
+		}
+	}
+
+	/* Get backup directory name */
+	if (optind >= argc)
+	{
+		pg_log_fatal("no backup directory specified");
+		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
+				progname);
+		exit(1);
+	}
+	context.backup_directory = pstrdup(argv[optind++]);
+	canonicalize_path(context.backup_directory);
+
+	/* Complain if any arguments remain */
+	if (optind < argc)
+	{
+		pg_log_fatal("too many command-line arguments (first is \"%s\")",
+					 argv[optind]);
+		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
+				progname);
+		exit(1);
+	}
+
+	/* By default, look for the manifest in the backup directory. */
+	if (manifest_path == NULL)
+		manifest_path = psprintf("%s/backup_manifest",
+								 context.backup_directory);
+
+	/*
+	 * Try to read the manifest. We treat any errors encountered while parsing
+	 * the manifest as fatal; there doesn't seem to be much point in trying to
+	 * validate the backup directory against a corrupted manifest.
+	 */
+	context.ht = parse_manifest_file(manifest_path);
+
+	/*
+	 * Now scan the files in the backup directory. At this stage, we verify
+	 * that every file on disk is present in the manifest and that the sizes
+	 * match. We also set the "matched" flag on every manifest entry that
+	 * corresponds to a file on disk.
+	 */
+	validate_backup_directory(&context, NULL, context.backup_directory);
+
+	/*
+	 * The "matched" flag should now be set on every entry in the hash table.
+	 * Any entries for which the bit is not set are files mentioned in the
+	 * manifest that don't exist on disk.
+	 */
+	report_extra_backup_files(&context);
+
+	/*
+	 * Finally, do the expensive work of verifying file checksums, unless we
+	 * were told to skip it.
+	 */
+	if (!skip_checksums)
+		validate_backup_checksums(&context);
+
+	/*
+	 * If everything looks OK, tell the user this, unless we were asked to
+	 * work quietly.
+	 */
+	if (!context.saw_any_error && !quiet)
+		pg_log_info("backup successfully verified");
+
+	exit(context.saw_any_error ? 1 : 0);
+}
+
+/*
+ * Parse a manifest file and construct a hash table with information about
+ * all the files it mentions.
+ */
+static manifestfiles_hash *
+parse_manifest_file(char *manifest_path)
+{
+	int			fd;
+	struct stat statbuf;
+	off_t		estimate;
+	uint32		initial_size;
+	manifestfiles_hash *ht;
+	char	   *buffer;
+	int			rc;
+	JsonManifestParseContext	context;
+
+	/* Open the manifest file. */
+	if ((fd = open(manifest_path, O_RDONLY | PG_BINARY, 0)) < 0)
+		pg_validator_fatal("could not open file \"%s\": %m", manifest_path);
+
+	/* Figure out how big the manifest is. */
+	if (fstat(fd, &statbuf) != 0)
+		pg_validator_fatal("could not stat file \"%s\": %m", manifest_path);
+
+	/* Guess how large to make the hash table based on the manifest size. */
+	estimate = statbuf.st_size / ESTIMATED_BYTES_PER_MANIFEST_LINE;
+	initial_size = Min(PG_UINT32_MAX, Max(estimate, 256));
+
+	/* Create the hash table. */
+	ht = manifestfiles_create(initial_size, NULL);
+
+	/*
+	 * Slurp in the whole file.
+	 *
+	 * This is not ideal, but there's currently no easy way to get
+	 * pg_parse_json() to perform incremental parsing.
+	 */
+	buffer = pg_malloc(statbuf.st_size);
+	rc = read(fd, buffer, statbuf.st_size);
+	if (rc != statbuf.st_size)
+	{
+		if (rc < 0)
+			pg_validator_fatal("could not read file \"%s\": %m",
+							   manifest_path);
+		else
+			pg_validator_fatal("could not read file \"%s\": read %d of %zu",
+							   manifest_path, rc, (size_t) statbuf.st_size);
+	}
+
+	/* Close the manifest file. */
+	close(fd);
+
+	/* Parse the manifest as JSON. */
+	context.private_data = ht;
+	context.perfile_cb = record_manifest_details_for_file;
+	context.error_cb = report_manifest_error;
+	json_parse_manifest(&context, buffer, statbuf.st_size);
+
+	/* Done with the buffer. */
+	pfree(buffer);
+
+	/* Return the hash table we constructed. */
+	return ht;
+}
+
+static void
+report_manifest_error(JsonManifestParseContext *context, char *fmt, ...)
+{
+	va_list		ap;
+
+	va_start(ap, fmt);
+	pg_validator_fatal(fmt, ap);
+	va_end(ap);
+}
+
+static void
+record_manifest_details_for_file(JsonManifestParseContext *context,
+								 char *pathname, size_t size,
+								 pg_checksum_type checksum_type,
+								 int checksum_length, uint8 *checksum_payload)
+{
+	manifestfiles_hash *ht = context->private_data;
+	manifestfile *tabent;
+	bool		found;
+
+	/* Make a new entry in the hash table for this file. */
+	tabent = manifestfiles_insert(ht, pathname, &found);
+	if (found)
+		pg_validator_fatal("duplicate pathname in backup manifest: \"%s\"",
+						   pathname);
+
+	/* Initialize the entry. */
+	tabent->size = size;
+	tabent->checksum_type = checksum_type;
+	tabent->checksum_length = checksum_length;
+	tabent->checksum_payload = checksum_payload;
+	tabent->matched = false;
+	tabent->bad = false;
+}
+
+/*
+ * Validate one directory.
+ *
+ * 'relpath' is NULL if we are to validate the top-level backup directory,
+ * and otherwise the relative path to the directory that is to be validated.
+ *
+ * 'fullpath' is the backup directory with 'relpath' appended; i.e. the actual
+ * filesystem path at which it can be found.
+ */
+static void
+validate_backup_directory(validator_context *context, char *relpath,
+						  char *fullpath)
+{
+	DIR		   *dir;
+	struct dirent *dirent;
+
+	dir = opendir(fullpath);
+	if (dir == NULL)
+	{
+		/*
+		 * If even the toplevel backup directory cannot be found, treat this
+		 * as a fatal error.
+		 */
+		if (relpath == NULL)
+			pg_validator_fatal("could not open directory \"%s\": %m", fullpath);
+
+		/*
+		 * Otherwise, treat this as a non-fatal error, but ignore any further
+		 * errors related to this path and anything beneath it.
+		 */
+		pg_validator_error(context,
+						   "could not open directory \"%s\": %m", fullpath);
+		simple_string_list_append(&context->ignore_list, relpath);
+
+		return;
+	}
+
+	while (errno = 0, (dirent = readdir(dir)) != NULL)
+	{
+		char	   *filename = dirent->d_name;
+		char	   *newfullpath = psprintf("%s/%s", fullpath, filename);
+		char	   *newrelpath;
+
+		/* Skip "." and ".." */
+		if (filename[0] == '.' && (filename[1] == '\0'
+								   || strcmp(filename, "..") == 0))
+			continue;
+
+		if (relpath == NULL)
+			newrelpath = pstrdup(filename);
+		else
+			newrelpath = psprintf("%s/%s", relpath, filename);
+
+		if (!should_ignore_relpath(context, newrelpath))
+			validate_backup_file(context, newrelpath, newfullpath);
+
+		pfree(newfullpath);
+		pfree(newrelpath);
+	}
+
+	if (closedir(dir))
+	{
+		pg_validator_error(context,
+						   "could not close directory \"%s\": %m", fullpath);
+		return;
+	}
+}
+
+/*
+ * Validate one file (which might actually be a directory or a symlink).
+ *
+ * The arguments to this function have the same meaning as the arguments to
+ * validate_backup_directory.
+ */
+static void
+validate_backup_file(validator_context *context, char *relpath, char *fullpath)
+{
+	struct stat sb;
+	manifestfile *tabent;
+
+	if (stat(fullpath, &sb) != 0)
+	{
+		pg_validator_error(context,
+						   "could not stat file or directory \"%s\": %m",
+						   relpath);
+
+		/*
+		 * Suppress further errors related to this path name and, if it's a
+		 * directory, anything underneath it.
+		 */
+		simple_string_list_append(&context->ignore_list, relpath);
+
+		return;
+	}
+
+	/* If it's a directory, just recurse. */
+	if (S_ISDIR(sb.st_mode))
+	{
+		validate_backup_directory(context, relpath, fullpath);
+		return;
+	}
+
+	/* If it's not a directory, it should be a plain file. */
+	if (!S_ISREG(sb.st_mode))
+	{
+		pg_validator_error(context,
+						   "\"%s\" is not a file or directory",
+						   relpath);
+		return;
+	}
+
+	/* Check whether there's an entry in the manifest hash. */
+	tabent = manifestfiles_lookup(context->ht, relpath);
+	if (tabent == NULL)
+	{
+		pg_validator_error(context,
+						   "\"%s\" is present on disk but not in the manifest",
+						   relpath);
+		return;
+	}
+
+	/* Flag this entry as having been encountered in the filesystem. */
+	tabent->matched = true;
+
+	/* Check that the size matches. */
+	if (tabent->size != sb.st_size)
+	{
+		pg_validator_error(context,
+						   "\"%s\" has size %zu on disk but size %zu in the manifest",
+						   relpath, (size_t) sb.st_size, tabent->size);
+		tabent->bad = true;
+	}
+
+	/*
+	 * We don't validate checksums at this stage. We first finish validating
+	 * that we have the expected set of files with the expected sizes, and
+	 * only afterwards verify the checksums. That's because computing
+	 * checksums may take a while, and we'd like to report more obvious
+	 * problems quickly.
+	 */
+}
+
+/*
+ * Scan the hash table for entries where the 'matched' flag is not set; report
+ * that such files are present in the manifest but not on disk.
+ */
+static void
+report_extra_backup_files(validator_context *context)
+{
+	manifestfiles_iterator it;
+	manifestfile *tabent;
+
+	manifestfiles_start_iterate(context->ht, &it);
+	while ((tabent = manifestfiles_iterate(context->ht, &it)) != NULL)
+		if (!tabent->matched &&
+			!should_ignore_relpath(context, tabent->pathname))
+			pg_validator_error(context,
+							   "\"%s\" is present in the manifest but not on disk",
+							   tabent->pathname);
+}
+
+/*
+ * Validate checksums for hash table entries that are otherwise unproblematic.
+ * If we've already reported some problem related to a hash table entry, or
+ * if it has no checksum, just skip it.
+ */
+static void
+validate_backup_checksums(validator_context *context)
+{
+	manifestfiles_iterator it;
+	manifestfile *tabent;
+
+	manifestfiles_start_iterate(context->ht, &it);
+	while ((tabent = manifestfiles_iterate(context->ht, &it)) != NULL)
+	{
+		if (tabent->matched && !tabent->bad &&
+			tabent->checksum_type != CHECKSUM_TYPE_NONE &&
+			!should_ignore_relpath(context, tabent->pathname))
+		{
+			char	   *fullpath;
+
+			/* Compute the full pathname to the target file. */
+			fullpath = psprintf("%s/%s", context->backup_directory,
+								tabent->pathname);
+
+			/* Do the actual checksum validation. */
+			validate_file_checksum(context, tabent, fullpath);
+
+			/* Avoid leaking memory. */
+			pfree(fullpath);
+		}
+	}
+}
+
+/*
+ * Validate the checksum of a single file.
+ */
+static void
+validate_file_checksum(validator_context *context, manifestfile *tabent,
+					   char *fullpath)
+{
+	pg_checksum_context checksum_ctx;
+	char	   *relpath = tabent->pathname;
+	int			fd;
+	int			rc;
+	uint8		buffer[READ_CHUNK_SIZE];
+	uint8		checksumbuf[PG_CHECKSUM_MAX_LENGTH];
+	int			checksumlen;
+
+	/* Open the target file. */
+	if ((fd = open(fullpath, O_RDONLY | PG_BINARY, 0)) < 0)
+	{
+		pg_validator_error(context, "could not open file \"%s\": %m",
+						   relpath);
+		return;
+	}
+
+	/* Initialize checksum context. */
+	pg_checksum_init(&checksum_ctx, tabent->checksum_type);
+
+	/* Read the file chunk by chunk, updating the checksum as we go. */
+	while ((rc = read(fd, buffer, READ_CHUNK_SIZE)) > 0)
+		pg_checksum_update(&checksum_ctx, buffer, rc);
+	if (rc < 0)
+		pg_validator_error(context, "could not read file \"%s\": %m",
+						   relpath);
+
+	/* Close the file. */
+	if (close(fd) != 0)
+	{
+		pg_validator_error(context, "could not close file \"%s\": %m",
+						   relpath);
+		return;
+	}
+
+	/* If we didn't manage to read the whole file, bail out now. */
+	if (rc < 0)
+		return;
+
+	/* Get the final checksum. */
+	checksumlen = pg_checksum_final(&checksum_ctx, checksumbuf);
+
+	/* And check it against the manifest. */
+	if (checksumlen != tabent->checksum_length)
+		pg_validator_error(context,
+						   "file \"%s\" has checksum of length %d, but expected %d",
+						   relpath, tabent->checksum_length, checksumlen);
+	else if (memcmp(checksumbuf, tabent->checksum_payload, checksumlen) != 0)
+		pg_validator_error(context,
+						   "checksum mismatch for file \"%s\"",
+						   relpath);
+}
+
+/*
+ * Print out usage information and exit.
+ */
+static void
+usage(void)
+{
+	printf(_("%s validates a backup against the backup manifest.\n\n"), progname);
+	printf(_("Usage:\n  %s [OPTION]... BACKUPDIR\n\n"), progname);
+	printf(_("Options:\n"));
+	printf(_("  -e, --exit-on-error         exit immediately on error\n"));
+	printf(_("  -i, --ignore=RELATIVE_PATH  ignore indicated path\n"));
+	printf(_("  -m, --manifest=PATH         use specified path for manifest\n"));
+	printf(_("  -s, --skip-checksums        skip checksum verification\n"));
+	printf(_("  -V, --version               output version information, then exit\n"));
+	printf(_("  -?, --help                  show this help, then exit\n"));
+	printf(_("\nReport bugs to <pgsql-bugs@lists.postgresql.org>.\n"));
+}
+
+/*
+ * Report an error. Update the context to indicate that we saw an error, and
+ * exit if the context says we should.
+ */
+static void
+pg_validator_error(validator_context *context, const char *pg_restrict fmt,...)
+{
+	va_list		ap;
+
+	va_start(ap, fmt);
+	pg_log_generic_v(PG_LOG_ERROR, fmt, ap);
+	va_end(ap);
+
+	context->saw_any_error = true;
+	if (context->exit_on_error)
+		exit(1);
+}
+
+/*
+ * Report a fatal error and exit
+ */
+static void
+pg_validator_fatal(const char *pg_restrict fmt,...)
+{
+	va_list		ap;
+
+	va_start(ap, fmt);
+	pg_log_generic_v(PG_LOG_FATAL, fmt, ap);
+	va_end(ap);
+
+	exit(1);
+}
+
+/*
+ * Is the specified relative path, or some prefix of it, listed in the set
+ * of paths to ignore?
+ *
+ * Note that by "prefix" we mean a parent directory; for this purpose,
+ * "aa/bb" is not a prefix of "aa/bbb", but it is a prefix of "aa/bb/cc".
+ */
+static bool
+should_ignore_relpath(validator_context *context, char *relpath)
+{
+	SimpleStringListCell *cell;
+
+	for (cell = context->ignore_list.head; cell != NULL; cell = cell->next)
+	{
+		char	   *r = relpath;
+		char	   *v = cell->val;
+
+		while (*v != '\0' && *r == *v)
+			++r, ++v;
+
+		if (*v == '\0' && (*r == '\0' || *r == '/'))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Helper function for manifestfiles hash table.
+ */
+static uint32
+hash_string_pointer(char *s)
+{
+	unsigned char *ss = (unsigned char *) s;
+
+	return hash_bytes(ss, strlen(s));
+}
-- 
2.17.2 (Apple Git-113)

