From e63f83896fce4ae2068071e8aa50b18ffa8a595d Mon Sep 17 00:00:00 2001
From: Jelte Fennema <github-tech@jeltef.nl>
Date: Mon, 12 Sep 2022 09:44:06 +0200
Subject: [PATCH v13 3/5] Support load balancing in libpq

This adds support for load balancing to libpq using the newly added
load_balance_hosts parameter. When setting the load_balance_hosts
parameter to random, hosts and addresses will be connected to in a
random order. This then results in load balancing across these
hosts/addresses if multiple clients do this at the same time.

This patch implements two levels of random load balancing:
1. The given hosts are randomly shuffled, before resolving them
    one-by-one.
2. Once a host its addresses get resolved, those addresses are shuffled,
    before trying to connect to them one-by-one.
---
 doc/src/sgml/libpq.sgml                       |  74 ++++++++++++
 src/interfaces/libpq/fe-connect.c             | 105 ++++++++++++++++++
 src/interfaces/libpq/libpq-int.h              |  17 ++-
 src/interfaces/libpq/meson.build              |   2 +
 .../libpq/t/003_load_balance_host_list.pl     |  82 ++++++++++++++
 src/test/perl/PostgreSQL/Test/Cluster.pm      |  16 +++
 src/tools/pgindent/typedefs.list              |   1 +
 7 files changed, 296 insertions(+), 1 deletion(-)
 create mode 100644 src/interfaces/libpq/t/003_load_balance_host_list.pl

diff --git a/doc/src/sgml/libpq.sgml b/doc/src/sgml/libpq.sgml
index 3706d349abc..0ada127a73b 100644
--- a/doc/src/sgml/libpq.sgml
+++ b/doc/src/sgml/libpq.sgml
@@ -2069,6 +2069,80 @@ postgresql://%2Fvar%2Flib%2Fpostgresql/dbname
        </para>
       </listitem>
      </varlistentry>
+
+     <varlistentry id="libpq-connect-load-balance-hosts" xreflabel="load_balance_hosts">
+      <term><literal>load_balance_hosts</literal></term>
+      <listitem>
+       <para>
+        Controls the order in which the client tries to connect to the available
+        hosts and addresses. Once a connection attempt is successful no other
+        hosts and addresses will be tried. This parameter is typically used in
+        combination with multiple host names or a DNS record that returns
+        multiple IPs. This parameter can be used in combination with
+        <xref linkend="libpq-connect-target-session-attrs"/>
+        to, for example, load balance over standby servers only. Once successfully
+        connected, subsequent queries on the returned connection will all be
+        sent to the same server. There are currently two modes:
+        <variablelist>
+         <varlistentry>
+          <term><literal>disable</literal> (default)</term>
+          <listitem>
+           <para>
+            No load balancing across hosts is performed. The order in which
+            hosts and addresses are tried is the same for every connection
+            attempt: Hosts are tried in the order in which they are provided and
+            addresses are tried in the order they are received from DNS or a
+            hosts file.
+           </para>
+
+           <para>
+            While this may sound similar to round-robin load balancing, it is
+            not. Round-robin load balancing requires that subsequent connection
+            attempts start iterating over hosts where the previous connection
+            attempt stopped. This is not done when using <literal>disable</literal>.
+            Instead every connection attempt starts at <emphasis>the same</emphasis>
+            first host. So, if that host is online and accepting connections, all
+            clients will connect to it and all of the other hosts in the
+            list get no connections at all.
+           </para>
+          </listitem>
+         </varlistentry>
+
+         <varlistentry>
+          <term><literal>random</literal></term>
+          <listitem>
+           <para>
+            Hosts and addresses are tried in random order. This value is mostly
+            useful when opening multiple connections at the same time, possibly
+            from different machines. This way connections can be load balanced
+            across multiple <productname>PostgreSQL</productname> servers.
+           </para>
+           <para>
+            While random load balancing, due to its random nature, will almost
+            never result in a completely uniform distribution, it statistically
+            gets quite close. One important aspect here is that this algorithm
+            uses two levels of random choices: First the hosts
+            will be resolved in random order. Then secondly, before resolving
+            the next host, all resolved addresses for the current host will be
+            tried in random order. This behaviour can skew the amount of
+            connections each node gets greatly in certain cases, for instance
+            when some hosts resolve to more addresses than others. But such a
+            skew can also be used on purpose, e.g. to increase the number of
+            connections a larger server gets by providing its hostname multiple
+            times in the host string.
+           </para>
+           <para>
+            When using this value it's recommended to also configure a reasonable
+            value for <xref linkend="libpq-connect-connect-timeout"/>. Because then,
+            if one of the nodes that are used for load balancing is not responding,
+            a new node will be tried.
+           </para>
+          </listitem>
+         </varlistentry>
+        </variablelist>
+       </para>
+      </listitem>
+     </varlistentry>
     </variablelist>
    </para>
   </sect2>
diff --git a/src/interfaces/libpq/fe-connect.c b/src/interfaces/libpq/fe-connect.c
index b085892feac..c180ebb26d1 100644
--- a/src/interfaces/libpq/fe-connect.c
+++ b/src/interfaces/libpq/fe-connect.c
@@ -123,6 +123,7 @@ static int	ldapServiceLookup(const char *purl, PQconninfoOption *options,
 #define DefaultChannelBinding	"disable"
 #endif
 #define DefaultTargetSessionAttrs	"any"
+#define DefaultLoadBalanceHosts	"disable"
 #ifdef USE_SSL
 #define DefaultSSLMode "prefer"
 #else
@@ -345,6 +346,11 @@ static const internalPQconninfoOption PQconninfoOptions[] = {
 		"Target-Session-Attrs", "", 15, /* sizeof("prefer-standby") = 15 */
 	offsetof(struct pg_conn, target_session_attrs)},
 
+	{"load_balance_hosts", "PGLOADBALANCEHOSTS",
+		DefaultLoadBalanceHosts, NULL,
+		"Load-Balance-Hosts", "", 8,	/* sizeof("disable") = 8 */
+	offsetof(struct pg_conn, load_balance_hosts)},
+
 	/* Terminating entry --- MUST BE LAST */
 	{NULL, NULL, NULL, NULL,
 	NULL, NULL, 0}
@@ -429,6 +435,8 @@ static void pgpassfileWarning(PGconn *conn);
 static void default_threadlock(int acquire);
 static bool sslVerifyProtocolVersion(const char *version);
 static bool sslVerifyProtocolRange(const char *min, const char *max);
+static bool parse_int_param(const char *value, int *result, PGconn *conn,
+							const char *context);
 
 
 /* global variable because fe-auth.c needs to access it */
@@ -1013,6 +1021,32 @@ parse_comma_separated_list(char **startptr, bool *more)
 	return p;
 }
 
+/*
+ * Initializes the prng_state field of the connection. We want something
+ * unpredictable, so if possible, use high-quality random bits for the
+ * seed. Otherwise, fall back to a seed based on the connection address,
+ * timestamp and PID.
+ */
+static bool
+libpq_prng_init(PGconn *conn)
+{
+	if (unlikely(!pg_prng_strong_seed(&conn->prng_state)))
+	{
+		uint64		rseed;
+		struct timeval tval = {0};
+
+		gettimeofday(&tval, NULL);
+
+		rseed = ((uint64) conn) ^
+			((uint64) getpid()) ^
+			((uint64) tval.tv_usec) ^
+			((uint64) tval.tv_sec);
+
+		pg_prng_seed(&conn->prng_state, rseed);
+	}
+	return true;
+}
+
 /*
  *		connectOptions2
  *
@@ -1570,6 +1604,50 @@ connectOptions2(PGconn *conn)
 	else
 		conn->target_server_type = SERVER_TYPE_ANY;
 
+	/*
+	 * validate load_balance_hosts option, and set load_balance_type
+	 */
+	if (conn->load_balance_hosts)
+	{
+		if (strcmp(conn->load_balance_hosts, "disable") == 0)
+			conn->load_balance_type = LOAD_BALANCE_DISABLE;
+		else if (strcmp(conn->load_balance_hosts, "random") == 0)
+			conn->load_balance_type = LOAD_BALANCE_RANDOM;
+		else
+		{
+			conn->status = CONNECTION_BAD;
+			libpq_append_conn_error(conn, "invalid %s value: \"%s\"",
+									"load_balance_hosts",
+									conn->load_balance_hosts);
+			return false;
+		}
+	}
+	else
+		conn->load_balance_type = LOAD_BALANCE_DISABLE;
+
+	if (conn->load_balance_type == LOAD_BALANCE_RANDOM)
+	{
+		if (!libpq_prng_init(conn))
+			return false;
+
+		/*
+		 * This is the "inside-out" variant of the Fisher-Yates shuffle
+		 * algorithm. Notionally, we append each new value to the array and
+		 * then swap it with a randomly-chosen array element (possibly
+		 * including itself, else we fail to generate permutations with the
+		 * last integer last).  The swap step can be optimized by combining it
+		 * with the insertion.
+		 */
+		for (i = 1; i < conn->nconnhost; i++)
+		{
+			int			j = pg_prng_uint64_range(&conn->prng_state, 0, i);
+			pg_conn_host temp = conn->connhost[j];
+
+			conn->connhost[j] = conn->connhost[i];
+			conn->connhost[i] = temp;
+		}
+	}
+
 	/*
 	 * Resolve special "auto" client_encoding from the locale
 	 */
@@ -2576,6 +2654,32 @@ keep_going:						/* We will come back to here until there is
 		}
 		pg_freeaddrinfo_all(hint.ai_family, addrlist);
 
+		/*
+		 * If random load balancing is enabled we shuffle the addresses.
+		 */
+		if (conn->load_balance_type == LOAD_BALANCE_RANDOM)
+		{
+			/*
+			 * This is the "inside-out" variant of the Fisher-Yates shuffle
+			 * algorithm. Notionally, we append each new value to the array
+			 * and then swap it with a randomly-chosen array element (possibly
+			 * including itself, else we fail to generate permutations with
+			 * the last integer last).  The swap step can be optimized by
+			 * combining it with the insertion.
+			 *
+			 * We don't need to initialize conn->prng_state here, because that
+			 * already happened in connectOptions2.
+			 */
+			for (int i = 1; i < conn->naddr; i++)
+			{
+				int			j = pg_prng_uint64_range(&conn->prng_state, 0, i);
+				AddrInfo	temp = conn->addr[j];
+
+				conn->addr[j] = conn->addr[i];
+				conn->addr[i] = temp;
+			}
+		}
+
 		reset_connection_state_machine = true;
 		conn->try_next_host = false;
 	}
@@ -4244,6 +4348,7 @@ freePGconn(PGconn *conn)
 	free(conn->outBuffer);
 	free(conn->rowBuf);
 	free(conn->target_session_attrs);
+	free(conn->load_balance_hosts);
 	termPQExpBuffer(&conn->errorMessage);
 	termPQExpBuffer(&conn->workBuffer);
 
diff --git a/src/interfaces/libpq/libpq-int.h b/src/interfaces/libpq/libpq-int.h
index 8f96c52e6c3..ff79396c0be 100644
--- a/src/interfaces/libpq/libpq-int.h
+++ b/src/interfaces/libpq/libpq-int.h
@@ -26,7 +26,8 @@
 #include <netdb.h>
 #include <sys/socket.h>
 #include <time.h>
-#ifndef WIN32
+/* MinGW has sys/time.h, but MSVC doesn't */
+#ifndef _MSC_VER
 #include <sys/time.h>
 #endif
 
@@ -82,6 +83,8 @@ typedef struct
 #endif
 #endif							/* USE_OPENSSL */
 
+#include "common/pg_prng.h"
+
 /*
  * POSTGRES backend dependent Constants.
  */
@@ -242,6 +245,13 @@ typedef enum
 	SERVER_TYPE_PREFER_STANDBY_PASS2	/* second pass - behaves same as ANY */
 } PGTargetServerType;
 
+/* Target server type (decoded value of load_balance_hosts) */
+typedef enum
+{
+	LOAD_BALANCE_DISABLE = 0,	/* Use the existing host order (default) */
+	LOAD_BALANCE_RANDOM,		/* Randomly shuffle the hosts */
+} PGLoadBalanceType;
+
 /* Boolean value plus a not-known state, for GUCs we might have to fetch */
 typedef enum
 {
@@ -397,6 +407,7 @@ struct pg_conn
 	char	   *ssl_max_protocol_version;	/* maximum TLS protocol version */
 	char	   *target_session_attrs;	/* desired session properties */
 	char	   *require_auth;	/* name of the expected auth method */
+	char	   *load_balance_hosts; /* load balance over hosts */
 
 	/* Optional file to write trace info to */
 	FILE	   *Pfdebug;
@@ -468,6 +479,8 @@ struct pg_conn
 
 	/* Transient state needed while establishing connection */
 	PGTargetServerType target_server_type;	/* desired session properties */
+	PGLoadBalanceType load_balance_type;	/* desired load balancing
+											 * algorithm */
 	bool		try_next_addr;	/* time to advance to next address/host? */
 	bool		try_next_host;	/* time to advance to next connhost[]? */
 	int			naddr;			/* number of addresses returned by getaddrinfo */
@@ -487,6 +500,8 @@ struct pg_conn
 	PGVerbosity verbosity;		/* error/notice message verbosity */
 	PGContextVisibility show_context;	/* whether to show CONTEXT field */
 	PGlobjfuncs *lobjfuncs;		/* private state for large-object access fns */
+	pg_prng_state prng_state;	/* prng state for load balancing connections */
+
 
 	/* Buffer for data received from backend and not yet processed */
 	char	   *inBuffer;		/* currently allocated buffer */
diff --git a/src/interfaces/libpq/meson.build b/src/interfaces/libpq/meson.build
index 3cd0ddb4945..80e6a15adf8 100644
--- a/src/interfaces/libpq/meson.build
+++ b/src/interfaces/libpq/meson.build
@@ -116,6 +116,8 @@ tests += {
     'tests': [
       't/001_uri.pl',
       't/002_api.pl',
+      't/003_load_balance_host_list.pl',
+      't/004_load_balance_dns.pl',
     ],
     'env': {'with_ssl': ssl_library},
   },
diff --git a/src/interfaces/libpq/t/003_load_balance_host_list.pl b/src/interfaces/libpq/t/003_load_balance_host_list.pl
new file mode 100644
index 00000000000..1bdddfdbcfd
--- /dev/null
+++ b/src/interfaces/libpq/t/003_load_balance_host_list.pl
@@ -0,0 +1,82 @@
+# Copyright (c) 2023, PostgreSQL Global Development Group
+use strict;
+use warnings;
+use Config;
+use PostgreSQL::Test::Utils;
+use PostgreSQL::Test::Cluster;
+use Test::More;
+
+# This tests load balancing across the list of different hosts in the host
+# parameter of the connection string.
+
+# Cluster setup which is shared for testing both load balancing methods
+my $node1 = PostgreSQL::Test::Cluster->new('node1');
+my $node2 = PostgreSQL::Test::Cluster->new('node2', own_host => 1);
+my $node3 = PostgreSQL::Test::Cluster->new('node3', own_host => 1);
+
+# Create a data directory with initdb
+$node1->init();
+$node2->init();
+$node3->init();
+
+# Start the PostgreSQL server
+$node1->start();
+$node2->start();
+$node3->start();
+
+# Start the tests for load balancing method 1
+my $hostlist = $node1->host . ',' . $node2->host . ',' . $node3->host;
+my $portlist = $node1->port . ',' . $node2->port . ',' . $node3->port;
+
+$node1->connect_fails(
+	"host=$hostlist port=$portlist load_balance_hosts=doesnotexist",
+	"load_balance_hosts doesn't accept unknown values",
+	expected_stderr => qr/invalid load_balance_hosts value: "doesnotexist"/);
+
+# load_balance_hosts=disable should always choose the first one.
+$node1->connect_ok("host=$hostlist port=$portlist load_balance_hosts=disable",
+	"load_balance_hosts=disable connects to the first node",
+	sql => "SELECT 'connect2'",
+	log_like => [qr/statement: SELECT 'connect2'/]);
+
+# Statistically the following loop with load_balance_hosts=random will almost
+# certainly connect at least once to each of the nodes. The chance of that not
+# happening is so small that it's negligible: (2/3)^50 = 1.56832855e-9
+foreach my $i (1 .. 50) {
+	$node1->connect_ok("host=$hostlist port=$portlist load_balance_hosts=random",
+		"seed 1234 selects node 1 first",
+		sql => "SELECT 'connect1'");
+}
+
+my $node1_occurences = () = $node1->log_content() =~ /statement: SELECT 'connect1'/g;
+my $node2_occurences = () = $node2->log_content() =~ /statement: SELECT 'connect1'/g;
+my $node3_occurences = () = $node3->log_content() =~ /statement: SELECT 'connect1'/g;
+
+my $total_occurences = $node1_occurences + $node2_occurences + $node3_occurences;
+
+ok($node1_occurences > 1, "expected at least one execution on node1, found $node1_occurences");
+ok($node2_occurences > 1, "expected at least one execution on node2, found $node2_occurences");
+ok($node3_occurences > 1, "expected at least one execution on node3, found $node3_occurences");
+ok($total_occurences == 50, "expected 50 executions across all nodes, found $total_occurences");
+
+$node1->stop();
+$node2->stop();
+
+# load_balance_hosts=disable should continue trying hosts until it finds a
+# working one.
+$node3->connect_ok("host=$hostlist port=$portlist load_balance_hosts=disable",
+	"load_balance_hosts=disable continues until it connects to the a working node",
+	sql => "SELECT 'connect3'",
+	log_like => [qr/statement: SELECT 'connect3'/]);
+
+# Also with load_balance_hosts=random we continue to the next nodes if previous
+# ones are down. Connect a few times to make sure it's not just lucky.
+foreach my $i (1 .. 5) {
+	$node3->connect_ok("host=$hostlist port=$portlist load_balance_hosts=random",
+		"load_balance_hosts=random continues until it connects to the a working node",
+		sql => "SELECT 'connect4'",
+		log_like => [qr/statement: SELECT 'connect4'/]);
+}
+
+done_testing();
+
diff --git a/src/test/perl/PostgreSQL/Test/Cluster.pm b/src/test/perl/PostgreSQL/Test/Cluster.pm
index 3e2a27fb717..a3aef8b5e91 100644
--- a/src/test/perl/PostgreSQL/Test/Cluster.pm
+++ b/src/test/perl/PostgreSQL/Test/Cluster.pm
@@ -2567,6 +2567,22 @@ sub issues_sql_like
 	return;
 }
 
+=pod
+
+=item $node->log_content()
+
+Returns the contents of log of the node
+
+=cut
+
+sub log_content
+{
+	my ($self) = @_;
+	return
+	  PostgreSQL::Test::Utils::slurp_file($self->logfile);
+}
+
+
 =pod
 
 =item $node->run_log(...)
diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list
index 5c5aa8bf4c9..2571d22f96d 100644
--- a/src/tools/pgindent/typedefs.list
+++ b/src/tools/pgindent/typedefs.list
@@ -1702,6 +1702,7 @@ PGFileType
 PGFunction
 PGLZ_HistEntry
 PGLZ_Strategy
+PGLoadBalanceType
 PGMessageField
 PGModuleMagicFunction
 PGNoticeHooks
-- 
2.34.1

