From ba4e26a0086dc5b49405a5ac8b495a64f1c2bb49 Mon Sep 17 00:00:00 2001
From: Neil Conway <neil@determined.ai>
Date: Sun, 2 Jun 2024 14:00:58 -0400
Subject: [PATCH 2/2] Optimize COPY TO ... FORMAT CSV using SIMD instructions

CopyAttributeOutCSV() does one or two byte-by-byte loops over the text of each
attribute, depending on whether quotation is required. Implementing this loops
using SIMD yields a significant speedup for long attribute values. For short
attribute values, performance is roughly unchanged.

Using SIMD when encoding_embeds_ascii is true seems quite complex, so for now we
just use the old code path for such encodings.
---
 src/backend/commands/copyto.c | 145 +++++++++++++++++++++++++++++++++-
 1 file changed, 142 insertions(+), 3 deletions(-)

diff --git a/src/backend/commands/copyto.c b/src/backend/commands/copyto.c
index 48957d8c3e..a62c4ec120 100644
--- a/src/backend/commands/copyto.c
+++ b/src/backend/commands/copyto.c
@@ -29,6 +29,7 @@
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "pgstat.h"
+#include "port/simd.h"
 #include "storage/fd.h"
 #include "tcop/tcopprot.h"
 #include "utils/lsyscache.h"
@@ -1124,6 +1125,137 @@ CopyAttributeOutText(CopyToState cstate, const char *string)
 	DUMPSOFAR();
 }
 
+/*
+ * Send text representation of one attribute, with conversion and CSV-style
+ * escaping. This variant uses SIMD instructions to optimize processing, but
+ * we can only use this approach when encoding_embeds_ascii if false.
+ */
+static void
+CopyAttributeOutCSVFast(CopyToState cstate, const char *ptr,
+						bool use_quote)
+{
+	int			len;
+	int			vlen;
+	char		delimc = cstate->opts.delim[0];
+	char		quotec = cstate->opts.quote[0];
+	char		escapec = cstate->opts.escape[0];
+
+	len = strlen(ptr);
+	vlen = len & (int) (~(sizeof(Vector8) - 1));
+
+	/*
+	 * Make a preliminary pass to discover if it needs quoting
+	 */
+	if (!use_quote)
+	{
+		bool	single_attr = (list_length(cstate->attnumlist) == 1);
+
+		/*
+		 * Because '\.' can be a data value, quote it if it appears alone on a
+		 * line so it is not interpreted as the end-of-data marker.
+		 */
+		if (single_attr && strcmp(ptr, "\\.") == 0)
+			use_quote = true;
+		else
+		{
+			int		i;
+			Vector8 chunk;
+
+			for (i = 0; i < vlen; i += sizeof(Vector8))
+			{
+				vector8_load(&chunk, (const uint8 *) &ptr[i]);
+
+				if (vector8_has(chunk, (unsigned char) delimc) ||
+					vector8_has(chunk, (unsigned char) quotec) ||
+					vector8_has(chunk, (unsigned char) '\n') ||
+					vector8_has(chunk, (unsigned char) '\r'))
+				{
+					use_quote = true;
+					break;
+				}
+			}
+
+			/* Check the tail of the string */
+			if (!use_quote)
+			{
+				for (; i < len; i++)
+				{
+					char c = ptr[i];
+
+					if (c == delimc || c == quotec || c == '\n' || c == '\r')
+					{
+						use_quote = true;
+						break;
+					}
+				}
+			}
+		}
+	}
+
+	if (use_quote)
+	{
+		int		i;
+		int		start_idx = 0;
+		Vector8 chunk;
+
+		CopySendChar(cstate, quotec);
+
+		for (i = 0; i < vlen; i += sizeof(Vector8))
+		{
+			vector8_load(&chunk, (const uint8 *) &ptr[i]);
+
+			if (vector8_has(chunk, (unsigned char) delimc) ||
+				vector8_has(chunk, (unsigned char) quotec))
+			{
+				/*
+				 * This chunk has one or more characters that require
+				 * escaping, so switch to byte-at-a-time processing
+				 */
+				for (int j = i; j < (i + sizeof(Vector8)); j++)
+				{
+					char c = ptr[j];
+
+					if (c == quotec || c == escapec)
+					{
+						if (j > start_idx)
+							CopySendData(cstate, ptr + start_idx, j - start_idx);
+
+						CopySendChar(cstate, escapec);
+						start_idx = j;
+					}
+				}
+			}
+		}
+
+		/* Process the tail of the string */
+		for (; i < len; i++)
+		{
+			char c = ptr[i];
+
+			if (c == quotec || c == escapec)
+			{
+				if (i > start_idx)
+					CopySendData(cstate, ptr + start_idx, i - start_idx);
+
+				CopySendChar(cstate, escapec);
+				start_idx = i;
+			}
+		}
+
+		/* Send any remaining text */
+		if (start_idx < len)
+			CopySendData(cstate, ptr + start_idx, len - start_idx);
+
+		CopySendChar(cstate, quotec);
+	}
+	else
+	{
+		/* If it doesn't need quoting, we can just dump it as-is */
+		CopySendData(cstate, ptr, len);
+	}
+}
+
+
 /*
  * Send text representation of one attribute, with conversion and
  * CSV-style escaping
@@ -1138,7 +1270,6 @@ CopyAttributeOutCSV(CopyToState cstate, const char *string,
 	char		delimc = cstate->opts.delim[0];
 	char		quotec = cstate->opts.quote[0];
 	char		escapec = cstate->opts.escape[0];
-	bool		single_attr = (list_length(cstate->attnumlist) == 1);
 
 	/* force quoting if it matches null_print (before conversion!) */
 	if (!use_quote && strcmp(string, cstate->opts.null_print) == 0)
@@ -1149,11 +1280,19 @@ CopyAttributeOutCSV(CopyToState cstate, const char *string,
 	else
 		ptr = string;
 
+	if (!cstate->encoding_embeds_ascii)
+	{
+		CopyAttributeOutCSVFast(cstate, ptr, use_quote);
+		return;
+	}
+
 	/*
 	 * Make a preliminary pass to discover if it needs quoting
 	 */
 	if (!use_quote)
 	{
+		bool	single_attr = (list_length(cstate->attnumlist) == 1);
+
 		/*
 		 * Because '\.' can be a data value, quote it if it appears alone on a
 		 * line so it is not interpreted as the end-of-data marker.
@@ -1171,7 +1310,7 @@ CopyAttributeOutCSV(CopyToState cstate, const char *string,
 					use_quote = true;
 					break;
 				}
-				if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
+				if (IS_HIGHBIT_SET(c))
 					tptr += pg_encoding_mblen(cstate->file_encoding, tptr);
 				else
 					tptr++;
@@ -1195,7 +1334,7 @@ CopyAttributeOutCSV(CopyToState cstate, const char *string,
 				CopySendChar(cstate, escapec);
 				start = ptr;	/* we include char in next run */
 			}
-			if (IS_HIGHBIT_SET(c) && cstate->encoding_embeds_ascii)
+			if (IS_HIGHBIT_SET(c))
 				ptr += pg_encoding_mblen(cstate->file_encoding, ptr);
 			else
 				ptr++;
-- 
2.39.3 (Apple Git-146)

