From 3d4683d2682dc3bdcef532489e851ad16ec9262d Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Fri, 15 Sep 2023 22:25:40 +0200
Subject: [PATCH v2] Update BKI syntax, bootstrap performance

- NULL values are 56% smaller (from 7 bytes to 3 bytes each)
- NULL-only postfixes are not emitted into the BKI file
- Inserts are batched into inserts of up to 1000 rows per catalog, reducing overhead of the 'INSERT' statement
- The insert tuple descriptor is reused across tuple insertions, reducing insert overhead
- OPEN/CLOSE are now implicit and depend on context: no need to manually open/close the relation in the bki.
- Removed some wasteful spaces from the generated file.
---
 doc/src/sgml/bki.sgml               | 13 ++---
 src/backend/bootstrap/bootparse.y   | 73 ++++++++++++++++----------
 src/backend/bootstrap/bootscanner.l |  5 +-
 src/backend/bootstrap/bootstrap.c   | 13 +++--
 src/backend/catalog/genbki.pl       | 80 +++++++++++++++++++++++------
 src/include/bootstrap/bootstrap.h   |  1 +
 6 files changed, 127 insertions(+), 58 deletions(-)

diff --git a/doc/src/sgml/bki.sgml b/doc/src/sgml/bki.sgml
index f71644e398..5c057d0f8a 100644
--- a/doc/src/sgml/bki.sgml
+++ b/doc/src/sgml/bki.sgml
@@ -247,9 +247,9 @@
 
     <listitem>
      <para>
-      Null values are represented by <literal>_null_</literal>.
-      (Note that there is no way to create a value that is just that
-      string.)
+      Null values are represented by <literal>_null_</literal> and
+      <literal>__</literal>. (Note that there is no way to create a value that
+      is just either of these strings.)
      </para>
     </listitem>
 
@@ -1068,11 +1068,8 @@ $ perl  rewrite_dat_with_prokind.pl  pg_proc.dat
    of type <type>oid</type>, <type>int4</type> and <type>text</type>,
    respectively, and insert two rows into the table:
 <programlisting>
-create test_table 420 (oid = oid, cola = int4, colb = text)
-open test_table
-insert ( 421 1 'value 1' )
-insert ( 422 2 _null_ )
-close test_table
+create test_table 420 open (oid = oid, cola = int4, colb = text)
+insert ( 421 1 'value 1' ), ( 422 2 _null_ )
 </programlisting>
   </para>
  </sect1>
diff --git a/src/backend/bootstrap/bootparse.y b/src/backend/bootstrap/bootparse.y
index 81a1b7bfec..c911e26018 100644
--- a/src/backend/bootstrap/bootparse.y
+++ b/src/backend/bootstrap/bootparse.y
@@ -62,6 +62,7 @@ do_end(void)
 	/* Reclaim memory allocated while processing this line */
 	MemoryContextSwitchTo(CurTransactionContext);
 	MemoryContextReset(per_line_ctx);
+	tupDesc = NULL;
 	CHECK_FOR_INTERRUPTS();		/* allow SIGINT to kill bootstrap run */
 	if (isatty(0))
 	{
@@ -91,7 +92,7 @@ static int num_columns_read = 0;
 %type <list>  boot_index_params
 %type <ielem> boot_index_param
 %type <str>   boot_ident
-%type <ival>  optbootstrap optsharedrelation boot_column_nullness
+%type <ival>  optbootstrap optsharedrelation optopen boot_column_nullness
 %type <oidval> oidspec optrowtypeoid
 
 %token <str> ID
@@ -110,6 +111,12 @@ static int num_columns_read = 0;
 
 TopLevel:
 		  Boot_Queries
+				{
+					if (boot_reldesc)
+					{
+						closerel(NULL);
+					}
+				}
 		|
 		;
 
@@ -119,9 +126,7 @@ Boot_Queries:
 		;
 
 Boot_Query :
-		  Boot_OpenStmt
-		| Boot_CloseStmt
-		| Boot_CreateStmt
+		  Boot_CreateStmt
 		| Boot_InsertStmt
 		| Boot_DeclareIndexStmt
 		| Boot_DeclareUniqueIndexStmt
@@ -129,26 +134,8 @@ Boot_Query :
 		| Boot_BuildIndsStmt
 		;
 
-Boot_OpenStmt:
-		  OPEN boot_ident
-				{
-					do_start();
-					boot_openrel($2);
-					do_end();
-				}
-		;
-
-Boot_CloseStmt:
-		  XCLOSE boot_ident
-				{
-					do_start();
-					closerel($2);
-					do_end();
-				}
-		;
-
 Boot_CreateStmt:
-		  XCREATE boot_ident oidspec optbootstrap optsharedrelation optrowtypeoid LPAREN
+		  XCREATE boot_ident oidspec optbootstrap optsharedrelation optrowtypeoid optopen LPAREN
 				{
 					do_start();
 					numattr = 0;
@@ -192,7 +179,6 @@ Boot_CreateStmt:
 
 						if (boot_reldesc)
 						{
-							elog(DEBUG4, "create bootstrap: warning, open relation exists, closing first");
 							closerel(NULL);
 						}
 
@@ -240,6 +226,12 @@ Boot_CreateStmt:
 													  NULL);
 						elog(DEBUG4, "relation created with OID %u", id);
 					}
+
+					if ($7)
+					{
+						boot_openrel($2);
+					}
+
 					do_end();
 				}
 		;
@@ -251,15 +243,37 @@ Boot_InsertStmt:
 					elog(DEBUG4, "inserting row");
 					num_columns_read = 0;
 				}
+		  Boot_InsertTuples
+				{
+					num_columns_read = 0;
+					do_end();
+				}
+		;
+
+Boot_InsertTuples:
+		  Boot_InsertTuple
+		| Boot_InsertTuples COMMA
+				{
+					elog(DEBUG4, "inserting row");
+					num_columns_read = 0;
+				}
+		  Boot_InsertTuple
+		;
+
+Boot_InsertTuple:
 		  LPAREN boot_column_val_list RPAREN
 				{
+					if (boot_reldesc == NULL)
+						elog(FATAL, "relation not open");
+
+					while (num_columns_read < numattr)
+						InsertOneNull(num_columns_read++);
+
 					if (num_columns_read != numattr)
 						elog(ERROR, "incorrect number of columns in row (expected %d, got %d)",
 							 numattr, num_columns_read);
-					if (boot_reldesc == NULL)
-						elog(FATAL, "relation not open");
 					InsertOneTuple();
-					do_end();
+					num_columns_read = 0;
 				}
 		;
 
@@ -427,6 +441,11 @@ optrowtypeoid:
 		|							{ $$ = InvalidOid; }
 		;
 
+optopen:
+			OPEN				{ $$ = true; }
+		|						{ $$ = false; }
+		;
+
 boot_column_list:
 		  boot_column_def
 		| boot_column_list COMMA boot_column_def
diff --git a/src/backend/bootstrap/bootscanner.l b/src/backend/bootstrap/bootscanner.l
index 6a9d4193f2..6564469951 100644
--- a/src/backend/bootstrap/bootscanner.l
+++ b/src/backend/bootstrap/bootscanner.l
@@ -60,8 +60,8 @@ sid		\'([^']|\'\')*\'
 /*
  * Keyword tokens return the keyword text (as a constant string) in boot_yylval.kw,
  * just in case that's needed because we want to treat the keyword as an
- * unreserved identifier.  Note that _null_ is not treated as a keyword
- * for this purpose; it's the one "reserved word" in the bootstrap syntax.
+ * unreserved identifier.  Note that _null_ and __ are not treated as a keyword
+ * for this purpose; they're the only "reserved words" in the bootstrap syntax.
  *
  * Notice that all the keywords are case-sensitive, and for historical
  * reasons some must be upper case.
@@ -85,6 +85,7 @@ rowtype_oid		{ boot_yylval.kw = "rowtype_oid"; return XROWTYPE_OID; }
 insert			{ boot_yylval.kw = "insert"; return INSERT_TUPLE; }
 
 _null_			{ return NULLVAL; }
+__				{ return NULLVAL; }
 
 ","				{ return COMMA; }
 "="				{ return EQUALS; }
diff --git a/src/backend/bootstrap/bootstrap.c b/src/backend/bootstrap/bootstrap.c
index 4cc2efa95c..90c5907d30 100644
--- a/src/backend/bootstrap/bootstrap.c
+++ b/src/backend/bootstrap/bootstrap.c
@@ -64,7 +64,7 @@ Relation	boot_reldesc;		/* current relation descriptor */
 
 Form_pg_attribute attrtypes[MAXATTR];	/* points to attribute info */
 int			numattr;			/* number of attributes for cur. rel */
-
+TupleDesc	tupDesc;			/* cached tupDesc for cur. rel, or NULL */
 
 /*
  * Basic information associated with each type.  This is used before
@@ -506,7 +506,6 @@ DefineAttr(char *name, char *type, int attnum, int nullness)
 
 	if (boot_reldesc != NULL)
 	{
-		elog(WARNING, "no open relations allowed with CREATE command");
 		closerel(NULL);
 	}
 
@@ -612,14 +611,18 @@ void
 InsertOneTuple(void)
 {
 	HeapTuple	tuple;
-	TupleDesc	tupDesc;
 	int			i;
 
 	elog(DEBUG4, "inserting row with %d columns", numattr);
 
-	tupDesc = CreateTupleDesc(numattr, attrtypes);
+	/*
+	 * tupDesc will be freed when the memory context is reset,
+	 * and will be reset to NULL.
+	 */
+	if (tupDesc == NULL)
+		tupDesc = CreateTupleDesc(numattr, attrtypes);
+
 	tuple = heap_form_tuple(tupDesc, values, Nulls);
-	pfree(tupDesc);				/* just free's tupDesc, not the attrtypes */
 
 	simple_heap_insert(boot_reldesc, tuple);
 	heap_freetuple(tuple);
diff --git a/src/backend/catalog/genbki.pl b/src/backend/catalog/genbki.pl
index 4a7205472c..1e6893792c 100644
--- a/src/backend/catalog/genbki.pl
+++ b/src/backend/catalog/genbki.pl
@@ -28,6 +28,8 @@ my $include_path;
 
 my $num_errors = 0;
 
+my $insert_batch_size = 1000;
+
 GetOptions(
 	'output:s' => \$output_path,
 	'set-version:s' => \$major_version,
@@ -503,9 +505,19 @@ EOM
 	  . $catalog->{bootstrap}
 	  . $catalog->{rowtype_oid_clause};
 
+	# Open it, unless it's a bootstrap catalog (create bootstrap does this
+	# automatically)
+	if (!$catalog->{bootstrap})
+	{
+		if (($catalog_data{$catname} || 0) > 0)
+		{
+			print $bki " open";
+		}
+	}
+
 	my $first = 1;
 
-	print $bki "\n (\n";
+	print $bki "\n(\n";
 	my $schema = $catalog->{columns};
 	my %attnames;
 	my $attnum = 0;
@@ -521,7 +533,7 @@ EOM
 		# Emit column definitions
 		if (!$first)
 		{
-			print $bki " ,\n";
+			print $bki ",\n";
 		}
 		$first = 0;
 
@@ -539,7 +551,7 @@ EOM
 		# Emit Anum_* constants
 		printf $def "#define Anum_%s_%s %s\n", $catname, $attname, $attnum;
 	}
-	print $bki "\n )\n";
+	print $bki "\n)\n";
 
 	# Emit Natts_* constant
 	print $def "\n#define Natts_$catname $attnum\n\n";
@@ -550,19 +562,13 @@ EOM
 		print $def $line;
 	}
 
-	# Open it, unless it's a bootstrap catalog (create bootstrap does this
-	# automatically)
-	if (!$catalog->{bootstrap})
-	{
-		print $bki "open $catname\n";
-	}
-
 	# For pg_attribute.h, we generate data entries ourselves.
 	if ($catname eq 'pg_attribute')
 	{
 		gen_pg_attribute($schema);
 	}
 
+	my $rows = 0;
 	# Ordinary catalog with a data file
 	foreach my $row (@{ $catalog_data{$catname} })
 	{
@@ -618,7 +624,7 @@ EOM
 				}
 				elsif ($atttype eq '_oid')
 				{
-					if ($bki_values{$attname} ne '_null_')
+					if ($bki_values{$attname} ne '_null_' && $bki_values{$attname} ne '__')
 					{
 						$bki_values{$attname} =~ s/[{}]//g;
 						@lookupnames = split /,/, $bki_values{$attname};
@@ -654,7 +660,7 @@ EOM
 		}
 
 		# Write to postgres.bki
-		print_bki_insert(\%bki_values, $schema);
+		print_bki_insert(\%bki_values, $schema, ($rows++ % $insert_batch_size) eq 0);
 
 		# Emit OID symbol
 		if (defined $bki_values{oid_symbol})
@@ -671,7 +677,6 @@ EOM
 		}
 	}
 
-	print $bki "close $catname\n";
 	printf $def "\n#endif\t\t\t\t\t\t\t/* %s_D_H */\n", uc $catname;
 
 	# Close and rename definition header
@@ -833,6 +838,8 @@ sub gen_pg_attribute
 		push @attnames, $column->{name};
 	}
 
+	my $attrs_processed = 0;
+
 	foreach my $table_name (@catnames)
 	{
 		my $table = $catalogs{$table_name};
@@ -862,7 +869,10 @@ sub gen_pg_attribute
 				  && ($row{attlen} eq 'NAMEDATALEN' || $row{attlen} > 0));
 
 			# If it's bootstrapped, put an entry in postgres.bki.
-			print_bki_insert(\%row, $schema) if $table->{bootstrap};
+			if ($table->{bootstrap})
+			{
+				print_bki_insert(\%row, $schema, ($attrs_processed++ % $insert_batch_size) eq 0);
+			}
 
 			# Store schemapg entries for later.
 			morph_row_for_schemapg(\%row, $schema);
@@ -892,7 +902,7 @@ sub gen_pg_attribute
 				$row{attstattarget} = '0';
 
 				morph_row_for_pgattr(\%row, $schema, $attr, 1);
-				print_bki_insert(\%row, $schema);
+				print_bki_insert(\%row, $schema, ($attrs_processed++ % $insert_batch_size) eq 0);
 			}
 		}
 	}
@@ -964,6 +974,8 @@ sub print_bki_insert
 {
 	my $row = shift;
 	my $schema = shift;
+	my $insert = shift;
+	my $n_null_skipped = 0;
 
 	my @bki_values;
 
@@ -977,6 +989,31 @@ sub print_bki_insert
 		# since that represents a NUL char in C code.
 		$bki_value = '' if $bki_value eq '\0';
 
+		# We have many nulls in our catalogs, which would use 6 bytes each if
+		# stored as _null_. We compress them by printing __ instead.
+		# While not as readable, the cost of each NULL column's values is
+		# decreased by 57%.
+		$bki_value = '__' if $bki_value eq '_null_';
+
+		# Various catalogs contain trailing NULL values. By defaulting
+		# missing trailing columns to NULL, we can skip emitting those columns,
+		# often removing several column values from the generated BKI.
+		if ($bki_value eq '__')
+		{
+			$n_null_skipped += 1;
+			next;
+		}
+		else
+		{
+			# We're not in a tail of only nulls, so write out the skipped null
+			# values.
+			while ($n_null_skipped > 0)
+			{
+				push @bki_values, '__';
+				$n_null_skipped -= 1;
+			}
+		}
+
 		# Handle single quotes by doubling them, because that's what the
 		# bootstrap scanner requires.  We do not process backslashes
 		# specially; this allows escape-string-style backslash escapes
@@ -991,7 +1028,18 @@ sub print_bki_insert
 
 		push @bki_values, $bki_value;
 	}
-	printf $bki "insert ( %s )\n", join(' ', @bki_values);
+
+	# The "insert" part of the statement is quite large at 6 bytes, so we
+	# combine multiple inserts for the same catalog into one large statement.
+	# This again saves a few bytes per statement.
+	if ($insert != 0)
+	{
+		printf $bki "insert (%s)\n", join(' ', @bki_values);
+	}
+	else
+	{
+		printf $bki ", (%s)\n", join(' ', @bki_values);
+	}
 	return;
 }
 
diff --git a/src/include/bootstrap/bootstrap.h b/src/include/bootstrap/bootstrap.h
index e1cb73c5f2..0228195709 100644
--- a/src/include/bootstrap/bootstrap.h
+++ b/src/include/bootstrap/bootstrap.h
@@ -30,6 +30,7 @@
 extern PGDLLIMPORT Relation boot_reldesc;
 extern PGDLLIMPORT Form_pg_attribute attrtypes[MAXATTR];
 extern PGDLLIMPORT int numattr;
+extern PGDLLIMPORT TupleDesc tupDesc;
 
 
 extern void BootstrapModeMain(int argc, char *argv[], bool check_only) pg_attribute_noreturn();
-- 
2.40.1

