From bf524d3e3b2c3868b241395fc7bb34024643fd64 Mon Sep 17 00:00:00 2001
From: Masahiko Sawada <sawada.mshk@gmail.com>
Date: Wed, 18 Aug 2021 14:55:17 +0900
Subject: [PATCH v3] Track I/O timing for temp buffers

Previously, the output of EXPLAIN with BUFFERS option shows only I/O
timing spent reading and writing shared and local buffers. This
commit shows I/O timing for temp buffers in the output of EXPLAIN. It
is helpful for users in a case where it is the temp buffers I/O that
is using most of the time.

Same as shared/local buffer I/O timings, temp buffers I/O timing is
shown only when track_io_timing is enabled.
---
 doc/src/sgml/ref/explain.sgml         |   8 +-
 src/backend/commands/explain.c        |  42 ++++++++--
 src/backend/executor/instrument.c     |   6 ++
 src/backend/storage/file/buffile.c    |  26 +++++++
 src/include/executor/instrument.h     |   2 +
 src/test/regress/expected/explain.out | 106 ++++++++++++++++++++++++++
 src/test/regress/sql/explain.sql      |   8 ++
 7 files changed, 186 insertions(+), 12 deletions(-)

diff --git a/doc/src/sgml/ref/explain.sgml b/doc/src/sgml/ref/explain.sgml
index 4d758fb237..70c65c4ba1 100644
--- a/doc/src/sgml/ref/explain.sgml
+++ b/doc/src/sgml/ref/explain.sgml
@@ -174,10 +174,10 @@ ROLLBACK;
       Include information on buffer usage. Specifically, include the number of
       shared blocks hit, read, dirtied, and written, the number of local blocks
       hit, read, dirtied, and written, the number of temp blocks read and
-      written, and the time spent reading and writing data file blocks
-      (in milliseconds) if <xref linkend="guc-track-io-timing"/> is enabled.
-      A <emphasis>hit</emphasis> means that a read was avoided because the block was
-      found already in cache when needed.
+      written, and the time spent reading and writing data file blocks and
+      temp file blocks (in milliseconds) if <xref linkend="guc-track-io-timing"/>
+      is enabled.  A <emphasis>hit</emphasis> means that a read was avoided because
+      the block was found already in cache when needed.
       Shared blocks contain data from regular tables and indexes;
       local blocks contain data from temporary tables and indexes;
       while temp blocks contain short-term working data used in sorts, hashes,
diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 10644dfac4..02052af55c 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -3501,8 +3501,11 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 								usage->temp_blks_written > 0);
 		bool		has_timing = (!INSTR_TIME_IS_ZERO(usage->blk_read_time) ||
 								  !INSTR_TIME_IS_ZERO(usage->blk_write_time));
+		bool		has_temp_timing = (!INSTR_TIME_IS_ZERO(usage->temp_blk_read_time) ||
+									   !INSTR_TIME_IS_ZERO(usage->temp_blk_write_time));
 		bool		show_planning = (planning && (has_shared ||
-												  has_local || has_temp || has_timing));
+												  has_local || has_temp || has_timing ||
+												  has_temp_timing));
 
 		if (show_planning)
 		{
@@ -3567,16 +3570,33 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 		}
 
 		/* As above, show only positive counter values. */
-		if (has_timing)
+		if (has_timing || has_temp_timing)
 		{
 			ExplainIndentText(es);
 			appendStringInfoString(es->str, "I/O Timings:");
-			if (!INSTR_TIME_IS_ZERO(usage->blk_read_time))
-				appendStringInfo(es->str, " read=%0.3f",
-								 INSTR_TIME_GET_MILLISEC(usage->blk_read_time));
-			if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
-				appendStringInfo(es->str, " write=%0.3f",
-								 INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
+
+			if (has_timing)
+			{
+				appendStringInfoString(es->str, " shared/local");
+				if (!INSTR_TIME_IS_ZERO(usage->blk_read_time))
+					appendStringInfo(es->str, " read=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->blk_read_time));
+				if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
+					appendStringInfo(es->str, " write=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
+				if (has_temp_timing)
+					appendStringInfoChar(es->str, ',');
+			}
+			if (has_temp_timing)
+			{
+				appendStringInfoString(es->str, " temp");
+				if (!INSTR_TIME_IS_ZERO(usage->temp_blk_read_time))
+					appendStringInfo(es->str, " read=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time));
+				if (!INSTR_TIME_IS_ZERO(usage->temp_blk_write_time))
+					appendStringInfo(es->str, " write=%0.3f",
+									 INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time));
+			}
 			appendStringInfoChar(es->str, '\n');
 		}
 
@@ -3613,6 +3633,12 @@ show_buffer_usage(ExplainState *es, const BufferUsage *usage, bool planning)
 			ExplainPropertyFloat("I/O Write Time", "ms",
 								 INSTR_TIME_GET_MILLISEC(usage->blk_write_time),
 								 3, es);
+			ExplainPropertyFloat("Temp I/O Read Time", "ms",
+								 INSTR_TIME_GET_MILLISEC(usage->temp_blk_read_time),
+								 3, es);
+			ExplainPropertyFloat("Temp I/O Write Time", "ms",
+								 INSTR_TIME_GET_MILLISEC(usage->temp_blk_write_time),
+								 3, es);
 		}
 	}
 }
diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 2b106d8473..5c3cc4b1cc 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -237,6 +237,8 @@ BufferUsageAdd(BufferUsage *dst, const BufferUsage *add)
 	dst->temp_blks_written += add->temp_blks_written;
 	INSTR_TIME_ADD(dst->blk_read_time, add->blk_read_time);
 	INSTR_TIME_ADD(dst->blk_write_time, add->blk_write_time);
+	INSTR_TIME_ADD(dst->temp_blk_read_time, add->temp_blk_read_time);
+	INSTR_TIME_ADD(dst->temp_blk_write_time, add->temp_blk_write_time);
 }
 
 /* dst += add - sub */
@@ -259,6 +261,10 @@ BufferUsageAccumDiff(BufferUsage *dst,
 						  add->blk_read_time, sub->blk_read_time);
 	INSTR_TIME_ACCUM_DIFF(dst->blk_write_time,
 						  add->blk_write_time, sub->blk_write_time);
+	INSTR_TIME_ACCUM_DIFF(dst->temp_blk_read_time,
+						  add->temp_blk_read_time, sub->temp_blk_read_time);
+	INSTR_TIME_ACCUM_DIFF(dst->temp_blk_write_time,
+						  add->temp_blk_write_time, sub->temp_blk_write_time);
 }
 
 /* helper functions for WAL usage accumulation */
diff --git a/src/backend/storage/file/buffile.c b/src/backend/storage/file/buffile.c
index b673150dbe..998c7bffe1 100644
--- a/src/backend/storage/file/buffile.c
+++ b/src/backend/storage/file/buffile.c
@@ -429,6 +429,8 @@ static void
 BufFileLoadBuffer(BufFile *file)
 {
 	File		thisfile;
+	instr_time	io_start;
+	instr_time	io_time;
 
 	/*
 	 * Advance to next component file if necessary and possible.
@@ -440,6 +442,9 @@ BufFileLoadBuffer(BufFile *file)
 		file->curOffset = 0L;
 	}
 
+	if (track_io_timing)
+		INSTR_TIME_SET_CURRENT(io_start);
+
 	/*
 	 * Read whatever we can get, up to a full bufferload.
 	 */
@@ -458,6 +463,13 @@ BufFileLoadBuffer(BufFile *file)
 						FilePathName(thisfile))));
 	}
 
+	if (track_io_timing)
+	{
+		INSTR_TIME_SET_CURRENT(io_time);
+		INSTR_TIME_SUBTRACT(io_time, io_start);
+		INSTR_TIME_ADD(pgBufferUsage.temp_blk_read_time, io_time);
+	}
+
 	/* we choose not to advance curOffset here */
 
 	if (file->nbytes > 0)
@@ -485,6 +497,8 @@ BufFileDumpBuffer(BufFile *file)
 	while (wpos < file->nbytes)
 	{
 		off_t		availbytes;
+		instr_time	io_start;
+		instr_time	io_time;
 
 		/*
 		 * Advance to next component file if necessary and possible.
@@ -507,6 +521,10 @@ BufFileDumpBuffer(BufFile *file)
 			bytestowrite = (int) availbytes;
 
 		thisfile = file->files[file->curFile];
+
+		if (track_io_timing)
+			INSTR_TIME_SET_CURRENT(io_start);
+
 		bytestowrite = FileWrite(thisfile,
 								 file->buffer.data + wpos,
 								 bytestowrite,
@@ -517,6 +535,14 @@ BufFileDumpBuffer(BufFile *file)
 					(errcode_for_file_access(),
 					 errmsg("could not write to file \"%s\": %m",
 							FilePathName(thisfile))));
+
+		if (track_io_timing)
+		{
+			INSTR_TIME_SET_CURRENT(io_time);
+			INSTR_TIME_SUBTRACT(io_time, io_start);
+			INSTR_TIME_ADD(pgBufferUsage.temp_blk_write_time, io_time);
+		}
+
 		file->curOffset += bytestowrite;
 		wpos += bytestowrite;
 
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index 2f9905b7c8..6f3cbcabef 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -35,6 +35,8 @@ typedef struct BufferUsage
 	int64		temp_blks_written;	/* # of temp blocks written */
 	instr_time	blk_read_time;	/* time spent reading */
 	instr_time	blk_write_time; /* time spent writing */
+	instr_time	temp_blk_read_time;	/* time spent reading tmp blocks */
+	instr_time	temp_blk_write_time; /* time spent writing tmp blocks */
 } BufferUsage;
 
 /*
diff --git a/src/test/regress/expected/explain.out b/src/test/regress/expected/explain.out
index 1734dfee8c..2534f92c72 100644
--- a/src/test/regress/expected/explain.out
+++ b/src/test/regress/expected/explain.out
@@ -502,3 +502,109 @@ select explain_filter('explain (verbose) select * from int8_tbl i8');
  Query Identifier: N
 (3 rows)
 
+-- Test I/O timing for temp buffers
+set track_io_timing = on;
+set work_mem to '4MB';
+select explain_filter('explain (analyze, buffers) select count(*) from generate_series(1, 100000)');
+                                                explain_filter                                                
+--------------------------------------------------------------------------------------------------------------
+ Aggregate  (cost=N.N..N.N rows=N width=N) (actual time=N.N..N.N rows=N loops=N)
+   I/O Timings: temp read=N.N write=N.N
+   ->  Function Scan on generate_series  (cost=N.N..N.N rows=N width=N) (actual time=N.N..N.N rows=N loops=N)
+         I/O Timings: temp read=N.N write=N.N
+ Planning Time: N.N ms
+ Execution Time: N.N ms
+(6 rows)
+
+select explain_filter('explain (analyze, buffers, format json) select count(*) from generate_series(1, 100000)');
+                explain_filter                 
+-----------------------------------------------
+ [                                            +
+   {                                          +
+     "Plan": {                                +
+       "Node Type": "Aggregate",              +
+       "Strategy": "Plain",                   +
+       "Partial Mode": "Simple",              +
+       "Parallel Aware": false,               +
+       "Async Capable": false,                +
+       "Startup Cost": N.N,                   +
+       "Total Cost": N.N,                     +
+       "Plan Rows": N,                        +
+       "Plan Width": N,                       +
+       "Actual Startup Time": N.N,            +
+       "Actual Total Time": N.N,              +
+       "Actual Rows": N,                      +
+       "Actual Loops": N,                     +
+       "Shared Hit Blocks": N,                +
+       "Shared Read Blocks": N,               +
+       "Shared Dirtied Blocks": N,            +
+       "Shared Written Blocks": N,            +
+       "Local Hit Blocks": N,                 +
+       "Local Read Blocks": N,                +
+       "Local Dirtied Blocks": N,             +
+       "Local Written Blocks": N,             +
+       "Temp Read Blocks": N,                 +
+       "Temp Written Blocks": N,              +
+       "I/O Read Time": N.N,                  +
+       "I/O Write Time": N.N,                 +
+       "Temp I/O Read Time": N.N,             +
+       "Temp I/O Write Time": N.N,            +
+       "Plans": [                             +
+         {                                    +
+           "Node Type": "Function Scan",      +
+           "Parent Relationship": "Outer",    +
+           "Parallel Aware": false,           +
+           "Async Capable": false,            +
+           "Function Name": "generate_series",+
+           "Alias": "generate_series",        +
+           "Startup Cost": N.N,               +
+           "Total Cost": N.N,                 +
+           "Plan Rows": N,                    +
+           "Plan Width": N,                   +
+           "Actual Startup Time": N.N,        +
+           "Actual Total Time": N.N,          +
+           "Actual Rows": N,                  +
+           "Actual Loops": N,                 +
+           "Shared Hit Blocks": N,            +
+           "Shared Read Blocks": N,           +
+           "Shared Dirtied Blocks": N,        +
+           "Shared Written Blocks": N,        +
+           "Local Hit Blocks": N,             +
+           "Local Read Blocks": N,            +
+           "Local Dirtied Blocks": N,         +
+           "Local Written Blocks": N,         +
+           "Temp Read Blocks": N,             +
+           "Temp Written Blocks": N,          +
+           "I/O Read Time": N.N,              +
+           "I/O Write Time": N.N,             +
+           "Temp I/O Read Time": N.N,         +
+           "Temp I/O Write Time": N.N         +
+         }                                    +
+       ]                                      +
+     },                                       +
+     "Planning": {                            +
+       "Shared Hit Blocks": N,                +
+       "Shared Read Blocks": N,               +
+       "Shared Dirtied Blocks": N,            +
+       "Shared Written Blocks": N,            +
+       "Local Hit Blocks": N,                 +
+       "Local Read Blocks": N,                +
+       "Local Dirtied Blocks": N,             +
+       "Local Written Blocks": N,             +
+       "Temp Read Blocks": N,                 +
+       "Temp Written Blocks": N,              +
+       "I/O Read Time": N.N,                  +
+       "I/O Write Time": N.N,                 +
+       "Temp I/O Read Time": N.N,             +
+       "Temp I/O Write Time": N.N             +
+     },                                       +
+     "Planning Time": N.N,                    +
+     "Triggers": [                            +
+     ],                                       +
+     "Execution Time": N.N                    +
+   }                                          +
+ ]
+(1 row)
+
+reset track_io_timing;
+reset work_mem;
diff --git a/src/test/regress/sql/explain.sql b/src/test/regress/sql/explain.sql
index 84549c78fa..7cf9f4734c 100644
--- a/src/test/regress/sql/explain.sql
+++ b/src/test/regress/sql/explain.sql
@@ -120,3 +120,11 @@ select explain_filter('explain (verbose) select * from t1 where pg_temp.mysin(f1
 -- Test compute_query_id
 set compute_query_id = on;
 select explain_filter('explain (verbose) select * from int8_tbl i8');
+
+-- Test I/O timing for temp buffers
+set track_io_timing = on;
+set work_mem to '4MB';
+select explain_filter('explain (analyze, buffers) select count(*) from generate_series(1, 100000)');
+select explain_filter('explain (analyze, buffers, format json) select count(*) from generate_series(1, 100000)');
+reset track_io_timing;
+reset work_mem;
-- 
2.24.3 (Apple Git-128)

