#!/usr/bin/bash -x

# exit on error
set -e

DATADIR=/mnt/pgdata/data-skip-scan

# number of rows to generate
ROWS="1000000 10000000"

# number of distinct values
DISTINCT="100 1000 10000 100000 1000000 10000000"

# how skewed should the dataset be?
# 1 means no skew (uniform distribution), >1 means skew
SKEW="1 2 4 8"

# number of random queries to run for each combination of parameters
QUERIES=10

# number of runs for each query
RUNS=3

# number of values in the IN() clause
VALUES="1 2 4 8 16 32 64 128 256 512 1024"

RESULTS=results-$(date +%s).csv

# simple deterministic pseudorandom generator - seed and max are enough
# to fully determine the result for a particular value
function prng() {
	seed=$1
	max=$2
	value=$3

	echo $(((seed + 786433 * value) % max))
}

SEQ=0

for r in $ROWS; do

	for d in $DISTINCT; do

		# too many distinct values (more than rows)
		if [[ $d -gt $r ]]; then
			continue
		fi

		for s in $SKEW; do

			echo "========== rows $r distinct $d skew $s ==========" >> debug.log 2>&1

			table="t_$r_$d_$s"

			c=$(psql test -t -A -c "select count(*) from pg_class where relname = '$table'")

			if [ "$c" == "0" ]; then
				psql test -c "create table $table (id bigint, val text)" >> debug.log 2>&1
				psql test -c "insert into $table select pow(random(), $s) * $d, md5(i::text) from generate_series(1,$r) s(i)" >> debug.log 2>&1
				psql test -c "create index on $table (id)" >> debug.log 2>&1
				psql test -c "vacuum analyze" >> debug.log 2>&1
				psql test -c "checkpoint" >> debug.log 2>&1
			fi

			for v in $VALUES; do

				# too many values (more than distinct values)
				if [[ $v -gt $d ]]; then
					continue
				fi

				for q in $(seq 1 $QUERIES); do


					# max parallel workers
					for w in 0 4; do

						# seed for this particular combination of parameters
						seed=$(((389 * r) + (24593 * d) + (s * 769) + (v * 3079) + (q * 98317) + (w * 1543)))

						# generate the right number of random values for the IN clause
						vals=""
						for i in $(seq 1 $v); do

							if [ "$vals" != "" ]; then
								vals="$vals,"
							fi

							x=$(prng $seed $d $i)

							vals="$vals $x"
						done

						for z in $(seq 1 $RUNS); do

							# index scan
							sql="select * from $table where id in ($vals)"

							psql test > explain.log <<EOF
-- to make parallel queries likely (if enabled)
SET parallel_setup_cost = 0;
SET parallel_tuple_cost = 0;
SET min_parallel_table_scan_size = 0;
SET min_parallel_index_scan_size = 0;
-- force index scan
SET max_parallel_workers_per_gather = $w;
SET enable_bitmapscan = off;
SET enable_seqscan = off;
explain $sql;
EOF

							# count different plan types
							is=$(grep 'Index Scan' explain.log | wc -l)
							ios=$(grep 'Index Only Scan' explain.log | wc -l)
							gather=$(grep 'Gather' explain.log | wc -l)

							echo "===== $sql run $r index-scan =====" >> explains.log 2>&1
							cat explain.log  >> explains.log 2>&1

							# hash of results

							psql -t -A test > output <<EOF
-- to make parallel queries likely (if enabled)
SET parallel_setup_cost = 0;
SET parallel_tuple_cost = 0;
SET min_parallel_table_scan_size = 0;
SET min_parallel_index_scan_size = 0;
-- force index scan
SET max_parallel_workers_per_gather = $w;
SET enable_bitmapscan = off;
SET enable_seqscan = off;
$sql;
EOF

							h=$(md5sum output | awk '{print $1}')


							# uncached

							pg_ctl -D $DATADIR -l pg.log restart >> debug.log 2>&1
							sudo ./drop-caches.sh >> debug.log 2>&1

							psql test > explain-analyze.log <<EOF
\timing on
-- to make parallel queries likely (if enabled)
SET parallel_setup_cost = 0;
SET parallel_tuple_cost = 0;
SET min_parallel_table_scan_size = 0;
SET min_parallel_index_scan_size = 0;
-- force index scan
SET max_parallel_workers_per_gather = $w;
SET enable_bitmapscan = off;
SET enable_seqscan = off;
explain (analyze, timing off) $sql;
EOF

							workers=$(grep 'Workers Launched' explain-analyze.log | awk '{print $3}')
							if [ "$workers" == "" ]; then workers="0"; fi

							echo "===== $sql run $r index-scan not-ncached =====" >> explains-analyze.log 2>&1
							cat explain-analyze.log  >> explains-analyze.log 2>&1

							t=$(grep '^Time:' explain-analyze.log | awk '{print $2}' | tail -n 1)

							SEQ=$((SEQ+1))
							echo $SEQ $(date +%s) $r $d $s $v $q $seed $w $z index-scan not-cached $h $t $is $ios $gather $workers >> $RESULTS

							# cached

							psql test > explain-analyze.log <<EOF
\timing on
-- to make parallel queries likely (if enabled)
SET parallel_setup_cost = 0;
SET parallel_tuple_cost = 0;
SET min_parallel_table_scan_size = 0;
SET min_parallel_index_scan_size = 0;
-- force index scan
SET max_parallel_workers_per_gather = $w;
SET enable_bitmapscan = off;
SET enable_seqscan = off;
explain (analyze, timing off) $sql;
EOF

							workers=$(grep 'Workers Launched' explain-analyze.log | awk '{print $3}')
							if [ "$workers" == "" ]; then workers="0"; fi

							echo "===== $sql run $r index-scan cached =====" >> explains-analyze.log 2>&1
							cat explain-analyze.log  >> explains-analyze.log 2>&1

							t=$(grep '^Time:' explain-analyze.log | awk '{print $2}' | tail -n 1)

							SEQ=$((SEQ+1))
							echo $SEQ $(date +%s) $r $d $s $v $q $seed $w $z index-scan cached $h $t $is $ios $gather $workers >> $RESULTS

							# index-only scan

							sql="select id from $table where id in ($vals)"

							psql test > explain.log <<EOF
-- to make parallel queries likely (if enabled)
SET parallel_setup_cost = 0;
SET parallel_tuple_cost = 0;
SET min_parallel_table_scan_size = 0;
SET min_parallel_index_scan_size = 0;
-- force index scan
SET max_parallel_workers_per_gather = $w;
SET enable_bitmapscan = off;
SET enable_seqscan = off;
explain $sql;
EOF

							# count different plan types
							is=$(grep 'Index Scan' explain.log | wc -l)
							ios=$(grep 'Index Only Scan' explain.log | wc -l)
							gather=$(grep 'Gather' explain.log | wc -l)

							# hash of results

							psql -t -A test > output <<EOF
-- to make parallel queries likely (if enabled)
SET parallel_setup_cost = 0;
SET parallel_tuple_cost = 0;
SET min_parallel_table_scan_size = 0;
SET min_parallel_index_scan_size = 0;
-- force index scan
SET max_parallel_workers_per_gather = $w;
SET enable_bitmapscan = off;
SET enable_seqscan = off;
$sql;
EOF

							h=$(md5sum output | awk '{print $1}')

							echo "===== $sql run $r index-only-scan =====" >> explains.log 2>&1
							cat explain.log  >> explains.log 2>&1

							# uncached

							pg_ctl -D $DATADIR -l pg.log restart >> debug.log 2>&1
							sudo ./drop-caches.sh >> debug.log 2>&1

							psql test > explain-analyze.log <<EOF
\timing on
-- to make parallel queries likely (if enabled)
SET parallel_setup_cost = 0;
SET parallel_tuple_cost = 0;
SET min_parallel_table_scan_size = 0;
SET min_parallel_index_scan_size = 0;
-- force index scan
SET max_parallel_workers_per_gather = $w;
SET enable_bitmapscan = off;
SET enable_seqscan = off;
explain (analyze, timing off) $sql;
EOF

							workers=$(grep 'Workers Launched' explain-analyze.log | awk '{print $3}')
							if [ "$workers" == "" ]; then workers="0"; fi

							echo "===== $sql run $r index-only-scan not-cached =====" >> explains-analyze.log 2>&1
							cat explain-analyze.log  >> explains-analyze.log 2>&1

							t=$(grep '^Time:' explain-analyze.log | awk '{print $2}' | tail -n 1)

							SEQ=$((SEQ+1))
							echo $SEQ $(date +%s) $r $d $s $v $q $seed $w $z index-only-scan not-cached $h $t $is $ios $gather $workers >> $RESULTS

							# cached

							pg_ctl -D $DATADIR -l pg.log restart >> debug.log 2>&1
							sudo ./drop-caches.sh >> debug.log 2>&1

							psql test > explain-analyze.log <<EOF
\timing on
-- to make parallel queries likely (if enabled)
SET parallel_setup_cost = 0;
SET parallel_tuple_cost = 0;
SET min_parallel_table_scan_size = 0;
SET min_parallel_index_scan_size = 0;
-- force index scan
SET max_parallel_workers_per_gather = $w;
SET enable_bitmapscan = off;
SET enable_seqscan = off;
explain (analyze, timing off) $sql;
EOF

							workers=$(grep 'Workers Launched' explain-analyze.log | awk '{print $3}')
							if [ "$workers" == "" ]; then workers="0"; fi

							echo "===== $sql run $r index-only-scan cached =====" >> explains-analyze.log 2>&1
							cat explain-analyze.log  >> explains-analyze.log 2>&1

							t=$(grep '^Time:' explain-analyze.log | awk '{print $2}' | tail -n 1)

							SEQ=$((SEQ+1))
							echo $SEQ $(date +%s) $r $d $s $v $q $seed $w $z index-only-scan cached $h $t $is $ios $gather $workers >> $RESULTS

						done

					done

				done

			done

		done

	done

done
