From 0b7e0fd25b55c8d869afcbc7b86535f87b32442f Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <boekewurm+postgres@gmail.com>
Date: Tue, 4 Feb 2025 17:03:19 +0100
Subject: [PATCH v1 1/2] BufTable: Use optimized hash table for buffer lookups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This uses up to 1/3rd the memory and 1 less redirections vs the current
dynahash.c table. In the worst case for 64-bit systems, it'll only use 16%
less memory, in the best case ±58%.

Future commits will further optimize allocation of entries to reduce cache
misses.
---
 src/backend/storage/buffer/buf_table.c | 459 ++++++++++++++++++++++---
 src/backend/storage/buffer/bufmgr.c    |   4 +-
 src/include/storage/buf_internals.h    |   2 +-
 3 files changed, 414 insertions(+), 51 deletions(-)

diff --git a/src/backend/storage/buffer/buf_table.c b/src/backend/storage/buffer/buf_table.c
index a50955d5286..b237822285a 100644
--- a/src/backend/storage/buffer/buf_table.c
+++ b/src/backend/storage/buffer/buf_table.c
@@ -21,17 +21,117 @@
  */
 #include "postgres.h"
 
+#include "common/hashfn.h"
 #include "storage/buf_internals.h"
+#include "utils/dynahash.h"
 
-/* entry for buffer lookup hashtable */
-typedef struct
+/*
+ * Hash element: Singly linked list with hash of value contained, and
+ * the reference to the buffer header that contains the to-be-hashed value.
+ *
+ * Note: when negative, buffer is a sentinel value indicating it as a freelist
+ * entry for list at offset (-1 - offset). The user should lock that freelist
+ * to acquire that slot, but not before confirming that the value hasn't
+ * been updated since.
+ */
+typedef struct HashElement
+{
+	int32		next;
+	uint32		hashValue;
+	pg_atomic_uint32	buffer;
+} HashElement;
+
+/*
+ * Free list entry for the hash.
+ *
+ * Note tha
+ */
+typedef struct HashFreeListEntry
+{
+	int32		next;
+	int32		prev;
+	pg_atomic_uint32 tag;
+} HashFreeListEntry;
+
+StaticAssertDecl(offsetof(HashFreeListEntry, tag) == offsetof(HashElement, buffer),
+				 "tag and buffer should be at the same offset into HashFreeListEntry and HashElement");
+
+typedef struct HashBucket
+{
+	int32		bucketStartElement;
+	union {
+		HashElement	element;
+		HashFreeListEntry freeEntry;
+	};
+} HashBucket;
+
+
+typedef struct FreeList
+{
+	slock_t	mutex;
+	int32	firstFree;
+	int32	lastFree;
+	int32	myTag;
+} FreeList;
+
+
+typedef union FreeListPadded
+{
+	FreeList	list;
+	char		padding[PG_CACHE_LINE_SIZE];
+} FreeListPadded;
+
+typedef struct BufTableHeader
+{
+	uint32		numBuckets;
+	uint32		mask;
+} BufTableHeader;
+
+typedef struct BufTable {
+	union {
+		BufTableHeader hdr;
+		char	pad[CACHELINEALIGN(sizeof(BufTableHeader))];
+	};
+	FreeListPadded	freeList[NUM_BUFFER_PARTITIONS];
+	HashBucket	buckets[FLEXIBLE_ARRAY_MEMBER];
+} BufTable;
+
+static BufTable *bufLookupTable;
+
+#define NumBucketsFromSize(size) (1 << my_log2((size * 5) / 4))
+
+#define FreeListIdxFromHash(hash) (hash % NUM_BUFFER_PARTITIONS)
+
+static inline FreeList *
+BufTableGetFreelist(BufTable *table, uint32 idx)
+{
+	Assert(idx < NUM_BUFFER_PARTITIONS);
+	return &table->freeList[idx].list;
+}
+
+static inline HashBucket *
+BufTableGetBucket(BufTable *table, uint32 hash)
 {
-	BufferTag	key;			/* Tag of a disk page */
-	int			id;				/* Associated buffer ID */
-} BufferLookupEnt;
+	int32		offset = (hash & table->hdr.mask);
+
+	Assert(offset < table->hdr.numBuckets);
+
+	return &table->buckets[offset];
+}
 
-static HTAB *SharedBufHash;
+static inline HashElement *
+BufTableGetElement(BufTable *table, int32 offset)
+{
+	Assert(offset >= 0 && offset <= table->hdr.numBuckets);
+	return &table->buckets[offset].element;
+}
 
+static inline HashFreeListEntry *
+BufTableGetFreeEntry(BufTable *table, int32 offset)
+{
+	Assert(offset >= 0 && offset <= table->hdr.numBuckets);
+	return &table->buckets[offset].freeEntry;
+}
 
 /*
  * Estimate space needed for mapping hashtable
@@ -40,7 +140,53 @@ static HTAB *SharedBufHash;
 Size
 BufTableShmemSize(int size)
 {
-	return hash_estimate_size(size, sizeof(BufferLookupEnt));
+	uint64		nbuckets = NumBucketsFromSize(size);
+	Size		alloc = 0;
+
+	alloc = add_size(alloc, PG_CACHE_LINE_SIZE);
+	alloc = add_size(alloc, offsetof(BufTable, buckets));
+	alloc = add_size(alloc, mul_size(nbuckets, sizeof(HashBucket)));
+
+	return alloc;
+}
+
+
+static inline void
+add_to_freelist(BufTable *table, HashFreeListEntry *freeEntry, uint32 index)
+{
+	uint32		freeListIdx;
+	FreeList   *freeList;
+	int32		next;
+	int32		freeListTag;
+	Assert(freeEntry == BufTableGetFreeEntry(table, index));
+
+	freeListIdx = FreeListIdxFromHash(index);
+	freeList = BufTableGetFreelist(bufLookupTable, freeListIdx);
+	freeListTag = (-1 - (int32) freeListIdx);
+	freeEntry->next = freeListTag;
+	freeEntry->prev = freeListTag;
+
+	S_LOCK(&freeList->mutex);
+	next = freeList->firstFree;
+	if (next < 0)
+	{
+		pg_atomic_write_u32(&freeEntry->tag, (uint32) freeListTag);
+		freeList->firstFree = (int32) index;
+		freeList->lastFree = (int32) index;
+	}
+	else
+	{
+		HashFreeListEntry *prevEntry;
+
+		prevEntry = BufTableGetFreeEntry(table, next);
+		Assert(pg_atomic_read_u32(&prevEntry->tag) == (uint32) freeListTag);
+
+		pg_atomic_write_u32(&freeEntry->tag, (uint32) freeListTag);
+
+		freeEntry->prev = freeList->firstFree;
+		freeList->firstFree = prevEntry->next = (int32) index;
+	}
+	S_UNLOCK(&freeList->mutex);
 }
 
 /*
@@ -50,21 +196,132 @@ BufTableShmemSize(int size)
 void
 InitBufTable(int size)
 {
-	HASHCTL		info;
+	void	   *baseptr;
+	bool		found = false;
+	uint32		nbuckets = NumBucketsFromSize(size);
+
+	baseptr = ShmemInitStruct("Shared Buffer Lookup Table",
+							  BufTableShmemSize(size),
+							  &found);
+
+	/*
+	 * Runs only in postmaster during startup; there is no concurrent process
+	 * that can or is allowed to initialize this.
+	 */
+	Assert(!found);
+
+	if (found)
+		return;
+
+	Assert(PointerIsAligned(baseptr, void *));
+	bufLookupTable = (void *) CACHELINEALIGN((char *) baseptr);
+
+	/* nbuckets is always a power of 2, so mask is always nbuckets-1 */
+	bufLookupTable->hdr.numBuckets = nbuckets;
+	bufLookupTable->hdr.mask = nbuckets - 1;
+
+	for (int i = 0; i < NUM_BUFFER_PARTITIONS; i++)
+	{
+		FreeList *list = &bufLookupTable->freeList[i].list;
+
+		S_INIT_LOCK(&list->mutex);
+		list->myTag = (-1 - i);
+
+		list->firstFree = list->myTag;
+		list->lastFree = list->myTag;
+	}
+
+	/* build per-partition freelists */
+	for (int i = 0; i < nbuckets; i++)
+	{
+		HashBucket *bucket = BufTableGetBucket(bufLookupTable, i);
+		bucket->bucketStartElement = -1;
+		add_to_freelist(bufLookupTable, &bucket->freeEntry, i);
+	}
+}
+
+/* Convert a hash value to a bucket number */
+static inline int32
+calc_bucket(BufTable *table, uint32 hash_val)
+{
+	uint32		bucket;
+
+	bucket = hash_val & table->hdr.mask;
+
+	return (int32) bucket;
+}
+
+static inline HashElement *
+initial_lookup(BufTable *table, uint32 hashvalue, HashBucket **hashBucket, int32 *idx)
+{
+	HashBucket *bucket;
+	int32		bucketno;
+	int32		startElementOffset;
+
+	bucketno = calc_bucket(table, hashvalue);
+
+	bucket = BufTableGetBucket(bufLookupTable, bucketno);
+	startElementOffset = bucket->bucketStartElement;
+	*idx = startElementOffset;
+	*hashBucket = bucket;
+
+	if (startElementOffset < 0)
+		return NULL;
+
+	if (likely(startElementOffset == bucketno))
+		return &bucket->element;
+
+	return BufTableGetElement(table, startElementOffset);
+}
+
+/*
+ * Get a new free element from a freelist.
+ *
+ * NOTE: This assumes there are free elements available. If that assumption
+ * does not hold this code will loop infinitely.
+ */
+static HashElement *
+buftable_get_free_element(BufTable *table, uint32 hashValue, int32 *pInt)
+{
+	FreeList   *freeList;
+	HashFreeListEntry *entry;
+
+	do {
+		for (;; hashValue++)
+		{
+			freeList = BufTableGetFreelist(table, FreeListIdxFromHash(hashValue));
+
+			S_LOCK(&freeList->mutex);
+			if (freeList->firstFree != freeList->myTag)
+				break;
+			S_UNLOCK(&freeList->mutex);
+		}
+
+		entry = BufTableGetFreeEntry(table, freeList->firstFree);
+		*pInt = freeList->firstFree;
+	} while (false);
+
+	Assert(entry->next == freeList->myTag);
 
-	/* assume no locking is needed yet */
+	/* unlink the entry from the linked list*/
+	if (entry->prev != freeList->myTag)
+	{
+		HashFreeListEntry *prevEntry = BufTableGetFreeEntry(table, entry->prev);
+		prevEntry->next = entry->next;
+		freeList->firstFree = entry->prev;
+	}
+	else
+	{
+		Assert(freeList->firstFree == freeList->lastFree);
+		Assert(entry->next == entry->prev);
+	}
 
-	/* BufferTag maps to Buffer */
-	info.keysize = sizeof(BufferTag);
-	info.entrysize = sizeof(BufferLookupEnt);
-	info.num_partitions = NUM_BUFFER_PARTITIONS;
+	S_UNLOCK(&freeList->mutex);
 
-	SharedBufHash = ShmemInitHash("Shared Buffer Lookup Table",
-								  size, size,
-								  &info,
-								  HASH_ELEM | HASH_BLOBS | HASH_PARTITION);
+	return (HashElement *) entry;
 }
 
+
 /*
  * BufTableHashCode
  *		Compute the hash code associated with a BufferTag
@@ -77,7 +334,7 @@ InitBufTable(int size)
 uint32
 BufTableHashCode(BufferTag *tagPtr)
 {
-	return get_hash_value(SharedBufHash, tagPtr);
+	return hash_bytes((void *) tagPtr, sizeof(BufferTag));
 }
 
 /*
@@ -89,19 +346,40 @@ BufTableHashCode(BufferTag *tagPtr)
 int
 BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
 {
-	BufferLookupEnt *result;
+	BufTable *table = bufLookupTable;
+	HashBucket *currBucket;
+	HashElement *matchElement;
+	int32 curIdx;
+
+	/*
+	 * Do the initial lookup
+	 */
+	matchElement = initial_lookup(table, hashcode, &currBucket,
+								  &curIdx);
+
+	/* follow the bucket's chain */
+	while (matchElement != NULL)
+	{
+		if (matchElement->hashValue == hashcode && BufferTagsEqual(
+		tagPtr,
+		&GetBufferDescriptor(pg_atomic_read_u32(&matchElement->buffer))->tag
+		))
+			break;
 
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									tagPtr,
-									hashcode,
-									HASH_FIND,
-									NULL);
+		if (matchElement->next < 0)
+		{
+			matchElement = NULL;
+			break;
+		}
 
-	if (!result)
+		curIdx = matchElement->next;
+		matchElement = BufTableGetElement(table, curIdx);
+	}
+
+	if (!matchElement)
 		return -1;
 
-	return result->id;
+	return (int) pg_atomic_read_u32(&matchElement->buffer);
 }
 
 /*
@@ -117,23 +395,64 @@ BufTableLookup(BufferTag *tagPtr, uint32 hashcode)
 int
 BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
 {
-	BufferLookupEnt *result;
-	bool		found;
+	BufTable *table = bufLookupTable;
+	HashBucket *currBucket;
+	HashElement *matchElement;
+	int32	   *prevNextPtr;
+	int32		matchElemIdx;
 
 	Assert(buf_id >= 0);		/* -1 is reserved for not-in-table */
 	Assert(tagPtr->blockNum != P_NEW);	/* invalid tag */
 
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									tagPtr,
-									hashcode,
-									HASH_ENTER,
-									&found);
+	/*
+	 * Do the initial lookup
+	 */
+	matchElement = initial_lookup(table, hashcode, &currBucket,
+								  &matchElemIdx);
+	prevNextPtr = &currBucket->bucketStartElement;
+
+	while (matchElement != NULL)
+	{
+		/*
+		 * Make sure it's a valid element in the chain. We have a lock on
+		 * that chain, so any breakage is a broken invariant.
+		 */
+		Assert(0 <= (int32) pg_atomic_read_u32(&matchElement->buffer));
+
+		if (matchElement->hashValue == hashcode && BufferTagsEqual(
+			tagPtr,
+			&GetBufferDescriptor(pg_atomic_read_u32(&matchElement->buffer))->tag
+		))
+			break;
 
-	if (found)					/* found something already in the table */
-		return result->id;
+		prevNextPtr = &matchElement->next;
 
-	result->id = buf_id;
+		if (matchElement->next < 0)
+		{
+			Assert(matchElement->next == -1);
+			matchElement = NULL;
+			break;
+		}
+
+		matchElemIdx = matchElement->next;
+		matchElement = BufTableGetElement(table, matchElemIdx);
+	}
+
+	/* Return existing buffer ID if we found a pre-existing buffer */
+	if (matchElement != NULL)
+		return (int) pg_atomic_read_u32(&matchElement->buffer);
+
+	/* else, create a new one */
+	matchElement = buftable_get_free_element(table, hashcode, &matchElemIdx);
+
+	/* link into hashbucket chain */
+	*prevNextPtr = matchElemIdx;
+
+	/* copy data into the element */
+	matchElement->next = -1;	/* it's the tail of the bucket */
+	matchElement->hashValue = hashcode;
+
+	pg_atomic_write_u32(&matchElement->buffer, (uint32) ((int32) buf_id));
 
 	return -1;
 }
@@ -145,17 +464,61 @@ BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id)
  * Caller must hold exclusive lock on BufMappingLock for tag's partition
  */
 void
-BufTableDelete(BufferTag *tagPtr, uint32 hashcode)
+BufTableDelete(BufferTag *tagPtr, uint32 hashcode, int buffer)
 {
-	BufferLookupEnt *result;
+	BufTable	   *table = bufLookupTable;
+	HashBucket	   *currBucket;
+	HashElement	   *matchElement;
+	int32		   *prevNextPtr;
+	int32			matchElemIdx;
+
+	/*
+	 * Do the initial lookup
+	 */
+	matchElement = initial_lookup(table, hashcode, &currBucket,
+								  &matchElemIdx);
+	prevNextPtr = &currBucket->bucketStartElement;
+
+	while (matchElement != NULL)
+	{
+		if (matchElement->hashValue == hashcode &&
+			pg_atomic_read_u32(&matchElement->buffer) == buffer)
+			break;
+
+		prevNextPtr = &matchElement->next;
+
+		if (matchElement->next < 0)
+		{
+			matchElement = NULL;
+			break;
+		}
+
+		matchElemIdx = matchElement->next;
+		matchElement = BufTableGetElement(table, matchElemIdx);
+	}
+
+	if (matchElement != NULL)
+	{
+		/*
+		 * We can't test the buffer tag, because that may have been cleared
+		 * by the caller by now.
+		 */
+		Assert(0 <= (int32) pg_atomic_read_u32(&matchElement->buffer));
 
-	result = (BufferLookupEnt *)
-		hash_search_with_hash_value(SharedBufHash,
-									tagPtr,
-									hashcode,
-									HASH_REMOVE,
-									NULL);
+		/* unlink the element from the singly-linked list */
+		*prevNextPtr = matchElement->next;
 
-	if (!result)				/* shouldn't happen */
-		elog(ERROR, "shared buffer hash table corrupted");
+		/*
+		 * Remove record from hash bucket's chain.
+		 *
+		 * Note that bucket chains are singly linked lists, but freelists
+		 * doubly linked lists.
+		 */
+		add_to_freelist(bufLookupTable, (HashFreeListEntry *) matchElement,
+						matchElemIdx);
+	}
+	else
+	{
+		elog(PANIC, "shared buffer hash table corrupted");
+	}
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index ee83669992b..362e2d2f134 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -1856,7 +1856,7 @@ retry:
 	 * Remove the buffer from the lookup hashtable, if it was in there.
 	 */
 	if (oldFlags & BM_TAG_VALID)
-		BufTableDelete(&oldTag, oldHash);
+		BufTableDelete(&oldTag, oldHash, buf->buf_id);
 
 	/*
 	 * Done with mapping lock.
@@ -1935,7 +1935,7 @@ InvalidateVictimBuffer(BufferDesc *buf_hdr)
 	Assert(BUF_STATE_GET_REFCOUNT(buf_state) > 0);
 
 	/* finally delete buffer from the buffer mapping table */
-	BufTableDelete(&tag, hash);
+	BufTableDelete(&tag, hash, buf_hdr->buf_id);
 
 	LWLockRelease(partition_lock);
 
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index 1a65342177d..5739bc44c00 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -439,7 +439,7 @@ extern void InitBufTable(int size);
 extern uint32 BufTableHashCode(BufferTag *tagPtr);
 extern int	BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
 extern int	BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id);
-extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
+extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode, int buffer);
 
 /* localbuf.c */
 extern bool PinLocalBuffer(BufferDesc *buf_hdr, bool adjust_usagecount);
-- 
2.46.0

