Module: Mesa
Branch: main
Commit: 611545fbfe7648ea95bad28c7099cc775c3024c8
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=611545fbfe7648ea95bad28c7099cc775c3024c8

Author: Tatsuyuki Ishi <[email protected]>
Date:   Fri Nov  3 17:20:30 2023 +0900

radv: Implement helpers for shader part caching.

Currently, shader part caching logic is duplicated between VS prolog and
PS/TCS epilogs. This commit introduces a common abstraction to
deduplicate the code.

Additionally, there are a few design decisions that diverts from the
current implementation:
1. A simple mutex is used instead of reader-writer lock. Prolog/epilog
   constructions are serialized, removing the need to free duplicate
   objects in case of a race.
2. A CS-local cache is used to quickly lookup an entry without holding a
   lock. This eliminates locking in over 99% of cases.
3. A set is used to reduce number of allocations.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26028>

---

 src/amd/vulkan/radv_shader.c | 69 ++++++++++++++++++++++++++++++++++++++++++++
 src/amd/vulkan/radv_shader.h | 29 +++++++++++++++++++
 2 files changed, 98 insertions(+)

diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c
index a73a7b2f73e..25a43b1ec08 100644
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@@ -2207,6 +2207,75 @@ fail:
    return NULL;
 }
 
+bool
+radv_shader_part_cache_init(struct radv_shader_part_cache *cache, struct 
radv_shader_part_cache_ops *ops)
+{
+   cache->ops = ops;
+   if (!_mesa_set_init(&cache->entries, NULL, cache->ops->hash, 
cache->ops->equals))
+      return false;
+   simple_mtx_init(&cache->lock, mtx_plain);
+   return true;
+}
+
+void
+radv_shader_part_cache_finish(struct radv_device *device, struct 
radv_shader_part_cache *cache)
+{
+   set_foreach (&cache->entries, entry)
+      radv_shader_part_unref(device, 
radv_shader_part_from_cache_entry(entry->key));
+   simple_mtx_destroy(&cache->lock);
+   ralloc_free(cache->entries.table);
+}
+
+/*
+ * A cache with atomics-free fast path for prolog / epilog lookups.
+ *
+ * VS prologs and PS/TCS epilogs are used to support dynamic states. In
+ * particular dynamic blend state is heavily used by Zink. These are called
+ * every frame as a part of command buffer building, so these functions are
+ * on the hot path.
+ *
+ * Originally this was implemented with a rwlock, but this lead to high
+ * overhead. To avoid locking altogether in the hot path, the cache is done
+ * at two levels: one at device level, and another at each CS. Access to the
+ * CS cache is externally synchronized and do not require a lock.
+ */
+struct radv_shader_part *
+radv_shader_part_cache_get(struct radv_device *device, struct 
radv_shader_part_cache *cache, struct set *local_entries,
+                           const void *key)
+{
+   struct set_entry *local, *global;
+   bool local_found, global_found;
+   uint32_t hash = cache->ops->hash(key);
+
+   local = _mesa_set_search_or_add_pre_hashed(local_entries, hash, key, 
&local_found);
+   if (local_found)
+      return radv_shader_part_from_cache_entry(local->key);
+
+   simple_mtx_lock(&cache->lock);
+   global = _mesa_set_search_or_add_pre_hashed(&cache->entries, hash, key, 
&global_found);
+   if (global_found) {
+      simple_mtx_unlock(&cache->lock);
+      local->key = global->key;
+      return radv_shader_part_from_cache_entry(global->key);
+   }
+
+   struct radv_shader_part *shader_part = cache->ops->create(device, key);
+   if (!shader_part) {
+      _mesa_set_remove(&cache->entries, global);
+      simple_mtx_unlock(&cache->lock);
+      _mesa_set_remove(local_entries, local);
+      return NULL;
+   }
+
+   /* Make the set entry a pointer to the key, so that the hash and equals
+    * functions from radv_shader_part_cache_ops can be directly used.
+    */
+   global->key = &shader_part->key;
+   simple_mtx_unlock(&cache->lock);
+   local->key = &shader_part->key;
+   return shader_part;
+}
+
 static char *
 radv_dump_nir_shaders(struct nir_shader *const *shaders, int shader_count)
 {
diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h
index 5a451c71d5c..ea5ab6cc888 100644
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@@ -621,6 +621,12 @@ struct radv_shader {
 struct radv_shader_part {
    uint32_t ref_count;
 
+   union {
+      struct radv_vs_prolog_key vs;
+      struct radv_ps_epilog_key ps;
+      struct radv_tcs_epilog_key tcs;
+   } key;
+
    uint64_t va;
 
    struct radeon_winsys_bo *bo;
@@ -635,6 +641,18 @@ struct radv_shader_part {
    char *disasm_string;
 };
 
+struct radv_shader_part_cache_ops {
+   uint32_t (*hash)(const void *key);
+   bool (*equals)(const void *a, const void *b);
+   struct radv_shader_part *(*create)(struct radv_device *device, const void 
*key);
+};
+
+struct radv_shader_part_cache {
+   simple_mtx_t lock;
+   struct radv_shader_part_cache_ops *ops;
+   struct set entries;
+};
+
 struct radv_pipeline_layout;
 struct radv_shader_stage;
 
@@ -722,6 +740,11 @@ struct radv_shader_part *radv_create_tcs_epilog(struct 
radv_device *device, cons
 
 void radv_shader_part_destroy(struct radv_device *device, struct 
radv_shader_part *shader_part);
 
+bool radv_shader_part_cache_init(struct radv_shader_part_cache *cache, struct 
radv_shader_part_cache_ops *ops);
+void radv_shader_part_cache_finish(struct radv_device *device, struct 
radv_shader_part_cache *cache);
+struct radv_shader_part *radv_shader_part_cache_get(struct radv_device 
*device, struct radv_shader_part_cache *cache,
+                                                    struct set *local_entries, 
const void *key);
+
 uint64_t radv_shader_get_va(const struct radv_shader *shader);
 struct radv_shader *radv_find_shader(struct radv_device *device, uint64_t pc);
 
@@ -776,6 +799,12 @@ radv_shader_part_unref(struct radv_device *device, struct 
radv_shader_part *shad
       radv_shader_part_destroy(device, shader_part);
 }
 
+static inline struct radv_shader_part *
+radv_shader_part_from_cache_entry(const void *key)
+{
+   return container_of(key, struct radv_shader_part, key);
+}
+
 static inline unsigned
 get_tcs_input_vertex_stride(unsigned tcs_num_inputs)
 {

Reply via email to