Module: Mesa Branch: main Commit: 611545fbfe7648ea95bad28c7099cc775c3024c8 URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=611545fbfe7648ea95bad28c7099cc775c3024c8
Author: Tatsuyuki Ishi <[email protected]> Date: Fri Nov 3 17:20:30 2023 +0900 radv: Implement helpers for shader part caching. Currently, shader part caching logic is duplicated between VS prolog and PS/TCS epilogs. This commit introduces a common abstraction to deduplicate the code. Additionally, there are a few design decisions that diverts from the current implementation: 1. A simple mutex is used instead of reader-writer lock. Prolog/epilog constructions are serialized, removing the need to free duplicate objects in case of a race. 2. A CS-local cache is used to quickly lookup an entry without holding a lock. This eliminates locking in over 99% of cases. 3. A set is used to reduce number of allocations. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26028> --- src/amd/vulkan/radv_shader.c | 69 ++++++++++++++++++++++++++++++++++++++++++++ src/amd/vulkan/radv_shader.h | 29 +++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/src/amd/vulkan/radv_shader.c b/src/amd/vulkan/radv_shader.c index a73a7b2f73e..25a43b1ec08 100644 --- a/src/amd/vulkan/radv_shader.c +++ b/src/amd/vulkan/radv_shader.c @@ -2207,6 +2207,75 @@ fail: return NULL; } +bool +radv_shader_part_cache_init(struct radv_shader_part_cache *cache, struct radv_shader_part_cache_ops *ops) +{ + cache->ops = ops; + if (!_mesa_set_init(&cache->entries, NULL, cache->ops->hash, cache->ops->equals)) + return false; + simple_mtx_init(&cache->lock, mtx_plain); + return true; +} + +void +radv_shader_part_cache_finish(struct radv_device *device, struct radv_shader_part_cache *cache) +{ + set_foreach (&cache->entries, entry) + radv_shader_part_unref(device, radv_shader_part_from_cache_entry(entry->key)); + simple_mtx_destroy(&cache->lock); + ralloc_free(cache->entries.table); +} + +/* + * A cache with atomics-free fast path for prolog / epilog lookups. + * + * VS prologs and PS/TCS epilogs are used to support dynamic states. In + * particular dynamic blend state is heavily used by Zink. These are called + * every frame as a part of command buffer building, so these functions are + * on the hot path. + * + * Originally this was implemented with a rwlock, but this lead to high + * overhead. To avoid locking altogether in the hot path, the cache is done + * at two levels: one at device level, and another at each CS. Access to the + * CS cache is externally synchronized and do not require a lock. + */ +struct radv_shader_part * +radv_shader_part_cache_get(struct radv_device *device, struct radv_shader_part_cache *cache, struct set *local_entries, + const void *key) +{ + struct set_entry *local, *global; + bool local_found, global_found; + uint32_t hash = cache->ops->hash(key); + + local = _mesa_set_search_or_add_pre_hashed(local_entries, hash, key, &local_found); + if (local_found) + return radv_shader_part_from_cache_entry(local->key); + + simple_mtx_lock(&cache->lock); + global = _mesa_set_search_or_add_pre_hashed(&cache->entries, hash, key, &global_found); + if (global_found) { + simple_mtx_unlock(&cache->lock); + local->key = global->key; + return radv_shader_part_from_cache_entry(global->key); + } + + struct radv_shader_part *shader_part = cache->ops->create(device, key); + if (!shader_part) { + _mesa_set_remove(&cache->entries, global); + simple_mtx_unlock(&cache->lock); + _mesa_set_remove(local_entries, local); + return NULL; + } + + /* Make the set entry a pointer to the key, so that the hash and equals + * functions from radv_shader_part_cache_ops can be directly used. + */ + global->key = &shader_part->key; + simple_mtx_unlock(&cache->lock); + local->key = &shader_part->key; + return shader_part; +} + static char * radv_dump_nir_shaders(struct nir_shader *const *shaders, int shader_count) { diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 5a451c71d5c..ea5ab6cc888 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -621,6 +621,12 @@ struct radv_shader { struct radv_shader_part { uint32_t ref_count; + union { + struct radv_vs_prolog_key vs; + struct radv_ps_epilog_key ps; + struct radv_tcs_epilog_key tcs; + } key; + uint64_t va; struct radeon_winsys_bo *bo; @@ -635,6 +641,18 @@ struct radv_shader_part { char *disasm_string; }; +struct radv_shader_part_cache_ops { + uint32_t (*hash)(const void *key); + bool (*equals)(const void *a, const void *b); + struct radv_shader_part *(*create)(struct radv_device *device, const void *key); +}; + +struct radv_shader_part_cache { + simple_mtx_t lock; + struct radv_shader_part_cache_ops *ops; + struct set entries; +}; + struct radv_pipeline_layout; struct radv_shader_stage; @@ -722,6 +740,11 @@ struct radv_shader_part *radv_create_tcs_epilog(struct radv_device *device, cons void radv_shader_part_destroy(struct radv_device *device, struct radv_shader_part *shader_part); +bool radv_shader_part_cache_init(struct radv_shader_part_cache *cache, struct radv_shader_part_cache_ops *ops); +void radv_shader_part_cache_finish(struct radv_device *device, struct radv_shader_part_cache *cache); +struct radv_shader_part *radv_shader_part_cache_get(struct radv_device *device, struct radv_shader_part_cache *cache, + struct set *local_entries, const void *key); + uint64_t radv_shader_get_va(const struct radv_shader *shader); struct radv_shader *radv_find_shader(struct radv_device *device, uint64_t pc); @@ -776,6 +799,12 @@ radv_shader_part_unref(struct radv_device *device, struct radv_shader_part *shad radv_shader_part_destroy(device, shader_part); } +static inline struct radv_shader_part * +radv_shader_part_from_cache_entry(const void *key) +{ + return container_of(key, struct radv_shader_part, key); +} + static inline unsigned get_tcs_input_vertex_stride(unsigned tcs_num_inputs) {
