Module: Mesa Branch: main Commit: 9ef621ec2ebc61ce89b6cb05608fb7961b5f67bc URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=9ef621ec2ebc61ce89b6cb05608fb7961b5f67bc
Author: Jesse Natalie <[email protected]> Date: Fri Nov 10 14:48:07 2023 -0800 d3d12: ARB_query_buffer_object and GL4.4 Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26156> --- docs/features.txt | 12 +- src/gallium/drivers/d3d12/d3d12_batch.cpp | 3 + .../drivers/d3d12/d3d12_compute_transforms.cpp | 166 ++++++++++++++++ .../drivers/d3d12/d3d12_compute_transforms.h | 21 +- src/gallium/drivers/d3d12/d3d12_context.cpp | 2 + src/gallium/drivers/d3d12/d3d12_context.h | 1 + src/gallium/drivers/d3d12/d3d12_query.cpp | 216 +++++++++++++-------- src/gallium/drivers/d3d12/d3d12_screen.cpp | 8 +- 8 files changed, 339 insertions(+), 90 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index 6c77e127126..a7bb309c4b3 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -192,12 +192,12 @@ GL 4.3, GLSL 4.30 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, v GL_ARB_vertex_attrib_binding DONE (all drivers) -GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, virgl, zink, iris, crocus/gen7.5+ +GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, virgl, zink, iris, crocus/gen7.5+, d3d12 GL_MAX_VERTEX_ATTRIB_STRIDE DONE (all drivers) - GL_ARB_buffer_storage DONE (freedreno, nv50, v3d, vc4, lima, panfrost, asahi, d3d12, softpipe, etnaviv, crocus) + GL_ARB_buffer_storage DONE (freedreno, nv50, v3d, vc4, lima, panfrost, asahi, softpipe, etnaviv, crocus) GL_ARB_clear_texture DONE (all drivers) - GL_ARB_enhanced_layouts DONE (freedreno/a3xx+, nv50, softpipe, crocus, d3d12) + GL_ARB_enhanced_layouts DONE (freedreno/a3xx+, nv50, softpipe, crocus) - compile-time constant expressions DONE - explicit byte offsets for blocks DONE - forced alignment within blocks DONE @@ -206,9 +206,9 @@ GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, v - input/output block locations DONE GL_ARB_multi_bind DONE (all drivers) GL_ARB_query_buffer_object DONE (freedreno/a6xx) - GL_ARB_texture_mirror_clamp_to_edge DONE (freedreno, nv50, softpipe, v3d, panfrost, asahi, crocus, d3d12) - GL_ARB_texture_stencil8 DONE (freedreno, nv50, softpipe, v3d, panfrost, d3d12, asahi) - GL_ARB_vertex_type_10f_11f_11f_rev DONE (freedreno, nv50, softpipe, panfrost, d3d12, asahi, crocus) + GL_ARB_texture_mirror_clamp_to_edge DONE (freedreno, nv50, softpipe, v3d, panfrost, asahi, crocus) + GL_ARB_texture_stencil8 DONE (freedreno, nv50, softpipe, v3d, panfrost, asahi) + GL_ARB_vertex_type_10f_11f_11f_rev DONE (freedreno, nv50, softpipe, panfrost, asahi, crocus) GL 4.5, GLSL 4.50 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, virgl, zink, iris, crocus/gen7.5+ diff --git a/src/gallium/drivers/d3d12/d3d12_batch.cpp b/src/gallium/drivers/d3d12/d3d12_batch.cpp index 50be53be027..373255a0740 100644 --- a/src/gallium/drivers/d3d12/d3d12_batch.cpp +++ b/src/gallium/drivers/d3d12/d3d12_batch.cpp @@ -230,6 +230,9 @@ d3d12_start_batch(struct d3d12_context *ctx, struct d3d12_batch *batch) batch->has_errors = true; return; } + if (FAILED(ctx->cmdlist->QueryInterface(IID_PPV_ARGS(&ctx->cmdlist2)))) { + ctx->cmdlist2 = nullptr; + } if (FAILED(ctx->cmdlist->QueryInterface(IID_PPV_ARGS(&ctx->cmdlist8)))) { ctx->cmdlist8 = nullptr; } diff --git a/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp b/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp index 089a47a0404..a444c756b69 100644 --- a/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp +++ b/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp @@ -212,6 +212,170 @@ get_draw_auto(const nir_shader_compiler_options *options) return b.shader; } +static struct nir_shader * +get_query_resolve(const nir_shader_compiler_options *options, const d3d12_compute_transform_key *key) +{ + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "QueryResolve"); + + uint32_t bit_size = key->query_resolve.is_64bit ? 64 : 32; + const struct glsl_type *value_type = glsl_uintN_t_type(bit_size); + + assert(!key->query_resolve.is_resolve_in_place || + (key->query_resolve.is_64bit && key->query_resolve.num_subqueries == 1)); + assert(key->query_resolve.num_subqueries == 1 || + key->query_resolve.pipe_query_type == PIPE_QUERY_PRIMITIVES_GENERATED); + assert(key->query_resolve.num_subqueries <= 3); /* Fourth state var is an output offset */ + + nir_variable *inputs[3]; + for (uint32_t i = 0; i < key->query_resolve.num_subqueries; ++i) { + /* Inputs are always 64-bit */ + inputs[i] = nir_variable_create(b.shader, nir_var_mem_ssbo, glsl_array_type(glsl_uint64_t_type(), 0, 8), "input"); + inputs[i]->data.binding = i; + } + nir_variable *output = inputs[0]; + if (!key->query_resolve.is_resolve_in_place) { + output = nir_variable_create(b.shader, nir_var_mem_ssbo, glsl_array_type(value_type, 0, bit_size / 8), "output"); + output->data.binding = key->query_resolve.num_subqueries; + } + + /* How many entries in each sub-query is passed via root constants */ + nir_variable *state_var = nullptr; + nir_def *state_var_data = d3d12_get_state_var(&b, D3D12_STATE_VAR_TRANSFORM_GENERIC0, "state_var", glsl_uvec4_type(), &state_var); + + /* For in-place resolves, we resolve each field of the query. Otherwise, resolve one field into the dest */ + nir_variable *results[sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / sizeof(UINT64)]; + uint32_t num_result_values = 1; + + if (key->query_resolve.is_resolve_in_place) { + if (key->query_resolve.pipe_query_type == PIPE_QUERY_PIPELINE_STATISTICS) + num_result_values = sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / sizeof(UINT64); + else if (key->query_resolve.pipe_query_type == PIPE_QUERY_SO_STATISTICS) + num_result_values = sizeof(D3D12_QUERY_DATA_SO_STATISTICS) / sizeof(UINT64); + } + + uint32_t var_bit_size = key->query_resolve.pipe_query_type == PIPE_QUERY_TIME_ELAPSED || + key->query_resolve.pipe_query_type == PIPE_QUERY_TIMESTAMP ? 64 : bit_size; + for (uint32_t i = 0; i < num_result_values; ++i) { + results[i] = nir_local_variable_create(b.impl, glsl_uintN_t_type(var_bit_size), "result"); + nir_store_var(&b, results[i], nir_imm_intN_t(&b, 0, var_bit_size), 1); + } + + /* For each subquery... */ + for (uint32_t i = 0; i < key->query_resolve.num_subqueries; ++i) { + nir_def *num_results = nir_channel(&b, state_var_data, i); + + uint32_t subquery_index = key->query_resolve.num_subqueries == 1 ? + key->query_resolve.single_subquery_index : i; + uint32_t base_offset = 0; + uint32_t stride = 0; + switch (key->query_resolve.pipe_query_type) { + case PIPE_QUERY_OCCLUSION_COUNTER: + case PIPE_QUERY_OCCLUSION_PREDICATE: + case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE: + case PIPE_QUERY_TIMESTAMP: + stride = 1; + break; + case PIPE_QUERY_TIME_ELAPSED: + stride = 2; + break; + case PIPE_QUERY_SO_STATISTICS: + case PIPE_QUERY_PRIMITIVES_EMITTED: + stride = sizeof(D3D12_QUERY_DATA_SO_STATISTICS) / sizeof(UINT64); + break; + case PIPE_QUERY_PRIMITIVES_GENERATED: + if (subquery_index == 0) + stride = sizeof(D3D12_QUERY_DATA_SO_STATISTICS) / sizeof(UINT64); + else + stride = sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / sizeof(UINT64); + if (!key->query_resolve.is_resolve_in_place) { + if (subquery_index == 1) + base_offset = offsetof(D3D12_QUERY_DATA_PIPELINE_STATISTICS, GSPrimitives) / sizeof(UINT64); + else if (subquery_index == 2) + base_offset = offsetof(D3D12_QUERY_DATA_PIPELINE_STATISTICS, IAPrimitives) / sizeof(UINT64); + } + break; + case PIPE_QUERY_PIPELINE_STATISTICS: + stride = sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / sizeof(UINT64); + break; + default: + unreachable("Unhandled query resolve"); + } + + if (!key->query_resolve.is_resolve_in_place && key->query_resolve.num_subqueries == 1) + base_offset = key->query_resolve.single_result_field_offset; + + nir_def *base_array_index = nir_imm_int(&b, base_offset); + + /* For each query result in this subquery... */ + nir_variable *loop_counter = nir_local_variable_create(b.impl, glsl_uint_type(), "loop_counter"); + nir_store_var(&b, loop_counter, nir_imm_int(&b, 0), 1); + nir_loop *loop = nir_push_loop(&b); + + nir_def *loop_counter_value = nir_load_var(&b, loop_counter); + nir_if *nif = nir_push_if(&b, nir_ieq(&b, loop_counter_value, num_results)); + nir_jump(&b, nir_jump_break); + nir_pop_if(&b, nif); + + /* For each field in the query result, accumulate */ + nir_def *array_index = nir_iadd(&b, nir_imul_imm(&b, loop_counter_value, stride), base_array_index); + for (uint32_t j = 0; j < num_result_values; ++j) { + nir_def *new_value; + if (key->query_resolve.pipe_query_type == PIPE_QUERY_TIME_ELAPSED) { + assert(j == 0 && i == 0); + nir_def *start = nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), nir_imul_imm(&b, array_index, 8)); + nir_def *end = nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), nir_imul_imm(&b, nir_iadd_imm(&b, array_index, 1), 8)); + new_value = nir_isub(&b, end, start); + } else { + new_value = nir_u2uN(&b, nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), nir_imul_imm(&b, nir_iadd_imm(&b, array_index, j), 8)), var_bit_size); + } + nir_store_var(&b, results[j], nir_iadd(&b, nir_load_var(&b, results[j]), new_value), 1); + } + + nir_store_var(&b, loop_counter, nir_iadd_imm(&b, loop_counter_value, 1), 1); + nir_pop_loop(&b, loop); + } + + /* Results are accumulated, now store the final values */ + nir_def *output_base_index = nir_channel(&b, state_var_data, 3); + for (uint32_t i = 0; i < num_result_values; ++i) { + /* When resolving in-place, resolve each field, otherwise just write the one result */ + uint32_t field_offset = key->query_resolve.is_resolve_in_place ? + i : key->query_resolve.single_result_field_offset; + + /* When resolving time elapsed in-place, write [0, time], as the only special case */ + if (key->query_resolve.is_resolve_in_place && + key->query_resolve.pipe_query_type == PIPE_QUERY_TIME_ELAPSED) { + nir_store_ssbo(&b, nir_imm_int64(&b, 0), nir_imm_int(&b, output->data.binding), + nir_imul_imm(&b, output_base_index, bit_size / 8), 1, (gl_access_qualifier)0, bit_size / 8, 0); + field_offset++; + } + nir_def *result_val = nir_load_var(&b, results[i]); + if (!key->query_resolve.is_resolve_in_place && + (key->query_resolve.pipe_query_type == PIPE_QUERY_TIME_ELAPSED || + key->query_resolve.pipe_query_type == PIPE_QUERY_TIMESTAMP)) { + result_val = nir_f2u64(&b, nir_fmul_imm(&b, nir_u2f64(&b, result_val), key->query_resolve.timestamp_multiplier)); + + if (!key->query_resolve.is_64bit) { + nir_alu_type rounding_type = key->query_resolve.is_signed ? nir_type_int : nir_type_uint; + nir_alu_type src_round = (nir_alu_type)(rounding_type | 64); + nir_alu_type dst_round = (nir_alu_type)(rounding_type | bit_size); + result_val = nir_convert_alu_types(&b, bit_size, result_val, src_round, dst_round, nir_rounding_mode_undef, true); + } + } + nir_store_ssbo(&b, result_val, nir_imm_int(&b, output->data.binding), + nir_imul_imm(&b, nir_iadd_imm(&b, output_base_index, field_offset), bit_size / 8), + 1, (gl_access_qualifier)0, bit_size / 8, 0); + } + + nir_validate_shader(b.shader, "creation"); + b.shader->info.num_ssbos = key->query_resolve.num_subqueries + !key->query_resolve.is_resolve_in_place; + b.shader->info.num_ubos = 0; + + NIR_PASS_V(b.shader, nir_lower_convert_alu_types, NULL); + + return b.shader; +} + static struct nir_shader * create_compute_transform(const nir_shader_compiler_options *options, const d3d12_compute_transform_key *key) { @@ -224,6 +388,8 @@ create_compute_transform(const nir_shader_compiler_options *options, const d3d12 return get_fake_so_buffer_vertex_count(options); case d3d12_compute_transform_type::draw_auto: return get_draw_auto(options); + case d3d12_compute_transform_type::query_resolve: + return get_query_resolve(options, key); default: unreachable("Invalid transform"); } diff --git a/src/gallium/drivers/d3d12/d3d12_compute_transforms.h b/src/gallium/drivers/d3d12/d3d12_compute_transforms.h index d8daa7d6840..011fbba90cf 100644 --- a/src/gallium/drivers/d3d12/d3d12_compute_transforms.h +++ b/src/gallium/drivers/d3d12/d3d12_compute_transforms.h @@ -45,6 +45,8 @@ enum class d3d12_compute_transform_type fake_so_buffer_vertex_count, /* Append a buffer filled size with (vertex count, 1, 0, 0) */ draw_auto, + /* Accumulate queries together and write a 32-bit or 64-bit result */ + query_resolve, max, }; @@ -67,6 +69,23 @@ struct d3d12_compute_transform_key uint16_t size; } ranges[PIPE_MAX_SO_OUTPUTS]; } fake_so_buffer_copy_back; + + struct { + /* true means the accumulation should be done as uint64, else uint32. */ + uint8_t is_64bit : 1; + /* true means output is written where input[0] was, else output is a separate buffer. + * true also means all fields are accumulated, else single_result_field_offset determines + * which field is resolved. Implies num_subqueries == 1. */ + uint8_t is_resolve_in_place : 1; + /* Indicates how many subqueries to accumulate together into a final result. When + * set to 1, single_subquery_index determines where the data comes from. */ + uint8_t num_subqueries : 2; + uint8_t pipe_query_type : 4; + uint8_t single_subquery_index : 2; + uint8_t single_result_field_offset : 4; + uint8_t is_signed : 1; + double timestamp_multiplier; + } query_resolve; }; }; @@ -83,7 +102,7 @@ struct d3d12_compute_transform_save_restore { struct d3d12_shader_selector *cs; struct pipe_constant_buffer cbuf0; - struct pipe_shader_buffer ssbos[2]; + struct pipe_shader_buffer ssbos[4]; }; void diff --git a/src/gallium/drivers/d3d12/d3d12_context.cpp b/src/gallium/drivers/d3d12/d3d12_context.cpp index af16adf8622..d770b258e74 100644 --- a/src/gallium/drivers/d3d12/d3d12_context.cpp +++ b/src/gallium/drivers/d3d12/d3d12_context.cpp @@ -98,6 +98,8 @@ d3d12_context_destroy(struct pipe_context *pctx) for (unsigned i = 0; i < ARRAY_SIZE(ctx->batches); ++i) d3d12_destroy_batch(ctx, &ctx->batches[i]); ctx->cmdlist->Release(); + if (ctx->cmdlist2) + ctx->cmdlist2->Release(); if (ctx->cmdlist8) ctx->cmdlist8->Release(); d3d12_descriptor_pool_free(ctx->sampler_pool); diff --git a/src/gallium/drivers/d3d12/d3d12_context.h b/src/gallium/drivers/d3d12/d3d12_context.h index 2f201d7b2d5..3429c0bfccd 100644 --- a/src/gallium/drivers/d3d12/d3d12_context.h +++ b/src/gallium/drivers/d3d12/d3d12_context.h @@ -257,6 +257,7 @@ struct d3d12_context { uint64_t submit_id; ID3D12GraphicsCommandList *cmdlist; + ID3D12GraphicsCommandList2 *cmdlist2; ID3D12GraphicsCommandList8 *cmdlist8; ID3D12GraphicsCommandList *state_fixup_cmdlist; diff --git a/src/gallium/drivers/d3d12/d3d12_query.cpp b/src/gallium/drivers/d3d12/d3d12_query.cpp index c596e0434d5..6e50c715342 100644 --- a/src/gallium/drivers/d3d12/d3d12_query.cpp +++ b/src/gallium/drivers/d3d12/d3d12_query.cpp @@ -23,6 +23,7 @@ #include "d3d12_query.h" #include "d3d12_compiler.h" +#include "d3d12_compute_transforms.h" #include "d3d12_context.h" #include "d3d12_resource.h" #include "d3d12_screen.h" @@ -184,9 +185,9 @@ d3d12_release_query(struct pipe_context *pctx, } static bool -accumulate_subresult(struct d3d12_context *ctx, struct d3d12_query *q_parent, - unsigned sub_query, - union pipe_query_result *result, bool write) +accumulate_subresult_cpu(struct d3d12_context *ctx, struct d3d12_query *q_parent, + unsigned sub_query, + union pipe_query_result *result) { struct pipe_transfer *transfer = NULL; struct d3d12_screen *screen = d3d12_screen(ctx->base.screen); @@ -194,8 +195,6 @@ accumulate_subresult(struct d3d12_context *ctx, struct d3d12_query *q_parent, unsigned access = PIPE_MAP_READ; void *results; - if (write) - access |= PIPE_MAP_WRITE; access |= PIPE_MAP_UNSYNCHRONIZED; results = pipe_buffer_map_range(&ctx->base, q->buffer, q->buffer_offset, @@ -256,32 +255,6 @@ accumulate_subresult(struct d3d12_context *ctx, struct d3d12_query *q_parent, } } - if (write) { - if (q->d3d12qtype == D3D12_QUERY_TYPE_PIPELINE_STATISTICS) { - results_stats[0].IAVertices = result->pipeline_statistics.ia_vertices; - results_stats[0].IAPrimitives = result->pipeline_statistics.ia_primitives; - results_stats[0].VSInvocations = result->pipeline_statistics.vs_invocations; - results_stats[0].GSInvocations = result->pipeline_statistics.gs_invocations; - results_stats[0].GSPrimitives = result->pipeline_statistics.gs_primitives; - results_stats[0].CInvocations = result->pipeline_statistics.c_invocations; - results_stats[0].CPrimitives = result->pipeline_statistics.c_primitives; - results_stats[0].PSInvocations = result->pipeline_statistics.ps_invocations; - results_stats[0].HSInvocations = result->pipeline_statistics.hs_invocations; - results_stats[0].DSInvocations = result->pipeline_statistics.ds_invocations; - results_stats[0].CSInvocations = result->pipeline_statistics.cs_invocations; - } else if (d3d12_query_heap_type(q_parent->type, sub_query) == D3D12_QUERY_HEAP_TYPE_SO_STATISTICS) { - results_so[0].NumPrimitivesWritten = result->so_statistics.num_primitives_written; - results_so[0].PrimitivesStorageNeeded = result->so_statistics.primitives_storage_needed; - } else { - if (unlikely(q->d3d12qtype == D3D12_QUERY_TYPE_TIMESTAMP)) { - results_u64[0] = 0; - results_u64[1] = result->u64; - } else { - results_u64[0] = result->u64; - } - } - } - pipe_buffer_unmap(&ctx->base, transfer); if (q->d3d12qtype == D3D12_QUERY_TYPE_TIMESTAMP) @@ -291,33 +264,33 @@ accumulate_subresult(struct d3d12_context *ctx, struct d3d12_query *q_parent, } static bool -accumulate_result(struct d3d12_context *ctx, struct d3d12_query *q, - union pipe_query_result *result, bool write) +accumulate_result_cpu(struct d3d12_context *ctx, struct d3d12_query *q, + union pipe_query_result *result) { union pipe_query_result local_result; switch (q->type) { case PIPE_QUERY_PRIMITIVES_GENERATED: - if (!accumulate_subresult(ctx, q, 0, &local_result, write)) + if (!accumulate_subresult_cpu(ctx, q, 0, &local_result)) return false; result->u64 = local_result.so_statistics.primitives_storage_needed; - if (!accumulate_subresult(ctx, q, 1, &local_result, write)) + if (!accumulate_subresult_cpu(ctx, q, 1, &local_result)) return false; result->u64 += local_result.pipeline_statistics.gs_primitives; - if (!accumulate_subresult(ctx, q, 2, &local_result, write)) + if (!accumulate_subresult_cpu(ctx, q, 2, &local_result)) return false; result->u64 += local_result.pipeline_statistics.ia_primitives; return true; case PIPE_QUERY_PRIMITIVES_EMITTED: - if (!accumulate_subresult(ctx, q, 0, &local_result, write)) + if (!accumulate_subresult_cpu(ctx, q, 0, &local_result)) return false; result->u64 = local_result.so_statistics.num_primitives_written; return true; default: assert(num_sub_queries(q->type) == 1); - return accumulate_subresult(ctx, q, 0, result, write); + return accumulate_subresult_cpu(ctx, q, 0, result); } } @@ -362,21 +335,99 @@ query_ensure_ready(struct d3d12_screen* screen, struct d3d12_context* ctx, struc return true; } +static void +accumulate_subresult_gpu(struct d3d12_context *ctx, struct d3d12_query *q_parent, + unsigned sub_query) +{ + d3d12_compute_transform_save_restore save; + d3d12_save_compute_transform_state(ctx, &save); + + d3d12_compute_transform_key key; + memset(&key, 0, sizeof(key)); + key.type = d3d12_compute_transform_type::query_resolve; + key.query_resolve.is_64bit = true; + key.query_resolve.is_resolve_in_place = true; + key.query_resolve.num_subqueries = 1; + key.query_resolve.pipe_query_type = q_parent->type; + key.query_resolve.single_subquery_index = sub_query; + key.query_resolve.is_signed = false; + key.query_resolve.timestamp_multiplier = 1.0; + ctx->base.bind_compute_state(&ctx->base, d3d12_get_compute_transform(ctx, &key)); + + ctx->transform_state_vars[0] = q_parent->subqueries[sub_query].curr_query; + ctx->transform_state_vars[1] = 0; + ctx->transform_state_vars[2] = 0; + ctx->transform_state_vars[3] = 0; + + pipe_shader_buffer new_cs_ssbos[1]; + new_cs_ssbos[0].buffer = q_parent->subqueries[sub_query].buffer; + new_cs_ssbos[0].buffer_offset = q_parent->subqueries[sub_query].buffer_offset; + new_cs_ssbos[0].buffer_size = q_parent->subqueries[sub_query].query_size * q_parent->subqueries[sub_query].num_queries; + ctx->base.set_shader_buffers(&ctx->base, PIPE_SHADER_COMPUTE, 0, 1, new_cs_ssbos, 1); + + pipe_grid_info grid = {}; + grid.block[0] = grid.block[1] = grid.block[2] = 1; + grid.grid[0] = grid.grid[1] = grid.grid[2] = 1; + ctx->base.launch_grid(&ctx->base, &grid); + + d3d12_restore_compute_transform_state(ctx, &save); +} + +static void +accumulate_result_gpu(struct d3d12_context *ctx, struct d3d12_query *q, + struct pipe_resource *dst, uint32_t dst_offset, + int index, enum pipe_query_value_type result_type) +{ + d3d12_compute_transform_save_restore save; + d3d12_save_compute_transform_state(ctx, &save); + + d3d12_compute_transform_key key; + memset(&key, 0, sizeof(key)); + key.type = d3d12_compute_transform_type::query_resolve; + key.query_resolve.is_64bit = result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64; + key.query_resolve.is_resolve_in_place = false; + key.query_resolve.num_subqueries = num_sub_queries(q->type); + key.query_resolve.pipe_query_type = q->type; + key.query_resolve.single_result_field_offset = index; + key.query_resolve.is_signed = result_type == PIPE_QUERY_TYPE_I32 || result_type == PIPE_QUERY_TYPE_I64; + key.query_resolve.timestamp_multiplier = d3d12_screen(ctx->base.screen)->timestamp_multiplier; + ctx->base.bind_compute_state(&ctx->base, d3d12_get_compute_transform(ctx, &key)); + + pipe_shader_buffer new_cs_ssbos[4]; + uint32_t num_ssbos = 0; + for (uint32_t i = 0; i < key.query_resolve.num_subqueries; ++i) { + ctx->transform_state_vars[i] = q->subqueries[i].curr_query; + new_cs_ssbos[num_ssbos].buffer = q->subqueries[i].buffer; + new_cs_ssbos[num_ssbos].buffer_offset = q->subqueries[i].buffer_offset; + new_cs_ssbos[num_ssbos].buffer_size = q->subqueries[i].query_size * q->subqueries[i].num_queries; + num_ssbos++; + } + + assert(dst_offset % (key.query_resolve.is_64bit ? 8 : 4) == 0); + ctx->transform_state_vars[3] = dst_offset / (key.query_resolve.is_64bit ? 8 : 4); + + new_cs_ssbos[num_ssbos].buffer = dst; + new_cs_ssbos[num_ssbos].buffer_offset = 0; + new_cs_ssbos[num_ssbos].buffer_size = dst->width0; + num_ssbos++; + + ctx->base.set_shader_buffers(&ctx->base, PIPE_SHADER_COMPUTE, 0, num_ssbos, new_cs_ssbos, 1 << (num_ssbos - 1)); + + pipe_grid_info grid = {}; + grid.block[0] = grid.block[1] = grid.block[2] = 1; + grid.grid[0] = grid.grid[1] = grid.grid[2] = 1; + ctx->base.launch_grid(&ctx->base, &grid); + + d3d12_restore_compute_transform_state(ctx, &save); +} + static void begin_subquery(struct d3d12_context *ctx, struct d3d12_query *q_parent, unsigned sub_query) { struct d3d12_query_impl *q = &q_parent->subqueries[sub_query]; if (q->curr_query == q->num_queries) { - union pipe_query_result result; - - query_ensure_ready(d3d12_screen(ctx->base.screen), ctx, q_parent, false); - d3d12_foreach_submitted_batch(ctx, old_batch) { - if (old_batch->fence && old_batch->fence->value <= q_parent->fence_value) - d3d12_reset_batch(ctx, old_batch, OS_TIMEOUT_INFINITE); - } - /* Accumulate current results and store in first slot */ - accumulate_subresult(ctx, q_parent, sub_query, &result, true); + accumulate_subresult_gpu(ctx, q_parent, sub_query); q->curr_query = 1; } @@ -412,18 +463,9 @@ begin_timer_query(struct d3d12_context *ctx, struct d3d12_query *q_parent, bool q->curr_query = 0; query_index = 0; } else if (query_index == q->num_queries) { - union pipe_query_result result; - /* Accumulate current results and store in first slot */ - - query_ensure_ready(d3d12_screen(ctx->base.screen), ctx, q_parent, false); - d3d12_foreach_submitted_batch(ctx, old_batch) { - if (old_batch->fence && old_batch->fence->value <= q_parent->fence_value) - d3d12_reset_batch(ctx, old_batch, OS_TIMEOUT_INFINITE); - } - - accumulate_subresult(ctx, q_parent, 0, &result, true); - q->curr_query = 2; + accumulate_subresult_gpu(ctx, q_parent, 0); + q->curr_query = 1; } ctx->cmdlist->EndQuery(q->query_heap, q->d3d12qtype, query_index); @@ -530,7 +572,39 @@ d3d12_get_query_result(struct pipe_context *pctx, if (!query_ensure_ready(screen, ctx, query, wait)) return false; - return accumulate_result(ctx, query, result, false); + return accumulate_result_cpu(ctx, query, result); +} + +static void +d3d12_get_query_result_resource(struct pipe_context *pctx, + struct pipe_query *q, + enum pipe_query_flags flags, + enum pipe_query_value_type result_type, + int index, + struct pipe_resource *resource, + unsigned offset) +{ + struct d3d12_context *ctx = d3d12_context(pctx); + + if (index == -1) { + /* Write the "available" bit, which is always true */ + struct d3d12_resource *res = d3d12_resource(resource); + d3d12_transition_resource_state(ctx, res, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_TRANSITION_FLAG_NONE); + d3d12_apply_resource_states(ctx, false); + + D3D12_GPU_VIRTUAL_ADDRESS gpuva_base = d3d12_resource_gpu_virtual_address(res) + offset; + D3D12_WRITEBUFFERIMMEDIATE_PARAMETER params[2] = { + { gpuva_base, 1 }, + { gpuva_base + sizeof(uint32_t), 0 }, + }; + D3D12_WRITEBUFFERIMMEDIATE_MODE modes[2] = { D3D12_WRITEBUFFERIMMEDIATE_MODE_DEFAULT, D3D12_WRITEBUFFERIMMEDIATE_MODE_DEFAULT }; + ctx->cmdlist8->WriteBufferImmediate(result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64 ? 2 : 1, + params, modes); + return; + } + + struct d3d12_query *query = (struct d3d12_query *)q; + accumulate_result_gpu(ctx, query, resource, offset, index, result_type); } void @@ -598,28 +672,7 @@ d3d12_render_condition(struct pipe_context *pctx, query->predicate = d3d12_resource(pipe_buffer_create(pctx->screen, 0, PIPE_USAGE_DEFAULT, sizeof(uint64_t))); - if (mode == PIPE_RENDER_COND_WAIT) { - - query_ensure_ready(d3d12_screen(ctx->base.screen), ctx, query, false); - d3d12_foreach_submitted_batch(ctx, old_batch) { - if (old_batch->fence && old_batch->fence->value <= query->fence_value) - d3d12_reset_batch(ctx, old_batch, OS_TIMEOUT_INFINITE); - } - - union pipe_query_result result; - accumulate_result(ctx, (d3d12_query *)pquery, &result, true); - } - - struct d3d12_resource *res = (struct d3d12_resource *)query->subqueries[0].buffer; - uint64_t source_offset = 0; - ID3D12Resource *source = d3d12_resource_underlying(res, &source_offset); - source_offset += query->subqueries[0].buffer_offset; - d3d12_transition_resource_state(ctx, res, D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_TRANSITION_FLAG_INVALIDATE_BINDINGS); - d3d12_transition_resource_state(ctx, query->predicate, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_TRANSITION_FLAG_NONE); - d3d12_apply_resource_states(ctx, false); - ctx->cmdlist->CopyBufferRegion(d3d12_resource_resource(query->predicate), 0, - source, source_offset, - sizeof(uint64_t)); + accumulate_result_gpu(ctx, query, &query->predicate->base.b, 0, 0, PIPE_QUERY_TYPE_U64); d3d12_transition_resource_state(ctx, query->predicate, D3D12_RESOURCE_STATE_PREDICATION, D3D12_TRANSITION_FLAG_NONE); d3d12_apply_resource_states(ctx, false); @@ -656,6 +709,7 @@ d3d12_context_query_init(struct pipe_context *pctx) pctx->begin_query = d3d12_begin_query; pctx->end_query = d3d12_end_query; pctx->get_query_result = d3d12_get_query_result; + pctx->get_query_result_resource = d3d12_get_query_result_resource; pctx->set_active_query_state = d3d12_set_active_query_state; pctx->render_condition = d3d12_render_condition; } diff --git a/src/gallium/drivers/d3d12/d3d12_screen.cpp b/src/gallium/drivers/d3d12/d3d12_screen.cpp index 17d2de8b0c7..4e9f769bcd1 100644 --- a/src/gallium/drivers/d3d12/d3d12_screen.cpp +++ b/src/gallium/drivers/d3d12/d3d12_screen.cpp @@ -193,9 +193,9 @@ d3d12_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return 1; case PIPE_CAP_GLSL_FEATURE_LEVEL: - return 430; + return 440; case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY: - return 430; + return 440; case PIPE_CAP_ESSL_FEATURE_LEVEL: return 310; @@ -335,8 +335,12 @@ d3d12_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS: case PIPE_CAP_SHADER_ARRAY_COMPONENTS: case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE: + case PIPE_CAP_QUERY_TIME_ELAPSED: return 1; + case PIPE_CAP_QUERY_BUFFER_OBJECT: + return (screen->opts3.WriteBufferImmediateSupportFlags & D3D12_COMMAND_LIST_SUPPORT_FLAG_DIRECT) != 0; + case PIPE_CAP_MAX_VERTEX_STREAMS: return D3D12_SO_BUFFER_SLOT_COUNT;
