Module: Mesa
Branch: main
Commit: 9ef621ec2ebc61ce89b6cb05608fb7961b5f67bc
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=9ef621ec2ebc61ce89b6cb05608fb7961b5f67bc

Author: Jesse Natalie <[email protected]>
Date:   Fri Nov 10 14:48:07 2023 -0800

d3d12: ARB_query_buffer_object and GL4.4

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26156>

---

 docs/features.txt                                  |  12 +-
 src/gallium/drivers/d3d12/d3d12_batch.cpp          |   3 +
 .../drivers/d3d12/d3d12_compute_transforms.cpp     | 166 ++++++++++++++++
 .../drivers/d3d12/d3d12_compute_transforms.h       |  21 +-
 src/gallium/drivers/d3d12/d3d12_context.cpp        |   2 +
 src/gallium/drivers/d3d12/d3d12_context.h          |   1 +
 src/gallium/drivers/d3d12/d3d12_query.cpp          | 216 +++++++++++++--------
 src/gallium/drivers/d3d12/d3d12_screen.cpp         |   8 +-
 8 files changed, 339 insertions(+), 90 deletions(-)

diff --git a/docs/features.txt b/docs/features.txt
index 6c77e127126..a7bb309c4b3 100644
--- a/docs/features.txt
+++ b/docs/features.txt
@@ -192,12 +192,12 @@ GL 4.3, GLSL 4.30 -- all DONE: freedreno/a6xx, nvc0, 
r600, radeonsi, llvmpipe, v
   GL_ARB_vertex_attrib_binding                          DONE (all drivers)
 
 
-GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, 
virgl, zink, iris, crocus/gen7.5+
+GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, 
virgl, zink, iris, crocus/gen7.5+, d3d12
 
   GL_MAX_VERTEX_ATTRIB_STRIDE                           DONE (all drivers)
-  GL_ARB_buffer_storage                                 DONE (freedreno, nv50, 
v3d, vc4, lima, panfrost, asahi, d3d12, softpipe, etnaviv, crocus)
+  GL_ARB_buffer_storage                                 DONE (freedreno, nv50, 
v3d, vc4, lima, panfrost, asahi, softpipe, etnaviv, crocus)
   GL_ARB_clear_texture                                  DONE (all drivers)
-  GL_ARB_enhanced_layouts                               DONE (freedreno/a3xx+, 
nv50, softpipe, crocus, d3d12)
+  GL_ARB_enhanced_layouts                               DONE (freedreno/a3xx+, 
nv50, softpipe, crocus)
   - compile-time constant expressions                   DONE
   - explicit byte offsets for blocks                    DONE
   - forced alignment within blocks                      DONE
@@ -206,9 +206,9 @@ GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, nvc0, r600, 
radeonsi, llvmpipe, v
   - input/output block locations                        DONE
   GL_ARB_multi_bind                                     DONE (all drivers)
   GL_ARB_query_buffer_object                            DONE (freedreno/a6xx)
-  GL_ARB_texture_mirror_clamp_to_edge                   DONE (freedreno, nv50, 
softpipe, v3d, panfrost, asahi, crocus, d3d12)
-  GL_ARB_texture_stencil8                               DONE (freedreno, nv50, 
softpipe, v3d, panfrost, d3d12, asahi)
-  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (freedreno, nv50, 
softpipe, panfrost, d3d12, asahi, crocus)
+  GL_ARB_texture_mirror_clamp_to_edge                   DONE (freedreno, nv50, 
softpipe, v3d, panfrost, asahi, crocus)
+  GL_ARB_texture_stencil8                               DONE (freedreno, nv50, 
softpipe, v3d, panfrost, asahi)
+  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (freedreno, nv50, 
softpipe, panfrost, asahi, crocus)
 
 GL 4.5, GLSL 4.50 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, 
virgl, zink, iris, crocus/gen7.5+
 
diff --git a/src/gallium/drivers/d3d12/d3d12_batch.cpp 
b/src/gallium/drivers/d3d12/d3d12_batch.cpp
index 50be53be027..373255a0740 100644
--- a/src/gallium/drivers/d3d12/d3d12_batch.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_batch.cpp
@@ -230,6 +230,9 @@ d3d12_start_batch(struct d3d12_context *ctx, struct 
d3d12_batch *batch)
          batch->has_errors = true;
          return;
       }
+      if (FAILED(ctx->cmdlist->QueryInterface(IID_PPV_ARGS(&ctx->cmdlist2)))) {
+         ctx->cmdlist2 = nullptr;
+      }
       if (FAILED(ctx->cmdlist->QueryInterface(IID_PPV_ARGS(&ctx->cmdlist8)))) {
          ctx->cmdlist8 = nullptr;
       }
diff --git a/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp 
b/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp
index 089a47a0404..a444c756b69 100644
--- a/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp
@@ -212,6 +212,170 @@ get_draw_auto(const nir_shader_compiler_options *options)
    return b.shader;
 }
 
+static struct nir_shader *
+get_query_resolve(const nir_shader_compiler_options *options, const 
d3d12_compute_transform_key *key)
+{
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, 
options, "QueryResolve");
+
+   uint32_t bit_size = key->query_resolve.is_64bit ? 64 : 32;
+   const struct glsl_type *value_type = glsl_uintN_t_type(bit_size);
+
+   assert(!key->query_resolve.is_resolve_in_place ||
+          (key->query_resolve.is_64bit && key->query_resolve.num_subqueries == 
1));
+   assert(key->query_resolve.num_subqueries == 1 ||
+          key->query_resolve.pipe_query_type == 
PIPE_QUERY_PRIMITIVES_GENERATED);
+   assert(key->query_resolve.num_subqueries <= 3); /* Fourth state var is an 
output offset */
+
+   nir_variable *inputs[3];
+   for (uint32_t i = 0; i < key->query_resolve.num_subqueries; ++i) {
+      /* Inputs are always 64-bit */
+      inputs[i] = nir_variable_create(b.shader, nir_var_mem_ssbo, 
glsl_array_type(glsl_uint64_t_type(), 0, 8), "input");
+      inputs[i]->data.binding = i;
+   }
+   nir_variable *output = inputs[0];
+   if (!key->query_resolve.is_resolve_in_place) {
+      output = nir_variable_create(b.shader, nir_var_mem_ssbo, 
glsl_array_type(value_type, 0, bit_size / 8), "output");
+      output->data.binding = key->query_resolve.num_subqueries;
+   }
+
+   /* How many entries in each sub-query is passed via root constants */
+   nir_variable *state_var = nullptr;
+   nir_def *state_var_data = d3d12_get_state_var(&b, 
D3D12_STATE_VAR_TRANSFORM_GENERIC0, "state_var", glsl_uvec4_type(), &state_var);
+
+   /* For in-place resolves, we resolve each field of the query. Otherwise, 
resolve one field into the dest */
+   nir_variable *results[sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / 
sizeof(UINT64)];
+   uint32_t num_result_values = 1;
+
+   if (key->query_resolve.is_resolve_in_place) {
+      if (key->query_resolve.pipe_query_type == PIPE_QUERY_PIPELINE_STATISTICS)
+         num_result_values = sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / 
sizeof(UINT64);
+      else if (key->query_resolve.pipe_query_type == PIPE_QUERY_SO_STATISTICS)
+         num_result_values = sizeof(D3D12_QUERY_DATA_SO_STATISTICS) / 
sizeof(UINT64);
+   }
+   
+   uint32_t var_bit_size = key->query_resolve.pipe_query_type == 
PIPE_QUERY_TIME_ELAPSED ||
+                           key->query_resolve.pipe_query_type == 
PIPE_QUERY_TIMESTAMP ? 64 : bit_size;
+   for (uint32_t i = 0; i < num_result_values; ++i) {
+      results[i] = nir_local_variable_create(b.impl, 
glsl_uintN_t_type(var_bit_size), "result");
+      nir_store_var(&b, results[i], nir_imm_intN_t(&b, 0, var_bit_size), 1);
+   }
+
+   /* For each subquery... */
+   for (uint32_t i = 0; i < key->query_resolve.num_subqueries; ++i) {
+      nir_def *num_results = nir_channel(&b, state_var_data, i);
+
+      uint32_t subquery_index = key->query_resolve.num_subqueries == 1 ?
+         key->query_resolve.single_subquery_index : i;
+      uint32_t base_offset = 0;
+      uint32_t stride = 0;
+      switch (key->query_resolve.pipe_query_type) {
+      case PIPE_QUERY_OCCLUSION_COUNTER:
+      case PIPE_QUERY_OCCLUSION_PREDICATE:
+      case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      case PIPE_QUERY_TIMESTAMP:
+         stride = 1;
+         break;
+      case PIPE_QUERY_TIME_ELAPSED:
+         stride = 2;
+         break;
+      case PIPE_QUERY_SO_STATISTICS:
+      case PIPE_QUERY_PRIMITIVES_EMITTED:
+         stride = sizeof(D3D12_QUERY_DATA_SO_STATISTICS) / sizeof(UINT64);
+         break;
+      case PIPE_QUERY_PRIMITIVES_GENERATED:
+         if (subquery_index == 0)
+            stride = sizeof(D3D12_QUERY_DATA_SO_STATISTICS) / sizeof(UINT64);
+         else
+            stride = sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / 
sizeof(UINT64);
+         if (!key->query_resolve.is_resolve_in_place) {
+            if (subquery_index == 1)
+               base_offset = offsetof(D3D12_QUERY_DATA_PIPELINE_STATISTICS, 
GSPrimitives) / sizeof(UINT64);
+            else if (subquery_index == 2)
+               base_offset = offsetof(D3D12_QUERY_DATA_PIPELINE_STATISTICS, 
IAPrimitives) / sizeof(UINT64);
+         }
+         break;
+      case PIPE_QUERY_PIPELINE_STATISTICS:
+         stride = sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / 
sizeof(UINT64);
+         break;
+      default:
+         unreachable("Unhandled query resolve");
+      }
+
+      if (!key->query_resolve.is_resolve_in_place && 
key->query_resolve.num_subqueries == 1)
+         base_offset = key->query_resolve.single_result_field_offset;
+
+      nir_def *base_array_index = nir_imm_int(&b, base_offset);
+
+      /* For each query result in this subquery... */
+      nir_variable *loop_counter = nir_local_variable_create(b.impl, 
glsl_uint_type(), "loop_counter");
+      nir_store_var(&b, loop_counter, nir_imm_int(&b, 0), 1);
+      nir_loop *loop = nir_push_loop(&b);
+
+      nir_def *loop_counter_value = nir_load_var(&b, loop_counter);
+      nir_if *nif = nir_push_if(&b, nir_ieq(&b, loop_counter_value, 
num_results));
+      nir_jump(&b, nir_jump_break);
+      nir_pop_if(&b, nif);
+
+      /* For each field in the query result, accumulate */
+      nir_def *array_index = nir_iadd(&b, nir_imul_imm(&b, loop_counter_value, 
stride), base_array_index);
+      for (uint32_t j = 0; j < num_result_values; ++j) {
+         nir_def *new_value;
+         if (key->query_resolve.pipe_query_type == PIPE_QUERY_TIME_ELAPSED) {
+            assert(j == 0 && i == 0);
+            nir_def *start = nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), 
nir_imul_imm(&b, array_index, 8));
+            nir_def *end = nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), 
nir_imul_imm(&b, nir_iadd_imm(&b, array_index, 1), 8));
+            new_value = nir_isub(&b, end, start);
+         } else {
+            new_value = nir_u2uN(&b, nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, 
i), nir_imul_imm(&b, nir_iadd_imm(&b, array_index, j), 8)), var_bit_size);
+         }
+         nir_store_var(&b, results[j], nir_iadd(&b, nir_load_var(&b, 
results[j]), new_value), 1);
+      }
+      
+      nir_store_var(&b, loop_counter, nir_iadd_imm(&b, loop_counter_value, 1), 
1);
+      nir_pop_loop(&b, loop);
+   }
+
+   /* Results are accumulated, now store the final values */
+   nir_def *output_base_index = nir_channel(&b, state_var_data, 3);
+   for (uint32_t i = 0; i < num_result_values; ++i) {
+      /* When resolving in-place, resolve each field, otherwise just write the 
one result */
+      uint32_t field_offset = key->query_resolve.is_resolve_in_place ?
+         i : key->query_resolve.single_result_field_offset;
+
+      /* When resolving time elapsed in-place, write [0, time], as the only 
special case */
+      if (key->query_resolve.is_resolve_in_place &&
+          key->query_resolve.pipe_query_type == PIPE_QUERY_TIME_ELAPSED) {
+         nir_store_ssbo(&b, nir_imm_int64(&b, 0), nir_imm_int(&b, 
output->data.binding),
+                        nir_imul_imm(&b, output_base_index, bit_size / 8), 1, 
(gl_access_qualifier)0, bit_size / 8, 0);
+         field_offset++;
+      }
+      nir_def *result_val = nir_load_var(&b, results[i]);
+      if (!key->query_resolve.is_resolve_in_place &&
+          (key->query_resolve.pipe_query_type == PIPE_QUERY_TIME_ELAPSED ||
+           key->query_resolve.pipe_query_type == PIPE_QUERY_TIMESTAMP)) {
+         result_val = nir_f2u64(&b, nir_fmul_imm(&b, nir_u2f64(&b, 
result_val), key->query_resolve.timestamp_multiplier));
+
+         if (!key->query_resolve.is_64bit) {
+            nir_alu_type rounding_type = key->query_resolve.is_signed ? 
nir_type_int : nir_type_uint;
+            nir_alu_type src_round = (nir_alu_type)(rounding_type | 64);
+            nir_alu_type dst_round = (nir_alu_type)(rounding_type | bit_size);
+            result_val = nir_convert_alu_types(&b, bit_size, result_val, 
src_round, dst_round, nir_rounding_mode_undef, true);
+         }
+      }
+      nir_store_ssbo(&b, result_val, nir_imm_int(&b, output->data.binding),
+                     nir_imul_imm(&b, nir_iadd_imm(&b, output_base_index, 
field_offset), bit_size / 8),
+                     1, (gl_access_qualifier)0, bit_size / 8, 0);
+   }
+
+   nir_validate_shader(b.shader, "creation");
+   b.shader->info.num_ssbos = key->query_resolve.num_subqueries + 
!key->query_resolve.is_resolve_in_place;
+   b.shader->info.num_ubos = 0;
+
+   NIR_PASS_V(b.shader, nir_lower_convert_alu_types, NULL);
+
+   return b.shader;
+}
+
 static struct nir_shader *
 create_compute_transform(const nir_shader_compiler_options *options, const 
d3d12_compute_transform_key *key)
 {
@@ -224,6 +388,8 @@ create_compute_transform(const nir_shader_compiler_options 
*options, const d3d12
       return get_fake_so_buffer_vertex_count(options);
    case d3d12_compute_transform_type::draw_auto:
       return get_draw_auto(options);
+   case d3d12_compute_transform_type::query_resolve:
+      return get_query_resolve(options, key);
    default:
       unreachable("Invalid transform");
    }
diff --git a/src/gallium/drivers/d3d12/d3d12_compute_transforms.h 
b/src/gallium/drivers/d3d12/d3d12_compute_transforms.h
index d8daa7d6840..011fbba90cf 100644
--- a/src/gallium/drivers/d3d12/d3d12_compute_transforms.h
+++ b/src/gallium/drivers/d3d12/d3d12_compute_transforms.h
@@ -45,6 +45,8 @@ enum class d3d12_compute_transform_type
    fake_so_buffer_vertex_count,
    /* Append a buffer filled size with (vertex count, 1, 0, 0) */
    draw_auto,
+   /* Accumulate queries together and write a 32-bit or 64-bit result */
+   query_resolve,
    max,
 };
 
@@ -67,6 +69,23 @@ struct d3d12_compute_transform_key
             uint16_t size;
          } ranges[PIPE_MAX_SO_OUTPUTS];
       } fake_so_buffer_copy_back;
+
+      struct {
+         /* true means the accumulation should be done as uint64, else uint32. 
*/
+         uint8_t is_64bit : 1;
+         /* true means output is written where input[0] was, else output is a 
separate buffer.
+          * true also means all fields are accumulated, else 
single_result_field_offset determines
+          * which field is resolved. Implies num_subqueries == 1. */
+         uint8_t is_resolve_in_place : 1;
+         /* Indicates how many subqueries to accumulate together into a final 
result. When
+          * set to 1, single_subquery_index determines where the data comes 
from. */
+         uint8_t num_subqueries : 2;
+         uint8_t pipe_query_type : 4;
+         uint8_t single_subquery_index : 2;
+         uint8_t single_result_field_offset : 4;
+         uint8_t is_signed : 1;
+         double timestamp_multiplier;
+      } query_resolve;
    };
 };
 
@@ -83,7 +102,7 @@ struct d3d12_compute_transform_save_restore
 {
    struct d3d12_shader_selector *cs;
    struct pipe_constant_buffer cbuf0;
-   struct pipe_shader_buffer ssbos[2];
+   struct pipe_shader_buffer ssbos[4];
 };
 
 void
diff --git a/src/gallium/drivers/d3d12/d3d12_context.cpp 
b/src/gallium/drivers/d3d12/d3d12_context.cpp
index af16adf8622..d770b258e74 100644
--- a/src/gallium/drivers/d3d12/d3d12_context.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_context.cpp
@@ -98,6 +98,8 @@ d3d12_context_destroy(struct pipe_context *pctx)
    for (unsigned i = 0; i < ARRAY_SIZE(ctx->batches); ++i)
       d3d12_destroy_batch(ctx, &ctx->batches[i]);
    ctx->cmdlist->Release();
+   if (ctx->cmdlist2)
+      ctx->cmdlist2->Release();
    if (ctx->cmdlist8)
       ctx->cmdlist8->Release();
    d3d12_descriptor_pool_free(ctx->sampler_pool);
diff --git a/src/gallium/drivers/d3d12/d3d12_context.h 
b/src/gallium/drivers/d3d12/d3d12_context.h
index 2f201d7b2d5..3429c0bfccd 100644
--- a/src/gallium/drivers/d3d12/d3d12_context.h
+++ b/src/gallium/drivers/d3d12/d3d12_context.h
@@ -257,6 +257,7 @@ struct d3d12_context {
 
    uint64_t submit_id;
    ID3D12GraphicsCommandList *cmdlist;
+   ID3D12GraphicsCommandList2 *cmdlist2;
    ID3D12GraphicsCommandList8 *cmdlist8;
    ID3D12GraphicsCommandList *state_fixup_cmdlist;
 
diff --git a/src/gallium/drivers/d3d12/d3d12_query.cpp 
b/src/gallium/drivers/d3d12/d3d12_query.cpp
index c596e0434d5..6e50c715342 100644
--- a/src/gallium/drivers/d3d12/d3d12_query.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_query.cpp
@@ -23,6 +23,7 @@
 
 #include "d3d12_query.h"
 #include "d3d12_compiler.h"
+#include "d3d12_compute_transforms.h"
 #include "d3d12_context.h"
 #include "d3d12_resource.h"
 #include "d3d12_screen.h"
@@ -184,9 +185,9 @@ d3d12_release_query(struct pipe_context *pctx,
 }
 
 static bool
-accumulate_subresult(struct d3d12_context *ctx, struct d3d12_query *q_parent,
-                     unsigned sub_query,
-                     union pipe_query_result *result, bool write)
+accumulate_subresult_cpu(struct d3d12_context *ctx, struct d3d12_query 
*q_parent,
+                         unsigned sub_query,
+                         union pipe_query_result *result)
 {
    struct pipe_transfer *transfer = NULL;
    struct d3d12_screen *screen = d3d12_screen(ctx->base.screen);
@@ -194,8 +195,6 @@ accumulate_subresult(struct d3d12_context *ctx, struct 
d3d12_query *q_parent,
    unsigned access = PIPE_MAP_READ;
    void *results;
 
-   if (write)
-      access |= PIPE_MAP_WRITE;
    access |= PIPE_MAP_UNSYNCHRONIZED;
 
    results = pipe_buffer_map_range(&ctx->base, q->buffer, q->buffer_offset,
@@ -256,32 +255,6 @@ accumulate_subresult(struct d3d12_context *ctx, struct 
d3d12_query *q_parent,
       }
    }
 
-   if (write) {
-      if (q->d3d12qtype == D3D12_QUERY_TYPE_PIPELINE_STATISTICS) {
-         results_stats[0].IAVertices = result->pipeline_statistics.ia_vertices;
-         results_stats[0].IAPrimitives = 
result->pipeline_statistics.ia_primitives;
-         results_stats[0].VSInvocations = 
result->pipeline_statistics.vs_invocations;
-         results_stats[0].GSInvocations = 
result->pipeline_statistics.gs_invocations;
-         results_stats[0].GSPrimitives = 
result->pipeline_statistics.gs_primitives;
-         results_stats[0].CInvocations = 
result->pipeline_statistics.c_invocations;
-         results_stats[0].CPrimitives = 
result->pipeline_statistics.c_primitives;
-         results_stats[0].PSInvocations = 
result->pipeline_statistics.ps_invocations;
-         results_stats[0].HSInvocations = 
result->pipeline_statistics.hs_invocations;
-         results_stats[0].DSInvocations = 
result->pipeline_statistics.ds_invocations;
-         results_stats[0].CSInvocations = 
result->pipeline_statistics.cs_invocations;
-      } else if (d3d12_query_heap_type(q_parent->type, sub_query) == 
D3D12_QUERY_HEAP_TYPE_SO_STATISTICS) {
-         results_so[0].NumPrimitivesWritten = 
result->so_statistics.num_primitives_written;
-         results_so[0].PrimitivesStorageNeeded = 
result->so_statistics.primitives_storage_needed;
-      } else {
-         if (unlikely(q->d3d12qtype == D3D12_QUERY_TYPE_TIMESTAMP)) {
-            results_u64[0] = 0;
-            results_u64[1] = result->u64;
-         } else {
-            results_u64[0] = result->u64;
-         }
-      }
-   }
-
    pipe_buffer_unmap(&ctx->base, transfer);
 
    if (q->d3d12qtype == D3D12_QUERY_TYPE_TIMESTAMP)
@@ -291,33 +264,33 @@ accumulate_subresult(struct d3d12_context *ctx, struct 
d3d12_query *q_parent,
 }
 
 static bool
-accumulate_result(struct d3d12_context *ctx, struct d3d12_query *q,
-                  union pipe_query_result *result, bool write)
+accumulate_result_cpu(struct d3d12_context *ctx, struct d3d12_query *q,
+                      union pipe_query_result *result)
 {
    union pipe_query_result local_result;
 
    switch (q->type) {
    case PIPE_QUERY_PRIMITIVES_GENERATED:
-      if (!accumulate_subresult(ctx, q, 0, &local_result, write))
+      if (!accumulate_subresult_cpu(ctx, q, 0, &local_result))
          return false;
       result->u64 = local_result.so_statistics.primitives_storage_needed;
 
-      if (!accumulate_subresult(ctx, q, 1, &local_result, write))
+      if (!accumulate_subresult_cpu(ctx, q, 1, &local_result))
          return false;
       result->u64 += local_result.pipeline_statistics.gs_primitives;
 
-      if (!accumulate_subresult(ctx, q, 2, &local_result, write))
+      if (!accumulate_subresult_cpu(ctx, q, 2, &local_result))
          return false;
       result->u64 += local_result.pipeline_statistics.ia_primitives;
       return true;
    case PIPE_QUERY_PRIMITIVES_EMITTED:
-      if (!accumulate_subresult(ctx, q, 0, &local_result, write))
+      if (!accumulate_subresult_cpu(ctx, q, 0, &local_result))
          return false;
       result->u64 = local_result.so_statistics.num_primitives_written;
       return true;
    default:
       assert(num_sub_queries(q->type) == 1);
-      return accumulate_subresult(ctx, q, 0, result, write);
+      return accumulate_subresult_cpu(ctx, q, 0, result);
    }
 }
 
@@ -362,21 +335,99 @@ query_ensure_ready(struct d3d12_screen* screen, struct 
d3d12_context* ctx, struc
    return true;
 }
 
+static void
+accumulate_subresult_gpu(struct d3d12_context *ctx, struct d3d12_query 
*q_parent,
+                         unsigned sub_query)
+{
+   d3d12_compute_transform_save_restore save;
+   d3d12_save_compute_transform_state(ctx, &save);
+
+   d3d12_compute_transform_key key;
+   memset(&key, 0, sizeof(key));
+   key.type = d3d12_compute_transform_type::query_resolve;
+   key.query_resolve.is_64bit = true;
+   key.query_resolve.is_resolve_in_place = true;
+   key.query_resolve.num_subqueries = 1;
+   key.query_resolve.pipe_query_type = q_parent->type;
+   key.query_resolve.single_subquery_index = sub_query;
+   key.query_resolve.is_signed = false;
+   key.query_resolve.timestamp_multiplier = 1.0;
+   ctx->base.bind_compute_state(&ctx->base, d3d12_get_compute_transform(ctx, 
&key));
+
+   ctx->transform_state_vars[0] = q_parent->subqueries[sub_query].curr_query;
+   ctx->transform_state_vars[1] = 0;
+   ctx->transform_state_vars[2] = 0;
+   ctx->transform_state_vars[3] = 0;
+
+   pipe_shader_buffer new_cs_ssbos[1];
+   new_cs_ssbos[0].buffer = q_parent->subqueries[sub_query].buffer;
+   new_cs_ssbos[0].buffer_offset = 
q_parent->subqueries[sub_query].buffer_offset;
+   new_cs_ssbos[0].buffer_size = q_parent->subqueries[sub_query].query_size * 
q_parent->subqueries[sub_query].num_queries;
+   ctx->base.set_shader_buffers(&ctx->base, PIPE_SHADER_COMPUTE, 0, 1, 
new_cs_ssbos, 1);
+
+   pipe_grid_info grid = {};
+   grid.block[0] = grid.block[1] = grid.block[2] = 1;
+   grid.grid[0] = grid.grid[1] = grid.grid[2] = 1;
+   ctx->base.launch_grid(&ctx->base, &grid);
+
+   d3d12_restore_compute_transform_state(ctx, &save);
+}
+
+static void
+accumulate_result_gpu(struct d3d12_context *ctx, struct d3d12_query *q,
+                      struct pipe_resource *dst, uint32_t dst_offset,
+                      int index, enum pipe_query_value_type result_type)
+{
+   d3d12_compute_transform_save_restore save;
+   d3d12_save_compute_transform_state(ctx, &save);
+
+   d3d12_compute_transform_key key;
+   memset(&key, 0, sizeof(key));
+   key.type = d3d12_compute_transform_type::query_resolve;
+   key.query_resolve.is_64bit = result_type == PIPE_QUERY_TYPE_I64 || 
result_type == PIPE_QUERY_TYPE_U64;
+   key.query_resolve.is_resolve_in_place = false;
+   key.query_resolve.num_subqueries = num_sub_queries(q->type);
+   key.query_resolve.pipe_query_type = q->type;
+   key.query_resolve.single_result_field_offset = index;
+   key.query_resolve.is_signed = result_type == PIPE_QUERY_TYPE_I32 || 
result_type == PIPE_QUERY_TYPE_I64;
+   key.query_resolve.timestamp_multiplier = 
d3d12_screen(ctx->base.screen)->timestamp_multiplier;
+   ctx->base.bind_compute_state(&ctx->base, d3d12_get_compute_transform(ctx, 
&key));
+
+   pipe_shader_buffer new_cs_ssbos[4];
+   uint32_t num_ssbos = 0;
+   for (uint32_t i = 0; i < key.query_resolve.num_subqueries; ++i) {
+      ctx->transform_state_vars[i] = q->subqueries[i].curr_query;
+      new_cs_ssbos[num_ssbos].buffer = q->subqueries[i].buffer;
+      new_cs_ssbos[num_ssbos].buffer_offset = q->subqueries[i].buffer_offset;
+      new_cs_ssbos[num_ssbos].buffer_size = q->subqueries[i].query_size * 
q->subqueries[i].num_queries;
+      num_ssbos++;
+   }
+
+   assert(dst_offset % (key.query_resolve.is_64bit ? 8 : 4) == 0);
+   ctx->transform_state_vars[3] = dst_offset / (key.query_resolve.is_64bit ? 8 
: 4);
+
+   new_cs_ssbos[num_ssbos].buffer = dst;
+   new_cs_ssbos[num_ssbos].buffer_offset = 0;
+   new_cs_ssbos[num_ssbos].buffer_size = dst->width0;
+   num_ssbos++;
+   
+   ctx->base.set_shader_buffers(&ctx->base, PIPE_SHADER_COMPUTE, 0, num_ssbos, 
new_cs_ssbos, 1 << (num_ssbos - 1));
+
+   pipe_grid_info grid = {};
+   grid.block[0] = grid.block[1] = grid.block[2] = 1;
+   grid.grid[0] = grid.grid[1] = grid.grid[2] = 1;
+   ctx->base.launch_grid(&ctx->base, &grid);
+
+   d3d12_restore_compute_transform_state(ctx, &save);
+}
+
 static void
 begin_subquery(struct d3d12_context *ctx, struct d3d12_query *q_parent, 
unsigned sub_query)
 {
    struct d3d12_query_impl *q = &q_parent->subqueries[sub_query];
    if (q->curr_query == q->num_queries) {
-      union pipe_query_result result;
-
-      query_ensure_ready(d3d12_screen(ctx->base.screen), ctx, q_parent, false);
-      d3d12_foreach_submitted_batch(ctx, old_batch) {
-         if (old_batch->fence && old_batch->fence->value <= 
q_parent->fence_value)
-            d3d12_reset_batch(ctx, old_batch, OS_TIMEOUT_INFINITE);
-      }
-
       /* Accumulate current results and store in first slot */
-      accumulate_subresult(ctx, q_parent, sub_query, &result, true);
+      accumulate_subresult_gpu(ctx, q_parent, sub_query);
       q->curr_query = 1;
    }
 
@@ -412,18 +463,9 @@ begin_timer_query(struct d3d12_context *ctx, struct 
d3d12_query *q_parent, bool
       q->curr_query = 0;
       query_index = 0;
    } else if (query_index == q->num_queries) {
-      union pipe_query_result result;
-
       /* Accumulate current results and store in first slot */
-
-      query_ensure_ready(d3d12_screen(ctx->base.screen), ctx, q_parent, false);
-      d3d12_foreach_submitted_batch(ctx, old_batch) {
-         if (old_batch->fence && old_batch->fence->value <= 
q_parent->fence_value)
-            d3d12_reset_batch(ctx, old_batch, OS_TIMEOUT_INFINITE);
-      }
-
-      accumulate_subresult(ctx, q_parent, 0, &result, true);
-      q->curr_query = 2;
+      accumulate_subresult_gpu(ctx, q_parent, 0);
+      q->curr_query = 1;
    }
 
    ctx->cmdlist->EndQuery(q->query_heap, q->d3d12qtype, query_index);
@@ -530,7 +572,39 @@ d3d12_get_query_result(struct pipe_context *pctx,
    if (!query_ensure_ready(screen, ctx, query, wait))
       return false;
 
-   return accumulate_result(ctx, query, result, false);
+   return accumulate_result_cpu(ctx, query, result);
+}
+
+static void
+d3d12_get_query_result_resource(struct pipe_context *pctx,
+                                struct pipe_query *q,
+                                enum pipe_query_flags flags,
+                                enum pipe_query_value_type result_type,
+                                int index,
+                                struct pipe_resource *resource,
+                                unsigned offset)
+{
+   struct d3d12_context *ctx = d3d12_context(pctx);
+
+   if (index == -1) {
+      /* Write the "available" bit, which is always true */
+      struct d3d12_resource *res = d3d12_resource(resource);
+      d3d12_transition_resource_state(ctx, res, 
D3D12_RESOURCE_STATE_COPY_DEST, D3D12_TRANSITION_FLAG_NONE);
+      d3d12_apply_resource_states(ctx, false);
+
+      D3D12_GPU_VIRTUAL_ADDRESS gpuva_base = 
d3d12_resource_gpu_virtual_address(res) + offset;
+      D3D12_WRITEBUFFERIMMEDIATE_PARAMETER params[2] = {
+         { gpuva_base, 1 },
+         { gpuva_base + sizeof(uint32_t), 0 },
+      };
+      D3D12_WRITEBUFFERIMMEDIATE_MODE modes[2] = { 
D3D12_WRITEBUFFERIMMEDIATE_MODE_DEFAULT, 
D3D12_WRITEBUFFERIMMEDIATE_MODE_DEFAULT };
+      ctx->cmdlist8->WriteBufferImmediate(result_type == PIPE_QUERY_TYPE_I64 
|| result_type == PIPE_QUERY_TYPE_U64 ? 2 : 1,
+                                          params, modes);
+      return;
+   }
+
+   struct d3d12_query *query = (struct d3d12_query *)q;
+   accumulate_result_gpu(ctx, query, resource, offset, index, result_type);
 }
 
 void
@@ -598,28 +672,7 @@ d3d12_render_condition(struct pipe_context *pctx,
       query->predicate = d3d12_resource(pipe_buffer_create(pctx->screen, 0,
                                                            PIPE_USAGE_DEFAULT, 
sizeof(uint64_t)));
 
-   if (mode == PIPE_RENDER_COND_WAIT) {
-
-      query_ensure_ready(d3d12_screen(ctx->base.screen), ctx, query, false);
-      d3d12_foreach_submitted_batch(ctx, old_batch) {
-         if (old_batch->fence && old_batch->fence->value <= query->fence_value)
-            d3d12_reset_batch(ctx, old_batch, OS_TIMEOUT_INFINITE);
-      }
-
-      union pipe_query_result result;
-      accumulate_result(ctx, (d3d12_query *)pquery, &result, true);
-   }
-
-   struct d3d12_resource *res = (struct d3d12_resource 
*)query->subqueries[0].buffer;
-   uint64_t source_offset = 0;
-   ID3D12Resource *source = d3d12_resource_underlying(res, &source_offset);
-   source_offset += query->subqueries[0].buffer_offset;
-   d3d12_transition_resource_state(ctx, res, D3D12_RESOURCE_STATE_COPY_SOURCE, 
D3D12_TRANSITION_FLAG_INVALIDATE_BINDINGS);
-   d3d12_transition_resource_state(ctx, query->predicate, 
D3D12_RESOURCE_STATE_COPY_DEST, D3D12_TRANSITION_FLAG_NONE);
-   d3d12_apply_resource_states(ctx, false);
-   ctx->cmdlist->CopyBufferRegion(d3d12_resource_resource(query->predicate), 0,
-                                  source, source_offset,
-                                  sizeof(uint64_t));
+   accumulate_result_gpu(ctx, query, &query->predicate->base.b, 0, 0, 
PIPE_QUERY_TYPE_U64);
 
    d3d12_transition_resource_state(ctx, query->predicate, 
D3D12_RESOURCE_STATE_PREDICATION, D3D12_TRANSITION_FLAG_NONE);
    d3d12_apply_resource_states(ctx, false);
@@ -656,6 +709,7 @@ d3d12_context_query_init(struct pipe_context *pctx)
    pctx->begin_query = d3d12_begin_query;
    pctx->end_query = d3d12_end_query;
    pctx->get_query_result = d3d12_get_query_result;
+   pctx->get_query_result_resource = d3d12_get_query_result_resource;
    pctx->set_active_query_state = d3d12_set_active_query_state;
    pctx->render_condition = d3d12_render_condition;
 }
diff --git a/src/gallium/drivers/d3d12/d3d12_screen.cpp 
b/src/gallium/drivers/d3d12/d3d12_screen.cpp
index 17d2de8b0c7..4e9f769bcd1 100644
--- a/src/gallium/drivers/d3d12/d3d12_screen.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_screen.cpp
@@ -193,9 +193,9 @@ d3d12_get_param(struct pipe_screen *pscreen, enum pipe_cap 
param)
       return 1;
 
    case PIPE_CAP_GLSL_FEATURE_LEVEL:
-      return 430;
+      return 440;
    case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
-      return 430;
+      return 440;
    case PIPE_CAP_ESSL_FEATURE_LEVEL:
       return 310;
 
@@ -335,8 +335,12 @@ d3d12_get_param(struct pipe_screen *pscreen, enum pipe_cap 
param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_SHADER_ARRAY_COMPONENTS:
    case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_CAP_QUERY_TIME_ELAPSED:
       return 1;
 
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+      return (screen->opts3.WriteBufferImmediateSupportFlags & 
D3D12_COMMAND_LIST_SUPPORT_FLAG_DIRECT) != 0;
+
    case PIPE_CAP_MAX_VERTEX_STREAMS:
       return D3D12_SO_BUFFER_SLOT_COUNT;
 

Reply via email to