Mesa (main): nvk: Move the guts of nvk_compile_nir() to nvk_codegen.c

GitLab Mirror Tue, 14 Nov 2023 18:41:06 -0800

Module: Mesa
Branch: main
Commit: 67bb8e8165db3717a8515fa931dd3776610713e8
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=67bb8e8165db3717a8515fa931dd3776610713e8


Author: Faith Ekstrand <[email protected]>
Date:   Tue Nov 14 12:08:38 2023 -0600

nvk: Move the guts of nvk_compile_nir() to nvk_codegen.c

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26197>

---

 src/nouveau/vulkan/nvk_codegen.c | 655 +++++++++++++++++++++++++++++++++++++++
 src/nouveau/vulkan/nvk_shader.c  | 647 +-------------------------------------
 src/nouveau/vulkan/nvk_shader.h  |   4 +
 3 files changed, 660 insertions(+), 646 deletions(-)

diff --git a/src/nouveau/vulkan/nvk_codegen.c b/src/nouveau/vulkan/nvk_codegen.c
index 6b416514152..8acd67fc85b 100644
--- a/src/nouveau/vulkan/nvk_codegen.c
+++ b/src/nouveau/vulkan/nvk_codegen.c
@@ -2,13 +2,19 @@
  * Copyright © 2022 Collabora Ltd. and Red Hat Inc.
  * SPDX-License-Identifier: MIT
  */
+#include "nvk_cmd_buffer.h"
 #include "nvk_physical_device.h"
 #include "nvk_shader.h"
 
 #include "nir.h"
 #include "nir_builder.h"
+#include "nir_xfb_info.h"
 
 #include "nv50_ir_driver.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "nvk_cl9097.h"
 
 uint64_t
 nvk_cg_get_prog_debug(void)
@@ -194,3 +200,652 @@ nvk_cg_optimize_nir(nir_shader *nir)
    NIR_PASS(progress, nir, nir_remove_dead_variables,
             nir_var_function_temp | nir_var_shader_in | nir_var_shader_out, 
NULL);
 }
+
+/* NOTE: Using a[0x270] in FP may cause an error even if we're using less than
+ * 124 scalar varying values.
+ */
+static uint32_t
+nvc0_shader_input_address(unsigned sn, unsigned si)
+{
+   switch (sn) {
+   case TGSI_SEMANTIC_TESSOUTER:    return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSINNER:    return 0x010 + si * 0x4;
+   case TGSI_SEMANTIC_PATCH:        return 0x020 + si * 0x10;
+   case TGSI_SEMANTIC_PRIMID:       return 0x060;
+   case TGSI_SEMANTIC_LAYER:        return 0x064;
+   case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
+   case TGSI_SEMANTIC_PSIZE:        return 0x06c;
+   case TGSI_SEMANTIC_POSITION:     return 0x070;
+   case TGSI_SEMANTIC_GENERIC:      return 0x080 + si * 0x10;
+   case TGSI_SEMANTIC_FOG:          return 0x2e8;
+   case TGSI_SEMANTIC_COLOR:        return 0x280 + si * 0x10;
+   case TGSI_SEMANTIC_BCOLOR:       return 0x2a0 + si * 0x10;
+   case TGSI_SEMANTIC_CLIPDIST:     return 0x2c0 + si * 0x10;
+   case TGSI_SEMANTIC_CLIPVERTEX:   return 0x270;
+   case TGSI_SEMANTIC_PCOORD:       return 0x2e0;
+   case TGSI_SEMANTIC_TESSCOORD:    return 0x2f0;
+   case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
+   case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
+   case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
+   default:
+      assert(!"invalid TGSI input semantic");
+      return ~0;
+   }
+}
+
+static uint32_t
+nvc0_shader_output_address(unsigned sn, unsigned si)
+{
+   switch (sn) {
+   case TGSI_SEMANTIC_TESSOUTER:     return 0x000 + si * 0x4;
+   case TGSI_SEMANTIC_TESSINNER:     return 0x010 + si * 0x4;
+   case TGSI_SEMANTIC_PATCH:         return 0x020 + si * 0x10;
+   case TGSI_SEMANTIC_PRIMID:        return 0x060;
+   case TGSI_SEMANTIC_LAYER:         return 0x064;
+   case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
+   case TGSI_SEMANTIC_PSIZE:         return 0x06c;
+   case TGSI_SEMANTIC_POSITION:      return 0x070;
+   case TGSI_SEMANTIC_GENERIC:       return 0x080 + si * 0x10;
+   case TGSI_SEMANTIC_FOG:           return 0x2e8;
+   case TGSI_SEMANTIC_COLOR:         return 0x280 + si * 0x10;
+   case TGSI_SEMANTIC_BCOLOR:        return 0x2a0 + si * 0x10;
+   case TGSI_SEMANTIC_CLIPDIST:      return 0x2c0 + si * 0x10;
+   case TGSI_SEMANTIC_CLIPVERTEX:    return 0x270;
+   case TGSI_SEMANTIC_TEXCOORD:      return 0x300 + si * 0x10;
+   case TGSI_SEMANTIC_VIEWPORT_MASK: return 0x3a0;
+   case TGSI_SEMANTIC_EDGEFLAG:      return ~0;
+   default:
+      assert(!"invalid TGSI output semantic");
+      return ~0;
+   }
+}
+
+static int
+nvc0_vp_assign_input_slots(struct nv50_ir_prog_info_out *info)
+{
+   unsigned i, c, n;
+
+   for (n = 0, i = 0; i < info->numInputs; ++i) {
+      switch (info->in[i].sn) {
+      case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */
+      case TGSI_SEMANTIC_VERTEXID:
+         info->in[i].mask = 0x1;
+         info->in[i].slot[0] =
+            nvc0_shader_input_address(info->in[i].sn, 0) / 4;
+         continue;
+      default:
+         break;
+      }
+      for (c = 0; c < 4; ++c)
+         info->in[i].slot[c] = (0x80 + n * 0x10 + c * 0x4) / 4;
+      ++n;
+   }
+
+   return 0;
+}
+
+static int
+nvc0_sp_assign_input_slots(struct nv50_ir_prog_info_out *info)
+{
+   unsigned offset;
+   unsigned i, c;
+
+   for (i = 0; i < info->numInputs; ++i) {
+      offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si);
+
+      for (c = 0; c < 4; ++c)
+         info->in[i].slot[c] = (offset + c * 0x4) / 4;
+   }
+
+   return 0;
+}
+
+static int
+nvc0_fp_assign_output_slots(struct nv50_ir_prog_info_out *info)
+{
+   unsigned count = info->prop.fp.numColourResults * 4;
+   unsigned i, c;
+
+   /* Compute the relative position of each color output, since skipped MRT
+    * positions will not have registers allocated to them.
+    */
+   unsigned colors[8] = {0};
+   for (i = 0; i < info->numOutputs; ++i)
+      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
+         colors[info->out[i].si] = 1;
+   for (i = 0, c = 0; i < 8; i++)
+      if (colors[i])
+         colors[i] = c++;
+   for (i = 0; i < info->numOutputs; ++i)
+      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
+         for (c = 0; c < 4; ++c)
+            info->out[i].slot[c] = colors[info->out[i].si] * 4 + c;
+
+   if (info->io.sampleMask < NV50_CODEGEN_MAX_VARYINGS)
+      info->out[info->io.sampleMask].slot[0] = count++;
+   else
+   if (info->target >= 0xe0)
+      count++; /* on Kepler, depth is always last colour reg + 2 */
+
+   if (info->io.fragDepth < NV50_CODEGEN_MAX_VARYINGS)
+      info->out[info->io.fragDepth].slot[2] = count;
+
+   return 0;
+}
+
+static int
+nvc0_sp_assign_output_slots(struct nv50_ir_prog_info_out *info)
+{
+   unsigned offset;
+   unsigned i, c;
+
+   for (i = 0; i < info->numOutputs; ++i) {
+      offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si);
+
+      for (c = 0; c < 4; ++c)
+         info->out[i].slot[c] = (offset + c * 0x4) / 4;
+   }
+
+   return 0;
+}
+
+static int
+nvc0_program_assign_varying_slots(struct nv50_ir_prog_info_out *info)
+{
+   int ret;
+
+   if (info->type == PIPE_SHADER_VERTEX)
+      ret = nvc0_vp_assign_input_slots(info);
+   else
+      ret = nvc0_sp_assign_input_slots(info);
+   if (ret)
+      return ret;
+
+   if (info->type == PIPE_SHADER_FRAGMENT)
+      ret = nvc0_fp_assign_output_slots(info);
+   else
+      ret = nvc0_sp_assign_output_slots(info);
+   return ret;
+}
+
+static inline void
+nvk_vtgs_hdr_update_oread(struct nvk_shader *vs, uint8_t slot)
+{
+   uint8_t min = (vs->hdr[4] >> 12) & 0xff;
+   uint8_t max = (vs->hdr[4] >> 24);
+
+   min = MIN2(min, slot);
+   max = MAX2(max, slot);
+
+   vs->hdr[4] = (max << 24) | (min << 12);
+}
+
+static int
+nvk_vtgp_gen_header(struct nvk_shader *vs, struct nv50_ir_prog_info_out *info)
+{
+   unsigned i, c, a;
+
+   for (i = 0; i < info->numInputs; ++i) {
+      if (info->in[i].patch)
+         continue;
+      for (c = 0; c < 4; ++c) {
+         a = info->in[i].slot[c];
+         if (info->in[i].mask & (1 << c))
+            vs->hdr[5 + a / 32] |= 1 << (a % 32);
+      }
+   }
+
+   for (i = 0; i < info->numOutputs; ++i) {
+      if (info->out[i].patch)
+         continue;
+      for (c = 0; c < 4; ++c) {
+         if (!(info->out[i].mask & (1 << c)))
+            continue;
+         assert(info->out[i].slot[c] >= 0x40 / 4);
+         a = info->out[i].slot[c] - 0x40 / 4;
+         vs->hdr[13 + a / 32] |= 1 << (a % 32);
+         if (info->out[i].oread)
+            nvk_vtgs_hdr_update_oread(vs, info->out[i].slot[c]);
+      }
+   }
+
+   for (i = 0; i < info->numSysVals; ++i) {
+      switch (info->sv[i].sn) {
+      case SYSTEM_VALUE_PRIMITIVE_ID:
+         vs->hdr[5] |= 1 << 24;
+         break;
+      case SYSTEM_VALUE_INSTANCE_ID:
+         vs->hdr[10] |= 1 << 30;
+         break;
+      case SYSTEM_VALUE_VERTEX_ID:
+         vs->hdr[10] |= 1 << 31;
+         break;
+      case SYSTEM_VALUE_TESS_COORD:
+         /* We don't have the mask, nor the slots populated. While this could
+          * be achieved, the vast majority of the time if either of the coords
+          * are read, then both will be read.
+          */
+         nvk_vtgs_hdr_update_oread(vs, 0x2f0 / 4);
+         nvk_vtgs_hdr_update_oread(vs, 0x2f4 / 4);
+         break;
+      default:
+         break;
+      }
+   }
+
+   vs->vs.clip_enable = (1 << info->io.clipDistances) - 1;
+   vs->vs.cull_enable =
+      ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
+   for (i = 0; i < info->io.cullDistances; ++i)
+      vs->vs.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);
+
+   vs->vs.layer_viewport_relative = info->io.layer_viewport_relative;
+
+   return 0;
+}
+
+static int
+nvk_vs_gen_header(struct nvk_shader *vs, struct nv50_ir_prog_info_out *info)
+{
+   vs->hdr[0] = 0x20061 | (1 << 10);
+   vs->hdr[4] = 0xff000;
+
+   return nvk_vtgp_gen_header(vs, info);
+}
+
+static int
+nvk_gs_gen_header(struct nvk_shader *gs,
+                  const struct nir_shader *nir,
+                  struct nv50_ir_prog_info_out *info)
+{
+   gs->hdr[0] = 0x20061 | (4 << 10);
+
+   gs->hdr[2] = MIN2(info->prop.gp.instanceCount, 32) << 24;
+
+   switch (info->prop.gp.outputPrim) {
+   case MESA_PRIM_POINTS:
+      gs->hdr[3] = 0x01000000;
+      break;
+   case MESA_PRIM_LINE_STRIP:
+      gs->hdr[3] = 0x06000000;
+      break;
+   case MESA_PRIM_TRIANGLE_STRIP:
+      gs->hdr[3] = 0x07000000;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+
+   gs->hdr[4] = CLAMP(info->prop.gp.maxVertices, 1, 1024);
+
+   gs->hdr[0] |= nir->info.gs.active_stream_mask << 28;
+
+   return nvk_vtgp_gen_header(gs, info);
+}
+
+static void
+nvk_generate_tessellation_parameters(const struct nv50_ir_prog_info_out *info,
+                                     struct nvk_shader *shader)
+{
+   // TODO: this is a little confusing because nouveau codegen uses
+   // MESA_PRIM_POINTS for unspecified domain and
+   // MESA_PRIM_POINTS = 0, the same as NV9097 ISOLINE enum
+   uint32_t domain_type;
+   switch (info->prop.tp.domain) {
+   case MESA_PRIM_LINES:
+      domain_type = NV9097_SET_TESSELLATION_PARAMETERS_DOMAIN_TYPE_ISOLINE;
+      break;
+   case MESA_PRIM_TRIANGLES:
+      domain_type = NV9097_SET_TESSELLATION_PARAMETERS_DOMAIN_TYPE_TRIANGLE;
+      break;
+   case MESA_PRIM_QUADS:
+      domain_type = NV9097_SET_TESSELLATION_PARAMETERS_DOMAIN_TYPE_QUAD;
+      break;
+   default:
+      domain_type = ~0;
+      break;
+   }
+   shader->tp.domain_type = domain_type;
+   if (domain_type == ~0) {
+      return;
+   }
+
+   uint32_t spacing;
+   switch (info->prop.tp.partitioning) {
+   case PIPE_TESS_SPACING_EQUAL:
+      spacing = NV9097_SET_TESSELLATION_PARAMETERS_SPACING_INTEGER;
+      break;
+   case PIPE_TESS_SPACING_FRACTIONAL_ODD:
+      spacing = NV9097_SET_TESSELLATION_PARAMETERS_SPACING_FRACTIONAL_ODD;
+      break;
+   case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
+      spacing = NV9097_SET_TESSELLATION_PARAMETERS_SPACING_FRACTIONAL_EVEN;
+      break;
+   default:
+      assert(!"invalid tessellator partitioning");
+      break;
+   }
+   shader->tp.spacing = spacing;
+
+   uint32_t output_prims;
+   if (info->prop.tp.outputPrim == MESA_PRIM_POINTS) { // point_mode
+      output_prims = 
NV9097_SET_TESSELLATION_PARAMETERS_OUTPUT_PRIMITIVES_POINTS;
+   } else if (info->prop.tp.domain == MESA_PRIM_LINES) { // isoline domain
+      output_prims = 
NV9097_SET_TESSELLATION_PARAMETERS_OUTPUT_PRIMITIVES_LINES;
+   } else {  // triangle/quad domain
+      if (info->prop.tp.winding > 0) {
+         output_prims = 
NV9097_SET_TESSELLATION_PARAMETERS_OUTPUT_PRIMITIVES_TRIANGLES_CW;
+      } else {
+         output_prims = 
NV9097_SET_TESSELLATION_PARAMETERS_OUTPUT_PRIMITIVES_TRIANGLES_CCW;
+      }
+   }
+   shader->tp.output_prims = output_prims;
+}
+
+static int
+nvk_tcs_gen_header(struct nvk_shader *tcs, struct nv50_ir_prog_info_out *info)
+{
+   unsigned opcs = 6; /* output patch constants (at least the TessFactors) */
+
+   if (info->numPatchConstants)
+      opcs = 8 + info->numPatchConstants * 4;
+
+   tcs->hdr[0] = 0x20061 | (2 << 10);
+
+   tcs->hdr[1] = opcs << 24;
+   tcs->hdr[2] = info->prop.tp.outputPatchSize << 24;
+
+   tcs->hdr[4] = 0xff000; /* initial min/max parallel output read address */
+
+   nvk_vtgp_gen_header(tcs, info);
+
+   if (info->target >= NVISA_GM107_CHIPSET) {
+      /* On GM107+, the number of output patch components has moved in the TCP
+       * header, but it seems like blob still also uses the old position.
+       * Also, the high 8-bits are located in between the min/max parallel
+       * field and has to be set after updating the outputs. */
+      tcs->hdr[3] = (opcs & 0x0f) << 28;
+      tcs->hdr[4] |= (opcs & 0xf0) << 16;
+   }
+
+   nvk_generate_tessellation_parameters(info, tcs);
+
+   return 0;
+}
+
+static int
+nvk_tes_gen_header(struct nvk_shader *tes, struct nv50_ir_prog_info_out *info)
+{
+   tes->hdr[0] = 0x20061 | (3 << 10);
+   tes->hdr[4] = 0xff000;
+
+   nvk_vtgp_gen_header(tes, info);
+
+   nvk_generate_tessellation_parameters(info, tes);
+
+   tes->hdr[18] |= 0x3 << 12; /* ? */
+
+   return 0;
+}
+
+#define NVC0_INTERP_FLAT          (1 << 0)
+#define NVC0_INTERP_PERSPECTIVE   (2 << 0)
+#define NVC0_INTERP_LINEAR        (3 << 0)
+#define NVC0_INTERP_CENTROID      (1 << 2)
+
+static uint8_t
+nvk_hdr_interp_mode(const struct nv50_ir_varying *var)
+{
+   if (var->linear)
+      return NVC0_INTERP_LINEAR;
+   if (var->flat)
+      return NVC0_INTERP_FLAT;
+   return NVC0_INTERP_PERSPECTIVE;
+}
+
+
+static int
+nvk_fs_gen_header(struct nvk_shader *fs, const struct nvk_fs_key *key,
+                  struct nv50_ir_prog_info_out *info)
+{
+   unsigned i, c, a, m;
+
+   /* just 00062 on Kepler */
+   fs->hdr[0] = 0x20062 | (5 << 10);
+   fs->hdr[5] = 0x80000000; /* getting a trap if FRAG_COORD_UMASK.w = 0 */
+
+   if (info->prop.fp.usesDiscard || key->zs_self_dep)
+      fs->hdr[0] |= 0x8000;
+   if (!info->prop.fp.separateFragData)
+      fs->hdr[0] |= 0x4000;
+   if (info->io.sampleMask < 80 /* PIPE_MAX_SHADER_OUTPUTS */)
+      fs->hdr[19] |= 0x1;
+   if (info->prop.fp.writesDepth) {
+      fs->hdr[19] |= 0x2;
+      fs->flags[0] = 0x11; /* deactivate ZCULL */
+   }
+
+   for (i = 0; i < info->numInputs; ++i) {
+      m = nvk_hdr_interp_mode(&info->in[i]);
+      if (info->in[i].sn == TGSI_SEMANTIC_COLOR) {
+         fs->fs.colors |= 1 << info->in[i].si;
+         if (info->in[i].sc)
+            fs->fs.color_interp[info->in[i].si] = m | (info->in[i].mask << 4);
+      }
+      for (c = 0; c < 4; ++c) {
+         if (!(info->in[i].mask & (1 << c)))
+            continue;
+         a = info->in[i].slot[c];
+         if (info->in[i].slot[0] >= (0x060 / 4) &&
+             info->in[i].slot[0] <= (0x07c / 4)) {
+            fs->hdr[5] |= 1 << (24 + (a - 0x060 / 4));
+         } else
+         if (info->in[i].slot[0] >= (0x2c0 / 4) &&
+             info->in[i].slot[0] <= (0x2fc / 4)) {
+            fs->hdr[14] |= (1 << (a - 0x280 / 4)) & 0x07ff0000;
+         } else {
+            if (info->in[i].slot[c] < (0x040 / 4) ||
+                info->in[i].slot[c] > (0x380 / 4))
+               continue;
+            a *= 2;
+            if (info->in[i].slot[0] >= (0x300 / 4))
+               a -= 32;
+            fs->hdr[4 + a / 32] |= m << (a % 32);
+         }
+      }
+   }
+   /* GM20x+ needs TGSI_SEMANTIC_POSITION to access sample locations */
+   if (info->prop.fp.readsSampleLocations && info->target >= 
NVISA_GM200_CHIPSET)
+      fs->hdr[5] |= 0x30000000;
+
+   for (i = 0; i < info->numOutputs; ++i) {
+      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
+         fs->hdr[18] |= 0xf << (4 * info->out[i].si);
+   }
+
+   /* There are no "regular" attachments, but the shader still needs to be
+    * executed. It seems like it wants to think that it has some color
+    * outputs in order to actually run.
+    */
+   if (info->prop.fp.numColourResults == 0 &&
+       !info->prop.fp.writesDepth &&
+       info->io.sampleMask >= 80 /* PIPE_MAX_SHADER_OUTPUTS */)
+      fs->hdr[18] |= 0xf;
+
+   fs->fs.early_z = info->prop.fp.earlyFragTests;
+   fs->fs.sample_mask_in = info->prop.fp.usesSampleMaskIn;
+   fs->fs.reads_framebuffer = info->prop.fp.readsFramebuffer;
+   fs->fs.post_depth_coverage = info->prop.fp.postDepthCoverage;
+
+   /* Mark position xy and layer as read */
+   if (fs->fs.reads_framebuffer)
+      fs->hdr[5] |= 0x32000000;
+
+   return 0;
+}
+
+static uint8_t find_register_index_for_xfb_output(const struct nir_shader *nir,
+                                                  nir_xfb_output_info output)
+{
+   nir_foreach_shader_out_variable(var, nir) {
+      uint32_t slots = glsl_count_vec4_slots(var->type, false, false);
+      for (uint32_t i = 0; i < slots; ++i) {
+         if (output.location == (var->data.location+i)) {
+            return var->data.driver_location+i;
+         }
+      }
+   }
+   // should not be reached
+   return 0;
+}
+
+static struct nvk_transform_feedback_state *
+nvk_fill_transform_feedback_state(struct nir_shader *nir,
+                                  const struct nv50_ir_prog_info_out *info)
+{
+   const uint8_t max_buffers = 4;
+   const uint8_t dw_bytes = 4;
+   const struct nir_xfb_info *nx = nir->xfb_info;
+   //nir_print_xfb_info(nx, stdout);
+
+   struct nvk_transform_feedback_state *xfb =
+      malloc(sizeof(struct nvk_transform_feedback_state));
+
+   if (!xfb)
+      return NULL;
+
+   for (uint8_t b = 0; b < max_buffers; ++b) {
+      xfb->stride[b] = b < nx->buffers_written ? nx->buffers[b].stride : 0;
+      xfb->varying_count[b] = 0;
+      xfb->stream[b] = nx->buffer_to_stream[b];
+   }
+   memset(xfb->varying_index, 0xff, sizeof(xfb->varying_index)); /* = skip */
+
+   if (info->numOutputs == 0)
+      return xfb;
+
+   for (uint32_t i = 0; i < nx->output_count; ++i) {
+      const nir_xfb_output_info output = nx->outputs[i];
+      const uint8_t b = output.buffer;
+      const uint8_t r = find_register_index_for_xfb_output(nir, output);
+      uint32_t p = output.offset / dw_bytes;
+
+      assert(r < info->numOutputs && p < ARRAY_SIZE(xfb->varying_index[b]));
+
+      u_foreach_bit(c, nx->outputs[i].component_mask)
+         xfb->varying_index[b][p++] = info->out[r].slot[c];
+
+      xfb->varying_count[b] = MAX2(xfb->varying_count[b], p);
+   }
+
+   /* zero unused indices */
+   for (uint8_t b = 0; b < 4; ++b)
+      for (uint32_t c = xfb->varying_count[b]; c & 3; ++c)
+         xfb->varying_index[b][c] = 0;
+
+   return xfb;
+}
+
+
+VkResult
+nvk_cg_compile_nir(struct nvk_physical_device *pdev, nir_shader *nir,
+                   const struct nvk_fs_key *fs_key,
+                   struct nvk_shader *shader)
+{
+   struct nv50_ir_prog_info *info;
+   struct nv50_ir_prog_info_out info_out = {};
+   int ret;
+
+   info = CALLOC_STRUCT(nv50_ir_prog_info);
+   if (!info)
+      return false;
+
+   info->type = pipe_shader_type_from_mesa(nir->info.stage);
+   info->target = pdev->info.chipset;
+   info->bin.nir = nir;
+
+   for (unsigned i = 0; i < 3; i++)
+      shader->cp.block_size[i] = nir->info.workgroup_size[i];
+
+   info->bin.smemSize = shader->cp.smem_size;
+   info->dbgFlags = nvk_cg_get_prog_debug();
+   info->optLevel = nvk_cg_get_prog_optimize();
+   info->io.auxCBSlot = 1;
+   info->io.uboInfoBase = 0;
+   info->io.drawInfoBase = nvk_root_descriptor_offset(draw.base_vertex);
+   if (nir->info.stage == MESA_SHADER_COMPUTE) {
+      info->prop.cp.gridInfoBase = 0;
+   } else {
+      info->assignSlots = nvc0_program_assign_varying_slots;
+   }
+   ret = nv50_ir_generate_code(info, &info_out);
+   if (ret)
+      return VK_ERROR_UNKNOWN;
+
+   if (info_out.bin.fixupData) {
+      nv50_ir_apply_fixups(info_out.bin.fixupData, info_out.bin.code,
+                           fs_key && fs_key->force_per_sample,
+                           false /* flatshade */, false /* alphatest */,
+                           fs_key && fs_key->msaa);
+   }
+
+   shader->stage = nir->info.stage;
+   shader->code_ptr = (uint8_t *)info_out.bin.code;
+   shader->code_size = info_out.bin.codeSize;
+
+   if (info_out.target >= NVISA_GV100_CHIPSET)
+      shader->num_gprs = MAX2(4, info_out.bin.maxGPR + 3);
+   else
+      shader->num_gprs = MAX2(4, info_out.bin.maxGPR + 1);
+   shader->cp.smem_size = info_out.bin.smemSize;
+   shader->num_barriers = info_out.numBarriers;
+
+   switch (info->type) {
+   case PIPE_SHADER_VERTEX:
+      ret = nvk_vs_gen_header(shader, &info_out);
+      break;
+   case PIPE_SHADER_FRAGMENT:
+      ret = nvk_fs_gen_header(shader, fs_key, &info_out);
+      shader->fs.uses_sample_shading = nir->info.fs.uses_sample_shading;
+      break;
+   case PIPE_SHADER_GEOMETRY:
+      ret = nvk_gs_gen_header(shader, nir, &info_out);
+      break;
+   case PIPE_SHADER_TESS_CTRL:
+      ret = nvk_tcs_gen_header(shader, &info_out);
+      break;
+   case PIPE_SHADER_TESS_EVAL:
+      ret = nvk_tes_gen_header(shader, &info_out);
+      break;
+   case PIPE_SHADER_COMPUTE:
+      break;
+   default:
+      unreachable("Invalid shader stage");
+      break;
+   }
+   assert(ret == 0);
+
+   if (info_out.bin.tlsSpace) {
+      assert(info_out.bin.tlsSpace < (1 << 24));
+      shader->hdr[0] |= 1 << 26;
+      shader->hdr[1] |= align(info_out.bin.tlsSpace, 0x10); /* l[] size */
+      shader->slm_size = info_out.bin.tlsSpace;
+   }
+
+   if (info_out.io.globalAccess)
+      shader->hdr[0] |= 1 << 26;
+   if (info_out.io.globalAccess & 0x2)
+      shader->hdr[0] |= 1 << 16;
+   if (info_out.io.fp64)
+      shader->hdr[0] |= 1 << 27;
+
+   if (nir->xfb_info) {
+      shader->xfb = nvk_fill_transform_feedback_state(nir, &info_out);
+      if (shader->xfb == NULL) {
+         return VK_ERROR_OUT_OF_HOST_MEMORY;
+      }
+   }
+
+   return VK_SUCCESS;
+}
diff --git a/src/nouveau/vulkan/nvk_shader.c b/src/nouveau/vulkan/nvk_shader.c
index 99551587e4e..86e58236040 100644
--- a/src/nouveau/vulkan/nvk_shader.c
+++ b/src/nouveau/vulkan/nvk_shader.c
@@ -4,7 +4,6 @@
  */
 #include "nvk_shader.h"
 
-#include "nvk_cmd_buffer.h"
 #include "nvk_descriptor_set_layout.h"
 #include "nvk_device.h"
 #include "nvk_physical_device.h"
@@ -21,7 +20,6 @@
 #include "nak.h"
 #include "nir.h"
 #include "nir_builder.h"
-#include "nir_xfb_info.h"
 #include "compiler/spirv/nir_spirv.h"
 
 #include "nv50_ir_driver.h"
@@ -33,7 +31,6 @@
 #include "clb097.h"
 #include "clc397.h"
 #include "clc597.h"
-#include "nvk_cl9097.h"
 
 static void
 shared_var_info(const struct glsl_type *type, unsigned *size, unsigned *align)
@@ -488,554 +485,6 @@ nvk_shader_dump(struct nvk_shader *shader)
 }
 #endif
 
-#include "tgsi/tgsi_ureg.h"
-
-/* NOTE: Using a[0x270] in FP may cause an error even if we're using less than
- * 124 scalar varying values.
- */
-static uint32_t
-nvc0_shader_input_address(unsigned sn, unsigned si)
-{
-   switch (sn) {
-   case TGSI_SEMANTIC_TESSOUTER:    return 0x000 + si * 0x4;
-   case TGSI_SEMANTIC_TESSINNER:    return 0x010 + si * 0x4;
-   case TGSI_SEMANTIC_PATCH:        return 0x020 + si * 0x10;
-   case TGSI_SEMANTIC_PRIMID:       return 0x060;
-   case TGSI_SEMANTIC_LAYER:        return 0x064;
-   case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
-   case TGSI_SEMANTIC_PSIZE:        return 0x06c;
-   case TGSI_SEMANTIC_POSITION:     return 0x070;
-   case TGSI_SEMANTIC_GENERIC:      return 0x080 + si * 0x10;
-   case TGSI_SEMANTIC_FOG:          return 0x2e8;
-   case TGSI_SEMANTIC_COLOR:        return 0x280 + si * 0x10;
-   case TGSI_SEMANTIC_BCOLOR:       return 0x2a0 + si * 0x10;
-   case TGSI_SEMANTIC_CLIPDIST:     return 0x2c0 + si * 0x10;
-   case TGSI_SEMANTIC_CLIPVERTEX:   return 0x270;
-   case TGSI_SEMANTIC_PCOORD:       return 0x2e0;
-   case TGSI_SEMANTIC_TESSCOORD:    return 0x2f0;
-   case TGSI_SEMANTIC_INSTANCEID:   return 0x2f8;
-   case TGSI_SEMANTIC_VERTEXID:     return 0x2fc;
-   case TGSI_SEMANTIC_TEXCOORD:     return 0x300 + si * 0x10;
-   default:
-      assert(!"invalid TGSI input semantic");
-      return ~0;
-   }
-}
-
-static uint32_t
-nvc0_shader_output_address(unsigned sn, unsigned si)
-{
-   switch (sn) {
-   case TGSI_SEMANTIC_TESSOUTER:     return 0x000 + si * 0x4;
-   case TGSI_SEMANTIC_TESSINNER:     return 0x010 + si * 0x4;
-   case TGSI_SEMANTIC_PATCH:         return 0x020 + si * 0x10;
-   case TGSI_SEMANTIC_PRIMID:        return 0x060;
-   case TGSI_SEMANTIC_LAYER:         return 0x064;
-   case TGSI_SEMANTIC_VIEWPORT_INDEX:return 0x068;
-   case TGSI_SEMANTIC_PSIZE:         return 0x06c;
-   case TGSI_SEMANTIC_POSITION:      return 0x070;
-   case TGSI_SEMANTIC_GENERIC:       return 0x080 + si * 0x10;
-   case TGSI_SEMANTIC_FOG:           return 0x2e8;
-   case TGSI_SEMANTIC_COLOR:         return 0x280 + si * 0x10;
-   case TGSI_SEMANTIC_BCOLOR:        return 0x2a0 + si * 0x10;
-   case TGSI_SEMANTIC_CLIPDIST:      return 0x2c0 + si * 0x10;
-   case TGSI_SEMANTIC_CLIPVERTEX:    return 0x270;
-   case TGSI_SEMANTIC_TEXCOORD:      return 0x300 + si * 0x10;
-   case TGSI_SEMANTIC_VIEWPORT_MASK: return 0x3a0;
-   case TGSI_SEMANTIC_EDGEFLAG:      return ~0;
-   default:
-      assert(!"invalid TGSI output semantic");
-      return ~0;
-   }
-}
-
-static int
-nvc0_vp_assign_input_slots(struct nv50_ir_prog_info_out *info)
-{
-   unsigned i, c, n;
-
-   for (n = 0, i = 0; i < info->numInputs; ++i) {
-      switch (info->in[i].sn) {
-      case TGSI_SEMANTIC_INSTANCEID: /* for SM4 only, in TGSI they're SVs */
-      case TGSI_SEMANTIC_VERTEXID:
-         info->in[i].mask = 0x1;
-         info->in[i].slot[0] =
-            nvc0_shader_input_address(info->in[i].sn, 0) / 4;
-         continue;
-      default:
-         break;
-      }
-      for (c = 0; c < 4; ++c)
-         info->in[i].slot[c] = (0x80 + n * 0x10 + c * 0x4) / 4;
-      ++n;
-   }
-
-   return 0;
-}
-
-static int
-nvc0_sp_assign_input_slots(struct nv50_ir_prog_info_out *info)
-{
-   unsigned offset;
-   unsigned i, c;
-
-   for (i = 0; i < info->numInputs; ++i) {
-      offset = nvc0_shader_input_address(info->in[i].sn, info->in[i].si);
-
-      for (c = 0; c < 4; ++c)
-         info->in[i].slot[c] = (offset + c * 0x4) / 4;
-   }
-
-   return 0;
-}
-
-static int
-nvc0_fp_assign_output_slots(struct nv50_ir_prog_info_out *info)
-{
-   unsigned count = info->prop.fp.numColourResults * 4;
-   unsigned i, c;
-
-   /* Compute the relative position of each color output, since skipped MRT
-    * positions will not have registers allocated to them.
-    */
-   unsigned colors[8] = {0};
-   for (i = 0; i < info->numOutputs; ++i)
-      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
-         colors[info->out[i].si] = 1;
-   for (i = 0, c = 0; i < 8; i++)
-      if (colors[i])
-         colors[i] = c++;
-   for (i = 0; i < info->numOutputs; ++i)
-      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
-         for (c = 0; c < 4; ++c)
-            info->out[i].slot[c] = colors[info->out[i].si] * 4 + c;
-
-   if (info->io.sampleMask < NV50_CODEGEN_MAX_VARYINGS)
-      info->out[info->io.sampleMask].slot[0] = count++;
-   else
-   if (info->target >= 0xe0)
-      count++; /* on Kepler, depth is always last colour reg + 2 */
-
-   if (info->io.fragDepth < NV50_CODEGEN_MAX_VARYINGS)
-      info->out[info->io.fragDepth].slot[2] = count;
-
-   return 0;
-}
-
-static int
-nvc0_sp_assign_output_slots(struct nv50_ir_prog_info_out *info)
-{
-   unsigned offset;
-   unsigned i, c;
-
-   for (i = 0; i < info->numOutputs; ++i) {
-      offset = nvc0_shader_output_address(info->out[i].sn, info->out[i].si);
-
-      for (c = 0; c < 4; ++c)
-         info->out[i].slot[c] = (offset + c * 0x4) / 4;
-   }
-
-   return 0;
-}
-
-static int
-nvc0_program_assign_varying_slots(struct nv50_ir_prog_info_out *info)
-{
-   int ret;
-
-   if (info->type == PIPE_SHADER_VERTEX)
-      ret = nvc0_vp_assign_input_slots(info);
-   else
-      ret = nvc0_sp_assign_input_slots(info);
-   if (ret)
-      return ret;
-
-   if (info->type == PIPE_SHADER_FRAGMENT)
-      ret = nvc0_fp_assign_output_slots(info);
-   else
-      ret = nvc0_sp_assign_output_slots(info);
-   return ret;
-}
-
-static inline void
-nvk_vtgs_hdr_update_oread(struct nvk_shader *vs, uint8_t slot)
-{
-   uint8_t min = (vs->hdr[4] >> 12) & 0xff;
-   uint8_t max = (vs->hdr[4] >> 24);
-
-   min = MIN2(min, slot);
-   max = MAX2(max, slot);
-
-   vs->hdr[4] = (max << 24) | (min << 12);
-}
-
-static int
-nvk_vtgp_gen_header(struct nvk_shader *vs, struct nv50_ir_prog_info_out *info)
-{
-   unsigned i, c, a;
-
-   for (i = 0; i < info->numInputs; ++i) {
-      if (info->in[i].patch)
-         continue;
-      for (c = 0; c < 4; ++c) {
-         a = info->in[i].slot[c];
-         if (info->in[i].mask & (1 << c))
-            vs->hdr[5 + a / 32] |= 1 << (a % 32);
-      }
-   }
-
-   for (i = 0; i < info->numOutputs; ++i) {
-      if (info->out[i].patch)
-         continue;
-      for (c = 0; c < 4; ++c) {
-         if (!(info->out[i].mask & (1 << c)))
-            continue;
-         assert(info->out[i].slot[c] >= 0x40 / 4);
-         a = info->out[i].slot[c] - 0x40 / 4;
-         vs->hdr[13 + a / 32] |= 1 << (a % 32);
-         if (info->out[i].oread)
-            nvk_vtgs_hdr_update_oread(vs, info->out[i].slot[c]);
-      }
-   }
-
-   for (i = 0; i < info->numSysVals; ++i) {
-      switch (info->sv[i].sn) {
-      case SYSTEM_VALUE_PRIMITIVE_ID:
-         vs->hdr[5] |= 1 << 24;
-         break;
-      case SYSTEM_VALUE_INSTANCE_ID:
-         vs->hdr[10] |= 1 << 30;
-         break;
-      case SYSTEM_VALUE_VERTEX_ID:
-         vs->hdr[10] |= 1 << 31;
-         break;
-      case SYSTEM_VALUE_TESS_COORD:
-         /* We don't have the mask, nor the slots populated. While this could
-          * be achieved, the vast majority of the time if either of the coords
-          * are read, then both will be read.
-          */
-         nvk_vtgs_hdr_update_oread(vs, 0x2f0 / 4);
-         nvk_vtgs_hdr_update_oread(vs, 0x2f4 / 4);
-         break;
-      default:
-         break;
-      }
-   }
-
-   vs->vs.clip_enable = (1 << info->io.clipDistances) - 1;
-   vs->vs.cull_enable =
-      ((1 << info->io.cullDistances) - 1) << info->io.clipDistances;
-   for (i = 0; i < info->io.cullDistances; ++i)
-      vs->vs.clip_mode |= 1 << ((info->io.clipDistances + i) * 4);
-
-   vs->vs.layer_viewport_relative = info->io.layer_viewport_relative;
-
-   return 0;
-}
-
-static int
-nvk_vs_gen_header(struct nvk_shader *vs, struct nv50_ir_prog_info_out *info)
-{
-   vs->hdr[0] = 0x20061 | (1 << 10);
-   vs->hdr[4] = 0xff000;
-
-   return nvk_vtgp_gen_header(vs, info);
-}
-
-static int
-nvk_gs_gen_header(struct nvk_shader *gs,
-                  const struct nir_shader *nir,
-                  struct nv50_ir_prog_info_out *info)
-{
-   gs->hdr[0] = 0x20061 | (4 << 10);
-
-   gs->hdr[2] = MIN2(info->prop.gp.instanceCount, 32) << 24;
-
-   switch (info->prop.gp.outputPrim) {
-   case MESA_PRIM_POINTS:
-      gs->hdr[3] = 0x01000000;
-      break;
-   case MESA_PRIM_LINE_STRIP:
-      gs->hdr[3] = 0x06000000;
-      break;
-   case MESA_PRIM_TRIANGLE_STRIP:
-      gs->hdr[3] = 0x07000000;
-      break;
-   default:
-      assert(0);
-      break;
-   }
-
-   gs->hdr[4] = CLAMP(info->prop.gp.maxVertices, 1, 1024);
-
-   gs->hdr[0] |= nir->info.gs.active_stream_mask << 28;
-
-   return nvk_vtgp_gen_header(gs, info);
-}
-
-static void
-nvk_generate_tessellation_parameters(const struct nv50_ir_prog_info_out *info,
-                                     struct nvk_shader *shader)
-{
-   // TODO: this is a little confusing because nouveau codegen uses
-   // MESA_PRIM_POINTS for unspecified domain and
-   // MESA_PRIM_POINTS = 0, the same as NV9097 ISOLINE enum
-   uint32_t domain_type;
-   switch (info->prop.tp.domain) {
-   case MESA_PRIM_LINES:
-      domain_type = NV9097_SET_TESSELLATION_PARAMETERS_DOMAIN_TYPE_ISOLINE;
-      break;
-   case MESA_PRIM_TRIANGLES:
-      domain_type = NV9097_SET_TESSELLATION_PARAMETERS_DOMAIN_TYPE_TRIANGLE;
-      break;
-   case MESA_PRIM_QUADS:
-      domain_type = NV9097_SET_TESSELLATION_PARAMETERS_DOMAIN_TYPE_QUAD;
-      break;
-   default:
-      domain_type = ~0;
-      break;
-   }
-   shader->tp.domain_type = domain_type;
-   if (domain_type == ~0) {
-      return;
-   }
-
-   uint32_t spacing;
-   switch (info->prop.tp.partitioning) {
-   case PIPE_TESS_SPACING_EQUAL:
-      spacing = NV9097_SET_TESSELLATION_PARAMETERS_SPACING_INTEGER;
-      break;
-   case PIPE_TESS_SPACING_FRACTIONAL_ODD:
-      spacing = NV9097_SET_TESSELLATION_PARAMETERS_SPACING_FRACTIONAL_ODD;
-      break;
-   case PIPE_TESS_SPACING_FRACTIONAL_EVEN:
-      spacing = NV9097_SET_TESSELLATION_PARAMETERS_SPACING_FRACTIONAL_EVEN;
-      break;
-   default:
-      assert(!"invalid tessellator partitioning");
-      break;
-   }
-   shader->tp.spacing = spacing;
-
-   uint32_t output_prims;
-   if (info->prop.tp.outputPrim == MESA_PRIM_POINTS) { // point_mode
-      output_prims = 
NV9097_SET_TESSELLATION_PARAMETERS_OUTPUT_PRIMITIVES_POINTS;
-   } else if (info->prop.tp.domain == MESA_PRIM_LINES) { // isoline domain
-      output_prims = 
NV9097_SET_TESSELLATION_PARAMETERS_OUTPUT_PRIMITIVES_LINES;
-   } else {  // triangle/quad domain
-      if (info->prop.tp.winding > 0) {
-         output_prims = 
NV9097_SET_TESSELLATION_PARAMETERS_OUTPUT_PRIMITIVES_TRIANGLES_CW;
-      } else {
-         output_prims = 
NV9097_SET_TESSELLATION_PARAMETERS_OUTPUT_PRIMITIVES_TRIANGLES_CCW;
-      }
-   }
-   shader->tp.output_prims = output_prims;
-}
-
-static int
-nvk_tcs_gen_header(struct nvk_shader *tcs, struct nv50_ir_prog_info_out *info)
-{
-   unsigned opcs = 6; /* output patch constants (at least the TessFactors) */
-
-   if (info->numPatchConstants)
-      opcs = 8 + info->numPatchConstants * 4;
-
-   tcs->hdr[0] = 0x20061 | (2 << 10);
-
-   tcs->hdr[1] = opcs << 24;
-   tcs->hdr[2] = info->prop.tp.outputPatchSize << 24;
-
-   tcs->hdr[4] = 0xff000; /* initial min/max parallel output read address */
-
-   nvk_vtgp_gen_header(tcs, info);
-
-   if (info->target >= NVISA_GM107_CHIPSET) {
-      /* On GM107+, the number of output patch components has moved in the TCP
-       * header, but it seems like blob still also uses the old position.
-       * Also, the high 8-bits are located in between the min/max parallel
-       * field and has to be set after updating the outputs. */
-      tcs->hdr[3] = (opcs & 0x0f) << 28;
-      tcs->hdr[4] |= (opcs & 0xf0) << 16;
-   }
-
-   nvk_generate_tessellation_parameters(info, tcs);
-
-   return 0;
-}
-
-static int
-nvk_tes_gen_header(struct nvk_shader *tes, struct nv50_ir_prog_info_out *info)
-{
-   tes->hdr[0] = 0x20061 | (3 << 10);
-   tes->hdr[4] = 0xff000;
-
-   nvk_vtgp_gen_header(tes, info);
-
-   nvk_generate_tessellation_parameters(info, tes);
-
-   tes->hdr[18] |= 0x3 << 12; /* ? */
-
-   return 0;
-}
-
-#define NVC0_INTERP_FLAT          (1 << 0)
-#define NVC0_INTERP_PERSPECTIVE   (2 << 0)
-#define NVC0_INTERP_LINEAR        (3 << 0)
-#define NVC0_INTERP_CENTROID      (1 << 2)
-
-static uint8_t
-nvk_hdr_interp_mode(const struct nv50_ir_varying *var)
-{
-   if (var->linear)
-      return NVC0_INTERP_LINEAR;
-   if (var->flat)
-      return NVC0_INTERP_FLAT;
-   return NVC0_INTERP_PERSPECTIVE;
-}
-
-
-static int
-nvk_fs_gen_header(struct nvk_shader *fs, const struct nvk_fs_key *key,
-                  struct nv50_ir_prog_info_out *info)
-{
-   unsigned i, c, a, m;
-
-   /* just 00062 on Kepler */
-   fs->hdr[0] = 0x20062 | (5 << 10);
-   fs->hdr[5] = 0x80000000; /* getting a trap if FRAG_COORD_UMASK.w = 0 */
-
-   if (info->prop.fp.usesDiscard || key->zs_self_dep)
-      fs->hdr[0] |= 0x8000;
-   if (!info->prop.fp.separateFragData)
-      fs->hdr[0] |= 0x4000;
-   if (info->io.sampleMask < 80 /* PIPE_MAX_SHADER_OUTPUTS */)
-      fs->hdr[19] |= 0x1;
-   if (info->prop.fp.writesDepth) {
-      fs->hdr[19] |= 0x2;
-      fs->flags[0] = 0x11; /* deactivate ZCULL */
-   }
-
-   for (i = 0; i < info->numInputs; ++i) {
-      m = nvk_hdr_interp_mode(&info->in[i]);
-      if (info->in[i].sn == TGSI_SEMANTIC_COLOR) {
-         fs->fs.colors |= 1 << info->in[i].si;
-         if (info->in[i].sc)
-            fs->fs.color_interp[info->in[i].si] = m | (info->in[i].mask << 4);
-      }
-      for (c = 0; c < 4; ++c) {
-         if (!(info->in[i].mask & (1 << c)))
-            continue;
-         a = info->in[i].slot[c];
-         if (info->in[i].slot[0] >= (0x060 / 4) &&
-             info->in[i].slot[0] <= (0x07c / 4)) {
-            fs->hdr[5] |= 1 << (24 + (a - 0x060 / 4));
-         } else
-         if (info->in[i].slot[0] >= (0x2c0 / 4) &&
-             info->in[i].slot[0] <= (0x2fc / 4)) {
-            fs->hdr[14] |= (1 << (a - 0x280 / 4)) & 0x07ff0000;
-         } else {
-            if (info->in[i].slot[c] < (0x040 / 4) ||
-                info->in[i].slot[c] > (0x380 / 4))
-               continue;
-            a *= 2;
-            if (info->in[i].slot[0] >= (0x300 / 4))
-               a -= 32;
-            fs->hdr[4 + a / 32] |= m << (a % 32);
-         }
-      }
-   }
-   /* GM20x+ needs TGSI_SEMANTIC_POSITION to access sample locations */
-   if (info->prop.fp.readsSampleLocations && info->target >= 
NVISA_GM200_CHIPSET)
-      fs->hdr[5] |= 0x30000000;
-
-   for (i = 0; i < info->numOutputs; ++i) {
-      if (info->out[i].sn == TGSI_SEMANTIC_COLOR)
-         fs->hdr[18] |= 0xf << (4 * info->out[i].si);
-   }
-
-   /* There are no "regular" attachments, but the shader still needs to be
-    * executed. It seems like it wants to think that it has some color
-    * outputs in order to actually run.
-    */
-   if (info->prop.fp.numColourResults == 0 &&
-       !info->prop.fp.writesDepth &&
-       info->io.sampleMask >= 80 /* PIPE_MAX_SHADER_OUTPUTS */)
-      fs->hdr[18] |= 0xf;
-
-   fs->fs.early_z = info->prop.fp.earlyFragTests;
-   fs->fs.sample_mask_in = info->prop.fp.usesSampleMaskIn;
-   fs->fs.reads_framebuffer = info->prop.fp.readsFramebuffer;
-   fs->fs.post_depth_coverage = info->prop.fp.postDepthCoverage;
-
-   /* Mark position xy and layer as read */
-   if (fs->fs.reads_framebuffer)
-      fs->hdr[5] |= 0x32000000;
-
-   return 0;
-}
-
-static uint8_t find_register_index_for_xfb_output(const struct nir_shader *nir,
-                                                  nir_xfb_output_info output)
-{
-   nir_foreach_shader_out_variable(var, nir) {
-      uint32_t slots = glsl_count_vec4_slots(var->type, false, false);
-      for (uint32_t i = 0; i < slots; ++i) {
-         if (output.location == (var->data.location+i)) {
-            return var->data.driver_location+i;
-         }
-      }
-   }
-   // should not be reached
-   return 0;
-}
-
-static struct nvk_transform_feedback_state *
-nvk_fill_transform_feedback_state(struct nir_shader *nir,
-                                  const struct nv50_ir_prog_info_out *info)
-{
-   const uint8_t max_buffers = 4;
-   const uint8_t dw_bytes = 4;
-   const struct nir_xfb_info *nx = nir->xfb_info;
-   //nir_print_xfb_info(nx, stdout);
-
-   struct nvk_transform_feedback_state *xfb =
-      malloc(sizeof(struct nvk_transform_feedback_state));
-
-   if (!xfb)
-      return NULL;
-
-   for (uint8_t b = 0; b < max_buffers; ++b) {
-      xfb->stride[b] = b < nx->buffers_written ? nx->buffers[b].stride : 0;
-      xfb->varying_count[b] = 0;
-      xfb->stream[b] = nx->buffer_to_stream[b];
-   }
-   memset(xfb->varying_index, 0xff, sizeof(xfb->varying_index)); /* = skip */
-
-   if (info->numOutputs == 0)
-      return xfb;
-
-   for (uint32_t i = 0; i < nx->output_count; ++i) {
-      const nir_xfb_output_info output = nx->outputs[i];
-      const uint8_t b = output.buffer;
-      const uint8_t r = find_register_index_for_xfb_output(nir, output);
-      uint32_t p = output.offset / dw_bytes;
-
-      assert(r < info->numOutputs && p < ARRAY_SIZE(xfb->varying_index[b]));
-
-      u_foreach_bit(c, nx->outputs[i].component_mask)
-         xfb->varying_index[b][p++] = info->out[r].slot[c];
-
-      xfb->varying_count[b] = MAX2(xfb->varying_count[b], p);
-   }
-
-   /* zero unused indices */
-   for (uint8_t b = 0; b < 4; ++b)
-      for (uint32_t c = xfb->varying_count[b]; c & 3; ++c)
-         xfb->varying_index[b][c] = 0;
-
-   return xfb;
-}
-
 static VkResult
 nvk_compile_nir_with_nak(struct nvk_physical_device *pdev,
                          nir_shader *nir,
@@ -1128,104 +577,10 @@ nvk_compile_nir(struct nvk_physical_device *pdev, 
nir_shader *nir,
                 const struct nvk_fs_key *fs_key,
                 struct nvk_shader *shader)
 {
-   struct nv50_ir_prog_info *info;
-   struct nv50_ir_prog_info_out info_out = {};
-   int ret;
-
    if (use_nak(pdev, nir->info.stage))
       return nvk_compile_nir_with_nak(pdev, nir, fs_key, shader);
-
-   info = CALLOC_STRUCT(nv50_ir_prog_info);
-   if (!info)
-      return false;
-
-   info->type = pipe_shader_type_from_mesa(nir->info.stage);
-   info->target = pdev->info.chipset;
-   info->bin.nir = nir;
-
-   for (unsigned i = 0; i < 3; i++)
-      shader->cp.block_size[i] = nir->info.workgroup_size[i];
-
-   info->bin.smemSize = shader->cp.smem_size;
-   info->dbgFlags = nvk_cg_get_prog_debug();
-   info->optLevel = nvk_cg_get_prog_optimize();
-   info->io.auxCBSlot = 1;
-   info->io.uboInfoBase = 0;
-   info->io.drawInfoBase = nvk_root_descriptor_offset(draw.base_vertex);
-   if (nir->info.stage == MESA_SHADER_COMPUTE) {
-      info->prop.cp.gridInfoBase = 0;
-   } else {
-      info->assignSlots = nvc0_program_assign_varying_slots;
-   }
-   ret = nv50_ir_generate_code(info, &info_out);
-   if (ret)
-      return VK_ERROR_UNKNOWN;
-
-   if (info_out.bin.fixupData) {
-      nv50_ir_apply_fixups(info_out.bin.fixupData, info_out.bin.code,
-                           fs_key && fs_key->force_per_sample,
-                           false /* flatshade */, false /* alphatest */,
-                           fs_key && fs_key->msaa);
-   }
-
-   shader->stage = nir->info.stage;
-   shader->code_ptr = (uint8_t *)info_out.bin.code;
-   shader->code_size = info_out.bin.codeSize;
-
-   if (info_out.target >= NVISA_GV100_CHIPSET)
-      shader->num_gprs = MAX2(4, info_out.bin.maxGPR + 3);
    else
-      shader->num_gprs = MAX2(4, info_out.bin.maxGPR + 1);
-   shader->cp.smem_size = info_out.bin.smemSize;
-   shader->num_barriers = info_out.numBarriers;
-
-   switch (info->type) {
-   case PIPE_SHADER_VERTEX:
-      ret = nvk_vs_gen_header(shader, &info_out);
-      break;
-   case PIPE_SHADER_FRAGMENT:
-      ret = nvk_fs_gen_header(shader, fs_key, &info_out);
-      shader->fs.uses_sample_shading = nir->info.fs.uses_sample_shading;
-      break;
-   case PIPE_SHADER_GEOMETRY:
-      ret = nvk_gs_gen_header(shader, nir, &info_out);
-      break;
-   case PIPE_SHADER_TESS_CTRL:
-      ret = nvk_tcs_gen_header(shader, &info_out);
-      break;
-   case PIPE_SHADER_TESS_EVAL:
-      ret = nvk_tes_gen_header(shader, &info_out);
-      break;
-   case PIPE_SHADER_COMPUTE:
-      break;
-   default:
-      unreachable("Invalid shader stage");
-      break;
-   }
-   assert(ret == 0);
-
-   if (info_out.bin.tlsSpace) {
-      assert(info_out.bin.tlsSpace < (1 << 24));
-      shader->hdr[0] |= 1 << 26;
-      shader->hdr[1] |= align(info_out.bin.tlsSpace, 0x10); /* l[] size */
-      shader->slm_size = info_out.bin.tlsSpace;
-   }
-
-   if (info_out.io.globalAccess)
-      shader->hdr[0] |= 1 << 26;
-   if (info_out.io.globalAccess & 0x2)
-      shader->hdr[0] |= 1 << 16;
-   if (info_out.io.fp64)
-      shader->hdr[0] |= 1 << 27;
-
-   if (nir->xfb_info) {
-      shader->xfb = nvk_fill_transform_feedback_state(nir, &info_out);
-      if (shader->xfb == NULL) {
-         return VK_ERROR_OUT_OF_HOST_MEMORY;
-      }
-   }
-
-   return VK_SUCCESS;
+      return nvk_cg_compile_nir(pdev, nir, fs_key, shader);
 }
 
 VkResult
diff --git a/src/nouveau/vulkan/nvk_shader.h b/src/nouveau/vulkan/nvk_shader.h
index 29bf3d2ae17..d8a174a266d 100644
--- a/src/nouveau/vulkan/nvk_shader.h
+++ b/src/nouveau/vulkan/nvk_shader.h
@@ -168,4 +168,8 @@ nvk_cg_nir_options(const struct nvk_physical_device *pdev,
 void nvk_cg_preprocess_nir(nir_shader *nir);
 void nvk_cg_optimize_nir(nir_shader *nir);
 
+VkResult nvk_cg_compile_nir(struct nvk_physical_device *pdev, nir_shader *nir,
+                            const struct nvk_fs_key *fs_key,
+                            struct nvk_shader *shader);
+
 #endif

Mesa (main): nvk: Move the guts of nvk_compile_nir() to nvk_codegen.c

Reply via email to