Module: Mesa
Branch: main
Commit: c0cfa4f53bafca6cda90f64bd102f22a812c7f4c
URL:    
http://cgit.freedesktop.org/mesa/mesa/commit/?id=c0cfa4f53bafca6cda90f64bd102f22a812c7f4c

Author: Alejandro PiƱeiro <[email protected]>
Date:   Sat Nov 18 10:56:37 2023 +0100

nir: add new opcodes to map new v71 packing/conversion instructions

Since v71, broadcom hw include specific packing/conversion
instructions, so this commit adds opcodes to be able to make use of
them, specially for image stores:

   * pack_2x16_to_unorm_2x8 (on backend vftounorm8/vftosnorm8):
     2x16-bit floating point to 2x8-bit unorm/snorm

   * f2unorm_16/f2snorm_16 (on backend ftounorm16/ftosnorm16):
     floating point to 16-bit unorm/snorm

   * pack_2x16_to_unorm_2x10/pack_2x16_to_unorm_10_2 (on backend
     vftounorm10lo/vftounorm10hi): used to convert a floating point to
     a r10g10b10a2 unorm

   * pack_32_to_r11g11b10 (on backend v11fpack): packs 2 2x16 FP into
     R11G11B10.

   * pack_uint_32_to_r10g10b10a2 (on backend v10pack): pack 2 2x16
     integer into R10G10B10A2

   * pack_4x16_to_4x8 (on backend v8pack): packs 2 2x16 bit integer
     into 4x8 bits.

   * pack_2x32_to_2x16 (on backend vpack): 2x32 bit to 2x16 integer
     pack

For the latter, it can be easly confused with the existing
pack_32_2x16_split. But note that this one receives two 16bit integer,
and packs them on a 32bit integer. But broadcom opcode takes two 32bit
integer, takes the lower halfword, and packs them as 2x16 on a 32bit
integer.

Interestingly broadcom also defines a similar one that packs the
higher halfword. Not used yet.

Note that at this point we use agnostic names, even if we add a _v3d
suffix as they are only available for broadcom, in order to follow
current NIR conventions.

Reviewed-by: Iago Toral Quiroga <[email protected]>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25726>

---

 src/compiler/nir/nir_constant_expressions.py | 98 ++++++++++++++++++++++++++++
 src/compiler/nir/nir_opcodes.py              | 52 +++++++++++++++
 2 files changed, 150 insertions(+)

diff --git a/src/compiler/nir/nir_constant_expressions.py 
b/src/compiler/nir/nir_constant_expressions.py
index e6383b67737..ac302ba3165 100644
--- a/src/compiler/nir/nir_constant_expressions.py
+++ b/src/compiler/nir/nir_constant_expressions.py
@@ -62,6 +62,8 @@ template = """\
 #include "util/softfloat.h"
 #include "util/bigmath.h"
 #include "util/format/format_utils.h"
+#include "util/format_r11g11b10f.h"
+#include "util/u_math.h"
 #include "nir_constant_expressions.h"
 
 /**
@@ -277,6 +279,102 @@ unpack_half_1x16(uint16_t u)
    return _mesa_half_to_float(u);
 }
 
+/* Broadcom v3d specific instructions */
+/**
+ * Packs 2 2x16 floating split into a r11g11b10f:
+ *
+ * dst[10:0]  = float16_to_float11 (src0[15:0])
+ * dst[21:11] = float16_to_float11 (src0[31:16])
+ * dst[31:22] = float16_to_float10 (src1[15:0])
+ */
+static uint32_t pack_32_to_r11g11b10_v3d(const uint32_t src0,
+                                         const uint32_t src1)
+{
+   float rgb[3] = {
+      unpack_half_1x16((src0 & 0xffff)),
+      unpack_half_1x16((src0 >> 16)),
+      unpack_half_1x16((src1 & 0xffff)),
+   };
+
+   return float3_to_r11g11b10f(rgb);
+}
+
+/**
+  * The three methods below are basically wrappers over pack_s/unorm_1x8/1x16,
+  * as they receives a uint16_t val instead of a float
+  */
+static inline uint8_t _mesa_half_to_snorm8(uint16_t val)
+{
+   return pack_snorm_1x8(_mesa_half_to_float(val));
+}
+
+static uint16_t _mesa_float_to_snorm16(uint32_t val)
+{
+   union fi aux;
+   aux.ui = val;
+   return pack_snorm_1x16(aux.f);
+}
+
+static uint16_t _mesa_float_to_unorm16(uint32_t val)
+{
+   union fi aux;
+   aux.ui = val;
+   return pack_unorm_1x16(aux.f);
+}
+
+static inline uint32_t float_pack16_v3d(uint32_t f32)
+{
+   return _mesa_float_to_half(uif(f32));
+}
+
+static inline uint32_t float_unpack16_v3d(uint32_t f16)
+{
+   return fui(_mesa_half_to_float(f16));
+}
+
+static inline uint32_t vfpack_v3d(uint32_t a, uint32_t b)
+{
+   return float_pack16_v3d(b) << 16 | float_pack16_v3d(a);
+}
+
+static inline uint32_t vfsat_v3d(uint32_t a)
+{
+   const uint32_t low = fui(SATURATE(_mesa_half_to_float(a & 0xffff)));
+   const uint32_t high = fui(SATURATE(_mesa_half_to_float(a >> 16)));
+
+   return vfpack_v3d(low, high);
+}
+
+static inline uint32_t fmul_v3d(uint32_t a, uint32_t b)
+{
+   return fui(uif(a) * uif(b));
+}
+
+static uint32_t vfmul_v3d(uint32_t a, uint32_t b)
+{
+   const uint32_t low = fmul_v3d(float_unpack16_v3d(a & 0xffff),
+                                 float_unpack16_v3d(b & 0xffff));
+   const uint32_t high = fmul_v3d(float_unpack16_v3d(a >> 16),
+                                  float_unpack16_v3d(b >> 16));
+
+   return vfpack_v3d(low, high);
+}
+
+/* Convert 2x16-bit floating point to 2x10-bit unorm */
+static uint32_t pack_2x16_to_unorm_2x10(uint32_t src0)
+{
+   return vfmul_v3d(vfsat_v3d(src0), 0x03ff03ff);
+}
+
+/*
+ * Convert 2x16-bit floating point to one 2-bit and one
+ * 10-bit unorm
+ */
+static uint32_t pack_2x16_to_unorm_10_2(uint32_t src0)
+{
+   return vfmul_v3d(vfsat_v3d(src0), 0x000303ff);
+}
+
 /* Some typed vector structures to make things like src0.y work */
 typedef int8_t int1_t;
 typedef uint8_t uint1_t;
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 0f81328f441..da53ea56f43 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -1413,6 +1413,58 @@ for (int i = 0; i < 32; i += 8) {
 }
 """)
 
+# v3d-specific opcodes
+
+# v3d-specific (v71) instruction that packs bits of 2 2x16 floating point into
+# r11g11b10 bits, rounding to nearest even, so
+#  dst[10:0]  = float16_to_float11 (src0[15:0])
+#  dst[21:11] = float16_to_float11 (src0[31:16])
+#  dst[31:22] = float16_to_float10 (src1[15:0])
+binop_convert("pack_32_to_r11g11b10_v3d", tuint32, tuint32, "",
+              "pack_32_to_r11g11b10_v3d(src0, src1)")
+
+# v3d-specific (v71) instruction that packs 2x32 bit to 2x16 bit integer. The
+# difference with pack_32_2x16_split is that the sources are 32bit too. So it
+# receives 2 32-bit integer, and packs the lower halfword as 2x16 on a 32-bit
+# integer.
+binop_horiz("pack_2x32_to_2x16_v3d", 1, tuint32, 1, tuint32, 1, tuint32,
+            "(src0.x & 0xffff) | (src1.x << 16)")
+
+# v3d-specific (v71) instruction that packs bits of 2 2x16 integers into
+# r10g10b10a2:
+#   dst[9:0]   = src0[9:0]
+#   dst[19:10] = src0[25:16]
+#   dst[29:20] = src1[9:0]
+#   dst[31:30] = src1[17:16]
+binop_convert("pack_uint_32_to_r10g10b10a2_v3d", tuint32, tuint32, "",
+              "(src0 & 0x3ff) | ((src0 >> 16) & 0x3ff) << 10 | (src1 & 0x3ff) 
<< 20 | ((src1 >> 16) & 0x3ff) << 30")
+
+# v3d-specific (v71) instruction that packs 2 2x16 bit integers into 4x8 bits:
+#   dst[7:0]   = src0[7:0]
+#   dst[15:8]  = src0[23:16]
+#   dst[23:16] = src1[7:0]
+#   dst[31:24] = src1[23:16]
+opcode("pack_4x16_to_4x8_v3d", 0, tuint32, [0, 0], [tuint32, tuint32],
+       False, "",
+       "(src0 & 0x000000ff) | (src0 & 0x00ff0000) >> 8 | (src1 & 0x000000ff) 
<< 16 | (src1 & 0x00ff0000) << 8")
+
+# v3d-specific (v71) instructions to convert 2x16 floating point to 2x8 bit 
unorm/snorm
+unop("pack_2x16_to_unorm_2x8_v3d", tuint32,
+     "_mesa_half_to_unorm(src0 & 0xffff, 8) | (_mesa_half_to_unorm(src0 >> 16, 
8) << 16)")
+unop("pack_2x16_to_snorm_2x8_v3d", tuint32,
+     "_mesa_half_to_snorm(src0 & 0xffff, 8) | (_mesa_half_to_snorm(src0 >> 16, 
8) << 16)")
+
+# v3d-specific (v71) instructions to convert 32-bit floating point to 16 bit 
unorm/snorm
+unop("f2unorm_16_v3d", tuint32, "_mesa_float_to_unorm16(src0)")
+unop("f2snorm_16_v3d", tuint32, "_mesa_float_to_snorm16(src0)")
+
+# v3d-specific (v71) instructions to convert 2x16 bit floating points to 2x10 
bit unorm
+unop("pack_2x16_to_unorm_2x10_v3d", tuint32, "pack_2x16_to_unorm_2x10(src0)")
+
+# v3d-specific (v71) instructions to convert 2x16 bit floating points to one 
2-bit
+# and one 10 bit unorm
+unop("pack_2x16_to_unorm_10_2_v3d", tuint32, "pack_2x16_to_unorm_10_2(src0)")
+
 # Mali-specific opcodes
 unop("fsat_signed_mali", tfloat, ("fmin(fmax(src0, -1.0), 1.0)"))
 unop("fclamp_pos_mali", tfloat, ("fmax(src0, 0.0)"))

Reply via email to