[PATCH] expmed: Prevent non-canonical subreg generation in store_bit_field [PR118873]

Konstantinos Eleftheriou Mon, 26 May 2025 00:58:01 -0700

In `store_bit_field_1`, when the value to be written in the bitfield
and/or the bitfield itself have vector modes, non-canonical subregs
are generated, like `(subreg:V4SI (reg:V8SI x) 0)`. If one them is
a scalar, this happens only when the scalar mode is different than the
vector's inner mode.


This patch tries to prevent this, using vec_set patterns when
possible.

Bootstrapped/regtested on AArch64 and x86_64.

        PR rtl-optimization/118873

gcc/ChangeLog:

        * expmed.cc (generate_vec_concat): New function.
        (store_bit_field_1): Check for cases where the value
        to be written and/or the bitfield have vector modes
        and try to generate the corresponding vec_set patterns
        instead of subregs.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/pr118873.c: New test.
---
 gcc/expmed.cc                            | 174 ++++++++++++++++++++++-
 gcc/testsuite/gcc.target/i386/pr118873.c |  33 +++++
 2 files changed, 200 insertions(+), 7 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr118873.c

diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index 8cf10d9c73bf..8c641f55b9c6 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -740,6 +740,42 @@ store_bit_field_using_insv (const extraction_insn *insv, 
rtx op0,
   return false;
 }
 
+/* Helper function for store_bit_field_1, used in the case that the bitfield
+   and the destination are both vectors.  It extracts the elements of OP from
+   LOWER_BOUND to UPPER_BOUND using a vec_select and uses a vec_concat to
+   concatenate the extracted elements with the VALUE.  */
+
+rtx
+generate_vec_concat (machine_mode fieldmode, rtx op, rtx value,
+                    HOST_WIDE_INT lower_bound,
+                    HOST_WIDE_INT upper_bound)
+{
+  if (!VECTOR_MODE_P (fieldmode))
+    return NULL_RTX;
+
+  rtvec vec = rtvec_alloc (GET_MODE_NUNITS (fieldmode).to_constant ());
+  machine_mode outermode = GET_MODE (op);
+
+  for (HOST_WIDE_INT i = lower_bound; i < upper_bound; ++i)
+    RTVEC_ELT (vec, i) = GEN_INT (i);
+  rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
+  rtx select = gen_rtx_VEC_SELECT (fieldmode, op, par);
+  if (BYTES_BIG_ENDIAN)
+    {
+      if (lower_bound > 0)
+       return gen_rtx_VEC_CONCAT (outermode, select, value);
+      else
+       return gen_rtx_VEC_CONCAT (outermode, value, select);
+    }
+  else
+    {
+      if (lower_bound > 0)
+       return gen_rtx_VEC_CONCAT (outermode, value, select);
+      else
+       return gen_rtx_VEC_CONCAT (outermode, select, value);
+    }
+}
+
 /* A subroutine of store_bit_field, with the same arguments.  Return true
    if the operation could be implemented.
 
@@ -778,18 +814,142 @@ store_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, 
poly_uint64 bitnum,
   if (VECTOR_MODE_P (outermode)
       && !MEM_P (op0)
       && optab_handler (vec_set_optab, outermode) != CODE_FOR_nothing
-      && fieldmode == innermode
-      && known_eq (bitsize, GET_MODE_PRECISION (innermode))
       && multiple_p (bitnum, GET_MODE_PRECISION (innermode), &pos))
     {
+      /* Cases where the destination's inner mode is not equal to the
+        value's mode need special treatment.  */
+
       class expand_operand ops[3];
       enum insn_code icode = optab_handler (vec_set_optab, outermode);
 
-      create_fixed_operand (&ops[0], op0);
-      create_input_operand (&ops[1], value, innermode);
-      create_integer_operand (&ops[2], pos);
-      if (maybe_expand_insn (icode, 3, ops))
-       return true;
+      /* Subreg expressions should operate on scalars only.  Subregs on
+        vectors are not canonical.  Extractions from vectors should use
+        vector operations instead.  */
+      bool is_non_canon_subreg = GET_CODE (value) == SUBREG
+                                && VECTOR_MODE_P (fieldmode)
+                                && !VECTOR_MODE_P (
+                                   GET_MODE (SUBREG_REG (value)));
+
+      /* If the value to be written is a memory expression or a non-canonical
+        scalar to vector subreg, don't try to generate a vec_set pattern.
+        Instead, fall back and try to generate an instruction without
+        touching the operands.  */
+      if (!MEM_P (value) && !is_non_canon_subreg)
+      {
+       if (VECTOR_MODE_P (fieldmode))
+         {
+           /* Handle the case where both the value to be written and the
+              destination are vectors.  */
+
+           HOST_WIDE_INT op_elem_num
+             = GET_MODE_NUNITS (outermode).to_constant ();
+           rtx concat_rtx = value;
+           rtx_insn *last_insn = get_last_insn ();
+           HOST_WIDE_INT index = 0;
+           /* If the store position is not at the start of the bitfield,
+              store the value by selecting the first pos elements of the
+              vector and then placing the value after them, using
+              a vec_concat.  */
+           if (pos.to_constant () > 0)
+             {
+               concat_rtx = generate_vec_concat (fieldmode, op0, value, 0,
+                                                 pos.to_constant ());
+
+               index = pos.to_constant () +  bitsize.to_constant ()
+                       / GET_MODE_UNIT_BITSIZE (outermode);
+             }
+
+           /* Reconstruct the rest of the vector, after the value.  */
+           if (index < op_elem_num)
+             concat_rtx = generate_vec_concat (fieldmode, op0, concat_rtx,
+                                               index, op_elem_num);
+
+           rtx_insn *set_insn = emit_insn (gen_rtx_SET (op0, concat_rtx));
+
+           if (recog_memoized (set_insn) >= 0)
+             return true;
+           else
+             delete_insns_since (last_insn);
+         }
+       else if (fieldmode != innermode)
+         {
+           /* Handle the case where the destination is a vector and
+              the value's mode is different than the vector's inner
+              mode.  We have to treat the bitfield insertion differently
+              depending on which of those modes is wider than the other.  */
+
+           if (known_gt (GET_MODE_SIZE (fieldmode),
+                         GET_MODE_SIZE (innermode)))
+             {
+               /* If the value's mode is wider than the vector's inner
+                  mode, extract a part from the value with size equal
+                  to the vector's inner mode size and write it in the
+                  appropriate position inside the vector, using a vec_set
+                  pattern.  Repeat, until the whole value is written.  */
+
+               unsigned int curr_pos = 0;
+               bool failed = false;
+               rtx_insn *last_insn = get_last_insn ();
+               while (curr_pos < bitsize.to_constant ())
+                 {
+
+                   rtx subreg = gen_reg_rtx (innermode);
+                   unsigned int innermode_size = GET_MODE_BITSIZE (innermode);
+                   rtx bitfield
+                     = extract_bit_field (value, innermode_size,
+                                          curr_pos, 0, NULL_RTX, innermode,
+                                          innermode, false, 0);
+
+                   store_bit_field_1 (subreg, innermode_size, 0, 0,
+                                      0, innermode, bitfield, false, false,
+                                      false);
+
+                   HOST_WIDE_INT index
+                       = pos.to_constant () + curr_pos / innermode_size;
+                   create_fixed_operand (&ops[0], op0);
+                   create_input_operand (&ops[1], subreg, innermode);
+                   create_integer_operand (&ops[2], index);
+                   if (!maybe_expand_insn (icode, 3, ops))
+                     {
+                       failed = true;
+                       break;
+                     }
+
+                   curr_pos += innermode_size;
+                 }
+
+               if (!failed)
+                 return true;
+               else
+                 delete_insns_since (last_insn);
+             }
+           else if (known_lt (GET_MODE_SIZE (fieldmode),
+                              GET_MODE_SIZE (innermode)))
+             {
+               /* If the value's mode is narrower than the vector's inner
+                  mode, extend the value's mode to the vector's inner
+                  mode and use a vec_set pattern for the insertion.  */
+
+               rtx ext_value = gen_rtx_ZERO_EXTEND (innermode, value);
+
+               create_fixed_operand (&ops[0], op0);
+               create_input_operand (&ops[1], ext_value, innermode);
+               create_integer_operand (&ops[2], pos);
+               if (maybe_expand_insn (icode, 3, ops))
+                 return true;
+             }
+         }
+       }
+
+      if (fieldmode == innermode
+         && known_eq (bitsize, GET_MODE_PRECISION (innermode)))
+       {
+         create_fixed_operand (&ops[0], op0);
+         create_input_operand (&ops[1], value, innermode);
+         create_integer_operand (&ops[2], pos);
+         if (maybe_expand_insn (icode, 3, ops))
+           return true;
+       }
     }
 
   /* If the target is a register, overwriting the entire object, or storing
diff --git a/gcc/testsuite/gcc.target/i386/pr118873.c 
b/gcc/testsuite/gcc.target/i386/pr118873.c
new file mode 100644
index 000000000000..3a07c7cc87f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr118873.c
@@ -0,0 +1,33 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -mavx2 -favoid-store-forwarding" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef int v4si __attribute__((vector_size(16)));
+typedef int v8si __attribute__((vector_size(32)));
+
+v8si a;
+v4si b;
+
+/*
+** foo:
+**     ...
+**     vmovdqa a\(%rip\), %ymm0
+**     vmovdqa b\(%rip\), %xmm1
+**     vmovdqa %ymm0, \(%rdi\)
+**     vmovdqa 16\(%rdi\), %ymm0
+**     vmovdqa %xmm1, 32\(%rdi\)
+**     vinserti128     \$0x1, %xmm1, %ymm0, %ymm0
+**     vmovdqa %ymm0, a\(%rip\)
+**     vzeroupper
+**     ret
+**     ...
+*/
+void foo (int *p)
+{
+  v8si aa = a;
+  v4si bb = b;
+  *(v8si *)p = a;
+  *(v4si *)(p + 8) = b;
+  a = *(v8si *)(p + 4);
+}
+
-- 
2.49.0

[PATCH] expmed: Prevent non-canonical subreg generation in store_bit_field [PR118873]

Reply via email to