The call to `gen_lowpart` in `store_bit_field_1` might copy the destination
register into a new one, which may lead to wrong code generation, as the bit
insertions update the new register instead of updating `str_rtx`.

This patch copies back the new destination register into `str_rtx` when needed.

Bootstrapped/regtested on AArch64 and x86-64.

        PR rtl-optimization/125988

gcc/ChangeLog:

        * expmed.cc (store_bit_field_1): Copy back the new destination
        register into `str_rtx` when needed.

gcc/testsuite/ChangeLog:

        * gcc.target/aarch64/pr125988.c: New test.
---
 gcc/expmed.cc                               | 22 +++++++--
 gcc/testsuite/gcc.target/aarch64/pr125988.c | 51 +++++++++++++++++++++
 2 files changed, 70 insertions(+), 3 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/pr125988.c

diff --git a/gcc/expmed.cc b/gcc/expmed.cc
index da1b5b632876..1f4611a6ed89 100644
--- a/gcc/expmed.cc
+++ b/gcc/expmed.cc
@@ -888,9 +888,25 @@ store_bit_field_1 (rtx str_rtx, poly_uint64 bitsize, 
poly_uint64 bitnum,
        op0 = gen_lowpart (op0_mode.require (), op0);
     }
 
-  return store_integral_bit_field (op0, op0_mode, ibitsize, ibitnum,
-                                  bitregion_start, bitregion_end,
-                                  fieldmode, value, reverse, fallback_p);
+  if (!store_integral_bit_field (op0, op0_mode, ibitsize, ibitnum,
+                                bitregion_start, bitregion_end,
+                                fieldmode, value, reverse, fallback_p))
+    return false;
+
+  rtx op0_reg = op0;
+  rtx str_rtx_reg = str_rtx;
+  while (SUBREG_P (op0_reg))
+    op0_reg = SUBREG_REG (op0_reg);
+  while (SUBREG_P (str_rtx_reg))
+    str_rtx_reg = SUBREG_REG (str_rtx_reg);
+
+  /* If a new destination register has been generated, copy the value back
+     into str_rtx.  */
+  if (REG_P (op0_reg) && REG_P (str_rtx_reg)
+      && REGNO (op0_reg) != REGNO (str_rtx_reg))
+    emit_move_insn (str_rtx_reg, op0_reg);
+
+  return true;
 }
 
 /* Subroutine of store_bit_field_1, with the same arguments, except
diff --git a/gcc/testsuite/gcc.target/aarch64/pr125988.c 
b/gcc/testsuite/gcc.target/aarch64/pr125988.c
new file mode 100644
index 000000000000..3ac7be9b7b99
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/pr125988.c
@@ -0,0 +1,51 @@
+/* PR rtl-optimization/125988 */
+/* { dg-do run } */
+/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */
+/* { dg-options "-O3 -favoid-store-forwarding" } */
+/* { dg-add-options arm_v8_2a_bf16_neon } */
+
+/* Verify that the lane inserted by vld4_lane_bf16 survives
+   avoid-store-forwarding's bit-insert rewrite.  */
+
+#include <arm_neon.h>
+
+extern void abort (void);
+
+typedef union
+{
+  bfloat16_t bf;
+  unsigned short u;
+} bf16_u;
+
+__attribute__((noipa)) static int
+test (const bf16_u *data, const bf16_u *overwrite)
+{
+  bfloat16x4x4_t v;
+  bf16_u t[4];
+  int i, j;
+  for (i = 0; i < 4; i++, data += 4)
+    v.val[i] = vld1_bf16 (&data->bf);
+  v = vld4_lane_bf16 (&overwrite->bf, v, 3);
+  while (--i >= 0)
+    {
+      vst1_bf16 (&t[0].bf, v.val[i]);
+      data -= 4;
+      for (j = 0; j < 4; j++)
+       if (t[j].u != (j == 3 ? overwrite[i].u : data[j].u))
+         return 1;
+    }
+  return 0;
+}
+
+int
+main (void)
+{
+  bf16_u d[16];
+  for (int i = 0; i < 16; i++)
+    d[i].u = 0x1000 + i;
+  bf16_u ov[4] = { {.u = 0xABCD}, {.u = 0x1234},
+                  {.u = 0xCAFE}, {.u = 0xBEEF} };
+  if (test (d, ov))
+    abort ();
+  return 0;
+}
-- 
2.52.0

Reply via email to