Hi Richard, We've been sending this as part of the ASF default enablement patchset for nearly a year now. We are splitting it now as we judged that it requires separate attention.
The idea is to overwrite the original to-be-updated register -- stripping the subregs to extract it -- with the newly generated one. Otherwise, str_rtx is left unchanged. The copy is done by `emit_mov_insn`. Any ideas on how we could handle this better? It definitely looks like the right place for the fix though. Thanks, Konstantinos On Tue, Jun 30, 2026 at 10:22 AM Richard Biener <[email protected]> wrote: > On Mon, Jun 29, 2026 at 4:13 PM Konstantinos Eleftheriou > <[email protected]> wrote: > > > > The call to `gen_lowpart` in `store_bit_field_1` might copy the > destination > > register into a new one, which may lead to wrong code generation, as the > bit > > insertions update the new register instead of updating `str_rtx`. > > > > This patch copies back the new destination register into `str_rtx` when > needed. > > > > Bootstrapped/regtested on AArch64 and x86-64. > > > > PR rtl-optimization/125988 > > > > gcc/ChangeLog: > > > > * expmed.cc (store_bit_field_1): Copy back the new destination > > register into `str_rtx` when needed. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/pr125988.c: New test. > > --- > > gcc/expmed.cc | 22 +++++++-- > > gcc/testsuite/gcc.target/aarch64/pr125988.c | 51 +++++++++++++++++++++ > > 2 files changed, 70 insertions(+), 3 deletions(-) > > create mode 100644 gcc/testsuite/gcc.target/aarch64/pr125988.c > > > > diff --git a/gcc/expmed.cc b/gcc/expmed.cc > > index da1b5b632876..1f4611a6ed89 100644 > > --- a/gcc/expmed.cc > > +++ b/gcc/expmed.cc > > @@ -888,9 +888,25 @@ store_bit_field_1 (rtx str_rtx, poly_uint64 > bitsize, poly_uint64 bitnum, > > op0 = gen_lowpart (op0_mode.require (), op0); > > } > > > > - return store_integral_bit_field (op0, op0_mode, ibitsize, ibitnum, > > - bitregion_start, bitregion_end, > > - fieldmode, value, reverse, > fallback_p); > > + if (!store_integral_bit_field (op0, op0_mode, ibitsize, ibitnum, > > + bitregion_start, bitregion_end, > > + fieldmode, value, reverse, fallback_p)) > > + return false; > > + > > + rtx op0_reg = op0; > > + rtx str_rtx_reg = str_rtx; > > + while (SUBREG_P (op0_reg)) > > + op0_reg = SUBREG_REG (op0_reg); > > + while (SUBREG_P (str_rtx_reg)) > > + str_rtx_reg = SUBREG_REG (str_rtx_reg); > > That looks definitely fishy. > > You do not quote the part that does the copy, but stripping all subregs > and then copying looks wrong. It also looks this was produced by an AI? > > > + > > + /* If a new destination register has been generated, copy the value > back > > + into str_rtx. */ > > + if (REG_P (op0_reg) && REG_P (str_rtx_reg) > > + && REGNO (op0_reg) != REGNO (str_rtx_reg)) > > + emit_move_insn (str_rtx_reg, op0_reg); > > + > > + return true; > > } > > > > /* Subroutine of store_bit_field_1, with the same arguments, except > > diff --git a/gcc/testsuite/gcc.target/aarch64/pr125988.c > b/gcc/testsuite/gcc.target/aarch64/pr125988.c > > new file mode 100644 > > index 000000000000..3ac7be9b7b99 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/pr125988.c > > @@ -0,0 +1,51 @@ > > +/* PR rtl-optimization/125988 */ > > +/* { dg-do run } */ > > +/* { dg-require-effective-target arm_v8_2a_bf16_neon_ok } */ > > +/* { dg-options "-O3 -favoid-store-forwarding" } */ > > +/* { dg-add-options arm_v8_2a_bf16_neon } */ > > + > > +/* Verify that the lane inserted by vld4_lane_bf16 survives > > + avoid-store-forwarding's bit-insert rewrite. */ > > + > > +#include <arm_neon.h> > > + > > +extern void abort (void); > > + > > +typedef union > > +{ > > + bfloat16_t bf; > > + unsigned short u; > > +} bf16_u; > > + > > +__attribute__((noipa)) static int > > +test (const bf16_u *data, const bf16_u *overwrite) > > +{ > > + bfloat16x4x4_t v; > > + bf16_u t[4]; > > + int i, j; > > + for (i = 0; i < 4; i++, data += 4) > > + v.val[i] = vld1_bf16 (&data->bf); > > + v = vld4_lane_bf16 (&overwrite->bf, v, 3); > > + while (--i >= 0) > > + { > > + vst1_bf16 (&t[0].bf, v.val[i]); > > + data -= 4; > > + for (j = 0; j < 4; j++) > > + if (t[j].u != (j == 3 ? overwrite[i].u : data[j].u)) > > + return 1; > > + } > > + return 0; > > +} > > + > > +int > > +main (void) > > +{ > > + bf16_u d[16]; > > + for (int i = 0; i < 16; i++) > > + d[i].u = 0x1000 + i; > > + bf16_u ov[4] = { {.u = 0xABCD}, {.u = 0x1234}, > > + {.u = 0xCAFE}, {.u = 0xBEEF} }; > > + if (test (d, ov)) > > + abort (); > > + return 0; > > +} > > -- > > 2.52.0 > > >
