On Tue, Dec 9, 2025 at 9:38 PM Andrew Pinski <[email protected]> wrote:
> On Tue, Dec 9, 2025 at 8:55 AM Konstantinos Eleftheriou > <[email protected]> wrote: > > > > From: kelefth <[email protected]> > > > > This patch enables the avoid-store-forwarding patch by default at O2 or > > higher on AArch64. > > I am not 100% convinced why this should be enabled specific to > aarch64. It seems like a generic issue. and looking into the testcases > all should happen on the gimple level much earlier than RTL. > We just don't have enough data to support that this would be beneficial on other architectures. There have been some talks about introducing a cost-function so that the targets can decide (see threads of previous versions), but there was no clear decision, so we went with this for now. > > > > > The assembly patterns in `bitfield-bitint-abi-align16.c` and > > `bitfield-bitint-abi-align8.c` have been updated to account for > > the asf transformations. > > > > gcc/ChangeLog: > > > > * common/config/aarch64/aarch64-common.cc: Enable asf at O2. > > * doc/invoke.texi: Document asf as an O2 enabled option for > AArch64. > > > > gcc/testsuite/ChangeLog: > > > > * gcc.target/aarch64/bitfield-bitint-abi-align16.c: > > Modify testcases to account for the asf transformations. > > * gcc.target/aarch64/bitfield-bitint-abi-align8.c: Likewise. > > * gcc.target/aarch64/avoid-store-forwarding-6.c: New test. > > > > Co-Authored-By: Christoph Müllner <[email protected]> > > Signed-off-by: Konstantinos Eleftheriou < > [email protected]> > > --- > > > > Changes in v5: > > - Move `store_bit_field` destination register fix to `store_bit_field_1`. > > > > Changes in v4: > > - Enable asf on AArch64 only. > > - Remove `avoid_store_forwarding_reorder_cost_p` function. > > - Remove `asf-default-cost-value` parameter. > > > > Changes in v3: > > - Add `avoid_store_forwarding_reorder_cost_p` function. > > - Add `asf-default-cost-value` parameter. > > - Use `avoid_store_forwarding_reorder_cost_p` for each store in ASF. > > - Update x86_64 testcases. > > - Update assembly patterns in `bitfield-bitint-abi-align16.c` and > > `bitfield-bitint-abi-align8.c`. > > - Fix bug where the result of `store_bit_field` is not used in the > > following instructions. > > > > Changes in v2: > > - Update assembly patterns in `bitfield-bitint-abi-align16.c` and > > `bitfield-bitint-abi-align8.c`. > > > > Changes in v1: > > - Enable asf by default at O2 or higher. > > > > gcc/common/config/aarch64/aarch64-common.cc | 2 ++ > > gcc/doc/invoke.texi | 2 +- > > .../aarch64/avoid-store-forwarding-6.c | 29 +++++++++++++++++++ > > .../aarch64/bitfield-bitint-abi-align16.c | 25 +++++++++------- > > .../aarch64/bitfield-bitint-abi-align8.c | 25 +++++++++------- > > 5 files changed, 60 insertions(+), 23 deletions(-) > > create mode 100644 > gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c > > > > diff --git a/gcc/common/config/aarch64/aarch64-common.cc > b/gcc/common/config/aarch64/aarch64-common.cc > > index 1488697c6ce4..c3cd3eca9b84 100644 > > --- a/gcc/common/config/aarch64/aarch64-common.cc > > +++ b/gcc/common/config/aarch64/aarch64-common.cc > > @@ -60,6 +60,8 @@ static const struct default_options > aarch_option_optimization_table[] = > > /* Enable redundant extension instructions removal at -O2 and > higher. */ > > { OPT_LEVELS_2_PLUS, OPT_free, NULL, 1 }, > > { OPT_LEVELS_2_PLUS, OPT_mearly_ra_, NULL, AARCH64_EARLY_RA_ALL }, > > + /* Enable store forwarding avoidance at -O2 and higher. */ > > + { OPT_LEVELS_2_PLUS, OPT_favoid_store_forwarding, NULL, 1 }, > > #if (TARGET_DEFAULT_ASYNC_UNWIND_TABLES == 1) > > { OPT_LEVELS_ALL, OPT_fasynchronous_unwind_tables, NULL, 1 }, > > { OPT_LEVELS_ALL, OPT_funwind_tables, NULL, 1}, > > diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi > > index 2da2a27acd68..ef73a2b6302a 100644 > > --- a/gcc/doc/invoke.texi > > +++ b/gcc/doc/invoke.texi > > @@ -13500,7 +13500,7 @@ Many CPUs will stall for many cycles when a load > partially depends on previous > > smaller stores. This pass tries to detect such cases and avoid the > penalty by > > changing the order of the load and store and then fixing up the loaded > value. > > > > -Disabled by default. > > +Enabled by default at @option{-O2} and higher on AArch64. > > > > @opindex ffp-contract > > @item -ffp-contract=@var{style} > > diff --git a/gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c > b/gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c > > new file mode 100644 > > index 000000000000..320fa5e101e6 > > --- /dev/null > > +++ b/gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c > > @@ -0,0 +1,29 @@ > > +/* { dg-do compile } */ > > +/* { dg-options "-O2" } */ > > + > > +/* Same as avoid-store-forwarding-1.c but without > -favoid-store-forwarding. */ > > + > > +typedef union { > > + char arr_8[8]; > > + long long_value; > > +} DataUnion; > > + > > +long ssll_1 (DataUnion *data, char x) > > +{ > > + data->arr_8[0] = x; > > + return data->long_value; > > +} > > + > > +long ssll_2 (DataUnion *data, char x) > > +{ > > + data->arr_8[1] = x; > > + return data->long_value; > > +} > > + > > +long ssll_3 (DataUnion *data, char x) > > +{ > > + data->arr_8[7] = x; > > + return data->long_value; > > +} > > + > > +/* { dg-final { scan-assembler-times {ldr\tx[0-9]+, > \[x[0-9]+\]\n\tstrb\tw[0-9]+, \[x[0-9]+(, \d+)?\]\n\tbfi\tx[0-9]+, x[0-9]+, > \d+, \d+} 3 } } */ > > Can you use check-function-bodies instead of scan-assembler-times? > Will do. > Also this seems like this optimization should be at the gimple level > instead of the RTL level for the above case and even the bitfield > case. > > > > diff --git > a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c > b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c > > index c29a230a7713..34f3d7f9653c 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c > > +++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c > > @@ -87,14 +87,15 @@ > > ** sub sp, sp, #16 > > ** mov (x[0-9]+), x0 > > ** mov w0, w1 > > +** mov x1, 0 > > ** sbfx x([0-9]+), \1, 0, 63 > > ** mov (w[0-9]+), 0 > > ** bfi \3, w\2, 0, 1 > > ** and x3, x\2, 9223372036854775807 > > -** mov x2, 0 > > -** str xzr, \[sp\] > > +** str x1, \[sp\] > > This is worse code generation here where x1 is zero. So why it the > case the x1 is set above to 0 and then stored? > > > ** strb \3, \[sp\] > > -** ldr x1, \[sp\] > > +** bfi x1, x2, 0, 8 > > This is not good code generation at all. Shouldn't this just be `and > x1, x2, 0xff`? > > More of the same below. > I am not 100% sure if this is a win either. > Why is pass_rtl_avoid_store_forwarding right before RA instead of > before combine? Where the above optimizations can happen? > These issues are indeed fixed if we move it before 'combine'. It's something that we had already considered in the past (or running it twice). We were thinking of sending a patch in the future about this, as this one was about the default enablement of the pass in its current state. We can change it now, though. Thanks, Konstantinos > > Thanks, > Andrew Pinski > > > > +** mov x2, 0 > > ** add sp, sp, 16 > > ** b fp > > */ > > @@ -183,19 +184,20 @@ > > ** sxtw (x[0-9]+), w1 > > ** mov x0, \2 > > ** and x7, \2, 9223372036854775807 > > +** mov x2, 0 > > ** mov (w[0-9]+), 0 > > ** bfi \3, w\1, 0, 1 > > ** strb wzr, \[sp, 16\] > > ** mov x6, x7 > > ** mov x5, x7 > > ** mov x4, x7 > > +** mov x1, x7 > > +** str x2, \[sp, 48\] > > +** strb \3, \[sp, 48\] > > +** bfi x2, x3, 0, 8 > > +** stp x7, x2, \[sp\] > > ** mov x3, x7 > > ** mov x2, x7 > > -** str xzr, \[sp, 48\] > > -** strb \3, \[sp, 48\] > > -** ldr (x[0-9]+), \[sp, 48\] > > -** stp x7, \4, \[sp\] > > -** mov x1, x7 > > ** bl fp_stack > > ** sbfx x0, x0, 0, 63 > > **... > > @@ -341,12 +343,13 @@ > > **... > > ** mov x([0-9]+), x0 > > ** mov w0, w1 > > +** mov x1, 0 > > ** mov (w[0-9]+), 0 > > ** bfi \2, w\1, 0, 1 > > -** mov x2, 0 > > -** str xzr, \[sp\] > > +** str x1, \[sp\] > > ** strb \2, \[sp\] > > -** ldr x1, \[sp\] > > +** bfi x1, x2, 0, 8 > > +** mov x2, 0 > > **... > > ** b fp_stdarg > > */ > > diff --git > a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c > b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c > > index 13ffbf416cab..d9cefbabb80c 100644 > > --- a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c > > +++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c > > @@ -87,14 +87,15 @@ > > ** sub sp, sp, #16 > > ** mov (x[0-9]+), x0 > > ** mov w0, w1 > > +** mov x1, 0 > > ** sbfx x([0-9]+), \1, 0, 63 > > ** mov (w[0-9]+), 0 > > ** bfi \3, w\2, 0, 1 > > ** and x3, x\2, 9223372036854775807 > > -** mov x2, 0 > > -** str xzr, \[sp\] > > +** str x1, \[sp\] > > ** strb \3, \[sp\] > > -** ldr x1, \[sp\] > > +** bfi x1, x2, 0, 8 > > +** mov x2, 0 > > ** add sp, sp, 16 > > ** b fp > > */ > > @@ -183,19 +184,20 @@ > > ** sxtw (x[0-9]+), w1 > > ** mov x0, \2 > > ** and x7, \2, 9223372036854775807 > > +** mov x2, 0 > > ** mov (w[0-9]+), 0 > > ** bfi \3, w\1, 0, 1 > > ** strb wzr, \[sp, 16\] > > ** mov x6, x7 > > ** mov x5, x7 > > ** mov x4, x7 > > +** mov x1, x7 > > +** str x2, \[sp, 48\] > > +** strb \3, \[sp, 48\] > > +** bfi x2, x3, 0, 8 > > +** stp x7, x2, \[sp\] > > ** mov x3, x7 > > ** mov x2, x7 > > -** str xzr, \[sp, 48\] > > -** strb \3, \[sp, 48\] > > -** ldr (x[0-9]+), \[sp, 48\] > > -** stp x7, \4, \[sp\] > > -** mov x1, x7 > > ** bl fp_stack > > ** sbfx x0, x0, 0, 63 > > **... > > @@ -343,12 +345,13 @@ > > **... > > ** mov x([0-9]+), x0 > > ** mov w0, w1 > > +** mov x1, 0 > > ** mov (w[0-9]+), 0 > > ** bfi \2, w\1, 0, 1 > > -** mov x2, 0 > > -** str xzr, \[sp\] > > +** str x1, \[sp\] > > ** strb \2, \[sp\] > > -** ldr x1, \[sp\] > > +** bfi x1, x2, 0, 8 > > +** mov x2, 0 > > **... > > ** b fp_stdarg > > */ > > -- > > 2.51.1 > > >
