On Tue, Dec 9, 2025 at 8:55 AM Konstantinos Eleftheriou
<[email protected]> wrote:
>
> From: kelefth <[email protected]>
>
> This patch enables the avoid-store-forwarding patch by default at O2 or
> higher on AArch64.

I am not 100% convinced why this should be enabled specific to
aarch64. It seems like a generic issue. and looking into the testcases
all should happen on the gimple level much earlier than RTL.

>
> The assembly patterns in `bitfield-bitint-abi-align16.c` and
> `bitfield-bitint-abi-align8.c` have been updated to account for
> the asf transformations.
>
> gcc/ChangeLog:
>
>         * common/config/aarch64/aarch64-common.cc: Enable asf at O2.
>         * doc/invoke.texi: Document asf as an O2 enabled option for AArch64.
>
> gcc/testsuite/ChangeLog:
>
>         * gcc.target/aarch64/bitfield-bitint-abi-align16.c:
>         Modify testcases to account for the asf transformations.
>         * gcc.target/aarch64/bitfield-bitint-abi-align8.c: Likewise.
>         * gcc.target/aarch64/avoid-store-forwarding-6.c: New test.
>
> Co-Authored-By: Christoph Müllner <[email protected]>
> Signed-off-by: Konstantinos Eleftheriou <[email protected]>
> ---
>
> Changes in v5:
> - Move `store_bit_field` destination register fix to `store_bit_field_1`.
>
> Changes in v4:
> - Enable asf on AArch64 only.
> - Remove `avoid_store_forwarding_reorder_cost_p` function.
> - Remove `asf-default-cost-value` parameter.
>
> Changes in v3:
> - Add `avoid_store_forwarding_reorder_cost_p` function.
> - Add `asf-default-cost-value` parameter.
> - Use `avoid_store_forwarding_reorder_cost_p` for each store in ASF.
> - Update x86_64 testcases.
> - Update assembly patterns in `bitfield-bitint-abi-align16.c` and
> `bitfield-bitint-abi-align8.c`.
> - Fix bug where the result of `store_bit_field` is not used in the
> following instructions.
>
> Changes in v2:
> - Update assembly patterns in `bitfield-bitint-abi-align16.c` and
> `bitfield-bitint-abi-align8.c`.
>
> Changes in v1:
> - Enable asf by default at O2 or higher.
>
>  gcc/common/config/aarch64/aarch64-common.cc   |  2 ++
>  gcc/doc/invoke.texi                           |  2 +-
>  .../aarch64/avoid-store-forwarding-6.c        | 29 +++++++++++++++++++
>  .../aarch64/bitfield-bitint-abi-align16.c     | 25 +++++++++-------
>  .../aarch64/bitfield-bitint-abi-align8.c      | 25 +++++++++-------
>  5 files changed, 60 insertions(+), 23 deletions(-)
>  create mode 100644 
> gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c
>
> diff --git a/gcc/common/config/aarch64/aarch64-common.cc 
> b/gcc/common/config/aarch64/aarch64-common.cc
> index 1488697c6ce4..c3cd3eca9b84 100644
> --- a/gcc/common/config/aarch64/aarch64-common.cc
> +++ b/gcc/common/config/aarch64/aarch64-common.cc
> @@ -60,6 +60,8 @@ static const struct default_options 
> aarch_option_optimization_table[] =
>      /* Enable redundant extension instructions removal at -O2 and higher.  */
>      { OPT_LEVELS_2_PLUS, OPT_free, NULL, 1 },
>      { OPT_LEVELS_2_PLUS, OPT_mearly_ra_, NULL, AARCH64_EARLY_RA_ALL },
> +    /* Enable store forwarding avoidance at -O2 and higher.  */
> +    { OPT_LEVELS_2_PLUS, OPT_favoid_store_forwarding, NULL, 1 },
>  #if (TARGET_DEFAULT_ASYNC_UNWIND_TABLES == 1)
>      { OPT_LEVELS_ALL, OPT_fasynchronous_unwind_tables, NULL, 1 },
>      { OPT_LEVELS_ALL, OPT_funwind_tables, NULL, 1},
> diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
> index 2da2a27acd68..ef73a2b6302a 100644
> --- a/gcc/doc/invoke.texi
> +++ b/gcc/doc/invoke.texi
> @@ -13500,7 +13500,7 @@ Many CPUs will stall for many cycles when a load 
> partially depends on previous
>  smaller stores.  This pass tries to detect such cases and avoid the penalty 
> by
>  changing the order of the load and store and then fixing up the loaded value.
>
> -Disabled by default.
> +Enabled by default at @option{-O2} and higher on AArch64.
>
>  @opindex ffp-contract
>  @item -ffp-contract=@var{style}
> diff --git a/gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c 
> b/gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c
> new file mode 100644
> index 000000000000..320fa5e101e6
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/aarch64/avoid-store-forwarding-6.c
> @@ -0,0 +1,29 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2" } */
> +
> +/* Same as avoid-store-forwarding-1.c but without -favoid-store-forwarding.  
> */
> +
> +typedef union {
> +    char arr_8[8];
> +    long long_value;
> +} DataUnion;
> +
> +long ssll_1 (DataUnion *data, char x)
> +{
> +  data->arr_8[0] = x;
> +  return data->long_value;
> +}
> +
> +long ssll_2 (DataUnion *data, char x)
> +{
> +  data->arr_8[1] = x;
> +  return data->long_value;
> +}
> +
> +long ssll_3 (DataUnion *data, char x)
> +{
> +  data->arr_8[7] = x;
> +  return data->long_value;
> +}
> +
> +/* { dg-final { scan-assembler-times {ldr\tx[0-9]+, 
> \[x[0-9]+\]\n\tstrb\tw[0-9]+, \[x[0-9]+(, \d+)?\]\n\tbfi\tx[0-9]+, x[0-9]+, 
> \d+, \d+} 3 } } */

Can you use check-function-bodies instead of scan-assembler-times?
Also this seems like this optimization should be at the gimple level
instead of the RTL level for the above case and even the bitfield
case.


> diff --git a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c 
> b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
> index c29a230a7713..34f3d7f9653c 100644
> --- a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
> +++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align16.c
> @@ -87,14 +87,15 @@
>  **     sub     sp, sp, #16
>  **     mov     (x[0-9]+), x0
>  **     mov     w0, w1
> +**     mov     x1, 0
>  **     sbfx    x([0-9]+), \1, 0, 63
>  **     mov     (w[0-9]+), 0
>  **     bfi     \3, w\2, 0, 1
>  **     and     x3, x\2, 9223372036854775807
> -**     mov     x2, 0
> -**     str     xzr, \[sp\]
> +**     str     x1, \[sp\]

This is worse code generation here where x1 is zero. So why it the
case the x1 is set above to 0 and then stored?

>  **     strb    \3, \[sp\]
> -**     ldr     x1, \[sp\]
> +**     bfi     x1, x2, 0, 8

This is not good code generation at all. Shouldn't this just be `and
x1, x2, 0xff`?

More of the same below.
I am not 100% sure if this is a win either.
Why is pass_rtl_avoid_store_forwarding right before RA instead of
before combine? Where the above optimizations can happen?

Thanks,
Andrew Pinski


> +**     mov     x2, 0
>  **     add     sp, sp, 16
>  **     b       fp
>  */
> @@ -183,19 +184,20 @@
>  **     sxtw    (x[0-9]+), w1
>  **     mov     x0, \2
>  **     and     x7, \2, 9223372036854775807
> +**     mov     x2, 0
>  **     mov     (w[0-9]+), 0
>  **     bfi     \3, w\1, 0, 1
>  **     strb    wzr, \[sp, 16\]
>  **     mov     x6, x7
>  **     mov     x5, x7
>  **     mov     x4, x7
> +**     mov     x1, x7
> +**     str     x2, \[sp, 48\]
> +**     strb    \3, \[sp, 48\]
> +**     bfi     x2, x3, 0, 8
> +**     stp     x7, x2, \[sp\]
>  **     mov     x3, x7
>  **     mov     x2, x7
> -**     str     xzr, \[sp, 48\]
> -**     strb    \3, \[sp, 48\]
> -**     ldr     (x[0-9]+), \[sp, 48\]
> -**     stp     x7, \4, \[sp\]
> -**     mov     x1, x7
>  **     bl      fp_stack
>  **     sbfx    x0, x0, 0, 63
>  **...
> @@ -341,12 +343,13 @@
>  **...
>  **     mov     x([0-9]+), x0
>  **     mov     w0, w1
> +**     mov     x1, 0
>  **     mov     (w[0-9]+), 0
>  **     bfi     \2, w\1, 0, 1
> -**     mov     x2, 0
> -**     str     xzr, \[sp\]
> +**     str     x1, \[sp\]
>  **     strb    \2, \[sp\]
> -**     ldr     x1, \[sp\]
> +**     bfi     x1, x2, 0, 8
> +**     mov     x2, 0
>  **...
>  **     b       fp_stdarg
>  */
> diff --git a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c 
> b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
> index 13ffbf416cab..d9cefbabb80c 100644
> --- a/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
> +++ b/gcc/testsuite/gcc.target/aarch64/bitfield-bitint-abi-align8.c
> @@ -87,14 +87,15 @@
>  **     sub     sp, sp, #16
>  **     mov     (x[0-9]+), x0
>  **     mov     w0, w1
> +**     mov     x1, 0
>  **     sbfx    x([0-9]+), \1, 0, 63
>  **     mov     (w[0-9]+), 0
>  **     bfi     \3, w\2, 0, 1
>  **     and     x3, x\2, 9223372036854775807
> -**     mov     x2, 0
> -**     str     xzr, \[sp\]
> +**     str     x1, \[sp\]
>  **     strb    \3, \[sp\]
> -**     ldr     x1, \[sp\]
> +**     bfi     x1, x2, 0, 8
> +**     mov     x2, 0
>  **     add     sp, sp, 16
>  **     b       fp
>  */
> @@ -183,19 +184,20 @@
>  **     sxtw    (x[0-9]+), w1
>  **     mov     x0, \2
>  **     and     x7, \2, 9223372036854775807
> +**     mov     x2, 0
>  **     mov     (w[0-9]+), 0
>  **     bfi     \3, w\1, 0, 1
>  **     strb    wzr, \[sp, 16\]
>  **     mov     x6, x7
>  **     mov     x5, x7
>  **     mov     x4, x7
> +**     mov     x1, x7
> +**     str     x2, \[sp, 48\]
> +**     strb    \3, \[sp, 48\]
> +**     bfi     x2, x3, 0, 8
> +**     stp     x7, x2, \[sp\]
>  **     mov     x3, x7
>  **     mov     x2, x7
> -**     str     xzr, \[sp, 48\]
> -**     strb    \3, \[sp, 48\]
> -**     ldr     (x[0-9]+), \[sp, 48\]
> -**     stp     x7, \4, \[sp\]
> -**     mov     x1, x7
>  **     bl      fp_stack
>  **     sbfx    x0, x0, 0, 63
>  **...
> @@ -343,12 +345,13 @@
>  **...
>  **     mov     x([0-9]+), x0
>  **     mov     w0, w1
> +**     mov     x1, 0
>  **     mov     (w[0-9]+), 0
>  **     bfi     \2, w\1, 0, 1
> -**     mov     x2, 0
> -**     str     xzr, \[sp\]
> +**     str     x1, \[sp\]
>  **     strb    \2, \[sp\]
> -**     ldr     x1, \[sp\]
> +**     bfi     x1, x2, 0, 8
> +**     mov     x2, 0
>  **...
>  **     b       fp_stdarg
>  */
> --
> 2.51.1
>

Reply via email to