Jennifer Schmitz <[email protected]> writes:
> Dear Richard,
> Thanks for the feedback! I made the changes as suggested. Here is the updated
> patch, bootstrapped and tested again.
> Best,
> Jennifer
LGTM, thanks. Pushed to trunk with some minor reindentation:
if ((prev_type == TYPE_ALUS_SREG || prev_type == TYPE_ALUS_IMM)
&& ((aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSEL)
&& GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
&& aarch64_reg_or_zero (XEXP (SET_SRC (curr_set), 1), VOIDmode)
&& aarch64_reg_or_zero (XEXP (SET_SRC (curr_set), 2), VOIDmode)
&& SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (curr_set), 1))))
|| (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSET)
&& GET_RTX_CLASS (GET_CODE (SET_SRC (curr_set)))
== RTX_COMPARE
&& REG_P (SET_DEST (curr_set)))))
return true;
Richard
>
>
>
>> On 31 Jul 2024, at 21:21, Richard Sandiford <[email protected]>
>> wrote:
>>
>> External email: Use caution opening links or attachments
>>
>>
>> Jennifer Schmitz <[email protected]> writes:
>>> Thanks for the feedback! I updated the patch based on your comments, more
>>> detailed comments inline below. The updated version was bootstrapped and
>>> tested again, no regression.
>>> Best,
>>> Jennifer
>>>
>>> From 89936b7bc2de7a1e4bc55c3a1e8d5e6ac0de579d Mon Sep 17 00:00:00 2001
>>> From: Jennifer Schmitz <[email protected]>
>>> Date: Wed, 24 Jul 2024 06:13:59 -0700
>>> Subject: [PATCH] AArch64: Fuse CMP+CSEL and CMP+CSET for -mcpu=neoverse-v2
>>>
>>> According to the Neoverse V2 Software Optimization Guide (section 4.14), the
>>> instruction pairs CMP+CSEL and CMP+CSET can be fused, which had not been
>>> implemented so far. This patch implements and tests the two fusion pairs.
>>>
>>> The patch was bootstrapped and regtested on aarch64-linux-gnu, no
>>> regression.
>>> There was also no non-noise impact on SPEC CPU2017 benchmark.
>>> OK for mainline?
>>>
>>> Signed-off-by: Jennifer Schmitz <[email protected]>
>>>
>>> gcc/
>>>
>>> * config/aarch64/aarch64.cc (aarch_macro_fusion_pair_p): Implement
>>> fusion logic.
>>> * config/aarch64/aarch64-fusion-pairs.def (cmp+csel): New entry.
>>> (cmp+cset): Likewise.
>>> * config/aarch64/tuning_models/neoversev2.h: Enable logic in
>>> field fusible_ops.
>>>
>>> gcc/testsuite/
>>>
>>> * gcc.target/aarch64/fuse_cmp_csel.c: New test.
>>> * gcc.target/aarch64/fuse_cmp_cset.c: Likewise.
>>> ---
>>> gcc/config/aarch64/aarch64-fusion-pairs.def | 2 ++
>>> gcc/config/aarch64/aarch64.cc | 20 +++++++++++
>>> gcc/config/aarch64/tuning_models/neoversev2.h | 5 ++-
>>> .../gcc.target/aarch64/fuse_cmp_csel.c | 33 +++++++++++++++++++
>>> .../gcc.target/aarch64/fuse_cmp_cset.c | 31 +++++++++++++++++
>>> 5 files changed, 90 insertions(+), 1 deletion(-)
>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/fuse_cmp_csel.c
>>> create mode 100644 gcc/testsuite/gcc.target/aarch64/fuse_cmp_cset.c
>>>
>>> diff --git a/gcc/config/aarch64/aarch64-fusion-pairs.def
>>> b/gcc/config/aarch64/aarch64-fusion-pairs.def
>>> index 9a43b0c8065..bf5e85ba8fe 100644
>>> --- a/gcc/config/aarch64/aarch64-fusion-pairs.def
>>> +++ b/gcc/config/aarch64/aarch64-fusion-pairs.def
>>> @@ -37,5 +37,7 @@ AARCH64_FUSION_PAIR ("aes+aesmc", AES_AESMC)
>>> AARCH64_FUSION_PAIR ("alu+branch", ALU_BRANCH)
>>> AARCH64_FUSION_PAIR ("alu+cbz", ALU_CBZ)
>>> AARCH64_FUSION_PAIR ("addsub_2reg_const1", ADDSUB_2REG_CONST1)
>>> +AARCH64_FUSION_PAIR ("cmp+csel", CMP_CSEL)
>>> +AARCH64_FUSION_PAIR ("cmp+cset", CMP_CSET)
>>>
>>> #undef AARCH64_FUSION_PAIR
>>> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
>>> index e0cf382998c..d42c153443e 100644
>>> --- a/gcc/config/aarch64/aarch64.cc
>>> +++ b/gcc/config/aarch64/aarch64.cc
>>> @@ -27345,6 +27345,26 @@ aarch_macro_fusion_pair_p (rtx_insn *prev,
>>> rtx_insn *curr)
>>> && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
>>> return true;
>>>
>>> + /* Fuse CMP and CSEL/CSET. */
>>> + if (prev_set && curr_set
>>> + && GET_CODE (SET_SRC (prev_set)) == COMPARE
>>> + && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
>>> + && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
>>> + {
>>> + enum attr_type prev_type = get_attr_type (prev);
>>> + if ((prev_type == TYPE_ALUS_SREG || prev_type == TYPE_ALUS_IMM)
>>> + && (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSEL)
>>> + && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
>>> + && REG_P (XEXP (SET_SRC (curr_set), 1))
>>> + && REG_P (XEXP (SET_SRC (curr_set), 2))
>>
>> Gah, I'd meant to say this in a previous review, but now realise
>> that I forgot. I think these REG_Ps can be relaxed to:
>>
>> && aarch64_reg_or_zero (XEXP (SET_SRC (curr_set), N), VOIDmode)
>>
>> since CSEL can take w/xzr.
> Done.
>>
>>> + && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (curr_set), 1))))
>>> + || (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_CSET)
>>> + && GET_RTX_CLASS (GET_CODE (SET_SRC (curr_set)))
>>> + == RTX_COMPARE
>>> + && REG_P (SET_DEST (curr_set))))
>>> + return true;
>>> + }
>>
>> It looks like there might be a missing pair of brackets here. AFAICT,
>> the current bracketing works out as:
>>
>> if ((prev_type conditions)
>> && (CMP_CSEL conditions)
>> || (CMP_CSET conditions))
>>
>> and since || binds less tightly than &&, I think the CMP_CSET condition
>> doesn't include the prev_type restriction. Enclosing everything after
>> the first && in an extra set of brackets would fix that.
>>
> Done.
>>
>>
>> OK with those changes from my POV (no need for another round of review,
>> unless you'd prefer one). Please give others a day or so to comment though.
>>
>> Thanks,
>> Richard
>>
>>> +
>>> /* Fuse flag-setting ALU instructions and conditional branch. */
>>> if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
>>> && any_condjump_p (curr))
>>> diff --git a/gcc/config/aarch64/tuning_models/neoversev2.h
>>> b/gcc/config/aarch64/tuning_models/neoversev2.h
>>> index f76e4ef358f..ae99fab22d8 100644
>>> --- a/gcc/config/aarch64/tuning_models/neoversev2.h
>>> +++ b/gcc/config/aarch64/tuning_models/neoversev2.h
>>> @@ -221,7 +221,10 @@ static const struct tune_params neoversev2_tunings =
>>> 2 /* store_pred. */
>>> }, /* memmov_cost. */
>>> 5, /* issue_rate */
>>> - (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
>>> + (AARCH64_FUSE_AES_AESMC
>>> + | AARCH64_FUSE_CMP_BRANCH
>>> + | AARCH64_FUSE_CMP_CSEL
>>> + | AARCH64_FUSE_CMP_CSET), /* fusible_ops */
>>> "32:16", /* function_align. */
>>> "4", /* jump_align. */
>>> "32:16", /* loop_align. */
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fuse_cmp_csel.c
>>> b/gcc/testsuite/gcc.target/aarch64/fuse_cmp_csel.c
>>> new file mode 100644
>>> index 00000000000..85f302bab98
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/fuse_cmp_csel.c
>>> @@ -0,0 +1,33 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -mcpu=neoverse-v2" } */
>>> +/* { dg-final { check-function-bodies "**" "" } } */
>>> +
>>> +/*
>>> +** f1:
>>> +** ...
>>> +** cmp w[0-9]+, w[0-9]+
>>> +** csel w[0-9]+, w[0-9]+, w[0-9]+, le
>>> +** ret
>>> +*/
>>> +int f1 (int a, int b, int c)
>>> +{
>>> + int cmp = a > b;
>>> + int add1 = c + 3;
>>> + int add2 = c + 8;
>>> + return cmp ? add1 : add2;
>>> +}
>>> +
>>> +/*
>>> +** f2:
>>> +** ...
>>> +** cmp x[0-9]+, x[0-9]+
>>> +** csel x[0-9]+, x[0-9]+, x[0-9]+, le
>>> +** ret
>>> +*/
>>> +long long f2 (long long a, long long b, long long c)
>>> +{
>>> + long long cmp = a > b;
>>> + long long add1 = c + 3;
>>> + long long add2 = c + 8;
>>> + return cmp ? add1 : add2;
>>> +}
>>> diff --git a/gcc/testsuite/gcc.target/aarch64/fuse_cmp_cset.c
>>> b/gcc/testsuite/gcc.target/aarch64/fuse_cmp_cset.c
>>> new file mode 100644
>>> index 00000000000..04f1ce2773b
>>> --- /dev/null
>>> +++ b/gcc/testsuite/gcc.target/aarch64/fuse_cmp_cset.c
>>> @@ -0,0 +1,31 @@
>>> +/* { dg-do compile } */
>>> +/* { dg-options "-O2 -mcpu=neoverse-v2" } */
>>> +/* { dg-final { check-function-bodies "**" "" } } */
>>> +
>>> +/*
>>> +** f1:
>>> +** cmp w[0-9]+, w[0-9]+
>>> +** cset w[0-9]+, gt
>>> +** ...
>>> +*/
>>> +int g;
>>> +int f1 (int a, int b)
>>> +{
>>> + int cmp = a > b;
>>> + g = cmp + 1;
>>> + return cmp;
>>> +}
>>> +
>>> +/*
>>> +** f2:
>>> +** cmp x[0-9]+, x[0-9]+
>>> +** cset x[0-9]+, gt
>>> +** ...
>>> +*/
>>> +long long h;
>>> +long long f2 (long long a, long long b)
>>> +{
>>> + long long cmp = a > b;
>>> + h = cmp + 1;
>>> + return cmp;
>>> +}