Hi Tamar,
> -----Original Message-----
> From: Tamar Christina <[email protected]>
> Sent: Monday, November 6, 2023 7:43 AM
> To: [email protected]
> Cc: nd <[email protected]>; Ramana Radhakrishnan
> <[email protected]>; Richard Earnshaw
> <[email protected]>; [email protected]; Kyrylo Tkachov
> <[email protected]>
> Subject: [PATCH 21/21]Arm: Add MVE cbranch implementation
>
> Hi All,
>
> This adds an implementation for conditional branch optab for MVE.
>
> Unfortunately MVE has rather limited operations on VPT.P0, we are missing the
> ability to do P0 comparisons and logical OR on P0.
>
> For that reason we can only support cbranch with 0, as for comparing to a 0
> predicate we don't need to actually do a comparison, we only have to check
> that
> any bit is set within P0.
>
> Because we can only do P0 comparisons with 0, the costing of the comparison
> was
> reduced in order for the compiler not to try to push 0 to a register thinking
> it's too expensive. For the cbranch implementation to be safe we must see the
> constant 0 vector.
>
> For the lack of logical OR on P0 we can't really work around. This means MVE
> can't support cases where the sizes of operands in the comparison don't match,
> i.e. when one operand has been unpacked.
>
> For e.g.
>
> void f1 ()
> {
> for (int i = 0; i < N; i++)
> {
> b[i] += a[i];
> if (a[i] > 0)
> break;
> }
> }
>
> For 128-bit vectors we generate:
>
> vcmp.s32 gt, q3, q1
> vmrs r3, p0 @ movhi
> cbnz r3, .L2
>
> MVE does not have 64-bit vector comparisons, as such that is also not
> supported.
>
> Bootstrapped arm-none-linux-gnueabihf and regtested with
> -march=armv8.1-m.main+mve -mfpu=auto and no issues.
>
> Ok for master?
>
This is okay once the rest goes in.
Thanks,
Kyrill
> Thanks,
> Tamar
>
> gcc/ChangeLog:
>
> * config/arm/arm.cc (arm_rtx_costs_internal): Update costs for pred 0
> compares.
> * config/arm/mve.md (cbranch<mode>4): New.
>
> gcc/testsuite/ChangeLog:
>
> * lib/target-supports.exp (vect_early_break): Add MVE.
> * gcc.target/arm/mve/vect-early-break-cbranch.c: New test.
>
> --- inline copy of patch --
> diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
> index
> 38f0839de1c75547c259ac3d655fcfc14e7208a2..15e65c15cb3cb6f70161787e84
> b255a24eb51e32 100644
> --- a/gcc/config/arm/arm.cc
> +++ b/gcc/config/arm/arm.cc
> @@ -11883,6 +11883,15 @@ arm_rtx_costs_internal (rtx x, enum rtx_code code,
> enum rtx_code outer_code,
> || TARGET_HAVE_MVE)
> && simd_immediate_valid_for_move (x, mode, NULL, NULL))
> *cost = COSTS_N_INSNS (1);
> + else if (TARGET_HAVE_MVE
> + && outer_code == COMPARE
> + && VALID_MVE_PRED_MODE (mode))
> + /* MVE allows very limited instructions on VPT.P0, however comparisons
> + to 0 do not require us to materialze this constant or require a
> + predicate comparison as we can go through SImode. For that reason
> + allow P0 CMP 0 as a cheap operation such that the 0 isn't forced to
> + registers as we can't compare two predicates. */
> + *cost = COSTS_N_INSNS (1);
> else
> *cost = COSTS_N_INSNS (4);
> return true;
> diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
> index
> 74909ce47e132c22a94f7d9cd3a0921b38e33051..95d40770ecc25f9eb251eba38
> 306dd43cbebfb3f 100644
> --- a/gcc/config/arm/mve.md
> +++ b/gcc/config/arm/mve.md
> @@ -6880,6 +6880,21 @@ (define_expand
> "vcond_mask_<mode><MVE_vpred>"
> DONE;
> })
>
> +(define_expand "cbranch<mode>4"
> + [(set (pc) (if_then_else
> + (match_operator 0 "expandable_comparison_operator"
> + [(match_operand:MVE_7 1 "register_operand")
> + (match_operand:MVE_7 2 "zero_operand")])
> + (label_ref (match_operand 3 "" ""))
> + (pc)))]
> + "TARGET_HAVE_MVE"
> +{
> + rtx val = gen_reg_rtx (SImode);
> + emit_move_insn (val, gen_lowpart (SImode, operands[1]));
> + emit_jump_insn (gen_cbranchsi4 (operands[0], val, const0_rtx,
> operands[3]));
> + DONE;
> +})
> +
> ;; Reinterpret operand 1 in operand 0's mode, without changing its contents.
> (define_expand "@arm_mve_reinterpret<mode>"
> [(set (match_operand:MVE_vecs 0 "register_operand")
> diff --git a/gcc/testsuite/gcc.target/arm/mve/vect-early-break-cbranch.c
> b/gcc/testsuite/gcc.target/arm/mve/vect-early-break-cbranch.c
> new file mode 100644
> index
> 0000000000000000000000000000000000000000..c3b8506dca0b2b044e6869a6
> c8259d663c1ff930
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/arm/mve/vect-early-break-cbranch.c
> @@ -0,0 +1,117 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target arm_v8_1m_mve_ok } */
> +/* { dg-add-options arm_v8_1m_mve } */
> +/* { dg-options "-O3" } */
> +/* { dg-final { check-function-bodies "**" "" "" } } */
> +
> +#define N 640
> +int a[N] = {0};
> +int b[N] = {0};
> +
> +/*
> +** f1:
> +** ...
> +** vcmp.s32 gt, q[0-9]+, q[0-9]+
> +** vmrs r[0-9]+, p0 @ movhi
> +** cbnz r[0-9]+, \.L[0-9]+
> +** ...
> +*/
> +void f1 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] > 0)
> + break;
> + }
> +}
> +
> +/*
> +** f2:
> +** ...
> +** vcmp.s32 ge, q[0-9]+, q[0-9]+
> +** vmrs r[0-9]+, p0 @ movhi
> +** cbnz r[0-9]+, \.L[0-9]+
> +** ...
> +*/
> +void f2 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] >= 0)
> + break;
> + }
> +}
> +
> +/*
> +** f3:
> +** ...
> +** vcmp.i32 eq, q[0-9]+, q[0-9]+
> +** vmrs r[0-9]+, p0 @ movhi
> +** cbnz r[0-9]+, \.L[0-9]+
> +** ...
> +*/
> +void f3 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] == 0)
> + break;
> + }
> +}
> +
> +/*
> +** f4:
> +** ...
> +** vcmp.i32 ne, q[0-9]+, q[0-9]+
> +** vmrs r[0-9]+, p0 @ movhi
> +** cbnz r[0-9]+, \.L[0-9]+
> +** ...
> +*/
> +void f4 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] != 0)
> + break;
> + }
> +}
> +
> +/*
> +** f5:
> +** ...
> +** vcmp.s32 lt, q[0-9]+, q[0-9]+
> +** vmrs r[0-9]+, p0 @ movhi
> +** cbnz r[0-9]+, \.L[0-9]+
> +** ...
> +*/
> +void f5 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] < 0)
> + break;
> + }
> +}
> +
> +/*
> +** f6:
> +** ...
> +** vcmp.s32 le, q[0-9]+, q[0-9]+
> +** vmrs r[0-9]+, p0 @ movhi
> +** cbnz r[0-9]+, \.L[0-9]+
> +** ...
> +*/
> +void f6 ()
> +{
> + for (int i = 0; i < N; i++)
> + {
> + b[i] += a[i];
> + if (a[i] <= 0)
> + break;
> + }
> +}
> diff --git a/gcc/testsuite/lib/target-supports.exp b/gcc/testsuite/lib/target-
> supports.exp
> index
> 8f58671e6cfd3546c6a98e40341fe31c6492594b..1eef764542a782786e27ed935a
> 06243e319ae3fc 100644
> --- a/gcc/testsuite/lib/target-supports.exp
> +++ b/gcc/testsuite/lib/target-supports.exp
> @@ -3785,6 +3785,8 @@ proc check_effective_target_vect_early_break { } {
> expr {
> [istarget aarch64*-*-*]
> || [check_effective_target_arm_neon_ok]
> + || ([check_effective_target_arm_v8_1m_mve_fp_ok]
> + && [check_effective_target_arm_little_endian])
> }}]
> }
> # Return 1 if the target supports hardware vectorization of complex
> additions of
>
>
>
>
> --