> Am 17.06.2022 um 22:34 schrieb Andrew Pinski via Gcc-patches > <gcc-patches@gcc.gnu.org>: > > On Thu, Jun 16, 2022 at 3:59 AM Tamar Christina via Gcc-patches > <gcc-patches@gcc.gnu.org> wrote: >> >> Hi All, >> >> For IEEE 754 floating point formats we can replace a sequence of alternative >> +/- with fneg of a wider type followed by an fadd. This eliminated the need >> for >> using a permutation. This patch adds a math.pd rule to recognize and do this >> rewriting. > > I don't think this is correct. You don't check the format of the > floating point to make sure this is valid (e.g. REAL_MODE_FORMAT's > signbit_rw/signbit_ro field). > Also would just be better if you do the xor in integer mode (using > signbit_rw field for the correct bit)? > And then making sure the target optimizes the xor to the neg > instruction when needed? I’m also worried about using FP operations for the negate here. When @1 is constant do we still constant fold this correctly? For costing purposes it would be nice to make this visible to the vectorizer. Also is this really good for all targets? Can there be issues with reformatting when using FP ops as in your patch or with using integer XOR as suggested making this more expensive than the blend? Richard. > Thanks, > Andrew Pinski > > > >> >> For >> >> void f (float *restrict a, float *restrict b, float *res, int n) >> { >> for (int i = 0; i < (n & -4); i+=2) >> { >> res[i+0] = a[i+0] + b[i+0]; >> res[i+1] = a[i+1] - b[i+1]; >> } >> } >> >> we generate: >> >> .L3: >> ldr q1, [x1, x3] >> ldr q0, [x0, x3] >> fneg v1.2d, v1.2d >> fadd v0.4s, v0.4s, v1.4s >> str q0, [x2, x3] >> add x3, x3, 16 >> cmp x3, x4 >> bne .L3 >> >> now instead of: >> >> .L3: >> ldr q1, [x0, x3] >> ldr q2, [x1, x3] >> fadd v0.4s, v1.4s, v2.4s >> fsub v1.4s, v1.4s, v2.4s >> tbl v0.16b, {v0.16b - v1.16b}, v3.16b >> str q0, [x2, x3] >> add x3, x3, 16 >> cmp x3, x4 >> bne .L3 >> >> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues. >> >> Thanks to George Steed for the idea. >> >> Ok for master? >> >> Thanks, >> Tamar >> >> gcc/ChangeLog: >> >> * match.pd: Add fneg/fadd rule. >> >> gcc/testsuite/ChangeLog: >> >> * gcc.target/aarch64/simd/addsub_1.c: New test. >> * gcc.target/aarch64/sve/addsub_1.c: New test. >> >> --- inline copy of patch -- >> diff --git a/gcc/match.pd b/gcc/match.pd >> index >> 51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bbe811c8ee6c7c6e >> 100644 >> --- a/gcc/match.pd >> +++ b/gcc/match.pd >> @@ -7612,6 +7612,49 @@ and, >> (simplify (reduc (op @0 VECTOR_CST@1)) >> (op (reduc:type @0) (reduc:type @1)))) >> >> +/* Simplify vector floating point operations of alternating sub/add pairs >> + into using an fneg of a wider element type followed by a normal add. >> + under IEEE 754 the fneg of the wider type will negate every even entry >> + and when doing an add we get a sub of the even and add of every odd >> + elements. */ >> +(simplify >> + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2) >> + (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN) >> + (with >> + { >> + /* Build a vector of integers from the tree mask. */ >> + vec_perm_builder builder; >> + if (!tree_to_vec_perm_builder (&builder, @2)) >> + return NULL_TREE; >> + >> + /* Create a vec_perm_indices for the integer vector. */ >> + poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type); >> + vec_perm_indices sel (builder, 2, nelts); >> + } >> + (if (sel.series_p (0, 2, 0, 2)) >> + (with >> + { >> + machine_mode vec_mode = TYPE_MODE (type); >> + auto elem_mode = GET_MODE_INNER (vec_mode); >> + auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2); >> + tree stype; >> + switch (elem_mode) >> + { >> + case E_HFmode: >> + stype = float_type_node; >> + break; >> + case E_SFmode: >> + stype = double_type_node; >> + break; >> + default: >> + return NULL_TREE; >> + } >> + tree ntype = build_vector_type (stype, nunits); >> + if (!ntype) >> + return NULL_TREE; >> + } >> + (plus (view_convert:type (negate (view_convert:ntype @1))) @0)))))) >> + >> (simplify >> (vec_perm @0 @1 VECTOR_CST@2) >> (with >> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c >> b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c >> new file mode 100644 >> index >> 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c >> @@ -0,0 +1,56 @@ >> +/* { dg-do compile } */ >> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */ >> +/* { dg-options "-Ofast" } */ >> +/* { dg-add-options arm_v8_2a_fp16_neon } */ >> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ >> + >> +#pragma GCC target "+nosve" >> + >> +/* >> +** f1: >> +** ... >> +** fneg v[0-9]+.2d, v[0-9]+.2d >> +** fadd v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s >> +** ... >> +*/ >> +void f1 (float *restrict a, float *restrict b, float *res, int n) >> +{ >> + for (int i = 0; i < (n & -4); i+=2) >> + { >> + res[i+0] = a[i+0] + b[i+0]; >> + res[i+1] = a[i+1] - b[i+1]; >> + } >> +} >> + >> +/* >> +** d1: >> +** ... >> +** fneg v[0-9]+.4s, v[0-9]+.4s >> +** fadd v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h >> +** ... >> +*/ >> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n) >> +{ >> + for (int i = 0; i < (n & -8); i+=2) >> + { >> + res[i+0] = a[i+0] + b[i+0]; >> + res[i+1] = a[i+1] - b[i+1]; >> + } >> +} >> + >> +/* >> +** e1: >> +** ... >> +** fadd v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d >> +** fsub v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d >> +** ins v[0-9]+.d\[1\], v[0-9]+.d\[1\] >> +** ... >> +*/ >> +void e1 (double *restrict a, double *restrict b, double *res, int n) >> +{ >> + for (int i = 0; i < (n & -4); i+=2) >> + { >> + res[i+0] = a[i+0] + b[i+0]; >> + res[i+1] = a[i+1] - b[i+1]; >> + } >> +} >> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c >> b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c >> new file mode 100644 >> index >> 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c >> @@ -0,0 +1,52 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-Ofast" } */ >> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */ >> + >> +/* >> +** f1: >> +** ... >> +** fneg z[0-9]+.d, p[0-9]+/m, z[0-9]+.d >> +** fadd z[0-9]+.s, z[0-9]+.s, z[0-9]+.s >> +** ... >> +*/ >> +void f1 (float *restrict a, float *restrict b, float *res, int n) >> +{ >> + for (int i = 0; i < (n & -4); i+=2) >> + { >> + res[i+0] = a[i+0] + b[i+0]; >> + res[i+1] = a[i+1] - b[i+1]; >> + } >> +} >> + >> +/* >> +** d1: >> +** ... >> +** fneg z[0-9]+.s, p[0-9]+/m, z[0-9]+.s >> +** fadd z[0-9]+.h, z[0-9]+.h, z[0-9]+.h >> +** ... >> +*/ >> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n) >> +{ >> + for (int i = 0; i < (n & -8); i+=2) >> + { >> + res[i+0] = a[i+0] + b[i+0]; >> + res[i+1] = a[i+1] - b[i+1]; >> + } >> +} >> + >> +/* >> +** e1: >> +** ... >> +** fsub z[0-9]+.d, z[0-9]+.d, z[0-9]+.d >> +** movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d >> +** fadd z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d >> +** ... >> +*/ >> +void e1 (double *restrict a, double *restrict b, double *res, int n) >> +{ >> + for (int i = 0; i < (n & -4); i+=2) >> + { >> + res[i+0] = a[i+0] + b[i+0]; >> + res[i+1] = a[i+1] - b[i+1]; >> + } >> +} >> >> >> >> >> --
Re: [PATCH]middle-end Add optimized float addsub without needing VEC_PERM_EXPR.
Richard Biener via Gcc-patches Sat, 18 Jun 2022 03:49:17 -0700
- [PATCH]middle-end Add optimized float ad... Tamar Christina via Gcc-patches
- Re: [PATCH]middle-end Add optimized... Andrew Pinski via Gcc-patches
- Re: [PATCH]middle-end Add optim... Richard Biener via Gcc-patches
- RE: [PATCH]middle-end Add o... Tamar Christina via Gcc-patches
- RE: [PATCH]middle-end A... Richard Biener via Gcc-patches
- RE: [PATCH]middle-... Tamar Christina via Gcc-patches
- Re: [PATCH]mid... Richard Sandiford via Gcc-patches
- Re: [PATCH]middle-... Jeff Law via Gcc-patches
- Re: [PATCH]middle-end Add optim... Jeff Law via Gcc-patches