Re: [PATCH]middle-end Add optimized float addsub without needing VEC_PERM_EXPR.

Richard Biener via Gcc-patches Sat, 18 Jun 2022 03:49:17 -0700

> Am 17.06.2022 um 22:34 schrieb Andrew Pinski via Gcc-patches 
> <gcc-patches@gcc.gnu.org>:
> 
> On Thu, Jun 16, 2022 at 3:59 AM Tamar Christina via Gcc-patches
> <gcc-patches@gcc.gnu.org> wrote:
>> 
>> Hi All,
>> 
>> For IEEE 754 floating point formats we can replace a sequence of alternative
>> +/- with fneg of a wider type followed by an fadd.  This eliminated the need 
>> for
>> using a permutation.  This patch adds a math.pd rule to recognize and do this
>> rewriting.
> 
> I don't think this is correct. You don't check the format of the
> floating point to make sure this is valid (e.g. REAL_MODE_FORMAT's
> signbit_rw/signbit_ro field).
> Also would just be better if you do the xor in integer mode (using
> signbit_rw field for the correct bit)?
> And then making sure the target optimizes the xor to the neg
> instruction when needed?

I’m also worried about using FP operations for the negate here.  When @1 is 
constant do we still constant fold this correctly?

For costing purposes it would be nice to make this visible to the vectorizer.

Also is this really good for all targets?  Can there be issues with 
reformatting when using FP ops as in your patch or with using integer XOR as 
suggested making this more expensive than the blend?

Richard.

> Thanks,
> Andrew Pinski
> 
> 
> 
>> 
>> For
>> 
>> void f (float *restrict a, float *restrict b, float *res, int n)
>> {
>>   for (int i = 0; i < (n & -4); i+=2)
>>    {
>>      res[i+0] = a[i+0] + b[i+0];
>>      res[i+1] = a[i+1] - b[i+1];
>>    }
>> }
>> 
>> we generate:
>> 
>> .L3:
>>        ldr     q1, [x1, x3]
>>        ldr     q0, [x0, x3]
>>        fneg    v1.2d, v1.2d
>>        fadd    v0.4s, v0.4s, v1.4s
>>        str     q0, [x2, x3]
>>        add     x3, x3, 16
>>        cmp     x3, x4
>>        bne     .L3
>> 
>> now instead of:
>> 
>> .L3:
>>        ldr     q1, [x0, x3]
>>        ldr     q2, [x1, x3]
>>        fadd    v0.4s, v1.4s, v2.4s
>>        fsub    v1.4s, v1.4s, v2.4s
>>        tbl     v0.16b, {v0.16b - v1.16b}, v3.16b
>>        str     q0, [x2, x3]
>>        add     x3, x3, 16
>>        cmp     x3, x4
>>        bne     .L3
>> 
>> Bootstrapped Regtested on aarch64-none-linux-gnu and no issues.
>> 
>> Thanks to George Steed for the idea.
>> 
>> Ok for master?
>> 
>> Thanks,
>> Tamar
>> 
>> gcc/ChangeLog:
>> 
>>        * match.pd: Add fneg/fadd rule.
>> 
>> gcc/testsuite/ChangeLog:
>> 
>>        * gcc.target/aarch64/simd/addsub_1.c: New test.
>>        * gcc.target/aarch64/sve/addsub_1.c: New test.
>> 
>> --- inline copy of patch --
>> diff --git a/gcc/match.pd b/gcc/match.pd
>> index 
>> 51b0a1b562409af535e53828a10c30b8a3e1ae2e..af1c98d4a2831f38258d6fc1bbe811c8ee6c7c6e
>>  100644
>> --- a/gcc/match.pd
>> +++ b/gcc/match.pd
>> @@ -7612,6 +7612,49 @@ and,
>>   (simplify (reduc (op @0 VECTOR_CST@1))
>>     (op (reduc:type @0) (reduc:type @1))))
>> 
>> +/* Simplify vector floating point operations of alternating sub/add pairs
>> +   into using an fneg of a wider element type followed by a normal add.
>> +   under IEEE 754 the fneg of the wider type will negate every even entry
>> +   and when doing an add we get a sub of the even and add of every odd
>> +   elements.  */
>> +(simplify
>> + (vec_perm (plus:c @0 @1) (minus @0 @1) VECTOR_CST@2)
>> + (if (!VECTOR_INTEGER_TYPE_P (type) && !BYTES_BIG_ENDIAN)
>> +  (with
>> +   {
>> +     /* Build a vector of integers from the tree mask.  */
>> +     vec_perm_builder builder;
>> +     if (!tree_to_vec_perm_builder (&builder, @2))
>> +       return NULL_TREE;
>> +
>> +     /* Create a vec_perm_indices for the integer vector.  */
>> +     poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (type);
>> +     vec_perm_indices sel (builder, 2, nelts);
>> +   }
>> +   (if (sel.series_p (0, 2, 0, 2))
>> +    (with
>> +     {
>> +       machine_mode vec_mode = TYPE_MODE (type);
>> +       auto elem_mode = GET_MODE_INNER (vec_mode);
>> +       auto nunits = exact_div (GET_MODE_NUNITS (vec_mode), 2);
>> +       tree stype;
>> +       switch (elem_mode)
>> +        {
>> +        case E_HFmode:
>> +          stype = float_type_node;
>> +          break;
>> +        case E_SFmode:
>> +          stype = double_type_node;
>> +          break;
>> +        default:
>> +          return NULL_TREE;
>> +        }
>> +       tree ntype = build_vector_type (stype, nunits);
>> +       if (!ntype)
>> +        return NULL_TREE;
>> +     }
>> +     (plus (view_convert:type (negate (view_convert:ntype @1))) @0))))))
>> +
>> (simplify
>>  (vec_perm @0 @1 VECTOR_CST@2)
>>  (with
>> diff --git a/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c 
>> b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
>> new file mode 100644
>> index 
>> 0000000000000000000000000000000000000000..1fb91a34c421bbd2894faa0dbbf1b47ad43310c4
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/simd/addsub_1.c
>> @@ -0,0 +1,56 @@
>> +/* { dg-do compile } */
>> +/* { dg-require-effective-target arm_v8_2a_fp16_neon_ok } */
>> +/* { dg-options "-Ofast" } */
>> +/* { dg-add-options arm_v8_2a_fp16_neon } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
>> +
>> +#pragma GCC target "+nosve"
>> +
>> +/*
>> +** f1:
>> +** ...
>> +**     fneg    v[0-9]+.2d, v[0-9]+.2d
>> +**     fadd    v[0-9]+.4s, v[0-9]+.4s, v[0-9]+.4s
>> +** ...
>> +*/
>> +void f1 (float *restrict a, float *restrict b, float *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -4); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> +
>> +/*
>> +** d1:
>> +** ...
>> +**     fneg    v[0-9]+.4s, v[0-9]+.4s
>> +**     fadd    v[0-9]+.8h, v[0-9]+.8h, v[0-9]+.8h
>> +** ...
>> +*/
>> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -8); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> +
>> +/*
>> +** e1:
>> +** ...
>> +**     fadd    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
>> +**     fsub    v[0-9]+.2d, v[0-9]+.2d, v[0-9]+.2d
>> +**     ins     v[0-9]+.d\[1\], v[0-9]+.d\[1\]
>> +** ...
>> +*/
>> +void e1 (double *restrict a, double *restrict b, double *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -4); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> diff --git a/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c 
>> b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
>> new file mode 100644
>> index 
>> 0000000000000000000000000000000000000000..ea7f9d9db2c8c9a3efe5c7951a314a29b7a7a922
>> --- /dev/null
>> +++ b/gcc/testsuite/gcc.target/aarch64/sve/addsub_1.c
>> @@ -0,0 +1,52 @@
>> +/* { dg-do compile } */
>> +/* { dg-options "-Ofast" } */
>> +/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
>> +
>> +/*
>> +** f1:
>> +** ...
>> +**     fneg    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
>> +**     fadd    z[0-9]+.s, z[0-9]+.s, z[0-9]+.s
>> +** ...
>> +*/
>> +void f1 (float *restrict a, float *restrict b, float *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -4); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> +
>> +/*
>> +** d1:
>> +** ...
>> +**     fneg    z[0-9]+.s, p[0-9]+/m, z[0-9]+.s
>> +**     fadd    z[0-9]+.h, z[0-9]+.h, z[0-9]+.h
>> +** ...
>> +*/
>> +void d1 (_Float16 *restrict a, _Float16 *restrict b, _Float16 *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -8); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> +
>> +/*
>> +** e1:
>> +** ...
>> +**     fsub    z[0-9]+.d, z[0-9]+.d, z[0-9]+.d
>> +**     movprfx z[0-9]+.d, p[0-9]+/m, z[0-9]+.d
>> +**     fadd    z[0-9]+.d, p[0-9]+/m, z[0-9]+.d, z[0-9]+.d
>> +** ...
>> +*/
>> +void e1 (double *restrict a, double *restrict b, double *res, int n)
>> +{
>> +   for (int i = 0; i < (n & -4); i+=2)
>> +    {
>> +      res[i+0] = a[i+0] + b[i+0];
>> +      res[i+1] = a[i+1] - b[i+1];
>> +    }
>> +}
>> 
>> 
>> 
>> 
>> --
Re: [PATCH]middle-end Add optimized float addsub without needing VEC_PERM_EXPR.

Reply via email to