On Tue, 29 Aug 2023, Jakub Jelinek wrote:

> Hi!
> 
> The uaddc/usubc usual matching is of the .{ADD,SUB}_OVERFLOW pair in the
> middle, which adds/subtracts carry-in (from lower limbs) and computes
> carry-out (to higher limbs).  Before optimizations (unless user writes
> it intentionally that way already), all the steps look the same, but
> optimizations simplify the handling of the least significant limb
> (one which adds/subtracts 0 carry-in) to just a single
> .{ADD,SUB}_OVERFLOW and the handling of the most significant limb
> if the computed carry-out is ignored to normal addition/subtraction
> of multiple operands.
> Now, match_uaddc_usubc has code to turn that least significant
> .{ADD,SUB}_OVERFLOW call into .U{ADD,SUB}C call with 0 carry-in if
> a more significant limb above it is matched into .U{ADD,SUB}C; this
> isn't necessary for functionality, as .ADD_OVERFLOW (x, y) is
> functionally equal to .UADDC (x, y, 0) (provided the types of operands
> are the same and result is complex type with that type element), and
> it also has code to match the most significant limb with ignored carry-out
> (in that case one pattern match turns both the penultimate limb pair of
> .{ADD,SUB}_OVERFLOW into .U{ADD,SUB}C and the addition/subtraction
> of the 4 values (2 carries) into another .U{ADD,SUB}C.
> 
> As the following patch shows, what we weren't handling is the case when
> one uses either the __builtin_{add,sub}c builtins or hand written forms
> thereof (either __builtin_*_overflow or even that written by hand) for
> just 2 limbs, where the least significant has 0 carry-in and the most
> significant ignores carry-out.  The following patch matches that, e.g.
>   _16 = .ADD_OVERFLOW (_1, _2);
>   _17 = REALPART_EXPR <_16>;
>   _18 = IMAGPART_EXPR <_16>;
>   _15 = _3 + _4;
>   _12 = _15 + _18;
> into
>   _16 = .UADDC (_1, _2, 0);
>   _17 = REALPART_EXPR <_16>;
>   _18 = IMAGPART_EXPR <_16>;
>   _19 = .UADDC (_3, _4, _18);
>   _12 = IMAGPART_EXPR <_19>;
> so that we can emit better code.
> 
> As the 2 later comments show, we must do that carefully, because the
> pass walks the IL from first to last stmt in a bb and we must avoid
> pattern matching this way something that should be matched on a later
> instruction differently.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

OK.

Richard.

> 2023-08-29  Jakub Jelinek  <ja...@redhat.com>
> 
>       PR middle-end/79173
>       PR middle-end/111209
>       * tree-ssa-math-opts.cc (match_uaddc_usubc): Match also
>       just 2 limb uaddc/usubc with 0 carry-in on lower limb and ignored
>       carry-out on higher limb.  Don't match it though if it could be
>       matched later on 4 argument addition/subtraction.
> 
>       * gcc.target/i386/pr79173-12.c: New test.
> 
> --- gcc/tree-ssa-math-opts.cc.jj      2023-08-08 15:55:09.498122557 +0200
> +++ gcc/tree-ssa-math-opts.cc 2023-08-28 20:51:31.893886862 +0200
> @@ -4641,8 +4641,135 @@ match_uaddc_usubc (gimple_stmt_iterator
>                      __imag__ of something, verify it is .UADDC/.USUBC.  */
>                   tree rhs1 = gimple_assign_rhs1 (im);
>                   gimple *ovf = SSA_NAME_DEF_STMT (TREE_OPERAND (rhs1, 0));
> +                 tree ovf_lhs = NULL_TREE;
> +                 tree ovf_arg1 = NULL_TREE, ovf_arg2 = NULL_TREE;
>                   if (gimple_call_internal_p (ovf, code == PLUS_EXPR
> -                                                  ? IFN_UADDC : IFN_USUBC)
> +                                                  ? IFN_ADD_OVERFLOW
> +                                                  : IFN_SUB_OVERFLOW))
> +                   {
> +                     /* Or verify it is .ADD_OVERFLOW/.SUB_OVERFLOW.
> +                        This is for the case of 2 chained .UADDC/.USUBC,
> +                        where the first one uses 0 carry-in and the second
> +                        one ignores the carry-out.
> +                        So, something like:
> +                        _16 = .ADD_OVERFLOW (_1, _2);
> +                        _17 = REALPART_EXPR <_16>;
> +                        _18 = IMAGPART_EXPR <_16>;
> +                        _15 = _3 + _4;
> +                        _12 = _15 + _18;
> +                        where the first 3 statements come from the lower
> +                        limb addition and the last 2 from the higher limb
> +                        which ignores carry-out.  */
> +                     ovf_lhs = gimple_call_lhs (ovf);
> +                     tree ovf_lhs_type = TREE_TYPE (TREE_TYPE (ovf_lhs));
> +                     ovf_arg1 = gimple_call_arg (ovf, 0);
> +                     ovf_arg2 = gimple_call_arg (ovf, 1);
> +                     /* In that case we need to punt if the types don't
> +                        mismatch.  */
> +                     if (!types_compatible_p (type, ovf_lhs_type)
> +                         || !types_compatible_p (type, TREE_TYPE (ovf_arg1))
> +                         || !types_compatible_p (type,
> +                                                 TREE_TYPE (ovf_arg2)))
> +                       ovf_lhs = NULL_TREE;
> +                     else
> +                       {
> +                         for (int i = (code == PLUS_EXPR ? 1 : 0);
> +                              i >= 0; --i)
> +                           {
> +                             tree r = gimple_call_arg (ovf, i);
> +                             if (TREE_CODE (r) != SSA_NAME)
> +                               continue;
> +                             if (uaddc_is_cplxpart (SSA_NAME_DEF_STMT (r),
> +                                                    REALPART_EXPR))
> +                               {
> +                                 /* Punt if one of the args which isn't
> +                                    subtracted isn't __real__; that could
> +                                    then prevent better match later.
> +                                    Consider:
> +                                    _3 = .ADD_OVERFLOW (_1, _2);
> +                                    _4 = REALPART_EXPR <_3>;
> +                                    _5 = IMAGPART_EXPR <_3>;
> +                                    _7 = .ADD_OVERFLOW (_4, _6);
> +                                    _8 = REALPART_EXPR <_7>;
> +                                    _9 = IMAGPART_EXPR <_7>;
> +                                    _12 = _10 + _11;
> +                                    _13 = _12 + _9;
> +                                    _14 = _13 + _5;
> +                                    We want to match this when called on
> +                                    the last stmt as a pair of .UADDC calls,
> +                                    but without this check we could turn
> +                                    that prematurely on _13 = _12 + _9;
> +                                    stmt into .UADDC with 0 carry-in just
> +                                    on the second .ADD_OVERFLOW call and
> +                                    another replacing the _12 and _13
> +                                    additions.  */
> +                                 ovf_lhs = NULL_TREE;
> +                                 break;
> +                               }
> +                           }
> +                       }
> +                     if (ovf_lhs)
> +                       {
> +                         use_operand_p use_p;
> +                         imm_use_iterator iter;
> +                         tree re_lhs = NULL_TREE;
> +                         FOR_EACH_IMM_USE_FAST (use_p, iter, ovf_lhs)
> +                           {
> +                             gimple *use_stmt = USE_STMT (use_p);
> +                             if (is_gimple_debug (use_stmt))
> +                               continue;
> +                             if (use_stmt == im)
> +                               continue;
> +                             if (!uaddc_is_cplxpart (use_stmt,
> +                                                     REALPART_EXPR))
> +                               {
> +                                 ovf_lhs = NULL_TREE;
> +                                 break;
> +                               }
> +                             re_lhs = gimple_assign_lhs (use_stmt);
> +                           }
> +                         if (ovf_lhs && re_lhs)
> +                           {
> +                             FOR_EACH_IMM_USE_FAST (use_p, iter, re_lhs)
> +                               {
> +                                 gimple *use_stmt = USE_STMT (use_p);
> +                                 if (is_gimple_debug (use_stmt))
> +                                   continue;
> +                                 internal_fn ifn
> +                                   = gimple_call_internal_fn (ovf);
> +                                 /* Punt if the __real__ of lhs is used
> +                                    in the same .*_OVERFLOW call.
> +                                    Consider:
> +                                    _3 = .ADD_OVERFLOW (_1, _2);
> +                                    _4 = REALPART_EXPR <_3>;
> +                                    _5 = IMAGPART_EXPR <_3>;
> +                                    _7 = .ADD_OVERFLOW (_4, _6);
> +                                    _8 = REALPART_EXPR <_7>;
> +                                    _9 = IMAGPART_EXPR <_7>;
> +                                    _12 = _10 + _11;
> +                                    _13 = _12 + _5;
> +                                    _14 = _13 + _9;
> +                                    We want to match this when called on
> +                                    the last stmt as a pair of .UADDC calls,
> +                                    but without this check we could turn
> +                                    that prematurely on _13 = _12 + _5;
> +                                    stmt into .UADDC with 0 carry-in just
> +                                    on the first .ADD_OVERFLOW call and
> +                                    another replacing the _12 and _13
> +                                    additions.  */
> +                                 if (gimple_call_internal_p (use_stmt, ifn))
> +                                   {
> +                                     ovf_lhs = NULL_TREE;
> +                                     break;
> +                                   }
> +                               }
> +                           }
> +                       }
> +                   }
> +                 if ((ovf_lhs
> +                      || gimple_call_internal_p (ovf,
> +                                                 code == PLUS_EXPR
> +                                                 ? IFN_UADDC : IFN_USUBC))
>                       && (optab_handler (code == PLUS_EXPR
>                                          ? uaddc5_optab : usubc5_optab,
>                                          TYPE_MODE (type))
> @@ -4668,6 +4795,26 @@ match_uaddc_usubc (gimple_stmt_iterator
>                                                        TREE_TYPE (ilhs),
>                                                        nlhs));
>                       gsi_replace (gsi, g, true);
> +                     /* And if it is initialized from result of __imag__
> +                        of .{ADD,SUB}_OVERFLOW call, replace that
> +                        call with .U{ADD,SUB}C call with the same arguments,
> +                        just 0 added as third argument.  This isn't strictly
> +                        necessary, .ADD_OVERFLOW (x, y) and .UADDC (x, y, 0)
> +                        produce the same result, but may result in better
> +                        generated code on some targets where the backend can
> +                        better prepare in how the result will be used.  */
> +                     if (ovf_lhs)
> +                       {
> +                         tree zero = build_zero_cst (type);
> +                         g = gimple_build_call_internal (code == PLUS_EXPR
> +                                                         ? IFN_UADDC
> +                                                         : IFN_USUBC,
> +                                                         3, ovf_arg1,
> +                                                         ovf_arg2, zero);
> +                         gimple_call_set_lhs (g, ovf_lhs);
> +                         gimple_stmt_iterator gsi2 = gsi_for_stmt (ovf);
> +                         gsi_replace (&gsi2, g, true);
> +                       }
>                       return true;
>                     }
>                 }
> --- gcc/testsuite/gcc.target/i386/pr79173-12.c.jj     2023-08-28 
> 19:51:23.574518679 +0200
> +++ gcc/testsuite/gcc.target/i386/pr79173-12.c        2023-08-28 
> 19:51:52.260124877 +0200
> @@ -0,0 +1,48 @@
> +/* PR middle-end/79173 */
> +/* PR middle-end/111209 */
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -fno-stack-protector -masm=att" } */
> +/* { dg-final { scan-assembler-times "addq\t%r\[^\n\r]*, \\\(%rdi\\\)" 1 { 
> target lp64 } } } */
> +/* { dg-final { scan-assembler-times "adcq\t%r\[^\n\r]*, 8\\\(%rdi\\\)" 1 { 
> target lp64 } } } */
> +/* { dg-final { scan-assembler-times "subq\t%r\[^\n\r]*, \\\(%rdi\\\)" 1 { 
> target lp64 } } } */
> +/* { dg-final { scan-assembler-times "sbbq\t%r\[^\n\r]*, 8\\\(%rdi\\\)" 1 { 
> target lp64 } } } */
> +/* { dg-final { scan-assembler-times "addl\t%e\[^\n\r]*, 
> \\\(%e\[^\n\r]*\\\)" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times "adcl\t%e\[^\n\r]*, 
> 4\\\(%e\[^\n\r]*\\\)" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times "subl\t%e\[^\n\r]*, 
> \\\(%e\[^\n\r]*\\\)" 1 { target ia32 } } } */
> +/* { dg-final { scan-assembler-times "sbbl\t%e\[^\n\r]*, 
> 4\\\(%e\[^\n\r]*\\\)" 1 { target ia32 } } } */
> +
> +static unsigned long
> +uaddc (unsigned long x, unsigned long y, unsigned long carry_in, unsigned 
> long *carry_out)
> +{
> +  unsigned long r;
> +  unsigned long c1 = __builtin_add_overflow (x, y, &r);
> +  unsigned long c2 = __builtin_add_overflow (r, carry_in, &r);
> +  *carry_out = c1 + c2;
> +  return r;
> +}
> +
> +static unsigned long
> +usubc (unsigned long x, unsigned long y, unsigned long carry_in, unsigned 
> long *carry_out)
> +{
> +  unsigned long r;
> +  unsigned long c1 = __builtin_sub_overflow (x, y, &r);
> +  unsigned long c2 = __builtin_sub_overflow (r, carry_in, &r);
> +  *carry_out = c1 + c2;
> +  return r;
> +}
> +
> +void
> +foo (unsigned long *p, unsigned long *q)
> +{
> +  unsigned long c;
> +  p[0] = uaddc (p[0], q[0], 0, &c);
> +  p[1] = uaddc (p[1], q[1], c, &c);
> +}
> +
> +void
> +bar (unsigned long *p, unsigned long *q)
> +{
> +  unsigned long c;
> +  p[0] = usubc (p[0], q[0], 0, &c);
> +  p[1] = usubc (p[1], q[1], c, &c);
> +}
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE Software Solutions Germany GmbH,
Frankenstrasse 146, 90461 Nuernberg, Germany;
GF: Ivo Totev, Andrew McDonald, Werner Knoblich; (HRB 36809, AG Nuernberg)

Reply via email to