On Tue, Mar 6, 2012 at 9:49 PM, William J. Schmidt
<wschm...@linux.vnet.ibm.com> wrote:
> Hi,
>
> This is a re-post of the patch I posted for comments in January to
> address http://gcc.gnu.org/bugzilla/show_bug.cgi?id=18589.  The patch
> modifies reassociation to expose repeated factors from __builtin_pow*
> calls, optimally reassociate repeated factors, and possibly reconstitute
> __builtin_powi calls from the results of reassociation.
>
> Bootstrapped and passes regression tests for powerpc64-linux-gnu.  I
> expect there may need to be some small changes, but I am targeting this
> for trunk approval.
>
> Thanks very much for the review,

Hmm.  How much work would it be to extend the reassoc 'IL' to allow
a repeat factor per op?  I realize what you do is all within what reassoc
already does though ideally we would not require any GIMPLE IL changes
for building up / optimizing the reassoc IL but only do so when we commit
changes.

Thanks,
Richard.

> Bill
>
>
> gcc:
>
> 2012-03-06  Bill Schmidt  <wschm...@linux.vnet.ibm.com>
>
>        * tree-pass.h: Replace pass_reassoc with pass_early_reassoc and
>        pass_late_reassoc.
>        * passes.c (init_optimization_passes): Change pass_reassoc calls to
>        pass_early_reassoc and pass_late_reassoc.
>        * tree-ssa-reassoc.c (reassociate_stats): Add two fields.
>        (early_reassoc): New static var.
>        (MAX_POW_EXPAND): New #define'd constant.
>        (linear_expand_pow_common): New function.
>        (linear_expand_powi): Likewise.
>        (linear_expand_pow): Likewise.
>        (break_up_subtract_bb): Attempt to expand __builtin_pow[i].
>        (repeat_factor_d): New struct and associated typedefs.
>        (repeat_factor_vec): New static vector variable.
>        (compare_repeat_factors): New function.
>        (get_reassoc_pow_ssa_name): Likewise.
>        (attempt_builtin_powi): Likewise.
>        (reassociate_bb): Attempt to reconstitute __builtin_powi calls, and
>        multiply their results by any leftover reassociated factors.
>        (fini_reassoc): Two new calls to statistics_counter_event.
>        (execute_early_reassoc): New function.
>        (execute_late_reassoc): Likewise.
>        (pass_early_reassoc): Replace pass_reassoc, renamed to reassoc1,
>        call execute_early_reassoc.
>        (pass_late_reassoc): New gimple_opt_pass named reassoc2 that calls
>        execute_late_reassoc.
>
> gcc/testsuite:
>
> 2012-03-06  Bill Schmidt  <wschm...@linux.vnet.ibm.com>
>
>        * gcc.dg/pr46309.c: Change -fdump-tree-reassoc-details to
>        -fdump-tree-reassoc[12]-details.
>        * gcc.dg/tree-ssa/pr18589-1.c: New test.
>        * gcc.dg/tree-ssa/pr18589-2.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-3.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-4.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-5.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-6.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-7.c: Likewise.
>        * gcc.dg/tree-ssa/pr18589-8.c: Likewise.
>
>
> Index: gcc/tree-pass.h
> ===================================================================
> --- gcc/tree-pass.h     (revision 184997)
> +++ gcc/tree-pass.h     (working copy)
> @@ -440,7 +440,8 @@ extern struct gimple_opt_pass pass_copy_prop;
>  extern struct gimple_opt_pass pass_vrp;
>  extern struct gimple_opt_pass pass_uncprop;
>  extern struct gimple_opt_pass pass_return_slot;
> -extern struct gimple_opt_pass pass_reassoc;
> +extern struct gimple_opt_pass pass_early_reassoc;
> +extern struct gimple_opt_pass pass_late_reassoc;
>  extern struct gimple_opt_pass pass_rebuild_cgraph_edges;
>  extern struct gimple_opt_pass pass_remove_cgraph_callee_edges;
>  extern struct gimple_opt_pass pass_build_cgraph_edges;
> Index: gcc/testsuite/gcc.dg/pr46309.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/pr46309.c      (revision 184997)
> +++ gcc/testsuite/gcc.dg/pr46309.c      (working copy)
> @@ -1,6 +1,6 @@
>  /* PR tree-optimization/46309 */
>  /* { dg-do compile } */
> -/* { dg-options "-O2 -fdump-tree-reassoc-details" } */
> +/* { dg-options "-O2 -fdump-tree-reassoc1-details 
> -fdump-tree-reassoc2-details" } */
>  /* The transformation depends on BRANCH_COST being greater than 1
>    (see the notes in the PR), so try to force that.  */
>  /* { dg-additional-options "-mtune=octeon2" { target mips*-*-* } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-4.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y, double z, double u)
> +{
> +  return x * x * y * y * y * z * z * z * z * u;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 7 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-5.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y, double z, double u)
> +{
> +  return x * x * x * y * y * y * z * z * z * z * u * u * u * u;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-6.c   (revision 0)
> @@ -0,0 +1,11 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y)
> +{
> +  return __builtin_pow (x, -4.0) * __builtin_pow (y, -4.0);
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
> +/* { dg-final { scan-tree-dump-times " / " 1 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-7.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +float baz (float x, float y)
> +{
> +  return x * x * x * x * y * y * y * y;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-8.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +long double baz (long double x, long double y)
> +{
> +  return x * x * x * x * y * y * y * y;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-1.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y)
> +{
> +  return x * x * x * x * y * y * y * y;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-2.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y)
> +{
> +  return x * y * y * x * y * x * x * y;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 3 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c
> ===================================================================
> --- gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c   (revision 0)
> +++ gcc/testsuite/gcc.dg/tree-ssa/pr18589-3.c   (revision 0)
> @@ -0,0 +1,10 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O3 -ffast-math -fdump-tree-optimized" } */
> +
> +double baz (double x, double y, double z)
> +{
> +  return x * x * y * y * y * z * z * z * z;
> +}
> +
> +/* { dg-final { scan-tree-dump-times " \\* " 6 "optimized" } } */
> +/* { dg-final { cleanup-tree-dump "optimized" } } */
> Index: gcc/passes.c
> ===================================================================
> --- gcc/passes.c        (revision 184997)
> +++ gcc/passes.c        (working copy)
> @@ -1325,7 +1325,7 @@ init_optimization_passes (void)
>         opportunities.  */
>       NEXT_PASS (pass_phi_only_cprop);
>       NEXT_PASS (pass_dse);
> -      NEXT_PASS (pass_reassoc);
> +      NEXT_PASS (pass_early_reassoc);
>       NEXT_PASS (pass_dce);
>       NEXT_PASS (pass_forwprop);
>       NEXT_PASS (pass_phiopt);
> @@ -1377,7 +1377,7 @@ init_optimization_passes (void)
>        }
>       NEXT_PASS (pass_lower_vector_ssa);
>       NEXT_PASS (pass_cse_reciprocals);
> -      NEXT_PASS (pass_reassoc);
> +      NEXT_PASS (pass_late_reassoc);
>       NEXT_PASS (pass_vrp);
>       NEXT_PASS (pass_dominator);
>       /* The only const/copy propagation opportunities left after
> Index: gcc/tree-ssa-reassoc.c
> ===================================================================
> --- gcc/tree-ssa-reassoc.c      (revision 184997)
> +++ gcc/tree-ssa-reassoc.c      (working copy)
> @@ -53,6 +53,9 @@ along with GCC; see the file COPYING3.  If not see
>     1. Breaking up subtract operations into addition + negate, where
>     it would promote the reassociation of adds.
>
> +    1a. During the same pass, expanding __builtin_pow variants with
> +    small integer exponents into linearized multiply trees.
> +
>     2. Left linearization of the expression trees, so that (A+B)+(C+D)
>     becomes (((A+B)+C)+D), which is easier for us to rewrite later.
>     During linearization, we place the operands of the binary
> @@ -61,6 +64,10 @@ along with GCC; see the file COPYING3.  If not see
>     3. Optimization of the operand lists, eliminating things like a +
>     -a, a & a, etc.
>
> +    3a. Combine repeated factors with the same occurrence counts
> +    into a __builtin_powi call that will later be optimized into
> +    an optimal number of multiplies.
> +
>     4. Rewrite the expression trees we linearized and optimized so
>     they are in proper rank order.
>
> @@ -169,6 +176,8 @@ static struct
>   int constants_eliminated;
>   int ops_eliminated;
>   int rewritten;
> +  int pows_expanded;
> +  int pows_created;
>  } reassociate_stats;
>
>  /* Operator, rank pair.  */
> @@ -190,6 +199,12 @@ static int next_operand_entry_id;
>    depth.  */
>  static long *bb_rank;
>
> +/* Distinguish between early and late reassociation passes.  Early
> +   reassociation expands and rebuilds __builtin_pow* calls.  This
> +   is not done in late reassociation to preserve the builtin
> +   optimizations performed in pass_cse_sincos.  */
> +static bool early_reassoc;
> +
>  /* Operand->rank hashtable.  */
>  static struct pointer_map_t *operand_rank;
>
> @@ -2759,6 +2774,164 @@ can_reassociate_p (tree op)
>   return false;
>  }
>
> +/* Define a limit on the exponent of a __builtin_pow or __builtin_powi
> +   that will be converted into a linear chain of multiplies prior to
> +   reassociation.  */
> +
> +#define MAX_POW_EXPAND 32
> +
> +/* Given STMT at *GSIP that is a __builtin_pow or __builtin_powi
> +   operation with base ARG0 and integer power EXPONENT, transform
> +   it into a linear chain of multiplies, provided that EXPONENT is
> +   of sufficiently small magnitude. */
> +static void
> +linear_expand_pow_common (gimple stmt, gimple_stmt_iterator *gsip,
> +                         tree arg0, HOST_WIDE_INT exponent)
> +{
> +  tree lhs = gimple_call_lhs (stmt);
> +  HOST_WIDE_INT abs_exp = abs (exponent);
> +  location_t loc = gimple_location (stmt);
> +  gimple mul_stmt = NULL;
> +  gimple div_stmt = NULL;
> +  tree result, type, target;
> +  gimple copy_stmt;
> +
> +  if (abs_exp > MAX_POW_EXPAND)
> +    return;
> +
> +  if (dump_file && (dump_flags & TDF_DETAILS))
> +    {
> +      fprintf (dump_file, "Expanding builtin pow/powi: ");
> +      print_gimple_stmt (dump_file, stmt, 0, 0);
> +    }
> +
> +  reassociate_stats.pows_expanded++;
> +
> +  type = TREE_TYPE (arg0);
> +
> +  if (exponent == 0)
> +    result = build_real (type, dconst1);
> +  else if (exponent == 1)
> +    result = arg0;
> +  else
> +    {
> +      tree target_ssa;
> +
> +      target = create_tmp_reg (type, "reassocpow");
> +      add_referenced_var (target);
> +
> +      if (exponent == -1)
> +       result = arg0;
> +      else
> +       {
> +         unsigned i;
> +
> +         target_ssa = make_ssa_name (target, NULL);
> +         mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
> +                                                  arg0, arg0);
> +         gimple_set_location (mul_stmt, loc);
> +         gsi_insert_before (gsip, mul_stmt, GSI_SAME_STMT);
> +
> +         for (i = abs_exp - 2; i > 0; i--)
> +           {
> +             tree prev_target_ssa = target_ssa;
> +             target_ssa = make_ssa_name (target, NULL);
> +             mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
> +                                                      prev_target_ssa, arg0);
> +             gimple_set_location (mul_stmt, loc);
> +             gsi_insert_before (gsip, mul_stmt, GSI_SAME_STMT);
> +           }
> +
> +         result = target_ssa;
> +       }
> +
> +      /* Possible TODO:  If we expand two __builtin_pow's with different
> +        negative exponents, the introduction of the RDIVs for inversion
> +        prevents combining their factors.  (If they have the same negative
> +        exponent, expression folding combines them as expected.)  I'm
> +        not worrying about this as it should be quite rare in practice.  */
> +      if (exponent < 0)
> +       {
> +         target_ssa = make_ssa_name (target, NULL);
> +         div_stmt = gimple_build_assign_with_ops (RDIV_EXPR, target_ssa,
> +                                                  build_real (type, dconst1),
> +                                                  result);
> +         gimple_set_location (div_stmt, loc);
> +         gsi_insert_before (gsip, div_stmt, GSI_SAME_STMT);
> +         result = target_ssa;
> +       }
> +    }
> +
> +  /* If we introduced multiplies but no inversion, avoid introducing a
> +     copy so that the copy doesn't truncate a larger reassociation chain.  */
> +  if (mul_stmt && !div_stmt)
> +    {
> +      unlink_stmt_vdef (stmt);
> +      gimple_set_lhs (mul_stmt, lhs);
> +      gsi_remove (gsip, true);
> +      update_stmt (mul_stmt);
> +      /* If we're at the end of the block, leave the iterator in a
> +        usable state.  */
> +      if (gsi_end_p (*gsip))
> +       *gsip = gsi_for_stmt (mul_stmt);
> +    }
> +  else
> +    {
> +      copy_stmt = gimple_build_assign (lhs, result);
> +      gimple_set_location (copy_stmt, loc);
> +      unlink_stmt_vdef (stmt);
> +      gsi_replace (gsip, copy_stmt, true);
> +    }
> +}
> +
> +/* Transform __builtin_powi into a linear chain of multiplies,
> +   if the exponent is of sufficiently small magnitude.  */
> +
> +static void
> +linear_expand_powi (gimple stmt, gimple_stmt_iterator *gsip)
> +{
> +  tree arg0 = gimple_call_arg (stmt, 0);
> +  tree arg1 = gimple_call_arg (stmt, 1);
> +  HOST_WIDE_INT exponent;
> +
> +  if (TREE_CODE (arg1) != INTEGER_CST)
> +    return;
> +
> +  if (host_integerp (arg1, 0))
> +    {
> +      exponent = TREE_INT_CST_LOW (arg1);
> +      linear_expand_pow_common (stmt, gsip, arg0, exponent);
> +    }
> +}
> +
> +/* Transform __builtin_pow into a linear chain of multiplies,
> +   if the exponent is constant and equivalent to an integer of
> +   sufficiently small magnitude.  */
> +
> +static void
> +linear_expand_pow (gimple stmt, gimple_stmt_iterator *gsip)
> +{
> +  tree arg0 = gimple_call_arg (stmt, 0);
> +  tree arg1 = gimple_call_arg (stmt, 1);
> +  REAL_VALUE_TYPE c, cint;
> +  HOST_WIDE_INT n;
> +
> +  /* The exponent must be a constant equivalent to an integer that
> +     fits in a HOST_WIDE_INT.  */
> +  if (TREE_CODE (arg1) != REAL_CST)
> +    return;
> +
> +  c = TREE_REAL_CST (arg1);
> +  if (REAL_EXP (&c) > HOST_BITS_PER_WIDE_INT)
> +    return;
> +
> +  n = real_to_integer (&c);
> +  real_from_integer (&cint, VOIDmode, n, n < 0 ? -1 : 0, 0);
> +
> +  if (real_identical (&c, &cint))
> +    linear_expand_pow_common (stmt, gsip, arg0, n);
> +}
> +
>  /* Break up subtract operations in block BB.
>
>    We do this top down because we don't know whether the subtract is
> @@ -2784,9 +2957,32 @@ break_up_subtract_bb (basic_block bb)
>
>   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
>     {
> +      tree fndecl;
>       gimple stmt = gsi_stmt (gsi);
>       gimple_set_visited (stmt, false);
>
> +      /* Look for __builtin_pow and __builtin_powi calls,
> +        and expand them to multiplies if possible.  */
> +      if (early_reassoc
> +         && flag_unsafe_math_optimizations
> +         && is_gimple_call (stmt)
> +         && (fndecl = gimple_call_fndecl (stmt))
> +         && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL)
> +       {
> +         switch (DECL_FUNCTION_CODE (fndecl))
> +           {
> +           CASE_FLT_FN (BUILT_IN_POW):
> +             linear_expand_pow (stmt, &gsi);
> +             break;
> +           CASE_FLT_FN (BUILT_IN_POWI):
> +             linear_expand_powi (stmt, &gsi);
> +             break;
> +           default:
> +             ;
> +           }
> +         continue;
> +       }
> +
>       if (!is_gimple_assign (stmt)
>          || !can_reassociate_p (gimple_assign_lhs (stmt)))
>        continue;
> @@ -2815,6 +3011,342 @@ break_up_subtract_bb (basic_block bb)
>     break_up_subtract_bb (son);
>  }
>
> +/* Used for repeated factor analysis.  */
> +struct repeat_factor_d
> +{
> +  /* An SSA name that occurs in a multiply chain.  */
> +  tree factor;
> +
> +  /* Cached rank of the factor.  */
> +  unsigned rank;
> +
> +  /* Number of occurrences of the factor in the chain.  */
> +  HOST_WIDE_INT count;
> +
> +  /* An SSA name representing the product of this factor and
> +     all factors appearing later in the repeated factor vector.  */
> +  tree repr;
> +};
> +
> +typedef struct repeat_factor_d repeat_factor, *repeat_factor_t;
> +typedef const struct repeat_factor_d *const_repeat_factor_t;
> +
> +DEF_VEC_O (repeat_factor);
> +DEF_VEC_ALLOC_O (repeat_factor, heap);
> +
> +static VEC (repeat_factor, heap) *repeat_factor_vec;
> +
> +/* Used for sorting the repeat factor vector.  Sort primarily by
> +   ascending occurrence count, secondarily by descending rank.  */
> +
> +static int
> +compare_repeat_factors (const void *x1, const void *x2)
> +{
> +  const_repeat_factor_t rf1 = (const_repeat_factor_t) x1;
> +  const_repeat_factor_t rf2 = (const_repeat_factor_t) x2;
> +
> +  if (rf1->count != rf2->count)
> +    return rf1->count - rf2->count;
> +
> +  return rf2->rank - rf1->rank;
> +}
> +
> +/* Get a new SSA name for register variable *TARGET of type TYPE.
> +   If *TARGET is null or incompatible with TYPE, create the variable
> +   first.  */
> +
> +static tree
> +get_reassoc_pow_ssa_name (tree *target, tree type)
> +{
> +  if (!*target || !types_compatible_p (type, TREE_TYPE (*target)))
> +    {
> +      *target = create_tmp_reg (type, "reassocpow");
> +      add_referenced_var (*target);
> +    }
> +
> +  return make_ssa_name (*target, NULL);
> +}
> +
> +/* Look for repeated operands in OPS in the multiply tree rooted at
> +   STMT.  Replace them with an optimal sequence of multiplies and powi
> +   builtin calls, and remove the used operands from OPS.  Return an
> +   SSA name representing the value of the replacement sequence.  */
> +
> +static tree
> +attempt_builtin_powi (gimple stmt, VEC(operand_entry_t, heap) **ops,
> +                     tree *target)
> +{
> +  unsigned i, j, cached, vec_len;
> +  int ii;
> +  operand_entry_t oe;
> +  repeat_factor_t rf1, rf2;
> +  repeat_factor rfnew;
> +  tree target_ssa;
> +  tree result = NULL_TREE;
> +  tree type = TREE_TYPE (gimple_get_lhs (stmt));
> +  tree powi_fndecl = mathfn_built_in (type, BUILT_IN_POWI);
> +  gimple_stmt_iterator gsi = gsi_for_stmt (stmt);
> +  gimple mul_stmt, pow_stmt;
> +
> +  /* Nothing to do if BUILT_IN_POWI doesn't exist for this type and
> +     target.  */
> +  if (!powi_fndecl)
> +    return NULL_TREE;
> +
> +  /* Allocate the repeated factor vector.  */
> +  repeat_factor_vec = VEC_alloc (repeat_factor, heap, 10);
> +
> +  /* Scan the OPS vector for all SSA names in the product and build
> +     up a vector of occurrence counts for each factor.  */
> +  FOR_EACH_VEC_ELT (operand_entry_t, *ops, i, oe)
> +    {
> +      if (TREE_CODE (oe->op) == SSA_NAME)
> +       {
> +         FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
> +           {
> +             if (rf1->factor == oe->op)
> +               {
> +                 rf1->count++;
> +                 break;
> +               }
> +           }
> +
> +         if (j >= VEC_length (repeat_factor, repeat_factor_vec))
> +           {
> +             rfnew.factor = oe->op;
> +             rfnew.rank = oe->rank;
> +             rfnew.count = 1;
> +             rfnew.repr = NULL_TREE;
> +             VEC_safe_push (repeat_factor, heap, repeat_factor_vec, &rfnew);
> +           }
> +       }
> +    }
> +
> +  vec_len = VEC_length (repeat_factor, repeat_factor_vec);
> +
> +  /* Sort the repeated factor vector by (a) increasing occurrence count,
> +     and (b) decreasing rank.  */
> +  VEC_qsort (repeat_factor, repeat_factor_vec, compare_repeat_factors);
> +
> +  /* There are a variety of ways we could combine factors with different
> +     occurrence counts.  It seems best in practice to try to combine as
> +     many base factors as possible into a product before applying
> +     builtin_powi to the result.  The sort order chosen for the repeated
> +     factor vector allows us to cache partial results for the product
> +     of the base factors for subsequent use.
> +
> +     As an example, consider x * x * y * y * y * y * z * z * z * z.
> +     We want to first compose the product x * y * z, raise it to the
> +     second power, and then multiply this by the product y * z raised
> +     to the second power.  This can be done in 5 multiplies provided
> +     we cache y * z for use in both expressions:
> +
> +        t1 = y * z
> +       t2 = t1 * x
> +       t3 = t2 * t2
> +       t4 = t1 * t1
> +       result = t3 * t4
> +
> +     Alternatively we could also do this in 5 multiplies by first
> +     calculating y * z, squaring it twice, and multiplying by x * x.
> +     However, if the occurrence counts were not powers of 2 as in
> +     this example, combining higher occurrence counts first would
> +     require more multiplies than combining lower ones first.  */
> +
> +  /* Repeatedly look for opportunities to create a builtin_powi call.  */
> +  while (true)
> +    {
> +      HOST_WIDE_INT power;
> +
> +      /* Find the first factor in the repeated factor vector whose
> +        occurrence count is at least 2.  If no such factor exists,
> +        there are no builtin_powi opportunities remaining.  */
> +      FOR_EACH_VEC_ELT (repeat_factor, repeat_factor_vec, j, rf1)
> +       {
> +         if (rf1->count >= 2)
> +           break;
> +       }
> +
> +      if (j >= vec_len)
> +       break;
> +
> +      power = rf1->count;
> +
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       {
> +         unsigned elt;
> +         repeat_factor_t rf;
> +         fputs ("Building __builtin_pow call for (", dump_file);
> +         for (elt = j; elt < vec_len; elt++)
> +           {
> +             rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
> +             print_generic_expr (dump_file, rf->factor, 0);
> +             if (elt < vec_len - 1)
> +               fputs (" * ", dump_file);
> +           }
> +         fprintf (dump_file, ")^%ld\n", power);
> +       }
> +
> +      reassociate_stats.pows_created++;
> +
> +      /* Visit each element of the vector in reverse order (so that
> +        high-occurrence elements are visited first, and within the
> +        same occurrence count, lower-ranked elements are visited
> +        first).  Form a linear product of all elements in this order
> +        whose occurrencce count is at least that of element J.
> +        Record the SSA name representing the product of each element
> +        with all subsequent elements in the vector.  */
> +      if (j == vec_len - 1)
> +       rf1->repr = rf1->factor;
> +      else
> +       {
> +         for (ii = vec_len - 2; ii >= (int)j; ii--)
> +           {
> +             tree op1, op2;
> +
> +             rf1 = VEC_index (repeat_factor, repeat_factor_vec, ii);
> +             rf2 = VEC_index (repeat_factor, repeat_factor_vec, ii + 1);
> +
> +             /* Init the last factor's representative to be itself.  */
> +             if (!rf2->repr)
> +               rf2->repr = rf2->factor;
> +
> +             op1 = rf1->factor;
> +             op2 = rf2->repr;
> +
> +             target_ssa = get_reassoc_pow_ssa_name (target, type);
> +             mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, target_ssa,
> +                                                      op1, op2);
> +             gimple_set_location (mul_stmt, gimple_location (stmt));
> +             gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
> +             rf1->repr = target_ssa;
> +           }
> +       }
> +
> +      /* Form a call to __builtin_powi for the maximum product
> +        just formed, raised to the power obtained earlier.  */
> +      rf1 = VEC_index (repeat_factor, repeat_factor_vec, j);
> +      target_ssa = get_reassoc_pow_ssa_name (target, type);
> +      pow_stmt = gimple_build_call (powi_fndecl, 2, rf1->repr,
> +                                   build_int_cst (integer_type_node, power));
> +      gimple_call_set_lhs (pow_stmt, target_ssa);
> +      gimple_set_location (pow_stmt, gimple_location (stmt));
> +      gsi_insert_before (&gsi, pow_stmt, GSI_SAME_STMT);
> +
> +      /* Decrement the occurrence count of each element in the product
> +        by the count found above, and remove this many copies of each
> +        factor from OPS.  */
> +      for (i = j; i < vec_len; i++)
> +       {
> +         unsigned k = power;
> +         unsigned n;
> +
> +         rf1 = VEC_index (repeat_factor, repeat_factor_vec, i);
> +         rf1->count -= power;
> +
> +         FOR_EACH_VEC_ELT_REVERSE (operand_entry_t, *ops, n, oe)
> +           {
> +             if (oe->op == rf1->factor)
> +               {
> +                 VEC_ordered_remove (operand_entry_t, *ops, n);
> +                 if (--k == 0)
> +                   break;
> +               }
> +           }
> +       }
> +
> +      /* If we previously formed at least one other builtin_powi call,
> +        form the product of this one and those others.  */
> +      if (result)
> +       {
> +         tree new_target_ssa = get_reassoc_pow_ssa_name (target, type);
> +         mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, new_target_ssa,
> +                                                  target_ssa, result);
> +         gimple_set_location (mul_stmt, gimple_location (stmt));
> +         gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
> +         result = new_target_ssa;
> +       }
> +      else
> +       result = target_ssa;
> +    }
> +
> +  /* At this point all elements in the repeated factor vector have a
> +     remaining occurrence count of 0 or 1.  Scanning the vector in
> +     reverse order, find the last element with a 1 before a 0 is found.
> +     If this element has an SSA representative and is not the last
> +     element, then it represents a multiply we have already calculated.
> +     Multiply the result so far by this SSA name.  Remove one copy of
> +     each factor in the product from OPS.  */
> +  cached = vec_len;
> +
> +  for (ii = vec_len - 1; ii >= 0; ii--)
> +    {
> +      rf1 = VEC_index (repeat_factor, repeat_factor_vec, ii);
> +      if (rf1->count == 0)
> +       {
> +         cached = ii + 1;
> +         break;
> +       }
> +    }
> +
> +  if (cached < vec_len)
> +    {
> +      rf1 = VEC_index (repeat_factor, repeat_factor_vec, cached);
> +
> +      if (dump_file && (dump_flags & TDF_DETAILS))
> +       {
> +         unsigned elt;
> +         repeat_factor_t rf;
> +         fputs ("Multiplying by cached product ", dump_file);
> +         for (elt = cached; elt < vec_len; elt++)
> +           {
> +             rf = VEC_index (repeat_factor, repeat_factor_vec, elt);
> +             print_generic_expr (dump_file, rf->factor, 0);
> +             if (elt < vec_len - 1)
> +               fputs (" * ", dump_file);
> +           }
> +         fputs ("\n", dump_file);
> +       }
> +
> +      if (!result)
> +       result = rf1->repr;
> +      else
> +       {
> +         tree new_target_ssa = get_reassoc_pow_ssa_name (target, type);
> +         mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, new_target_ssa,
> +                                                  rf1->repr, result);
> +         gimple_set_location (mul_stmt, gimple_location (stmt));
> +         gsi_insert_before (&gsi, mul_stmt, GSI_SAME_STMT);
> +         result = new_target_ssa;
> +       }
> +    }
> +
> +  for (i = cached; i < vec_len; i++)
> +    {
> +      unsigned n;
> +
> +      rf1 = VEC_index (repeat_factor, repeat_factor_vec, i);
> +      rf1->count--;
> +
> +      FOR_EACH_VEC_ELT_REVERSE (operand_entry_t, *ops, n, oe)
> +       {
> +         if (oe->op == rf1->factor)
> +           {
> +             VEC_ordered_remove (operand_entry_t, *ops, n);
> +             break;
> +           }
> +       }
> +    }
> +
> +  /* Clean up.  */
> +  VEC_free (repeat_factor, heap, repeat_factor_vec);
> +
> +  /* Return the final product as computed herein.  Note that there
> +     may still be some elements with single occurrence count left
> +     in OPS; those will be handled by the normal reassociation logic.  */
> +  return result;
> +}
> +
>  /* Reassociate expressions in basic block BB and its post-dominator as
>    children.  */
>
> @@ -2823,6 +3355,7 @@ reassociate_bb (basic_block bb)
>  {
>   gimple_stmt_iterator gsi;
>   basic_block son;
> +  tree target = NULL_TREE;
>
>   for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi))
>     {
> @@ -2883,6 +3416,8 @@ reassociate_bb (basic_block bb)
>
>          if (associative_tree_code (rhs_code))
>            {
> +             tree powi_result = NULL_TREE;
> +
>              VEC(operand_entry_t, heap) *ops = NULL;
>
>              /* There may be no immediate uses left by the time we
> @@ -2904,7 +3439,14 @@ reassociate_bb (basic_block bb)
>              if (rhs_code == BIT_IOR_EXPR || rhs_code == BIT_AND_EXPR)
>                optimize_range_tests (rhs_code, &ops);
>
> -             if (VEC_length (operand_entry_t, ops) == 1)
> +             if (early_reassoc
> +                 && rhs_code == MULT_EXPR
> +                 && flag_unsafe_math_optimizations)
> +               powi_result = attempt_builtin_powi (stmt, &ops, &target);
> +
> +             /* If the operand vector is now empty, all operands were
> +                consumed by the __builtin_pow optimization.  */
> +             if (VEC_length (operand_entry_t, ops) == 0)
>                {
>                  if (dump_file && (dump_flags & TDF_DETAILS))
>                    {
> @@ -2913,9 +3455,7 @@ reassociate_bb (basic_block bb)
>                    }
>
>                  rhs1 = gimple_assign_rhs1 (stmt);
> -                 gimple_assign_set_rhs_from_tree (&gsi,
> -                                                  VEC_last (operand_entry_t,
> -                                                            ops)->op);
> +                 gimple_assign_set_rhs_from_tree (&gsi, powi_result);
>                  update_stmt (stmt);
>                  remove_visited_stmt_chain (rhs1);
>
> @@ -2925,6 +3465,39 @@ reassociate_bb (basic_block bb)
>                      print_gimple_stmt (dump_file, stmt, 0, 0);
>                    }
>                }
> +
> +             else if (VEC_length (operand_entry_t, ops) == 1)
> +               {
> +                 tree last_op = VEC_last (operand_entry_t, ops)->op;
> +                 if (dump_file && (dump_flags & TDF_DETAILS))
> +                   {
> +                     fprintf (dump_file, "Transforming ");
> +                     print_gimple_stmt (dump_file, stmt, 0, 0);
> +                   }
> +
> +                 if (powi_result)
> +                   {
> +                     rhs1 = gimple_assign_rhs1 (stmt);
> +                     gimple_assign_set_rhs_with_ops_1 (&gsi, MULT_EXPR,
> +                                                       powi_result, last_op,
> +                                                       NULL_TREE);
> +                     update_stmt (gsi_stmt (gsi));
> +                     remove_visited_stmt_chain (rhs1);
> +                   }
> +                 else
> +                   {
> +                     rhs1 = gimple_assign_rhs1 (stmt);
> +                     gimple_assign_set_rhs_from_tree (&gsi, last_op);
> +                     update_stmt (stmt);
> +                     remove_visited_stmt_chain (rhs1);
> +                   }
> +
> +                 if (dump_file && (dump_flags & TDF_DETAILS))
> +                   {
> +                     fprintf (dump_file, " into ");
> +                     print_gimple_stmt (dump_file, stmt, 0, 0);
> +                   }
> +               }
>              else
>                {
>                  enum machine_mode mode = TYPE_MODE (TREE_TYPE (lhs));
> @@ -2940,6 +3513,24 @@ reassociate_bb (basic_block bb)
>                    rewrite_expr_tree_parallel (stmt, width, ops);
>                  else
>                    rewrite_expr_tree (stmt, 0, ops, false);
> +
> +                 /* If we combined some repeated factors into a
> +                    __builtin_powi call, multiply that result by the
> +                    reassociated operands.  */
> +                 if (powi_result)
> +                   {
> +                     gimple mul_stmt;
> +                     tree type = TREE_TYPE (gimple_get_lhs (stmt));
> +                     tree target_ssa = get_reassoc_pow_ssa_name (&target,
> +                                                                 type);
> +                     gimple_set_lhs (stmt, target_ssa);
> +                     update_stmt (stmt);
> +                     mul_stmt = gimple_build_assign_with_ops (MULT_EXPR, lhs,
> +                                                              powi_result,
> +                                                              target_ssa);
> +                     gimple_set_location (mul_stmt, gimple_location (stmt));
> +                     gsi_insert_after (&gsi, mul_stmt, GSI_NEW_STMT);
> +                   }
>                }
>
>              VEC_free (operand_entry_t, heap, ops);
> @@ -3054,6 +3645,10 @@ fini_reassoc (void)
>                            reassociate_stats.ops_eliminated);
>   statistics_counter_event (cfun, "Statements rewritten",
>                            reassociate_stats.rewritten);
> +  statistics_counter_event (cfun, "Built-in pow[i] calls expanded",
> +                           reassociate_stats.pows_expanded);
> +  statistics_counter_event (cfun, "Built-in powi calls created",
> +                           reassociate_stats.pows_created);
>
>   pointer_map_destroy (operand_rank);
>   free_alloc_pool (operand_entry_pool);
> @@ -3077,19 +3672,33 @@ execute_reassoc (void)
>   return 0;
>  }
>
> +static unsigned int
> +execute_early_reassoc (void)
> +{
> +  early_reassoc = true;
> +  return execute_reassoc ();
> +}
> +
> +static unsigned int
> +execute_late_reassoc (void)
> +{
> +  early_reassoc = false;
> +  return execute_reassoc ();
> +}
> +
>  static bool
>  gate_tree_ssa_reassoc (void)
>  {
>   return flag_tree_reassoc != 0;
>  }
>
> -struct gimple_opt_pass pass_reassoc =
> +struct gimple_opt_pass pass_early_reassoc =
>  {
>  {
>   GIMPLE_PASS,
> -  "reassoc",                           /* name */
> +  "reassoc1",                          /* name */
>   gate_tree_ssa_reassoc,               /* gate */
> -  execute_reassoc,                     /* execute */
> +  execute_early_reassoc,               /* execute */
>   NULL,                                        /* sub */
>   NULL,                                        /* next */
>   0,                                   /* static_pass_number */
> @@ -3103,3 +3712,24 @@ gate_tree_ssa_reassoc (void)
>     | TODO_ggc_collect                 /* todo_flags_finish */
>  }
>  };
> +
> +struct gimple_opt_pass pass_late_reassoc =
> +{
> + {
> +  GIMPLE_PASS,
> +  "reassoc2",                          /* name */
> +  gate_tree_ssa_reassoc,               /* gate */
> +  execute_late_reassoc,                        /* execute */
> +  NULL,                                        /* sub */
> +  NULL,                                        /* next */
> +  0,                                   /* static_pass_number */
> +  TV_TREE_REASSOC,                     /* tv_id */
> +  PROP_cfg | PROP_ssa,                 /* properties_required */
> +  0,                                   /* properties_provided */
> +  0,                                   /* properties_destroyed */
> +  0,                                   /* todo_flags_start */
> +  TODO_verify_ssa
> +    | TODO_verify_flow
> +    | TODO_ggc_collect                 /* todo_flags_finish */
> + }
> +};
>
>

Reply via email to