On Mon, Apr 8, 2013 at 5:10 PM, Joern Rennecke <joern.renne...@embecosm.com> wrote: > This is basically the same patch as attached to the PR, except that I > have changed the goto-loop into a do-while loop with a new comment; > this caused the need for a lot of reformatting.
Can you please include a testcase that shows the effect of this? > bootstrapped & regtested on i686-pc-linux-gnu. > > 2013-04-08 Joern Rennecke <joern.renne...@embecosm.com> > > * tree-ssa-math-opts.c (mult_to_fma_pass): New file static struct. > (convert_mult_to_fma): In first pass, don't use an fms construct > when we don't have an fms operation, but fmna. it's fnma I believe. > (execute_optimize_widening_mul): Add a second pass if > convert_mult_to_fma requests it. > > Index: gcc/tree-ssa-math-opts.c > =================================================================== > --- gcc/tree-ssa-math-opts.c (revision 197578) > +++ gcc/tree-ssa-math-opts.c (working copy) > @@ -2461,6 +2461,12 @@ convert_plusminus_to_widen (gimple_stmt_ > return true; > } > > +static struct > +{ > + bool second_pass; > + bool retry_request; > +} mult_to_fma_pass; > + > /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2 > with uses in additions and subtractions to form fused multiply-add > operations. Returns true if successful and MUL_STMT should be removed. > */ > @@ -2570,6 +2576,22 @@ convert_mult_to_fma (gimple mul_stmt, tr > return false; > } > > + /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed > + by a MULT_EXPR that we'll visit later, we might be able to > + get a more profitable match with fnma. > + OTOH, if we don't, a negate / fma pair has likely lower latency > + that a mult / subtract pair. */ This makes it sound that this is purely an ordering issue, thus for x = a * b; y = c * d; z = x - y; instead of y = c * d; z = a * b + (-y); you want to generate x = a * b; z = -(c * d + (-x)); I fail to see why you need two passes for this rather than considering the case that the immediate use stmt of the multiplication we start from combines another multiplication with a MINUS_EXPR. That is ... > + if (use_code == MINUS_EXPR && !negate_p > + && gimple_assign_rhs1 (use_stmt) == result > + && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing > + && optab_handler (fnma_optab, TYPE_MODE (type)) != > CODE_FOR_nothing > + && mult_to_fma_pass.second_pass == false) see if that is the case here and simply not do the transform. A second pass will not recover from that without destroying the fnma pattern (testcase?) Richard. > + { > + /* ??? Could make setting of retry_request dependent on some > + rtx_cost measure we evaluate beforehand. */ > + mult_to_fma_pass.retry_request = true; > + return false; > + } > /* We can't handle a * b + a * b. */ > if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt)) > return false; > @@ -2657,76 +2679,89 @@ execute_optimize_widening_mul (void) > > memset (&widen_mul_stats, 0, sizeof (widen_mul_stats)); > > - FOR_EACH_BB (bb) > - { > - gimple_stmt_iterator gsi; > > - for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > - { > - gimple stmt = gsi_stmt (gsi); > - enum tree_code code; > + /* We may run one or two passes. In the first pass, if have fnma, > + but not fms, we don't synthesize fms so that we can get the maximum > + matches for fnma. If we have therefore skipped opportunities to > + synthesize fms, we'll run a second pass where we use any such > + opportunities that still remain. */ > + mult_to_fma_pass.retry_request = false; > + do > + { > + mult_to_fma_pass.second_pass = mult_to_fma_pass.retry_request; > + FOR_EACH_BB (bb) > + { > + gimple_stmt_iterator gsi; > > - if (is_gimple_assign (stmt)) > + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) > { > - code = gimple_assign_rhs_code (stmt); > - switch (code) > + gimple stmt = gsi_stmt (gsi); > + enum tree_code code; > + > + if (is_gimple_assign (stmt)) > { > - case MULT_EXPR: > - if (!convert_mult_to_widen (stmt, &gsi) > - && convert_mult_to_fma (stmt, > - gimple_assign_rhs1 (stmt), > - gimple_assign_rhs2 (stmt))) > + code = gimple_assign_rhs_code (stmt); > + switch (code) > { > - gsi_remove (&gsi, true); > - release_defs (stmt); > - continue; > - } > - break; > - > - case PLUS_EXPR: > - case MINUS_EXPR: > - convert_plusminus_to_widen (&gsi, stmt, code); > - break; > + case MULT_EXPR: > + if (!convert_mult_to_widen (stmt, &gsi) > + && convert_mult_to_fma (stmt, > + gimple_assign_rhs1 (stmt), > + gimple_assign_rhs2 > (stmt))) > + { > + gsi_remove (&gsi, true); > + release_defs (stmt); > + continue; > + } > + break; > + > + case PLUS_EXPR: > + case MINUS_EXPR: > + convert_plusminus_to_widen (&gsi, stmt, code); > + break; > > - default:; > + default:; > + } > } > - } > - else if (is_gimple_call (stmt) > - && gimple_call_lhs (stmt)) > - { > - tree fndecl = gimple_call_fndecl (stmt); > - if (fndecl > - && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL) > + else if (is_gimple_call (stmt) > + && gimple_call_lhs (stmt)) > { > - switch (DECL_FUNCTION_CODE (fndecl)) > + tree fndecl = gimple_call_fndecl (stmt); > + if (fndecl > + && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL) > { > - case BUILT_IN_POWF: > - case BUILT_IN_POW: > - case BUILT_IN_POWL: > - if (TREE_CODE (gimple_call_arg (stmt, 1)) == > REAL_CST > - && REAL_VALUES_EQUAL > - (TREE_REAL_CST (gimple_call_arg (stmt, 1)), > - dconst2) > - && convert_mult_to_fma (stmt, > - gimple_call_arg (stmt, > 0), > - gimple_call_arg (stmt, > 0))) > - { > - unlink_stmt_vdef (stmt); > - if (gsi_remove (&gsi, true) > - && gimple_purge_dead_eh_edges (bb)) > - cfg_changed = true; > - release_defs (stmt); > - continue; > - } > + switch (DECL_FUNCTION_CODE (fndecl)) > + { > + case BUILT_IN_POWF: > + case BUILT_IN_POW: > + case BUILT_IN_POWL: > + if ((TREE_CODE (gimple_call_arg (stmt, 1)) > + == REAL_CST) > + && (REAL_VALUES_EQUAL > + (TREE_REAL_CST (gimple_call_arg (stmt, > 1)), > + dconst2)) > + && (convert_mult_to_fma > + (stmt, gimple_call_arg (stmt, 0), > + gimple_call_arg (stmt, 0)))) > + { > + unlink_stmt_vdef (stmt); > + if (gsi_remove (&gsi, true) > + && gimple_purge_dead_eh_edges (bb)) > + cfg_changed = true; > + release_defs (stmt); > + continue; > + } > break; > > - default:; > + default:; > + } > } > } > + gsi_next (&gsi); > } > - gsi_next (&gsi); > } > } > + while (!mult_to_fma_pass.second_pass && mult_to_fma_pass.retry_request); > > statistics_counter_event (cfun, "widening multiplications inserted", > widen_mul_stats.widen_mults_inserted); >