This is basically the same patch as attached to the PR, except that I have changed the goto-loop into a do-while loop with a new comment; this caused the need for a lot of reformatting.
bootstrapped & regtested on i686-pc-linux-gnu.
2013-04-08 Joern Rennecke <joern.renne...@embecosm.com> * tree-ssa-math-opts.c (mult_to_fma_pass): New file static struct. (convert_mult_to_fma): In first pass, don't use an fms construct when we don't have an fms operation, but fmna. (execute_optimize_widening_mul): Add a second pass if convert_mult_to_fma requests it. Index: gcc/tree-ssa-math-opts.c =================================================================== --- gcc/tree-ssa-math-opts.c (revision 197578) +++ gcc/tree-ssa-math-opts.c (working copy) @@ -2461,6 +2461,12 @@ convert_plusminus_to_widen (gimple_stmt_ return true; } +static struct +{ + bool second_pass; + bool retry_request; +} mult_to_fma_pass; + /* Combine the multiplication at MUL_STMT with operands MULOP1 and MULOP2 with uses in additions and subtractions to form fused multiply-add operations. Returns true if successful and MUL_STMT should be removed. */ @@ -2570,6 +2576,22 @@ convert_mult_to_fma (gimple mul_stmt, tr return false; } + /* If the subtrahend (gimple_assign_rhs2 (use_stmt)) is computed + by a MULT_EXPR that we'll visit later, we might be able to + get a more profitable match with fnma. + OTOH, if we don't, a negate / fma pair has likely lower latency + that a mult / subtract pair. */ + if (use_code == MINUS_EXPR && !negate_p + && gimple_assign_rhs1 (use_stmt) == result + && optab_handler (fms_optab, TYPE_MODE (type)) == CODE_FOR_nothing + && optab_handler (fnma_optab, TYPE_MODE (type)) != CODE_FOR_nothing + && mult_to_fma_pass.second_pass == false) + { + /* ??? Could make setting of retry_request dependent on some + rtx_cost measure we evaluate beforehand. */ + mult_to_fma_pass.retry_request = true; + return false; + } /* We can't handle a * b + a * b. */ if (gimple_assign_rhs1 (use_stmt) == gimple_assign_rhs2 (use_stmt)) return false; @@ -2657,76 +2679,89 @@ execute_optimize_widening_mul (void) memset (&widen_mul_stats, 0, sizeof (widen_mul_stats)); - FOR_EACH_BB (bb) - { - gimple_stmt_iterator gsi; - for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) - { - gimple stmt = gsi_stmt (gsi); - enum tree_code code; + /* We may run one or two passes. In the first pass, if have fnma, + but not fms, we don't synthesize fms so that we can get the maximum + matches for fnma. If we have therefore skipped opportunities to + synthesize fms, we'll run a second pass where we use any such + opportunities that still remain. */ + mult_to_fma_pass.retry_request = false; + do + { + mult_to_fma_pass.second_pass = mult_to_fma_pass.retry_request; + FOR_EACH_BB (bb) + { + gimple_stmt_iterator gsi; - if (is_gimple_assign (stmt)) + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi);) { - code = gimple_assign_rhs_code (stmt); - switch (code) + gimple stmt = gsi_stmt (gsi); + enum tree_code code; + + if (is_gimple_assign (stmt)) { - case MULT_EXPR: - if (!convert_mult_to_widen (stmt, &gsi) - && convert_mult_to_fma (stmt, - gimple_assign_rhs1 (stmt), - gimple_assign_rhs2 (stmt))) + code = gimple_assign_rhs_code (stmt); + switch (code) { - gsi_remove (&gsi, true); - release_defs (stmt); - continue; - } - break; - - case PLUS_EXPR: - case MINUS_EXPR: - convert_plusminus_to_widen (&gsi, stmt, code); - break; + case MULT_EXPR: + if (!convert_mult_to_widen (stmt, &gsi) + && convert_mult_to_fma (stmt, + gimple_assign_rhs1 (stmt), + gimple_assign_rhs2 (stmt))) + { + gsi_remove (&gsi, true); + release_defs (stmt); + continue; + } + break; + + case PLUS_EXPR: + case MINUS_EXPR: + convert_plusminus_to_widen (&gsi, stmt, code); + break; - default:; + default:; + } } - } - else if (is_gimple_call (stmt) - && gimple_call_lhs (stmt)) - { - tree fndecl = gimple_call_fndecl (stmt); - if (fndecl - && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL) + else if (is_gimple_call (stmt) + && gimple_call_lhs (stmt)) { - switch (DECL_FUNCTION_CODE (fndecl)) + tree fndecl = gimple_call_fndecl (stmt); + if (fndecl + && DECL_BUILT_IN_CLASS (fndecl) == BUILT_IN_NORMAL) { - case BUILT_IN_POWF: - case BUILT_IN_POW: - case BUILT_IN_POWL: - if (TREE_CODE (gimple_call_arg (stmt, 1)) == REAL_CST - && REAL_VALUES_EQUAL - (TREE_REAL_CST (gimple_call_arg (stmt, 1)), - dconst2) - && convert_mult_to_fma (stmt, - gimple_call_arg (stmt, 0), - gimple_call_arg (stmt, 0))) - { - unlink_stmt_vdef (stmt); - if (gsi_remove (&gsi, true) - && gimple_purge_dead_eh_edges (bb)) - cfg_changed = true; - release_defs (stmt); - continue; - } + switch (DECL_FUNCTION_CODE (fndecl)) + { + case BUILT_IN_POWF: + case BUILT_IN_POW: + case BUILT_IN_POWL: + if ((TREE_CODE (gimple_call_arg (stmt, 1)) + == REAL_CST) + && (REAL_VALUES_EQUAL + (TREE_REAL_CST (gimple_call_arg (stmt, 1)), + dconst2)) + && (convert_mult_to_fma + (stmt, gimple_call_arg (stmt, 0), + gimple_call_arg (stmt, 0)))) + { + unlink_stmt_vdef (stmt); + if (gsi_remove (&gsi, true) + && gimple_purge_dead_eh_edges (bb)) + cfg_changed = true; + release_defs (stmt); + continue; + } break; - default:; + default:; + } } } + gsi_next (&gsi); } - gsi_next (&gsi); } } + while (!mult_to_fma_pass.second_pass && mult_to_fma_pass.retry_request); statistics_counter_event (cfun, "widening multiplications inserted", widen_mul_stats.widen_mults_inserted);