On Tue, Jan 05, 2021 at 01:55:21PM +0100, Richard Biener wrote: > > Note, I have no idea why the bswap code needs TODO_update_ssa if it changed > > things, for the vuses it copies them from the surrounding vuses, which looks > > correct to me. Perhaps because it uses force_gimple_operand_gsi* in a few > > spots in bswap_replace? Confused... > > .. that shouldn't cause updating SSA to be necessary. Maybe it at some > point did not update virtual operands appropriately.
Ok, I've committed the following version without the TODO_update_ssa, which passed another bootstrap/regtest on x86_64-linux and i686-linux. 2021-01-05 Jakub Jelinek <ja...@redhat.com> PR tree-optimization/96239 * gimple-ssa-store-merging.c (maybe_optimize_vector_constructor): New function. (get_status_for_store_merging): Don't return BB_INVALID for blocks with potential bswap optimizable CONSTRUCTORs. (pass_store_merging::execute): Optimize vector CONSTRUCTORs with bswap if possible. * gcc.dg/tree-ssa/pr96239.c: New test. --- gcc/gimple-ssa-store-merging.c.jj 2020-12-16 13:07:51.729733816 +0100 +++ gcc/gimple-ssa-store-merging.c 2020-12-16 16:02:06.238868137 +0100 @@ -1255,6 +1255,75 @@ bswap_replace (gimple_stmt_iterator gsi, return tgt; } +/* Try to optimize an assignment CUR_STMT with CONSTRUCTOR on the rhs + using bswap optimizations. CDI_DOMINATORS need to be + computed on entry. Return true if it has been optimized and + TODO_update_ssa is needed. */ + +static bool +maybe_optimize_vector_constructor (gimple *cur_stmt) +{ + tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type; + struct symbolic_number n; + bool bswap; + + gcc_assert (is_gimple_assign (cur_stmt) + && gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR); + + tree rhs = gimple_assign_rhs1 (cur_stmt); + if (!VECTOR_TYPE_P (TREE_TYPE (rhs)) + || !INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))) + || gimple_assign_lhs (cur_stmt) == NULL_TREE) + return false; + + HOST_WIDE_INT sz = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT; + switch (sz) + { + case 16: + load_type = bswap_type = uint16_type_node; + break; + case 32: + if (builtin_decl_explicit_p (BUILT_IN_BSWAP32) + && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing) + { + load_type = uint32_type_node; + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP32); + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); + } + else + return false; + break; + case 64: + if (builtin_decl_explicit_p (BUILT_IN_BSWAP64) + && (optab_handler (bswap_optab, DImode) != CODE_FOR_nothing + || (word_mode == SImode + && builtin_decl_explicit_p (BUILT_IN_BSWAP32) + && optab_handler (bswap_optab, SImode) != CODE_FOR_nothing))) + { + load_type = uint64_type_node; + fndecl = builtin_decl_explicit (BUILT_IN_BSWAP64); + bswap_type = TREE_VALUE (TYPE_ARG_TYPES (TREE_TYPE (fndecl))); + } + else + return false; + break; + default: + return false; + } + + gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap); + if (!ins_stmt || n.range != (unsigned HOST_WIDE_INT) sz) + return false; + + if (bswap && !fndecl && n.range != 16) + return false; + + memset (&nop_stats, 0, sizeof (nop_stats)); + memset (&bswap_stats, 0, sizeof (bswap_stats)); + return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, + bswap_type, load_type, &n, bswap) != NULL_TREE; +} + /* Find manual byte swap implementations as well as load in a given endianness. Byte swaps are turned into a bswap builtin invokation while endian loads are converted to bswap builtin invokation or @@ -5126,6 +5195,7 @@ static enum basic_block_status get_status_for_store_merging (basic_block bb) { unsigned int num_statements = 0; + unsigned int num_constructors = 0; gimple_stmt_iterator gsi; edge e; @@ -5138,9 +5208,27 @@ get_status_for_store_merging (basic_bloc if (store_valid_for_store_merging_p (stmt) && ++num_statements >= 2) break; + + if (is_gimple_assign (stmt) + && gimple_assign_rhs_code (stmt) == CONSTRUCTOR) + { + tree rhs = gimple_assign_rhs1 (stmt); + if (VECTOR_TYPE_P (TREE_TYPE (rhs)) + && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))) + && gimple_assign_lhs (stmt) != NULL_TREE) + { + HOST_WIDE_INT sz + = int_size_in_bytes (TREE_TYPE (rhs)) * BITS_PER_UNIT; + if (sz == 16 || sz == 32 || sz == 64) + { + num_constructors = 1; + break; + } + } + } } - if (num_statements == 0) + if (num_statements == 0 && num_constructors == 0) return BB_INVALID; if (cfun->can_throw_non_call_exceptions && cfun->eh @@ -5149,7 +5237,7 @@ get_status_for_store_merging (basic_bloc && e->dest == bb->next_bb) return BB_EXTENDED_VALID; - return num_statements >= 2 ? BB_VALID : BB_INVALID; + return (num_statements >= 2 || num_constructors) ? BB_VALID : BB_INVALID; } /* Entry point for the pass. Go over each basic block recording chains of @@ -5189,9 +5278,10 @@ pass_store_merging::execute (function *f if (dump_file && (dump_flags & TDF_DETAILS)) fprintf (dump_file, "Processing basic block <%d>:\n", bb->index); - for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); gsi_next (&gsi)) + for (gsi = gsi_after_labels (bb); !gsi_end_p (gsi); ) { gimple *stmt = gsi_stmt (gsi); + gsi_next (&gsi); if (is_gimple_debug (stmt)) continue; @@ -5207,6 +5297,11 @@ pass_store_merging::execute (function *f continue; } + if (is_gimple_assign (stmt) + && gimple_assign_rhs_code (stmt) == CONSTRUCTOR + && maybe_optimize_vector_constructor (stmt)) + continue; + if (store_valid_for_store_merging_p (stmt)) changed |= process_store (stmt); else --- gcc/testsuite/gcc.dg/tree-ssa/pr96239.c.jj 2020-12-16 16:10:30.013256862 +0100 +++ gcc/testsuite/gcc.dg/tree-ssa/pr96239.c 2020-12-16 16:11:08.802822537 +0100 @@ -0,0 +1,18 @@ +/* PR tree-optimization/96239 */ +/* { dg-do compile { target { ilp32 || lp64 } } } */ +/* { dg-options "-O3 -fdump-tree-optimized" } */ +/* { dg-final { scan-tree-dump " r>> 8;" "optimized" { target bswap } } } */ + +union U { unsigned char c[2]; unsigned short s; }; + +unsigned short +foo (unsigned short x) +{ + union U u; + u.s = x; + unsigned char v = u.c[0]; + unsigned char w = u.c[1]; + u.c[0] = w; + u.c[1] = v; + return u.s; +} Jakub