On Thu, Aug 7, 2014 at 12:55 PM, Thomas Preud'homme <thomas.preudho...@arm.com> wrote: > Hi all, > > Currently, when an expression doing manual load or bswap is detected, the > bswap optimization replace it by an actual load and/or bswap instruction > without considering whether the intermediate values computed in the > expressions are reused later. If that is the case, the construction of these > values has to be retained and the sum of the load and/or bswap instruction > and the instruction to contruct those values might be more expensive than > the initial fully manual expression. This patch aims at detecting such a case > and cancel the bswap optimization. In addition, it takes advantage of the > infrastructure necessary for the detection to also cleanup the stmts that > become useless when the bswap optimization is performed.
Comments below > *** gcc/ChangeLog *** > > 2014-08-01 Thomas Preud'homme <thomas.preudho...@arm.com> > > * tree-ssa-math-opts.c (struct usedtree): New. > (find_bswap_or_nop_1): Change prototype to take a hashtable and a list > of struct usedtree. Fill respectively these with visited stmts and > trees (along with the stmts where they are defined) that would not be > defined if bswap optimization is applied. Adapt recursion call to > prototype change. > (find_bswap_or_nop): Adapt to find_bswap_or_nop_1 prototype change. > Pass the hashtable and the list of struct usedtree from > pass_optimize_bswap::execute (). > (do_bswap_p): New. > (bswap_replace): Fix typo in heading comment. Stop returning whether > the bswap optimization was performed since this is now handled by > do_bswap_p (). Move alignment check to do_bswap_p (). > (free_usedtrees_list): New. > (pass_optimize_bswap::execute): Add allocation and deallocation > handling of the hashtable of browsed stmts. Free the list of struct > usedtree at the end of each iteration using free_usedtrees_list () and > the new bswap_check_end_iter label. Move some of the logic to perform > the bswap optimization to do_bswap_p (). Set gsi after performing the > bswap optimization and loop manually via the new > bswap_check_start_iter label so that all stmts are checked for > load/bswap even when cur_stmt is moved around by bswap_replace (). > > *** gcc/testsuite/ChangeLog *** > > 2014-08-01 Thomas Preud'homme <thomas.preudho...@arm.com> > > * gcc.dg/optimize-bswapsi-2.c (read_le32_1): Add an intermediate > variable in the mix to check that it is optimized when there is no > use outside the expression computing a load/bswap. > (read_be32_1): Likewise. > * gcc.dg/optimize-bswapsi-3.c: New. Create read_le32_1 () and > read_be32_1 () based on identically named function in > gcc.dg/optimize-bswapsi-2.c but reusing the intermediate variable so > as to check that bswap optimization is not performed in these cases. > > diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c > b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c > index de6e697..df7856b 100644 > --- a/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c > +++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-2.c > @@ -14,7 +14,9 @@ struct uint32_st { > > uint32_t read_le32_1 (void) > { > - return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24); > + uint32_t low = data[0] | (data[1] << 8); > + uint32_t ret = low | (data[2] << 16) | (data[3] << 24); > + return ret; > } > > uint32_t read_le32_2 (struct uint32_st data) > @@ -30,7 +32,9 @@ uint32_t read_le32_3 (unsigned char *data) > > uint32_t read_be32_1 (void) > { > - return data[3] | (data[2] << 8) | (data[1] << 16) | (data[0] << 24); > + uint32_t low = data[3] | (data[2] << 8); > + uint32_t ret = low | (data[1] << 16) | (data[0] << 24); > + return ret; > } > > uint32_t read_be32_2 (struct uint32_st data) > diff --git a/gcc/testsuite/gcc.dg/optimize-bswapsi-3.c > b/gcc/testsuite/gcc.dg/optimize-bswapsi-3.c > new file mode 100644 > index 0000000..dc48d9d > --- /dev/null > +++ b/gcc/testsuite/gcc.dg/optimize-bswapsi-3.c > @@ -0,0 +1,31 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target bswap32 } */ > +/* { dg-require-effective-target stdint_types } */ > +/* { dg-options "-O2 -fdump-tree-bswap" } */ > +/* { dg-additional-options "-march=z900" { target s390-*-* } } */ > + > +#include <stdint.h> > + > +extern unsigned char data[4]; > + > +/* No "bswap" optimization as low is reused */ > +uint32_t read_le32_1 (unsigned char *data, uint32_t *low_neg) > +{ > + uint32_t low = data[0] | (data[1] << 8); > + uint32_t ret = low | (data[2] << 16) | (data[3] << 24); > + *low_neg = low; > + return ret; > +} > + > +/* No "bswap" optimization as low is reused */ > +uint32_t read_be32_1 (unsigned char *data, uint32_t *low_neg) > +{ > + uint32_t low = data[3] | (data[2] << 8); > + uint32_t ret = low | (data[1] << 16) | (data[0] << 24); > + *low_neg = low; > + return ret; > +} > + > +/* { dg-final { scan-tree-dump-not "32 bit load in target endianness found > at" "bswap" } } */ > +/* { dg-final { scan-tree-dump-not "32 bit bswap implementation found at" > "bswap" } } */ > +/* { dg-final { cleanup-tree-dump "bswap" } } */ > diff --git a/gcc/tree-ssa-math-opts.c b/gcc/tree-ssa-math-opts.c > index 705793b..9f71145 100644 > --- a/gcc/tree-ssa-math-opts.c > +++ b/gcc/tree-ssa-math-opts.c > @@ -1641,6 +1641,18 @@ struct symbolic_number { > #define CMPXCHG (sizeof (int64_t) < 8 ? 0 : \ > (uint64_t)0x01020304 << 32 | 0x05060708) > > +/* Structure to memorize all the intermediate trees created to compute a > bswap > + or a load along with the stmt where they are used. This allows to check > + whether some trees are reused outside the bswap or load performed and > avoid > + doing the optimization in that case since it might lead to more > computation. > + It also allows to remove all the stmts that create these trees in case the > + optimization is performed. */ > +struct usedtree { > + tree t; > + gimple stmt; > + struct usedtree *next; > +}; > + > /* Perform a SHIFT or ROTATE operation by COUNT bits on symbolic > number N. Return false if the requested operation is not permitted > on a symbolic number. */ > @@ -1803,19 +1815,26 @@ find_bswap_or_nop_load (gimple stmt, tree ref, struct > symbolic_number *n) > return true; > } > > -/* find_bswap_or_nop_1 invokes itself recursively with N and tries to perform > - the operation given by the rhs of STMT on the result. If the operation > - could successfully be executed the function returns a gimple stmt whose > - rhs's first tree is the expression of the source operand and NULL > - otherwise. */ > +/* find_bswap_or_nop_1 invokes itself recursively trying to perform the > + operation given by the rhs of STMT on the symbolic number represented by N > + until LIMIT recursions have been done or the variable that is the source > of > + the successive expressions has been found. The function also keeps track > of > + the intermediate trees defined in the expression with the stmt where they > + are defined (in USED_TREES) and records the stmt that were visited (in > + BROWSED_STMTS_HTAB). If the operation could successfully be executed the > + function returns a gimple stmt whose rhs's first tree is the expression of > + the source operand and NULL otherwise. */ > > static gimple > -find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit) > +find_bswap_or_nop_1 (gimple stmt, struct symbolic_number *n, int limit, > + htab_t browsed_stmts_htab, struct usedtree **used_trees) > { > + void **slot; > enum tree_code code; > tree rhs1, rhs2 = NULL; > gimple rhs1_stmt, rhs2_stmt, source_stmt1; > enum gimple_rhs_class rhs_class; > + struct usedtree *new_used; > > if (!limit || !is_gimple_assign (stmt)) > return NULL; > @@ -1823,7 +1842,20 @@ find_bswap_or_nop_1 (gimple stmt, struct > symbolic_number *n, int limit) > rhs1 = gimple_assign_rhs1 (stmt); > > if (find_bswap_or_nop_load (stmt, rhs1, n)) > - return stmt; > + { > + slot = htab_find_slot (browsed_stmts_htab, stmt, INSERT); > + gcc_assert (slot); > + if (!*slot) > + { > + *slot = stmt; > + new_used = (struct usedtree *) xmalloc (sizeof (*used_trees)); > + new_used->t = gimple_assign_lhs (stmt); > + new_used->stmt = stmt; > + new_used->next = *used_trees; > + *used_trees = new_used; Hmm. I am a little bit concerned about the malloc traffic generated here. So why not use a vec<usedtree>, get rid of the ->next pointer and use a hash_map <gimple, unsigned> to associate the stmt with an index into the vector? The whole scheme wouldn't exactly fly with the idea of eventually using a lattice to propagate the symbolic numbers, but well... > + } > + return stmt; > + } > > if (TREE_CODE (rhs1) != SSA_NAME) > return NULL; > @@ -1835,6 +1867,18 @@ find_bswap_or_nop_1 (gimple stmt, struct > symbolic_number *n, int limit) > if (rhs_class == GIMPLE_BINARY_RHS) > rhs2 = gimple_assign_rhs2 (stmt); > > + slot = htab_find_slot (browsed_stmts_htab, stmt, INSERT); > + gcc_assert (slot); > + if (!*slot) > + { > + *slot = stmt; > + new_used = (struct usedtree *) xmalloc (sizeof (*used_trees)); > + new_used->t = gimple_assign_lhs (stmt); > + new_used->stmt = stmt; > + new_used->next = *used_trees; > + *used_trees = new_used; > + } > + > /* Handle unary rhs and binary rhs with integer constants as second > operand. */ > > @@ -1851,12 +1895,14 @@ find_bswap_or_nop_1 (gimple stmt, struct > symbolic_number *n, int limit) > && code != CONVERT_EXPR) > return NULL; > > - source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1); > + source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, n, limit - 1, > + browsed_stmts_htab, used_trees); > > /* If find_bswap_or_nop_1 returned NULL, STMT is a leaf node and > we have to initialize the symbolic number. */ > if (!source_stmt1) > { > + > if (gimple_assign_load_p (stmt) > || !init_symbolic_number (n, rhs1)) > return NULL; > @@ -1942,12 +1988,14 @@ find_bswap_or_nop_1 (gimple stmt, struct > symbolic_number *n, int limit) > switch (code) > { > case BIT_IOR_EXPR: > - source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1); > + source_stmt1 = find_bswap_or_nop_1 (rhs1_stmt, &n1, limit - 1, > + browsed_stmts_htab, used_trees); > > if (!source_stmt1) > return NULL; > > - source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1); > + source_stmt2 = find_bswap_or_nop_1 (rhs2_stmt, &n2, limit - 1, > + browsed_stmts_htab, used_trees); > > if (!source_stmt2) > return NULL; > @@ -2044,16 +2092,19 @@ find_bswap_or_nop_1 (gimple stmt, struct > symbolic_number *n, int limit) > return NULL; > } > > -/* Check if STMT completes a bswap implementation or a read in a given > - endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP > - accordingly. It also sets N to represent the kind of operations > - performed: size of the resulting expression and whether it works on > - a memory source, and if so alias-set and vuse. At last, the > - function returns a stmt whose rhs's first tree is the source > - expression. */ > +/* Check if STMT completes a bswap implementation or a load in a given > + endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP accordingly. > + It also sets N to represent the kind of operations performed: size of the > + resulting expression and whether it works on a memory source, and if so > + alias-set and vuse. All the intermediate trees defined in such an > + implementation are recorded along the stmt where they are defined (in > + USED_TREES) as well as the stmts that are part of that implementation (in > + BROWSED_STMTS_HTAB). At last, the function returns a stmt whose rhs's > first > + tree is the source expression. */ > > static gimple > -find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap) > +find_bswap_or_nop (gimple stmt, struct symbolic_number *n, bool *bswap, > + htab_t browsed_stmts_htab, struct usedtree **used_trees) > { > /* The number which the find_bswap_or_nop_1 result should match in order > to have a full byte swap. The number is shifted to the right > @@ -2071,7 +2122,8 @@ find_bswap_or_nop (gimple stmt, struct symbolic_number > *n, bool *bswap) > in libgcc, and for initial shift/and operation of the src operand. */ > limit = TREE_INT_CST_LOW (TYPE_SIZE_UNIT (gimple_expr_type (stmt))); > limit += 1 + (int) ceil_log2 ((unsigned HOST_WIDE_INT) limit); > - source_stmt = find_bswap_or_nop_1 (stmt, n, limit); > + source_stmt = find_bswap_or_nop_1 (stmt, n, limit, browsed_stmts_htab, > + used_trees); > > if (!source_stmt) > return NULL; > @@ -2146,16 +2198,73 @@ public: > > }; // class pass_optimize_bswap > > +/* Tests whether the bswap optimization can be performed and is likely to > bring > + performance improvements. This decision is made based on: > + - whether the operation performed is a BSWAP or a simple load > + - whether the operation can be replaced by an instruction (FNDECL is non > + NULL) > + - the stmt reading/loading from source operand (SRC_STMT) > + - whether a the source is in memory (given by N->base_addr) > + - whether intermediate trees defined in the expression being optimized are > + reused elsewhere. This is determined from the list of tree defined along > + with the stmt where they are defined (in USED_TREES), a hashtable of the > + stmts that are part of the expression (in BROWSED_STMTS_HTAB) and the > + outermost stmt of the expression (in STMT). */ > + > +static bool > +do_bswap_p (gimple stmt, gimple src_stmt, tree fndecl, > + tree load_type ATTRIBUTE_UNUSED, struct symbolic_number *n, > + bool bswap, htab_t browsed_stmts_htab, struct usedtree > *used_trees) > +{ > + unsigned align ATTRIBUTE_UNUSED; > + struct usedtree *used; > + bool ret; > + > + if (bswap && !fndecl) > + return false; > + > + if (n->base_addr) > + { > + tree src = gimple_assign_rhs1 (src_stmt); > + align = get_object_alignment (src); > + if (bswap > + && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type)) > + && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align)) > + return false; > + } > + > + for (ret = true, used = used_trees; ret && used; used = used->next) > + { > + gimple using_stmt; > + imm_use_iterator imm_use; > + > + if (used->stmt == stmt) > + continue; > + > + FOR_EACH_IMM_USE_STMT(using_stmt, imm_use, used->t) > + { > + if (!is_gimple_debug (using_stmt) > + && !htab_find_slot (browsed_stmts_htab, using_stmt, NO_INSERT)) > + { > + ret = false; > + BREAK_FROM_IMM_USE_STMT (imm_use); > + } > + } > + } > + > + return ret; > +} > + > /* Perform the bswap optimization: replace the statement CUR_STMT at > GSI with a load of type, VUSE and set-alias as described by N if a > memory source is involved (N->base_addr is non null), followed by > the builtin bswap invocation in FNDECL if BSWAP is true. SRC_STMT > gives where should the replacement be made. It also gives the > - source on which CUR_STMT is operating via its rhs's first tree nad > + source on which CUR_STMT is operating via its rhs's first tree and > N->range gives the size of the expression involved for maintaining > some statistics. */ > > -static bool > +static void > bswap_replace (gimple cur_stmt, gimple_stmt_iterator gsi, gimple src_stmt, > tree fndecl, tree bswap_type, tree load_type, > struct symbolic_number *n, bool bswap) > @@ -2175,12 +2284,6 @@ bswap_replace (gimple cur_stmt, gimple_stmt_iterator > gsi, gimple src_stmt, > gimple addr_stmt, load_stmt; > unsigned align; > > - align = get_object_alignment (src); > - if (bswap > - && align < GET_MODE_ALIGNMENT (TYPE_MODE (load_type)) > - && SLOW_UNALIGNED_ACCESS (TYPE_MODE (load_type), align)) > - return false; > - > gsi_move_before (&gsi, &gsi_ins); > gsi = gsi_for_stmt (cur_stmt); > > @@ -2198,6 +2301,7 @@ bswap_replace (gimple cur_stmt, gimple_stmt_iterator > gsi, gimple src_stmt, > } > > /* Perform the load. */ > + align = get_object_alignment (src); > aligned_load_type = load_type; > if (align < TYPE_ALIGN (load_type)) > aligned_load_type = build_aligned_type (load_type, align); > @@ -2243,7 +2347,7 @@ bswap_replace (gimple cur_stmt, gimple_stmt_iterator > gsi, gimple src_stmt, > (int)n->range); > print_gimple_stmt (dump_file, cur_stmt, 0, 0); > } > - return true; > + return; > } > else > { > @@ -2300,7 +2404,32 @@ bswap_replace (gimple cur_stmt, gimple_stmt_iterator > gsi, gimple src_stmt, > > gsi_insert_after (&gsi, call, GSI_SAME_STMT); > gsi_remove (&gsi, true); > - return true; > +} > + > +/* Free the struct usedtree elements from the list USED_TREES. If > REMOVE_STMTS > + is true, the stmts they reference (CUR_STMT excepted) are removed and the > + ssa names of their lhs is released from the function FUN. */ > + > +static void > +free_usedtrees_list (struct usedtree *used_trees, bool remove_stmts, > + gimple cur_stmt, function *fun) > +{ > + gimple_stmt_iterator gsi_rm; > + struct usedtree *used; > + > + while ((used = used_trees)) > + { > + /* Do not mark stmt for removal if it's the replaced one. */ > + if (remove_stmts && used->stmt != cur_stmt) > + { > + tree lhs = gimple_assign_lhs (used->stmt); > + gsi_rm = gsi_for_stmt (used->stmt); > + gsi_remove (&gsi_rm, true); > + release_ssa_name_fn (fun, lhs); > + } > + used_trees = used->next; > + free (used); > + } At this point I'd rather leave DCE to DCE ... > } > > /* Find manual byte swap implementations as well as load in a given > @@ -2315,6 +2444,7 @@ pass_optimize_bswap::execute (function *fun) > bool bswap16_p, bswap32_p, bswap64_p; > bool changed = false; > tree bswap16_type = NULL_TREE, bswap32_type = NULL_TREE, bswap64_type = > NULL_TREE; > + htab_t browsed_stmts_htab; > > if (BITS_PER_UNIT != 8) > return 0; > @@ -2350,6 +2480,9 @@ pass_optimize_bswap::execute (function *fun) > memset (&nop_stats, 0, sizeof (nop_stats)); > memset (&bswap_stats, 0, sizeof (bswap_stats)); > > + browsed_stmts_htab = htab_create_alloc (16, htab_hash_pointer, > + htab_eq_pointer, 0, xcalloc, free); > + > FOR_EACH_BB_FN (bb, fun) > { > gimple_stmt_iterator gsi; > @@ -2360,19 +2493,29 @@ pass_optimize_bswap::execute (function *fun) > patterns, the wider variant wouldn't be detected. */ > for (gsi = gsi_last_bb (bb); !gsi_end_p (gsi); gsi_prev (&gsi)) > { > - gimple src_stmt, cur_stmt = gsi_stmt (gsi); > + gimple_stmt_iterator gsi_cont; > + gimple src_stmt, cur_stmt; > tree fndecl = NULL_TREE, bswap_type = NULL_TREE, load_type; > struct symbolic_number n; > - bool bswap; > + struct usedtree *used_trees; > + bool bswap, rm_stmts; > + > +bswap_check_start_iter: Ick - do we really have to use gotos here? Can you refactor this a bit to avoid it? I think the overall idea is sound and if you have time to adjust according to my comments that would be nice. Sorry for the very late review. Thanks, Richard. > + rm_stmts = false; > + cur_stmt = gsi_stmt (gsi); > > if (!is_gimple_assign (cur_stmt) > || gimple_assign_rhs_code (cur_stmt) != BIT_IOR_EXPR) > continue; > > - src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap); > + htab_empty (browsed_stmts_htab); > + used_trees = NULL; > + > + src_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap, > + browsed_stmts_htab, &used_trees); > > if (!src_stmt) > - continue; > + goto bswap_check_end_iter; > > switch (n.range) > { > @@ -2401,15 +2544,37 @@ pass_optimize_bswap::execute (function *fun) > } > break; > default: > - continue; > + goto bswap_check_end_iter; > } > > - if (bswap && !fndecl) > - continue; > - > - if (bswap_replace (cur_stmt, gsi, src_stmt, fndecl, bswap_type, > - load_type, &n, bswap)) > - changed = true; > + if (!do_bswap_p (cur_stmt, src_stmt, fndecl, load_type, &n, bswap, > + browsed_stmts_htab, used_trees)) > + goto bswap_check_end_iter; > + > + changed = true; > + gsi_cont = gsi_for_stmt (cur_stmt); > + gsi_next (&gsi_cont); > + bswap_replace (cur_stmt, gsi, src_stmt, fndecl, bswap_type, > + load_type, &n, bswap); > + rm_stmts = true; > + > +bswap_check_end_iter: > + free_usedtrees_list (used_trees, rm_stmts, cur_stmt, fun); > + > + /* cur_stmt may have been moved in bswap_replace (in case of a > + load) and/or replaced (in case of bswap). Ideally, we want the > + next iteration to check the first stmt that was before cur_stmt > + initially that is not an unused stmt. However it is easier and > + simpler to allow the next iteration to check the changed stmt. > */ > + if (rm_stmts) > + { > + if (gsi_end_p (gsi_cont)) > + gsi_cont = gsi_last_bb (bb); > + else > + gsi_prev (&gsi_cont); > + gsi = gsi_cont; > + goto bswap_check_start_iter; > + } > } > } > > Tested via bootstrap on x86_64-linux-gnu and no regression of the > testsuite (new tests passing) as well as through running the testsuite > compiled by arm-none-eabi-gcc cross compiler under qemu emulating > Cortex M3 without any regression (new tests passing). > > Ok for trunk? > > Best regards, > > Thomas > >