This patch uses `lowpart_subreg` for the base register initialization, instead of zero-extending it. We had tried this solution before, but we were leaving undefined bytes in the upper part of the register. This shouldn't be happening as we are supposed to write the whole register when the load is eliminated. This was occurring when having multiple stores with the same offset as the load, generating a register move for all of them, overwriting the bit inserts that were inserted before them.
In order to overcome this, we are removing redundant stores from the sequence, i.e. stores that write to addresses that will be overwritten by stores that come after them in the sequence. We are using the same bitmap that is used for the load elimination check, to keep track of the bytes that are written by each store. Also, we are now allowing the load to be eliminated even when there are overlaps between the stores, as there is no obvious reason why we shouldn't do that, we just want the stores to cover all of the load's bytes. Bootstrapped/regtested on AArch64 and x86_64. PR rtl-optimization/119884 gcc/ChangeLog: * avoid-store-forwarding.cc (process_store_forwarding): Use `lowpart_subreg` for the base register initialization, and remove redundant stores from the store/load sequence. gcc/testsuite/ChangeLog: * gcc.target/i386/pr119884.c: New test. Signed-off-by: Konstantinos Eleftheriou <konstantinos.elefther...@vrull.eu> --- Changes in v3: - Remove redundant stores, instead of generating a register move for the first store that has the same offset as the load only. Changes in v2: - Use `lowpart_subreg` for the base register initialization, but only for the first store that has the same offset as the load. Changes in v1: - Add a check for the register modes to match before calling `emit_mov_insn`. gcc/avoid-store-forwarding.cc | 45 ++++++++++++++++++------ gcc/testsuite/gcc.target/i386/pr119884.c | 13 +++++++ 2 files changed, 48 insertions(+), 10 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr119884.c diff --git a/gcc/avoid-store-forwarding.cc b/gcc/avoid-store-forwarding.cc index 5d960adec359..f88a001e5717 100644 --- a/gcc/avoid-store-forwarding.cc +++ b/gcc/avoid-store-forwarding.cc @@ -176,20 +176,28 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn, /* Memory sizes should be constants at this stage. */ HOST_WIDE_INT load_size = MEM_SIZE (load_mem).to_constant (); - /* If the stores cover all the bytes of the load without overlap then we can - eliminate the load entirely and use the computed value instead. */ + /* If the stores cover all the bytes of the load, then we can eliminate + the load entirely and use the computed value instead. + We can also eliminate stores on addresses that are overwritten + by later stores. */ sbitmap forwarded_bytes = sbitmap_alloc (load_size); bitmap_clear (forwarded_bytes); unsigned int i; store_fwd_info* it; + auto_vec<store_fwd_info> redundant_stores; + auto_vec<int> store_ind_to_remove; FOR_EACH_VEC_ELT (stores, i, it) { HOST_WIDE_INT store_size = MEM_SIZE (it->store_mem).to_constant (); - if (bitmap_bit_in_range_p (forwarded_bytes, it->offset, + if (bitmap_is_range_set_p (forwarded_bytes, it->offset, it->offset + store_size - 1)) - break; + { + redundant_stores.safe_push (*it); + store_ind_to_remove.safe_push (i); + continue; + } bitmap_set_range (forwarded_bytes, it->offset, store_size); } @@ -215,6 +223,11 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn, fprintf (dump_file, "(Load elimination candidate)\n"); } + /* Remove redundant stores from the vector. */ + store_ind_to_remove.reverse (); + for (int i : store_ind_to_remove) + stores.ordered_remove (i); + rtx load = single_set (load_insn); rtx dest; @@ -231,18 +244,16 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn, { it->mov_reg = gen_reg_rtx (GET_MODE (it->store_mem)); rtx_insn *insns = NULL; + const bool has_zero_offset = it->offset == 0; /* If we're eliminating the load then find the store with zero offset and use it as the base register to avoid a bit insert if possible. */ - if (load_elim && it->offset == 0) + if (load_elim && has_zero_offset) { start_sequence (); - machine_mode dest_mode = GET_MODE (dest); - rtx base_reg = it->mov_reg; - if (known_gt (GET_MODE_BITSIZE (dest_mode), - GET_MODE_BITSIZE (GET_MODE (it->mov_reg)))) - base_reg = gen_rtx_ZERO_EXTEND (dest_mode, it->mov_reg); + rtx base_reg = lowpart_subreg (GET_MODE (dest), it->mov_reg, + GET_MODE (it->mov_reg)); if (base_reg) { @@ -380,6 +391,16 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn, print_rtl_single (dump_file, insn); } } + + if (redundant_stores.length () > 0) + { + fprintf (dump_file, "\nRedundant stores that have been removed:\n"); + FOR_EACH_VEC_ELT (redundant_stores, i, it) + { + fprintf (dump_file, " "); + print_rtl_single (dump_file, it->store_insn); + } + } } stats_sf_avoided++; @@ -399,6 +420,10 @@ process_store_forwarding (vec<store_fwd_info> &stores, rtx_insn *load_insn, delete_insn (it->store_insn); } + /* Delete redundant stores. */ + FOR_EACH_VEC_ELT (redundant_stores, i, it) + delete_insn (it->store_insn); + df_insn_rescan (load_insn); if (load_elim) diff --git a/gcc/testsuite/gcc.target/i386/pr119884.c b/gcc/testsuite/gcc.target/i386/pr119884.c new file mode 100644 index 000000000000..34d5b689244d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr119884.c @@ -0,0 +1,13 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fno-dse -favoid-store-forwarding" } */ + +typedef __attribute__((__vector_size__(64))) char V; +char c; +V v; + +char +foo() +{ + v *= c; + return v[0]; +} \ No newline at end of file -- 2.49.0