On Tue, May 9, 2023 at 9:06 AM liuhongt via Gcc-patches <gcc-patches@gcc.gnu.org> wrote: > > The patch doesn't handle: > 1. cast64_to_32, > 2. memory source with rsize < range. > > Bootstrapped and regtested on x86_64-pc-linux-gnu{-m32,}. > Ok for trunk?
OK and sorry for the delay. Richard. > gcc/ChangeLog: > > PR middle-end/108938 > * gimple-ssa-store-merging.cc (is_bswap_or_nop_p): New > function, cut from original find_bswap_or_nop function. > (find_bswap_or_nop): Add a new parameter, detect bswap + > rotate and save rotate result in the new parameter. > (bswap_replace): Add a new parameter to indicate rotate and > generate rotate stmt if needed. > (maybe_optimize_vector_constructor): Adjust for new rotate > parameter in the upper 2 functions. > (pass_optimize_bswap::execute): Ditto. > (imm_store_chain_info::output_merged_store): Ditto. > > gcc/testsuite/ChangeLog: > > * gcc.target/i386/pr108938-1.c: New test. > * gcc.target/i386/pr108938-2.c: New test. > * gcc.target/i386/pr108938-3.c: New test. > * gcc.target/i386/pr108938-load-1.c: New test. > * gcc.target/i386/pr108938-load-2.c: New test. > --- > gcc/gimple-ssa-store-merging.cc | 130 ++++++++++++++---- > gcc/testsuite/gcc.target/i386/pr108938-1.c | 79 +++++++++++ > gcc/testsuite/gcc.target/i386/pr108938-2.c | 35 +++++ > gcc/testsuite/gcc.target/i386/pr108938-3.c | 26 ++++ > .../gcc.target/i386/pr108938-load-1.c | 69 ++++++++++ > .../gcc.target/i386/pr108938-load-2.c | 30 ++++ > 6 files changed, 342 insertions(+), 27 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr108938-load-2.c > > diff --git a/gcc/gimple-ssa-store-merging.cc b/gcc/gimple-ssa-store-merging.cc > index df7afd2fd78..9cb574fa315 100644 > --- a/gcc/gimple-ssa-store-merging.cc > +++ b/gcc/gimple-ssa-store-merging.cc > @@ -893,6 +893,37 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, > uint64_t *cmpxchg, > n->range *= BITS_PER_UNIT; > } > > +/* Helper function for find_bswap_or_nop, > + Return true if N is a swap or nop with MASK. */ > +static bool > +is_bswap_or_nop_p (uint64_t n, uint64_t cmpxchg, > + uint64_t cmpnop, uint64_t* mask, > + bool* bswap) > +{ > + *mask = ~(uint64_t) 0; > + if (n == cmpnop) > + *bswap = false; > + else if (n == cmpxchg) > + *bswap = true; > + else > + { > + int set = 0; > + for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER) > + if ((n & msk) == 0) > + *mask &= ~msk; > + else if ((n & msk) == (cmpxchg & msk)) > + set++; > + else > + return false; > + > + if (set < 2) > + return false; > + *bswap = true; > + } > + return true; > +} > + > + > /* Check if STMT completes a bswap implementation or a read in a given > endianness consisting of ORs, SHIFTs and ANDs and sets *BSWAP > accordingly. It also sets N to represent the kind of operations > @@ -903,7 +934,7 @@ find_bswap_or_nop_finalize (struct symbolic_number *n, > uint64_t *cmpxchg, > > gimple * > find_bswap_or_nop (gimple *stmt, struct symbolic_number *n, bool *bswap, > - bool *cast64_to_32, uint64_t *mask) > + bool *cast64_to_32, uint64_t *mask, uint64_t* l_rotate) > { > tree type_size = TYPE_SIZE_UNIT (TREE_TYPE (gimple_get_lhs (stmt))); > if (!tree_fits_uhwi_p (type_size)) > @@ -984,29 +1015,57 @@ find_bswap_or_nop (gimple *stmt, struct > symbolic_number *n, bool *bswap, > } > > uint64_t cmpxchg, cmpnop; > + uint64_t orig_range = n->range * BITS_PER_UNIT; > find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop, cast64_to_32); > > /* A complete byte swap should make the symbolic number to start with > the largest digit in the highest order byte. Unchanged symbolic > number indicates a read with same endianness as target architecture. */ > - *mask = ~(uint64_t) 0; > - if (n->n == cmpnop) > - *bswap = false; > - else if (n->n == cmpxchg) > - *bswap = true; > - else > + *l_rotate = 0; > + uint64_t tmp_n = n->n; > + if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap)) > { > - int set = 0; > - for (uint64_t msk = MARKER_MASK; msk; msk <<= BITS_PER_MARKER) > - if ((n->n & msk) == 0) > - *mask &= ~msk; > - else if ((n->n & msk) == (cmpxchg & msk)) > - set++; > - else > - return NULL; > - if (set < 2) > + /* Try bswap + lrotate. */ > + /* TODO, handle cast64_to_32 and big/litte_endian memory > + source when rsize < range. */ > + if (n->range == orig_range > + && ((orig_range == 32 > + && optab_handler (rotl_optab, SImode) != CODE_FOR_nothing) > + || (orig_range == 64 > + && optab_handler (rotl_optab, DImode) != CODE_FOR_nothing)) > + && (tmp_n & MARKER_MASK) < orig_range / BITS_PER_UNIT) > + { > + uint64_t range = (orig_range / BITS_PER_UNIT) * BITS_PER_MARKER; > + uint64_t count = (tmp_n & MARKER_MASK) * BITS_PER_MARKER; > + /* .i.e. hanlde 0x203040506070800 when lower byte is zero. */ > + if (!count) > + { > + for (uint64_t i = 1; i != range / BITS_PER_MARKER; i++) > + { > + count = (tmp_n >> i * BITS_PER_MARKER) & MARKER_MASK; > + if (count) > + { > + /* Count should be meaningful not 0xff. */ > + if (count <= range / BITS_PER_MARKER) > + { > + count = (count + i) * BITS_PER_MARKER % range; > + break; > + } > + else > + return NULL; > + } > + } > + } > + tmp_n = tmp_n >> count | tmp_n << (range - count); > + if (orig_range == 32) > + tmp_n &= (1ULL << 32) - 1; > + if (!is_bswap_or_nop_p (tmp_n, cmpxchg, cmpnop, mask, bswap)) > + return NULL; > + *l_rotate = count / BITS_PER_MARKER * BITS_PER_UNIT; > + gcc_assert (*bswap); > + } > + else > return NULL; > - *bswap = true; > } > > /* Useless bit manipulation performed by code. */ > @@ -1099,10 +1158,10 @@ bswap_view_convert (gimple_stmt_iterator *gsi, tree > type, tree val, > tree > bswap_replace (gimple_stmt_iterator gsi, gimple *ins_stmt, tree fndecl, > tree bswap_type, tree load_type, struct symbolic_number *n, > - bool bswap, uint64_t mask) > + bool bswap, uint64_t mask, uint64_t l_rotate) > { > tree src, tmp, tgt = NULL_TREE; > - gimple *bswap_stmt, *mask_stmt = NULL; > + gimple *bswap_stmt, *mask_stmt = NULL, *rotl_stmt = NULL; > tree_code conv_code = NOP_EXPR; > > gimple *cur_stmt = gsi_stmt (gsi); > @@ -1332,6 +1391,16 @@ bswap_replace (gimple_stmt_iterator gsi, gimple > *ins_stmt, tree fndecl, > tmp = tgt; > } > > + if (l_rotate) > + { > + tree m = build_int_cst (bswap_type, l_rotate); > + tmp = make_temp_ssa_name (bswap_type, NULL, > + mask_stmt ? "bswapmaskdst" : "bswapdst"); > + gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp); > + rotl_stmt = gimple_build_assign (tgt, LROTATE_EXPR, tmp, m); > + tmp = tgt; > + } > + > /* Convert the result if necessary. */ > if (!useless_type_conversion_p (TREE_TYPE (tgt), bswap_type)) > { > @@ -1344,7 +1413,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple > *ins_stmt, tree fndecl, > gsi_insert_after (&gsi2, convert_stmt, GSI_SAME_STMT); > } > > - gimple_set_lhs (mask_stmt ? mask_stmt : bswap_stmt, tmp); > + gimple_set_lhs (rotl_stmt ? rotl_stmt > + : mask_stmt ? mask_stmt : bswap_stmt, tmp); > > if (dump_file) > { > @@ -1361,6 +1431,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple > *ins_stmt, tree fndecl, > > if (cur_stmt) > { > + if (rotl_stmt) > + gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT); > if (mask_stmt) > gsi_insert_after (&gsi, mask_stmt, GSI_SAME_STMT); > gsi_insert_after (&gsi, bswap_stmt, GSI_SAME_STMT); > @@ -1371,6 +1443,8 @@ bswap_replace (gimple_stmt_iterator gsi, gimple > *ins_stmt, tree fndecl, > gsi_insert_before (&gsi, bswap_stmt, GSI_SAME_STMT); > if (mask_stmt) > gsi_insert_before (&gsi, mask_stmt, GSI_SAME_STMT); > + if (rotl_stmt) > + gsi_insert_after (&gsi, rotl_stmt, GSI_SAME_STMT); > } > return tgt; > } > @@ -1432,9 +1506,9 @@ maybe_optimize_vector_constructor (gimple *cur_stmt) > } > > bool cast64_to_32; > - uint64_t mask; > + uint64_t mask, l_rotate; > gimple *ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap, > - &cast64_to_32, &mask); > + &cast64_to_32, &mask, &l_rotate); > if (!ins_stmt > || n.range != (unsigned HOST_WIDE_INT) sz > || cast64_to_32 > @@ -1447,7 +1521,8 @@ maybe_optimize_vector_constructor (gimple *cur_stmt) > memset (&nop_stats, 0, sizeof (nop_stats)); > memset (&bswap_stats, 0, sizeof (bswap_stats)); > return bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, > - bswap_type, load_type, &n, bswap, mask) != NULL_TREE; > + bswap_type, load_type, &n, bswap, mask, > + l_rotate) != NULL_TREE; > } > > /* Find manual byte swap implementations as well as load in a given > @@ -1502,7 +1577,7 @@ pass_optimize_bswap::execute (function *fun) > enum tree_code code; > struct symbolic_number n; > bool bswap, cast64_to_32; > - uint64_t mask; > + uint64_t mask, l_rotate; > > /* This gsi_prev (&gsi) is not part of the for loop because cur_stmt > might be moved to a different basic block by bswap_replace and > gsi > @@ -1542,7 +1617,7 @@ pass_optimize_bswap::execute (function *fun) > } > > ins_stmt = find_bswap_or_nop (cur_stmt, &n, &bswap, > - &cast64_to_32, &mask); > + &cast64_to_32, &mask, &l_rotate); > > if (!ins_stmt) > continue; > @@ -1579,7 +1654,8 @@ pass_optimize_bswap::execute (function *fun) > continue; > > if (bswap_replace (gsi_for_stmt (cur_stmt), ins_stmt, fndecl, > - bswap_type, load_type, &n, bswap, mask)) > + bswap_type, load_type, &n, bswap, mask, > + l_rotate)) > changed = true; > } > } > @@ -4271,7 +4347,7 @@ imm_store_chain_info::output_merged_store > (merged_store_group *group) > } > bswap_res = bswap_replace (gsi_start (seq), ins_stmt, fndecl, > bswap_type, load_type, n, bswap, > - ~(uint64_t) 0); > + ~(uint64_t) 0, 0); > gcc_assert (bswap_res); > } > > diff --git a/gcc/testsuite/gcc.target/i386/pr108938-1.c > b/gcc/testsuite/gcc.target/i386/pr108938-1.c > new file mode 100644 > index 00000000000..67f245d1441 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr108938-1.c > @@ -0,0 +1,79 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mno-movbe" } */ > +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 6 { target { ! ia32 } } > } } */ > +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 9 { target ia32 } } } */ > + > +#include<stdint.h> > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64 (uint64_t x) > +{ > + return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) | > + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | > + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | > + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | > + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | > + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | > + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | > + (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48)); > +} > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64_mask_1 (uint64_t x) > +{ > + return ((((uint64_t)(0) & (uint64_t)0x00000000000000ffULL) << 0) | > + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | > + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | > + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | > + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | > + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | > + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | > + (((uint64_t)(x) & (uint64_t)0xff00000000000000ULL) >> 48)); > +} > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64_mask_2 (uint64_t x) > +{ > + return ((((uint64_t)(x) & (uint64_t)0x00000000000000ffULL) << 0) | > + (((uint64_t)(x) & (uint64_t)0x000000000000ff00ULL) << 48) | > + (((uint64_t)(x) & (uint64_t)0x0000000000ff0000ULL) << 32) | > + (((uint64_t)(x) & (uint64_t)0x00000000ff000000ULL) << 16) | > + (((uint64_t)(x) & (uint64_t)0x000000ff00000000ULL) >> 0) | > + (((uint64_t)(x) & (uint64_t)0x0000ff0000000000ULL) >> 16) | > + (((uint64_t)(x) & (uint64_t)0x00ff000000000000ULL) >> 32) | > + (((uint64_t)(0) & (uint64_t)0xff00000000000000ULL) >> 48)); > +} > + > + > +uint32_t > +__attribute__((noipa)) > +swap_rotate_32 (uint32_t x) > +{ > + return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) | > + (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) | > + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | > + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); > +} > + > +uint32_t > +__attribute__((noipa)) > +swap_rotate_32_mask_1 (uint32_t x) > +{ > + return ((((uint32_t)(0) & (uint32_t)0x00000000000000ffULL) << 8) | > + (((uint32_t)(x) & (uint32_t)0x000000000000ff00ULL) >> 8) | > + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | > + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); > +} > + > +uint32_t > +__attribute__((noipa)) > +swap_rotate_32_mask_2 (uint32_t x) > +{ > + return ((((uint32_t)(x) & (uint32_t)0x00000000000000ffULL) << 8) | > + (((uint32_t)(0) & (uint32_t)0x000000000000ff00ULL) >> 8) | > + (((uint32_t)(x) & (uint32_t)0x0000000000ff0000ULL) << 8) | > + (((uint32_t)(x) & (uint32_t)0x00000000ff000000ULL) >> 8)); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr108938-2.c > b/gcc/testsuite/gcc.target/i386/pr108938-2.c > new file mode 100644 > index 00000000000..47a2c89f1c4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr108938-2.c > @@ -0,0 +1,35 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > + > +#include "pr108938-1.c" > + > +int main () > +{ > + uint64_t a = 0x0807060504030201ULL; > + uint64_t res = swap_rotate_64 (a); > + if (res != 0x0203040506070801ULL) > + __builtin_abort (); > + > + res = swap_rotate_64_mask_1 (a); > + if (res != 0x0203040506070800ULL) > + __builtin_abort (); > + > + res = swap_rotate_64_mask_2 (a); > + if (res != 0x0203040506070001ULL) > + __builtin_abort (); > + > + uint32_t b = 0x04030201; > + uint32_t res2 = swap_rotate_32 (b); > + if (res2 != 0x03040102) > + __builtin_abort (); > + > + res2 = swap_rotate_32_mask_1 (b); > + if (res2 != 0x03040002) > + __builtin_abort (); > + > + res2 = swap_rotate_32_mask_2 (b); > + if (res2 != 0x03040100) > + __builtin_abort (); > + > + return 0; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr108938-3.c > b/gcc/testsuite/gcc.target/i386/pr108938-3.c > new file mode 100644 > index 00000000000..32ac544c7ed > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr108938-3.c > @@ -0,0 +1,26 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -ftree-vectorize -mno-movbe" } */ > +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 2 { target { ! ia32 } } > } } */ > +/* { dg-final { scan-assembler-times "bswap\[\t ]+" 3 { target ia32 } } } */ > + > +void > +foo1 (char* a, unsigned int* __restrict b) > +{ > + a[0] = b[0] >> 24; > + a[1] = b[0] >> 16; > + a[2] = b[0] >> 8; > + a[3] = b[0]; > + a[4] = b[1] >> 24; > + a[5] = b[1] >> 16; > + a[6] = b[1] >> 8; > + a[7] = b[1]; > +} > + > +void > +foo2 (char* a, short* __restrict b) > +{ > + a[0] = b[0] >> 8; > + a[1] = b[0]; > + a[2] = b[1] >> 8; > + a[3] = b[1]; > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-1.c > b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c > new file mode 100644 > index 00000000000..50d3a505c81 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-1.c > @@ -0,0 +1,69 @@ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mno-movbe" } */ > +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 5 { target { ! ia32 } } > } } */ > +/* { dg-final { scan-assembler-times "bswap\[ \t\]+" 8 { target ia32 } } } */ > + > +#include<stdint.h> > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64 (unsigned char* x) > +{ > + return ((uint64_t)(x[0]) << 0 | > + (uint64_t)(x[1]) << 56 | > + (uint64_t)(x[2]) << 48 | > + (uint64_t)(x[3]) << 40 | > + (uint64_t)(x[4]) << 32 | > + (uint64_t)(x[5]) << 24 | > + (uint64_t)(x[6]) << 16 | > + (uint64_t)(x[7]) << 8); > +} > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64_mask_1 (unsigned char* x) > +{ > + return ((uint64_t)(x[0]) << 24 | > + (uint64_t)(x[1]) << 16 | > + (uint64_t)(x[2]) << 8 | > + (uint64_t)(0) << 0 | > + (uint64_t)(x[4]) << 56 | > + (uint64_t)(x[5]) << 48 | > + (uint64_t)(x[6]) << 40 | > + (uint64_t)(x[7]) << 32); > +} > + > +uint64_t > +__attribute__((noipa)) > +swap_rotate_64_mask_2 (unsigned char* x) > +{ > + return ((uint64_t)(x[0]) << 0 | > + (uint64_t)(x[1]) << 56 | > + (uint64_t)(x[2]) << 48 | > + (uint64_t)(0) << 40 | > + (uint64_t)(x[4]) << 32 | > + (uint64_t)(x[5]) << 24 | > + (uint64_t)(x[6]) << 16 | > + (uint64_t)(x[7]) << 8); > +} > + > + > +uint32_t > +__attribute__((noipa)) > +swap_rotate_32 (unsigned char* x) > +{ > + return ((uint64_t)(x[0]) << 8 | > + (uint64_t)(x[1]) << 0 | > + (uint64_t)(x[2]) << 24 | > + (uint64_t)(x[3]) << 16); > +} > + > +uint32_t > +__attribute__((noipa)) > +swap_rotate_32_mask_1 (unsigned char* x) > +{ > + return ((uint64_t)(x[0]) << 8 | > + (uint64_t)(0) << 0 | > + (uint64_t)(x[2]) << 24 | > + (uint64_t)(x[3]) << 16); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr108938-load-2.c > b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c > new file mode 100644 > index 00000000000..51a8102f995 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr108938-load-2.c > @@ -0,0 +1,30 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2" } */ > + > +#include "pr108938-load-1.c" > + > +int main () > +{ > + unsigned char a[8] = {1, 2, 3, 4, 5, 6, 7, 8}; > + uint64_t res = swap_rotate_64 (a); > + if (res != 0x0203040506070801ULL) > + __builtin_abort (); > + > + res = swap_rotate_64_mask_1 (a); > + if (res != 0x0506070801020300ULL) > + __builtin_abort (); > + > + res = swap_rotate_64_mask_2 (a); > + if (res != 0x0203000506070801ULL) > + __builtin_abort (); > + > + uint32_t res2 = swap_rotate_32 (a); > + if (res2 != 0x03040102) > + __builtin_abort (); > + > + res2 = swap_rotate_32_mask_1 (a); > + if (res2 != 0x03040100) > + __builtin_abort (); > + > + return 0; > +} > -- > 2.39.1.388.g2fc9e9ca3c >