I am testing the following additional pattern for match.pd to fix PR81502 resulting in the desired optimization to
bar: .LFB526: .cfi_startproc movl %edi, %eax ret the pattern optimizes a BIT_FIELD_REF on a BIT_INSERT_EXPR by either extracting from the destination or the inserted value. Bootstrap and regtest running on x86_64-unknown-linux-gnu. Richard. 2017-07-27 Richard Biener <rguent...@suse.de> PR tree-optimization/81502 * match.pd: Add pattern combining BIT_INSERT_EXPR with BIT_FIELD_REF. * gcc.target/i386/pr81502.c: New testcase. Index: gcc/match.pd =================================================================== *** gcc/match.pd (revision 250620) --- gcc/match.pd (working copy) *************** DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) *** 4178,4180 **** --- 4178,4195 ---- { CONSTRUCTOR_ELT (ctor, idx / k)->value; }) (BIT_FIELD_REF { CONSTRUCTOR_ELT (ctor, idx / k)->value; } @1 { bitsize_int ((idx % k) * width); }))))))))) + + /* Simplify a bit extraction from a bit insertion for the cases with + the inserted element fully covering the extraction or the insertion + not touching the extraction. */ + (simplify + (BIT_FIELD_REF (bit_insert @0 @1 @ipos) @rsize @rpos) + (switch + (if (wi::leu_p (@ipos, @rpos) + && wi::leu_p (wi::add (@rpos, @rsize), + wi::add (@ipos, TYPE_PRECISION (TREE_TYPE (@1))))) + (BIT_FIELD_REF @1 @rsize { wide_int_to_tree (bitsizetype, + wi::sub (@rpos, @ipos)); })) + (if (wi::geu_p (@ipos, wi::add (@rpos, @rsize)) + || wi::geu_p (@rpos, wi::add (@ipos, TYPE_PRECISION (TREE_TYPE (@1))))) + (BIT_FIELD_REF @0 @rsize @rpos)))) Index: gcc/testsuite/gcc.target/i386/pr81502.c =================================================================== *** gcc/testsuite/gcc.target/i386/pr81502.c (nonexistent) --- gcc/testsuite/gcc.target/i386/pr81502.c (working copy) *************** *** 0 **** --- 1,34 ---- + /* { dg-do compile { target lp64 } } */ + /* { dg-options "-O2 -msse2" } */ + + #include <emmintrin.h> + + #define SIZE (sizeof (void *)) + + static int foo(unsigned char (*foo)[SIZE]) + { + __m128i acc = _mm_set_epi32(0, 0, 0, 0); + size_t i = 0; + for(; i + sizeof(__m128i) <= SIZE; i += sizeof(__m128i)) { + __m128i word; + __builtin_memcpy(&word, foo + i, sizeof(__m128i)); + acc = _mm_add_epi32(word, acc); + } + if (i != SIZE) { + __m128i word = _mm_set_epi32(0, 0, 0, 0); + __builtin_memcpy(&word, foo + i, SIZE - i); // (1) + acc = _mm_add_epi32(word, acc); + } + int res; + __builtin_memcpy(&res, &acc, sizeof(res)); + return res; + } + + int bar(void *ptr) + { + unsigned char buf[SIZE]; + __builtin_memcpy(buf, &ptr, SIZE); + return foo((unsigned char(*)[SIZE])buf); + } + + /* { dg-final { scan-assembler-times "mov" 1 } } */