Hi! As can be seen on the testcase below, the *<rotate_insn><mode>3_mask insn/splitter is able to optimize only the case when the and is performed in SImode and then the result subreged into QImode, while if the computation is already in QImode, we don't handle it.
Fixed by adding another pattern, bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2017-10-11 Jakub Jelinek <ja...@redhat.com> PR target/82498 * config/i386/i386.md (*<rotate_insn><mode>3_mask_1): New define_insn_and_split. * gcc.target/i386/pr82498.c: New test. --- gcc/config/i386/i386.md.jj 2017-10-10 11:54:11.000000000 +0200 +++ gcc/config/i386/i386.md 2017-10-11 19:24:27.673606778 +0200 @@ -11187,6 +11187,26 @@ (define_insn_and_split "*<rotate_insn><m (clobber (reg:CC FLAGS_REG))])] "operands[2] = gen_lowpart (QImode, operands[2]);") +(define_insn_and_split "*<rotate_insn><mode>3_mask_1" + [(set (match_operand:SWI48 0 "nonimmediate_operand") + (any_rotate:SWI48 + (match_operand:SWI48 1 "nonimmediate_operand") + (and:QI + (match_operand:QI 2 "register_operand") + (match_operand:QI 3 "const_int_operand")))) + (clobber (reg:CC FLAGS_REG))] + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) + && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) + == GET_MODE_BITSIZE (<MODE>mode)-1 + && can_create_pseudo_p ()" + "#" + "&& 1" + [(parallel + [(set (match_dup 0) + (any_rotate:SWI48 (match_dup 1) + (match_dup 2))) + (clobber (reg:CC FLAGS_REG))])]) + ;; Implement rotation using two double-precision ;; shift instructions and a scratch register. --- gcc/testsuite/gcc.target/i386/pr82498.c.jj 2017-10-11 20:21:10.677088306 +0200 +++ gcc/testsuite/gcc.target/i386/pr82498.c 2017-10-11 20:22:31.569101564 +0200 @@ -0,0 +1,52 @@ +/* PR target/82498 */ +/* { dg-do compile } */ +/* { dg-options "-O2 -mtune=generic -masm=att" } */ +/* { dg-final { scan-assembler-not {\mand[bwlq]\M} } } */ + +unsigned +f1 (unsigned x, unsigned char y) +{ + if (y == 0) + return x; + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; + return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y)); +} + +unsigned +f2 (unsigned x, unsigned y) +{ + if (y == 0) + return x; + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; + return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y)); +} + +unsigned +f3 (unsigned x, unsigned short y) +{ + if (y == 0) + return x; + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; + return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y)); +} + +unsigned +f4 (unsigned x, unsigned char y) +{ + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; + return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1))); +} + +unsigned +f5 (unsigned x, unsigned int y) +{ + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; + return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1))); +} + +unsigned +f6 (unsigned x, unsigned short y) +{ + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; + return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1))); +} Jakub