This patch builds on the recent improvements to TImode rotations (and Jakub's fixes to shldq/shrdq patterns). Now that expanding a TImode rotation can never fail, it is safe to allow general_operand constraints on the QImode shift amounts in rotlv1ti3 and rotrv1ti3 patterns. I've also made an additional tweak to ix86_expand_v1ti_to_ti to use vec_extract via V2DImode, which avoid using memory and takes advantage vpextrq on recent hardware.
For the following test case: typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16))); uv1ti rotr(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); } GCC with -O2 -mavx2 would previously generate: rotr: vmovdqa %xmm0, -24(%rsp) movq -16(%rsp), %rdx movl %edi, %ecx xorl %esi, %esi movq -24(%rsp), %rax shrdq %rdx, %rax shrq %cl, %rdx testb $64, %dil cmovne %rdx, %rax cmovne %rsi, %rdx negl %ecx xorl %edi, %edi andl $127, %ecx vmovq %rax, %xmm2 movq -24(%rsp), %rax vpinsrq $1, %rdx, %xmm2, %xmm1 movq -16(%rsp), %rdx shldq %rax, %rdx salq %cl, %rax testb $64, %cl cmovne %rax, %rdx cmovne %rdi, %rax vmovq %rax, %xmm3 vpinsrq $1, %rdx, %xmm3, %xmm0 vpor %xmm1, %xmm0, %xmm0 ret with this patch, we now generate: rotr: movl %edi, %ecx vpextrq $1, %xmm0, %rax vmovq %xmm0, %rdx shrdq %rax, %rdx vmovq %xmm0, %rsi shrdq %rsi, %rax andl $64, %ecx movq %rdx, %rsi cmovne %rax, %rsi cmove %rax, %rdx vmovq %rsi, %xmm0 vpinsrq $1, %rdx, %xmm0, %xmm0 ret This patch has been tested on x86_64-pc-linux-gnu with make bootstrap and make -k check with no new failures. Ok for mainline? 2021-11-28 Roger Sayle <ro...@nextmovesoftware.com> gcc/ChangeLog * config/i386/i386-expand.c (ix86_expand_v1ti_to_ti): Perform the conversion via V2DImode using vec_extractv2didi on TARGET_SSE2. * config/i386/sse.md (rotlv1ti3, rotrv1ti3): Change constraint on QImode shift amounts from const_int_operand to general_operand. gcc/testsuite/ChangeLog * gcc.target/i386/sse2-v1ti-rotate.c: New test case. Thanks in advance, Roger --
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c index 088e6af..1e9734b 100644 --- a/gcc/config/i386/i386-expand.c +++ b/gcc/config/i386/i386-expand.c @@ -6162,7 +6162,17 @@ static rtx ix86_expand_v1ti_to_ti (rtx x) { rtx result = gen_reg_rtx (TImode); - emit_move_insn (result, gen_lowpart (TImode, x)); + if (TARGET_SSE2) + { + rtx temp = gen_reg_rtx (V2DImode); + emit_move_insn (temp, gen_lowpart (V2DImode, x)); + rtx lo = gen_lowpart (DImode, result); + emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx)); + rtx hi = gen_highpart (DImode, result); + emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx)); + } + else + emit_move_insn (result, gen_lowpart (TImode, x)); return result; } diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index 2764a25..459eec9 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -15169,7 +15169,7 @@ [(set (match_operand:V1TI 0 "register_operand") (rotate:V1TI (match_operand:V1TI 1 "register_operand") - (match_operand:QI 2 "const_int_operand")))] + (match_operand:QI 2 "general_operand")))] "TARGET_SSE2 && TARGET_64BIT" { ix86_expand_v1ti_rotate (ROTATE, operands); @@ -15180,7 +15180,7 @@ [(set (match_operand:V1TI 0 "register_operand") (rotatert:V1TI (match_operand:V1TI 1 "register_operand") - (match_operand:QI 2 "const_int_operand")))] + (match_operand:QI 2 "general_operand")))] "TARGET_SSE2 && TARGET_64BIT" { ix86_expand_v1ti_rotate (ROTATERT, operands); diff --git a/gcc/testsuite/gcc.target/i386/sse2-v1ti-rotate.c b/gcc/testsuite/gcc.target/i386/sse2-v1ti-rotate.c new file mode 100644 index 0000000..b4b2814 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse2-v1ti-rotate.c @@ -0,0 +1,11 @@ +/* { dg-do compile { target int128 } } */ +/* { dg-options "-O2 -msse2" } */ +/* { dg-require-effective-target sse2 } */ + +typedef unsigned __int128 uv1ti __attribute__ ((__vector_size__ (16))); + +uv1ti rotr(uv1ti x, unsigned int i) { return (x >> i) | (x << (128-i)); } +uv1ti rotl(uv1ti x, unsigned int i) { return (x << i) | (x >> (128-i)); } + +/* { dg-final { scan-assembler-not "shrq" } } */ +/* { dg-final { scan-assembler-not "salq" } } */