Hi! This patch cleans up the vector/vector shifts, there is no need to write them with lots of vec_selects/vec_concats etc. Additionally, it hooks them up into the standard vlshr<mode>3, vashl<mode>3 and vashr<mode>3 expanders so that the vectorizer can use them. The V16QImode and V8HImode expanders XOP provides aren't probably very useful for autovectorization of C/C++ code, because the FEs will use int shifts in that case and we can't prove using smaller shifts is ok (except for left shifts if the vectorizer got a guarantee that larger than width shifts just zero instead of being clipped.).
Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk? 2011-10-27 Jakub Jelinek <ja...@redhat.com> * config/i386/sse.md (VI4SD_AVX2): Removed. (VI48_AVX2): New iterator. (vlshr<mode>3, vashl<mode>3): For VI48_AVX2 modes implement for TARGET_AVX2, for V4SImode also for TARGET_XOP if !TARGET_AVX2. (vashr<mode>3): For VI4_AVX2 modes implement for TARGET_AVX2, for V4SImode also for TARGET_XOP if !TARGET_AVX2. (avx2_ashrvv8si, avx2_ashrvv4si, avx2_<lshift>vv8si, avx2_<lshift>vv2di): Removed. (avx2_ashrv<mode>): New insn with VI4_AVX2 iterator. (avx2_<lshift>v<mode>): Macroize using VI48_AVX2 iterator. Simplify pattern. * gcc.dg/vshift-1.c: New test. * gcc.dg/vshift-2.c: New test. * gcc.target/i386/xop-vshift-1.c: New test. * gcc.target/i386/xop-vshift-2.c: New test. * gcc.target/i386/avx2-vshift-1.c: New test. --- gcc/config/i386/sse.md.jj 2011-10-27 18:26:28.000000000 +0200 +++ gcc/config/i386/sse.md 2011-10-27 18:23:52.000000000 +0200 @@ -125,8 +125,9 @@ (define_mode_iterator VI248_AVX2 (V8SI "TARGET_AVX2") V4SI (V4DI "TARGET_AVX2") V2DI]) -(define_mode_iterator VI4SD_AVX2 - [V4SI V4DI]) +(define_mode_iterator VI48_AVX2 + [(V8SI "TARGET_AVX2") V4SI + (V4DI "TARGET_AVX2") V2DI]) (define_mode_iterator V48_AVX2 [V4SF V2DF @@ -11268,9 +11269,9 @@ (define_insn "xop_vrotl<mode>3" ;; XOP packed shift instructions. ;; FIXME: add V2DI back in (define_expand "vlshr<mode>3" - [(match_operand:VI124_128 0 "register_operand" "") - (match_operand:VI124_128 1 "register_operand" "") - (match_operand:VI124_128 2 "register_operand" "")] + [(match_operand:VI12_128 0 "register_operand" "") + (match_operand:VI12_128 1 "register_operand" "") + (match_operand:VI12_128 2 "register_operand" "")] "TARGET_XOP" { rtx neg = gen_reg_rtx (<MODE>mode); @@ -11279,10 +11280,27 @@ (define_expand "vlshr<mode>3" DONE; }) +(define_expand "vlshr<mode>3" + [(match_operand:VI48_AVX2 0 "register_operand" "") + (match_operand:VI48_AVX2 1 "register_operand" "") + (match_operand:VI48_AVX2 2 "register_operand" "")] + "TARGET_AVX2 || (<MODE>mode == V4SImode && TARGET_XOP)" +{ + if (<MODE>mode == V4SImode && !TARGET_AVX2) + { + rtx neg = gen_reg_rtx (V4SImode); + emit_insn (gen_negv4si2 (neg, operands[2])); + emit_insn (gen_xop_lshlv4si3 (operands[0], operands[1], neg)); + DONE; + } + emit_insn (gen_avx2_lshrv<mode> (operands[0], operands[1], operands[2])); + DONE; +}) + (define_expand "vashr<mode>3" - [(match_operand:VI124_128 0 "register_operand" "") - (match_operand:VI124_128 1 "register_operand" "") - (match_operand:VI124_128 2 "register_operand" "")] + [(match_operand:VI12_128 0 "register_operand" "") + (match_operand:VI12_128 1 "register_operand" "") + (match_operand:VI12_128 2 "register_operand" "")] "TARGET_XOP" { rtx neg = gen_reg_rtx (<MODE>mode); @@ -11291,16 +11309,46 @@ (define_expand "vashr<mode>3" DONE; }) +(define_expand "vashr<mode>3" + [(match_operand:VI4_AVX2 0 "register_operand" "") + (match_operand:VI4_AVX2 1 "register_operand" "") + (match_operand:VI4_AVX2 2 "register_operand" "")] + "TARGET_AVX2 || (<MODE>mode == V4SImode && TARGET_XOP)" +{ + if (<MODE>mode == V4SImode && !TARGET_AVX2) + { + rtx neg = gen_reg_rtx (V4SImode); + emit_insn (gen_negv4si2 (neg, operands[2])); + emit_insn (gen_xop_ashlv4si3 (operands[0], operands[1], neg)); + DONE; + } + emit_insn (gen_avx2_ashrv<mode> (operands[0], operands[1], operands[2])); + DONE; +}) + (define_expand "vashl<mode>3" - [(match_operand:VI124_128 0 "register_operand" "") - (match_operand:VI124_128 1 "register_operand" "") - (match_operand:VI124_128 2 "register_operand" "")] + [(match_operand:VI12_128 0 "register_operand" "") + (match_operand:VI12_128 1 "register_operand" "") + (match_operand:VI12_128 2 "register_operand" "")] "TARGET_XOP" { emit_insn (gen_xop_ashl<mode>3 (operands[0], operands[1], operands[2])); DONE; }) +(define_expand "vashl<mode>3" + [(match_operand:VI48_AVX2 0 "register_operand" "") + (match_operand:VI48_AVX2 1 "register_operand" "") + (match_operand:VI48_AVX2 2 "register_operand" "")] + "TARGET_AVX2 || (<MODE>mode == V4SImode && TARGET_XOP)" +{ + if (<MODE>mode == V4SImode && !TARGET_AVX2) + emit_insn (gen_xop_ashlv4si3 (operands[0], operands[1], operands[2])); + else + emit_insn (gen_avx2_lshlv<mode> (operands[0], operands[1], operands[2])); + DONE; +}) + (define_insn "xop_ashl<mode>3" [(set (match_operand:VI_128 0 "register_operand" "=x,x") (if_then_else:VI_128 @@ -12402,249 +12450,28 @@ (define_expand "avx2_inserti128" DONE; }) -(define_insn "avx2_ashrvv8si" - [(set (match_operand:V8SI 0 "register_operand" "=x") - (vec_concat:V8SI - (vec_concat:V4SI - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_operand:V8SI 1 "register_operand" "x") - (parallel [(const_int 0)])) - (vec_select:SI - (match_operand:V8SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 2)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 3)]))))) - (vec_concat:V4SI - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 0)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 0)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 2)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 3)])))))))] - "TARGET_AVX2" - "vpsravd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sseishft") - (set_attr "prefix" "vex") - (set_attr "mode" "OI")]) - -(define_insn "avx2_ashrvv4si" - [(set (match_operand:V4SI 0 "register_operand" "=x") - (vec_concat:V4SI - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_operand:V4SI 1 "register_operand" "x") - (parallel [(const_int 0)])) - (vec_select:SI - (match_operand:V4SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat:V2SI - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 2)]))) - (ashiftrt:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 3)]))))))] +(define_insn "avx2_ashrv<mode>" + [(set (match_operand:VI4_AVX2 0 "register_operand" "=x") + (ashiftrt:VI4_AVX2 (match_operand:VI4_AVX2 1 "register_operand" "x") + (match_operand:VI4_AVX2 2 "nonimmediate_operand" + "xm")))] "TARGET_AVX2" "vpsravd\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sseishft") (set_attr "prefix" "vex") - (set_attr "mode" "TI")]) - -(define_insn "avx2_<lshift>vv8si" - [(set (match_operand:V8SI 0 "register_operand" "=x") - (vec_concat:V8SI - (vec_concat:V4SI - (vec_concat:V2SI - (lshift:SI - (vec_select:SI - (match_operand:V8SI 1 "register_operand" "x") - (parallel [(const_int 0)])) - (vec_select:SI - (match_operand:V8SI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0)]))) - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat:V2SI - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 2)]))) - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 3)]))))) - (vec_concat:V4SI - (vec_concat:V2SI - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 0)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 0)]))) - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat:V2SI - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 2)]))) - (lshift:SI - (vec_select:SI - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select:SI - (match_dup 2) - (parallel [(const_int 3)])))))))] - "TARGET_AVX2" - "vp<lshift_insn>vd\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sseishft") - (set_attr "prefix" "vex") - (set_attr "mode" "OI")]) + (set_attr "mode" "<sseinsnmode>")]) (define_insn "avx2_<lshift>v<mode>" - [(set (match_operand:VI4SD_AVX2 0 "register_operand" "=x") - (vec_concat:VI4SD_AVX2 - (vec_concat:<ssehalfvecmode> - (lshift:<ssescalarmode> - (vec_select:<ssescalarmode> - (match_operand:VI4SD_AVX2 1 "register_operand" "x") - (parallel [(const_int 0)])) - (vec_select:<ssescalarmode> - (match_operand:VI4SD_AVX2 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0)]))) - (lshift:<ssescalarmode> - (vec_select:<ssescalarmode> - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:<ssescalarmode> - (match_dup 2) - (parallel [(const_int 1)])))) - (vec_concat:<ssehalfvecmode> - (lshift:<ssescalarmode> - (vec_select:<ssescalarmode> - (match_dup 1) - (parallel [(const_int 2)])) - (vec_select:<ssescalarmode> - (match_dup 2) - (parallel [(const_int 2)]))) - (lshift:<ssescalarmode> - (vec_select:<ssescalarmode> - (match_dup 1) - (parallel [(const_int 3)])) - (vec_select:<ssescalarmode> - (match_dup 2) - (parallel [(const_int 3)]))))))] + [(set (match_operand:VI48_AVX2 0 "register_operand" "=x") + (lshift:VI48_AVX2 (match_operand:VI48_AVX2 1 "register_operand" "x") + (match_operand:VI48_AVX2 2 "nonimmediate_operand" + "xm")))] "TARGET_AVX2" "vp<lshift_insn>v<ssemodesuffix>\t{%2, %1, %0|%0, %1, %2}" [(set_attr "type" "sseishft") (set_attr "prefix" "vex") (set_attr "mode" "<sseinsnmode>")]) -(define_insn "avx2_<lshift>vv2di" - [(set (match_operand:V2DI 0 "register_operand" "=x") - (vec_concat:V2DI - (lshift:DI - (vec_select:DI - (match_operand:V2DI 1 "register_operand" "x") - (parallel [(const_int 0)])) - (vec_select:DI - (match_operand:V2DI 2 "nonimmediate_operand" "xm") - (parallel [(const_int 0)]))) - (lshift:DI - (vec_select:DI - (match_dup 1) - (parallel [(const_int 1)])) - (vec_select:DI - (match_dup 2) - (parallel [(const_int 1)])))))] - "TARGET_AVX2" - "vp<lshift_insn>vq\t{%2, %1, %0|%0, %1, %2}" - [(set_attr "type" "sseishft") - (set_attr "prefix" "vex") - (set_attr "mode" "TI")]) - (define_insn "avx_vec_concat<mode>" [(set (match_operand:V_256 0 "register_operand" "=x,x") (vec_concat:V_256 --- gcc/testsuite/gcc.dg/vshift-1.c.jj 2011-10-27 17:49:29.000000000 +0200 +++ gcc/testsuite/gcc.dg/vshift-1.c 2011-10-27 17:50:04.000000000 +0200 @@ -0,0 +1,132 @@ +/* { dg-do run } */ +/* { dg-options "-O3" } */ + +#include <stdlib.h> + +#define N 64 + +#ifndef TYPE1 +#define TYPE1 int +#define TYPE2 long long +#endif + +signed TYPE1 a[N], b[N], g[N]; +unsigned TYPE1 c[N], h[N]; +signed TYPE2 d[N], e[N], j[N]; +unsigned TYPE2 f[N], k[N]; + +__attribute__((noinline)) void +f1 (void) +{ + int i; + for (i = 0; i < N; i++) + g[i] = a[i] << b[i]; +} + +__attribute__((noinline)) void +f2 (void) +{ + int i; + for (i = 0; i < N; i++) + g[i] = a[i] >> b[i]; +} + +__attribute__((noinline)) void +f3 (void) +{ + int i; + for (i = 0; i < N; i++) + h[i] = c[i] >> b[i]; +} + +__attribute__((noinline)) void +f4 (void) +{ + int i; + for (i = 0; i < N; i++) + j[i] = d[i] << e[i]; +} + +__attribute__((noinline)) void +f5 (void) +{ + int i; + for (i = 0; i < N; i++) + j[i] = d[i] >> e[i]; +} + +__attribute__((noinline)) void +f6 (void) +{ + int i; + for (i = 0; i < N; i++) + k[i] = f[i] >> e[i]; +} + +__attribute__((noinline)) void +f7 (void) +{ + int i; + for (i = 0; i < N; i++) + j[i] = d[i] << b[i]; +} + +__attribute__((noinline)) void +f8 (void) +{ + int i; + for (i = 0; i < N; i++) + j[i] = d[i] >> b[i]; +} + +__attribute__((noinline)) void +f9 (void) +{ + int i; + for (i = 0; i < N; i++) + k[i] = f[i] >> b[i]; +} + +int +main () +{ + int i; + for (i = 0; i < N; i++) + { + asm (""); + c[i] = (random () << 1) | (random () & 1); + b[i] = (i * 85) & (sizeof (TYPE1) * __CHAR_BIT__ - 1); + a[i] = c[i]; + d[i] = (random () << 1) | (random () & 1); + d[i] |= (unsigned long long) c[i] << 32; + e[i] = (i * 85) & (sizeof (TYPE2) * __CHAR_BIT__ - 1); + f[i] = d[i]; + } + f1 (); + f3 (); + f4 (); + f6 (); + for (i = 0; i < N; i++) + if (g[i] != (signed TYPE1) (a[i] << b[i]) + || h[i] != (unsigned TYPE1) (c[i] >> b[i]) + || j[i] != (signed TYPE2) (d[i] << e[i]) + || k[i] != (unsigned TYPE2) (f[i] >> e[i])) + abort (); + f2 (); + f5 (); + f9 (); + for (i = 0; i < N; i++) + if (g[i] != (signed TYPE1) (a[i] >> b[i]) + || j[i] != (signed TYPE2) (d[i] >> e[i]) + || k[i] != (unsigned TYPE2) (f[i] >> b[i])) + abort (); + f7 (); + for (i = 0; i < N; i++) + if (j[i] != (signed TYPE2) (d[i] << b[i])) + abort (); + f8 (); + for (i = 0; i < N; i++) + if (j[i] != (signed TYPE2) (d[i] >> b[i])) + abort (); + return 0; +} --- gcc/testsuite/gcc.dg/vshift-2.c.jj 2011-10-27 17:50:12.000000000 +0200 +++ gcc/testsuite/gcc.dg/vshift-2.c 2011-10-27 17:50:40.000000000 +0200 @@ -0,0 +1,7 @@ +/* { dg-do run } */ +/* { dg-options "-O3" } */ + +#define TYPE1 char +#define TYPE2 short + +#include "vshift-1.c" --- gcc/testsuite/gcc.target/i386/xop-vshift-1.c.jj 2011-10-27 17:45:38.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/xop-vshift-1.c 2011-10-27 17:47:00.000000000 +0200 @@ -0,0 +1,140 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mxop" } */ +/* { dg-require-effective-target xop } */ + +#ifndef CHECK_H +#define CHECK_H "xop-check.h" +#endif + +#ifndef TEST +#define TEST xop_test +#endif + +#include CHECK_H + +#define N 64 + +#ifndef TYPE1 +#define TYPE1 int +#define TYPE2 long long +#endif + +signed TYPE1 a[N], b[N], g[N]; +unsigned TYPE1 c[N], h[N]; +signed TYPE2 d[N], e[N], j[N]; +unsigned TYPE2 f[N], k[N]; + +__attribute__((noinline)) void +f1 (void) +{ + int i; + for (i = 0; i < N; i++) + g[i] = a[i] << b[i]; +} + +__attribute__((noinline)) void +f2 (void) +{ + int i; + for (i = 0; i < N; i++) + g[i] = a[i] >> b[i]; +} + +__attribute__((noinline)) void +f3 (void) +{ + int i; + for (i = 0; i < N; i++) + h[i] = c[i] >> b[i]; +} + +__attribute__((noinline)) void +f4 (void) +{ + int i; + for (i = 0; i < N; i++) + j[i] = d[i] << e[i]; +} + +__attribute__((noinline)) void +f5 (void) +{ + int i; + for (i = 0; i < N; i++) + j[i] = d[i] >> e[i]; +} + +__attribute__((noinline)) void +f6 (void) +{ + int i; + for (i = 0; i < N; i++) + k[i] = f[i] >> e[i]; +} + +__attribute__((noinline)) void +f7 (void) +{ + int i; + for (i = 0; i < N; i++) + j[i] = d[i] << b[i]; +} + +__attribute__((noinline)) void +f8 (void) +{ + int i; + for (i = 0; i < N; i++) + j[i] = d[i] >> b[i]; +} + +__attribute__((noinline)) void +f9 (void) +{ + int i; + for (i = 0; i < N; i++) + k[i] = f[i] >> b[i]; +} + +static void +TEST () +{ + int i; + for (i = 0; i < N; i++) + { + asm (""); + c[i] = (random () << 1) | (random () & 1); + b[i] = (i * 85) & (sizeof (TYPE1) * __CHAR_BIT__ - 1); + a[i] = c[i]; + d[i] = (random () << 1) | (random () & 1); + d[i] |= (unsigned long long) c[i] << 32; + e[i] = (i * 85) & (sizeof (TYPE2) * __CHAR_BIT__ - 1); + f[i] = d[i]; + } + f1 (); + f3 (); + f4 (); + f6 (); + for (i = 0; i < N; i++) + if (g[i] != (signed TYPE1) (a[i] << b[i]) + || h[i] != (unsigned TYPE1) (c[i] >> b[i]) + || j[i] != (signed TYPE2) (d[i] << e[i]) + || k[i] != (unsigned TYPE2) (f[i] >> e[i])) + abort (); + f2 (); + f5 (); + f9 (); + for (i = 0; i < N; i++) + if (g[i] != (signed TYPE1) (a[i] >> b[i]) + || j[i] != (signed TYPE2) (d[i] >> e[i]) + || k[i] != (unsigned TYPE2) (f[i] >> b[i])) + abort (); + f7 (); + for (i = 0; i < N; i++) + if (j[i] != (signed TYPE2) (d[i] << b[i])) + abort (); + f8 (); + for (i = 0; i < N; i++) + if (j[i] != (signed TYPE2) (d[i] >> b[i])) + abort (); +} --- gcc/testsuite/gcc.target/i386/xop-vshift-2.c.jj 2011-10-27 17:47:18.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/xop-vshift-2.c 2011-10-27 17:47:48.000000000 +0200 @@ -0,0 +1,8 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mxop" } */ +/* { dg-require-effective-target xop } */ + +#define TYPE1 char +#define TYPE2 short + +#include "xop-vshift-1.c" --- gcc/testsuite/gcc.target/i386/avx2-vshift-1.c.jj 2011-10-27 17:48:27.000000000 +0200 +++ gcc/testsuite/gcc.target/i386/avx2-vshift-1.c 2011-10-27 17:49:00.000000000 +0200 @@ -0,0 +1,13 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mavx2" } */ +/* { dg-require-effective-target avx2 } */ + +#ifndef CHECK_H +#define CHECK_H "avx2-check.h" +#endif + +#ifndef TEST +#define TEST avx2_test +#endif + +#include "xop-vshift-1.c" Jakub