Hello!
Attached patch optimizes v2df (x2) -> v4sf,v4si conversion sequences
for AVX from:
vcvtpd2psx 48(%rsp), %xmm1
vcvtpd2psx 64(%rsp), %xmm0
vmovlhps %xmm0, %xmm1, %xmm0
vmovaps %xmm0, 32(%rsp)
to
vmovapd 64(%rsp), %xmm0
vinsertf128 $0x1, 80(%rsp), %ymm0, %ymm0
vcvtpd2psy %ymm0, %xmm0
vmovaps %xmm0, 32(%rsp)
Please note only one conversion instruction.
In a similar way, the patch optimizes floor/ceil/round from:
vroundpd $1, 32(%rsp), %xmm1
vroundpd $1, 48(%rsp), %xmm0
vcvttpd2dqx %xmm1, %xmm1
vcvttpd2dqx %xmm0, %xmm0
vpunpcklqdq %xmm0, %xmm1, %xmm0
vmovdqa %xmm0, 16(%rsp)
to
vroundpd $1, 64(%rsp), %xmm1
vroundpd $1, 80(%rsp), %xmm0
vinsertf128 $0x1, %xmm0, %ymm1, %ymm0
vcvttpd2dqy %ymm0, %xmm0
vmovdqa %xmm0, 32(%rsp)
Ideally, this would be just "vcvtpd2psy 64(%rsp), %xmm0" or "vroundpd
$1, 64(%rsp), %ymm1", but vectorizer does not (yet) support mixed
vectorize factors.
The patch also changes a couple of patterns to use simpler SSE
patterns with vec-concat pattern to generate equivalent code.
2011-11-15 Uros Bizjak <[email protected]>
* config/i386/sse.md (vec_pack_trunc_v2df): Optimize sequence for AVX.
(vec_pack_sfix_trunc_v2df): Ditto.
(vec_pack_sfix_v2df): Ditto.
(vec_pack_sfix_trunc_v4df): Generate fix_truncv4dfv4si2 and
avx_vec_concatv8si patterns.
(vec_pack_sfix_v4df): Generate avx_cvtpd2dq256 and
avx_vec_concatv8si patterns.
testsuite/ChangeLog:
2011-11-15 Uros Bizjak <[email protected]>
* gcc.target/i386/avx-cvt-2-vec.c: New test.
* gcc.target/i386/avx-floor-sfix-2-vec.c: Ditto.
* gcc.target/i386/avx-ceil-sfix-2-vec.c: Ditto.
* gcc.target/i386/avx-rint-sfix-2-vec.c: Ditto.
* gcc.target/i386/avx-round-sfix-2-vec.c: Ditto.
Tested on x86_64-pc-linux-gnu {,-m32} AVX target, committed to mainline SVN.
Uros.
Index: config/i386/sse.md
===================================================================
--- config/i386/sse.md (revision 181378)
+++ config/i386/sse.md (working copy)
@@ -3038,14 +3038,25 @@
(match_operand:V2DF 2 "nonimmediate_operand" "")]
"TARGET_SSE2"
{
- rtx r1, r2;
+ rtx tmp0, tmp1;
- r1 = gen_reg_rtx (V4SFmode);
- r2 = gen_reg_rtx (V4SFmode);
+ if (TARGET_AVX && !TARGET_PREFER_AVX128)
+ {
+ tmp0 = gen_reg_rtx (V4DFmode);
+ tmp1 = force_reg (V2DFmode, operands[1]);
- emit_insn (gen_sse2_cvtpd2ps (r1, operands[1]));
- emit_insn (gen_sse2_cvtpd2ps (r2, operands[2]));
- emit_insn (gen_sse_movlhps (operands[0], r1, r2));
+ emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+ emit_insn (gen_avx_cvtpd2ps256 (operands[0], tmp0));
+ }
+ else
+ {
+ tmp0 = gen_reg_rtx (V4SFmode);
+ tmp1 = gen_reg_rtx (V4SFmode);
+
+ emit_insn (gen_sse2_cvtpd2ps (tmp0, operands[1]));
+ emit_insn (gen_sse2_cvtpd2ps (tmp1, operands[2]));
+ emit_insn (gen_sse_movlhps (operands[0], tmp0, tmp1));
+ }
DONE;
})
@@ -3057,12 +3068,12 @@
{
rtx r1, r2;
- r1 = gen_reg_rtx (V8SImode);
- r2 = gen_reg_rtx (V8SImode);
+ r1 = gen_reg_rtx (V4SImode);
+ r2 = gen_reg_rtx (V4SImode);
- emit_insn (gen_avx_cvttpd2dq256_2 (r1, operands[1]));
- emit_insn (gen_avx_cvttpd2dq256_2 (r2, operands[2]));
- emit_insn (gen_avx_vperm2f128v8si3 (operands[0], r1, r2, GEN_INT (0x20)));
+ emit_insn (gen_fix_truncv4dfv4si2 (r1, operands[1]));
+ emit_insn (gen_fix_truncv4dfv4si2 (r2, operands[2]));
+ emit_insn (gen_avx_vec_concatv8si (operands[0], r1, r2));
DONE;
})
@@ -3072,16 +3083,28 @@
(match_operand:V2DF 2 "nonimmediate_operand" "")]
"TARGET_SSE2"
{
- rtx r1, r2;
+ rtx tmp0, tmp1;
- r1 = gen_reg_rtx (V4SImode);
- r2 = gen_reg_rtx (V4SImode);
+ if (TARGET_AVX && !TARGET_PREFER_AVX128)
+ {
+ tmp0 = gen_reg_rtx (V4DFmode);
+ tmp1 = force_reg (V2DFmode, operands[1]);
- emit_insn (gen_sse2_cvttpd2dq (r1, operands[1]));
- emit_insn (gen_sse2_cvttpd2dq (r2, operands[2]));
- emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
- gen_lowpart (V2DImode, r1),
- gen_lowpart (V2DImode, r2)));
+ emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+ emit_insn (gen_fix_truncv4dfv4si2 (operands[0], tmp0));
+ }
+ else
+ {
+ tmp0 = gen_reg_rtx (V4SImode);
+ tmp1 = gen_reg_rtx (V4SImode);
+
+ emit_insn (gen_sse2_cvttpd2dq (tmp0, operands[1]));
+ emit_insn (gen_sse2_cvttpd2dq (tmp1, operands[2]));
+ emit_insn
+ (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
+ gen_lowpart (V2DImode, tmp0),
+ gen_lowpart (V2DImode, tmp1)));
+ }
DONE;
})
@@ -3126,12 +3149,12 @@
{
rtx r1, r2;
- r1 = gen_reg_rtx (V8SImode);
- r2 = gen_reg_rtx (V8SImode);
+ r1 = gen_reg_rtx (V4SImode);
+ r2 = gen_reg_rtx (V4SImode);
- emit_insn (gen_avx_cvtpd2dq256_2 (r1, operands[1]));
- emit_insn (gen_avx_cvtpd2dq256_2 (r2, operands[2]));
- emit_insn (gen_avx_vperm2f128v8si3 (operands[0], r1, r2, GEN_INT (0x20)));
+ emit_insn (gen_avx_cvtpd2dq256 (r1, operands[1]));
+ emit_insn (gen_avx_cvtpd2dq256 (r2, operands[2]));
+ emit_insn (gen_avx_vec_concatv8si (operands[0], r1, r2));
DONE;
})
@@ -3141,16 +3164,28 @@
(match_operand:V2DF 2 "nonimmediate_operand" "")]
"TARGET_SSE2"
{
- rtx r1, r2;
+ rtx tmp0, tmp1;
- r1 = gen_reg_rtx (V4SImode);
- r2 = gen_reg_rtx (V4SImode);
+ if (TARGET_AVX && !TARGET_PREFER_AVX128)
+ {
+ tmp0 = gen_reg_rtx (V4DFmode);
+ tmp1 = force_reg (V2DFmode, operands[1]);
- emit_insn (gen_sse2_cvtpd2dq (r1, operands[1]));
- emit_insn (gen_sse2_cvtpd2dq (r2, operands[2]));
- emit_insn (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
- gen_lowpart (V2DImode, r1),
- gen_lowpart (V2DImode, r2)));
+ emit_insn (gen_avx_vec_concatv4df (tmp0, tmp1, operands[2]));
+ emit_insn (gen_avx_cvtpd2dq256 (operands[0], tmp0));
+ }
+ else
+ {
+ tmp0 = gen_reg_rtx (V4SImode);
+ tmp1 = gen_reg_rtx (V4SImode);
+
+ emit_insn (gen_sse2_cvtpd2dq (tmp0, operands[1]));
+ emit_insn (gen_sse2_cvtpd2dq (tmp1, operands[2]));
+ emit_insn
+ (gen_vec_interleave_lowv2di (gen_lowpart (V2DImode, operands[0]),
+ gen_lowpart (V2DImode, tmp0),
+ gen_lowpart (V2DImode, tmp1)));
+ }
DONE;
})
Index: gcc.target/i386/avx-ceil-sfix-2-vec.c
===================================================================
--- gcc.target/i386/avx-ceil-sfix-2-vec.c (revision 0)
+++ gcc.target/i386/avx-ceil-sfix-2-vec.c (revision 0)
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */
+/* { dg-require-effective-target avx } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+#include CHECK_H
+
+#include <math.h>
+
+extern double ceil (double);
+
+#define NUM 4
+
+static void
+__attribute__((__target__("fpmath=sse")))
+init_src (double *src)
+{
+ int i, sign = 1;
+ double f = rand ();
+
+ for (i = 0; i < NUM; i++)
+ {
+ src[i] = (i + 1) * f * M_PI * sign;
+ if (i < (NUM / 2))
+ {
+ if ((i % 6) == 0)
+ f = f * src[i];
+ }
+ else if (i == (NUM / 2))
+ f = rand ();
+ else if ((i % 6) == 0)
+ f = 1 / (f * (i + 1) * src[i] * M_PI * sign);
+ sign = -sign;
+ }
+}
+
+static void
+__attribute__((__target__("fpmath=387")))
+TEST (void)
+{
+ double a[NUM];
+ int r[NUM];
+ int i;
+
+ init_src (a);
+
+ for (i = 0; i < NUM; i++)
+ r[i] = (int) ceil (a[i]);
+
+ /* check results: */
+ for (i = 0; i < NUM; i++)
+ if (r[i] != (int) ceil (a[i]))
+ abort();
+}
Index: gcc.target/i386/avx-rint-sfix-2-vec.c
===================================================================
--- gcc.target/i386/avx-rint-sfix-2-vec.c (revision 0)
+++ gcc.target/i386/avx-rint-sfix-2-vec.c (revision 0)
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */
+/* { dg-require-effective-target avx } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+#include CHECK_H
+
+#include <math.h>
+
+extern double rint (double);
+
+#define NUM 4
+
+static void
+__attribute__((__target__("fpmath=sse")))
+init_src (double *src)
+{
+ int i, sign = 1;
+ double f = rand ();
+
+ for (i = 0; i < NUM; i++)
+ {
+ src[i] = (i + 1) * f * M_PI * sign;
+ if (i < (NUM / 2))
+ {
+ if ((i % 6) == 0)
+ f = f * src[i];
+ }
+ else if (i == (NUM / 2))
+ f = rand ();
+ else if ((i % 6) == 0)
+ f = 1 / (f * (i + 1) * src[i] * M_PI * sign);
+ sign = -sign;
+ }
+}
+
+static void
+__attribute__((__target__("fpmath=387")))
+TEST (void)
+{
+ double a[NUM];
+ int r[NUM];
+ int i;
+
+ init_src (a);
+
+ for (i = 0; i < NUM; i++)
+ r[i] = (int) rint (a[i]);
+
+ /* check results: */
+ for (i = 0; i < NUM; i++)
+ if (r[i] != (int) rint (a[i]))
+ abort();
+}
Index: gcc.target/i386/avx-round-sfix-2-vec.c
===================================================================
--- gcc.target/i386/avx-round-sfix-2-vec.c (revision 0)
+++ gcc.target/i386/avx-round-sfix-2-vec.c (revision 0)
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */
+/* { dg-require-effective-target avx } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+#include CHECK_H
+
+#include <math.h>
+
+extern double round (double);
+
+#define NUM 4
+
+static void
+__attribute__((__target__("fpmath=sse")))
+init_src (double *src)
+{
+ int i, sign = 1;
+ double f = rand ();
+
+ for (i = 0; i < NUM; i++)
+ {
+ src[i] = (i + 1) * f * M_PI * sign;
+ if (i < (NUM / 2))
+ {
+ if ((i % 6) == 0)
+ f = f * src[i];
+ }
+ else if (i == (NUM / 2))
+ f = rand ();
+ else if ((i % 6) == 0)
+ f = 1 / (f * (i + 1) * src[i] * M_PI * sign);
+ sign = -sign;
+ }
+}
+
+static void
+__attribute__((__target__("fpmath=387")))
+TEST (void)
+{
+ double a[NUM];
+ int r[NUM];
+ int i;
+
+ init_src (a);
+
+ for (i = 0; i < NUM; i++)
+ r[i] = (int) round (a[i]);
+
+ /* check results: */
+ for (i = 0; i < NUM; i++)
+ if (r[i] != (int) round (a[i]))
+ abort();
+}
Index: gcc.target/i386/avx-floor-sfix-2-vec.c
===================================================================
--- gcc.target/i386/avx-floor-sfix-2-vec.c (revision 0)
+++ gcc.target/i386/avx-floor-sfix-2-vec.c (revision 0)
@@ -0,0 +1,62 @@
+/* { dg-do run } */
+/* { dg-options "-O2 -ffast-math -ftree-vectorize -mavx" } */
+/* { dg-require-effective-target avx } */
+/* { dg-skip-if "no M_PI" { vxworks_kernel } } */
+
+#ifndef CHECK_H
+#define CHECK_H "avx-check.h"
+#endif
+
+#ifndef TEST
+#define TEST avx_test
+#endif
+
+#include CHECK_H
+
+#include <math.h>
+
+extern double floor (double);
+
+#define NUM 4
+
+static void
+__attribute__((__target__("fpmath=sse")))
+init_src (double *src)
+{
+ int i, sign = 1;
+ double f = rand ();
+
+ for (i = 0; i < NUM; i++)
+ {
+ src[i] = (i + 1) * f * M_PI * sign;
+ if (i < (NUM / 2))
+ {
+ if ((i % 6) == 0)
+ f = f * src[i];
+ }
+ else if (i == (NUM / 2))
+ f = rand ();
+ else if ((i % 6) == 0)
+ f = 1 / (f * (i + 1) * src[i] * M_PI * sign);
+ sign = -sign;
+ }
+}
+
+static void
+__attribute__((__target__("fpmath=387")))
+TEST (void)
+{
+ double a[NUM];
+ int r[NUM];
+ int i;
+
+ init_src (a);
+
+ for (i = 0; i < NUM; i++)
+ r[i] = (int) floor (a[i]);
+
+ /* check results: */
+ for (i = 0; i < NUM; i++)
+ if (r[i] != (int) floor (a[i]))
+ abort();
+}