Hi!

As the testcases show, the macros we have for -O0 for intrinsics that require
constant argument(s) should first cast the argument to the type the -O1+
inline uses and afterwards to whatever type e.g. a builtin needs.
The PR reported one which violated this, and I've grepped for all double-casts
and grepped out from that meaningful casts where the __m{128,256,512}{,d,i}
first cast is cast to same sized __v* type and has the same kind of element
type (float, double, integral).  These 7 macros were using different casts,
and I've double checked them against the inline function types.

Bootstrapped/regtested on x86_64-linux and i686-linux, committed to trunk
and 9.3 as obvious, queued for backporting to 8.5.

2020-03-05  Jakub Jelinek  <ja...@redhat.com>

        PR target/94046
        * config/i386/avx2intrin.h (_mm_mask_i32gather_ps): Fix first cast of
        SRC and MASK arguments to __m128 from __m128d.
        (_mm256_mask_i32gather_ps): Fix first cast of MASK argument to __m256
        from __m256d.
        (_mm_mask_i64gather_ps): Fix first cast of MASK argument to __m128
        from __m128d.
        * config/i386/xopintrin.h (_mm_permute2_pd): Fix first cast of C
        argument to __m128i from __m128d.
        (_mm256_permute2_pd): Fix first cast of C argument to __m256i from
        __m256d.
        (_mm_permute2_ps): Fix first cast of C argument to __m128i from __m128.
        (_mm256_permute2_ps): Fix first cast of C argument to __m256i from
        __m256.

        * g++.target/i386/pr94046-1.C: New test.
        * g++.target/i386/pr94046-2.C: New test.

--- gcc/config/i386/avx2intrin.h.jj     2020-01-12 11:54:36.313414917 +0100
+++ gcc/config/i386/avx2intrin.h        2020-03-05 15:22:12.684531786 +0100
@@ -1736,10 +1736,10 @@ _mm256_mask_i64gather_epi32 (__m128i __s
                                        (int)SCALE)
 
 #define _mm_mask_i32gather_ps(SRC, BASE, INDEX, MASK, SCALE)    \
-  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128d)SRC,   \
+  (__m128) __builtin_ia32_gathersiv4sf ((__v4sf)(__m128)SRC,    \
                                        (float const *)BASE,     \
                                        (__v4si)(__m128i)INDEX,  \
-                                       (__v4sf)(__m128d)MASK,   \
+                                       (__v4sf)(__m128)MASK,    \
                                        (int)SCALE)
 
 #define _mm256_i32gather_ps(BASE, INDEX, SCALE)                               \
@@ -1754,7 +1754,7 @@ _mm256_mask_i64gather_epi32 (__m128i __s
   (__m256) __builtin_ia32_gathersiv8sf ((__v8sf)(__m256)SRC,   \
                                        (float const *)BASE,    \
                                        (__v8si)(__m256i)INDEX, \
-                                       (__v8sf)(__m256d)MASK,  \
+                                       (__v8sf)(__m256)MASK,   \
                                        (int)SCALE)
 
 #define _mm_i64gather_ps(BASE, INDEX, SCALE)                           \
@@ -1769,7 +1769,7 @@ _mm256_mask_i64gather_epi32 (__m128i __s
   (__m128) __builtin_ia32_gatherdiv4sf ((__v4sf)(__m128)SRC,    \
                                        (float const *)BASE,     \
                                        (__v2di)(__m128i)INDEX,  \
-                                       (__v4sf)(__m128d)MASK,   \
+                                       (__v4sf)(__m128)MASK,    \
                                        (int)SCALE)
 
 #define _mm256_i64gather_ps(BASE, INDEX, SCALE)                                
\
--- gcc/config/i386/xopintrin.h.jj      2020-01-12 11:54:36.336414570 +0100
+++ gcc/config/i386/xopintrin.h 2020-03-05 15:40:31.663241763 +0100
@@ -814,25 +814,25 @@ _mm256_permute2_ps (__m256 __X, __m256 _
 #define _mm_permute2_pd(X, Y, C, I)                                    \
   ((__m128d) __builtin_ia32_vpermil2pd ((__v2df)(__m128d)(X),          \
                                        (__v2df)(__m128d)(Y),           \
-                                       (__v2di)(__m128d)(C),           \
+                                       (__v2di)(__m128i)(C),           \
                                        (int)(I)))
 
 #define _mm256_permute2_pd(X, Y, C, I)                                 \
   ((__m256d) __builtin_ia32_vpermil2pd256 ((__v4df)(__m256d)(X),       \
                                           (__v4df)(__m256d)(Y),        \
-                                          (__v4di)(__m256d)(C),        \
+                                          (__v4di)(__m256i)(C),        \
                                           (int)(I)))
 
 #define _mm_permute2_ps(X, Y, C, I)                                    \
   ((__m128) __builtin_ia32_vpermil2ps ((__v4sf)(__m128)(X),            \
                                       (__v4sf)(__m128)(Y),             \
-                                      (__v4si)(__m128)(C),             \
+                                      (__v4si)(__m128i)(C),            \
                                       (int)(I)))
 
 #define _mm256_permute2_ps(X, Y, C, I)                                 \
   ((__m256) __builtin_ia32_vpermil2ps256 ((__v8sf)(__m256)(X),         \
                                          (__v8sf)(__m256)(Y),          \
-                                         (__v8si)(__m256)(C),          \
+                                         (__v8si)(__m256i)(C),         \
                                          (int)(I)))
 #endif /* __OPTIMIZE__ */
 
--- gcc/testsuite/g++.target/i386/pr94046-1.C.jj        2020-03-05 
16:08:37.992247115 +0100
+++ gcc/testsuite/g++.target/i386/pr94046-1.C   2020-03-05 16:07:46.908004233 
+0100
@@ -0,0 +1,55 @@
+// PR target/94046
+// { dg-do compile }
+// { dg-options "-O2 -mavx2 -mxop" }
+
+#include <x86intrin.h>
+
+#define S(x) struct x { operator __##x (); };
+S (m128)
+S (m128d)
+S (m128i)
+S (m256)
+S (m256d)
+S (m256i)
+
+__m128
+f1 (m128 src, float const *base, m128i idx, m128 mask)
+{
+  return _mm_mask_i32gather_ps (src, base, idx, mask, 2);
+}
+
+__m256
+f2 (m256 src, float const *base, m256i idx, m256 mask)
+{
+  return _mm256_mask_i32gather_ps (src, base, idx, mask, 2);
+}
+
+__m128
+f3 (m128 src, float const *base, m128i idx, m128 mask)
+{
+  return _mm_mask_i64gather_ps (src, base, idx, mask, 2);
+}
+
+__m128d
+f4 (m128d x, m128d y, m128i c)
+{
+  return _mm_permute2_pd (x, y, c, 3);
+}
+
+__m128
+f5 (m128 x, m128 y, m128i c)
+{
+  return _mm_permute2_ps (x, y, c, 3);
+}
+
+__m256d
+f6 (m256d x, m256d y, m256i c)
+{
+  return _mm256_permute2_pd (x, y, c, 3);
+}
+
+__m256
+f7 (m256 x, m256 y, m256i c)
+{
+  return _mm256_permute2_ps (x, y, c, 3);
+}
--- gcc/testsuite/g++.target/i386/pr94046-2.C.jj        2020-03-05 
16:08:44.321153311 +0100
+++ gcc/testsuite/g++.target/i386/pr94046-2.C   2020-03-05 16:08:58.667940674 
+0100
@@ -0,0 +1,5 @@
+// PR target/94046
+// { dg-do compile }
+// { dg-options "-O0 -mavx2 -mxop" }
+
+#include "pr94046-1.C"

        Jakub

Reply via email to